def get_api_hastag_posts(self, response: Response): hashtag = response.json()['data']['hashtag'] url = self.get_url_to_query_next_posts_for_api(hashtag) if url: yield response.follow(url, callback=self.get_api_hastag_posts) posts: list = hashtag['edge_hashtag_to_media']['edges'] for post in posts: yield InstaPostItem(data=post['node']) if post['node']['edge_media_to_comment']['count'] > 30 or post[ 'node']['edge_liked_by']['count'] > 100: yield response.follow(f'/p/{post["node"]["shortcode"]}/', callback=self.post_page_parse)
def first_tag_page_parse(self, response: Response): js_data = self.get_js_shared_data(response) hashtag: dict = js_data['entry_data']['TagPage'][0]['graphql'][ 'hashtag'] url = self.get_url_to_query_next_posts_for_api(hashtag) if url: yield response.follow(url, callback=self.get_api_hastag_posts) hashtag['posts_count'] = hashtag['edge_hashtag_to_media']['count'] posts = hashtag.pop('edge_hashtag_to_media')['edges'] yield InstaHashTagItem(data=hashtag) for post in posts: yield InstaPostItem(data=post['node']) if post['node']['edge_media_to_comment']['count'] > 30 or post[ 'node']['edge_liked_by']['count'] > 100: yield response.follow(f'/p/{post["node"]["shortcode"]}/', callback=self.post_page_parse)
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator: page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)") page_count = int(page_count) for page in range(2, page_count + 1): yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page}) current_page = current_page or 1 for book in response.css("article.product_pod a"): yield response.follow( book, callback=self.parse_book, meta={ "playwright": True, "playwright_include_page": True, "playwright_context": f"page-{current_page}", }, )
def parse_propertymodel(self, response: Response, model: Dict) -> PR: properties = model["properties"] for data in properties: _make_id(data) existing = self.property_repository.get(data["id"]) if existing and existing.banned: continue callback = functools.partial(self.parse_property, data, existing) yield response.follow(data["propertyUrl"], callback=callback)
def get_api_hashtag_posts(self, response: Response): js_data = json.loads(response.text) hashtag = js_data['data']['hashtag'] posts = hashtag['edge_hashtag_to_media']['edges'] for post in posts: node = post['node'] node['typename'] = node.pop('__typename') if not node['id'] in self.collected_id['posts_id']: self.collected_id['posts_id'].append(node['id']) if node['edge_liked_by']['count'] > 100 or node[ 'edge_media_to_comment']['count'] > 30: post_url = f'{self.start_urls[0]}p/{node["shortcode"]}/' yield response.follow( post_url, callback=self.get_popular_posts_owners) yield InstagramPostitem(node) if hashtag['edge_hashtag_to_media']['page_info']['has_next_page']: url = self.get_api_url(hashtag) yield response.follow(url, callback=self.get_api_hashtag_posts)
def get_api_hastag_posts(self, response: Response): js_data = self.get_js_shared_data(response) hashtag = js_data['entry_data']['TagPage'][0]['graphql']['hashtag'] variables = { "tag_name": hashtag['name'], "first": 50, "after": hashtag['edge_hashtag_to_media']['page_info']['end_cursor'] } """https://www.instagram.com/graphql/query/?query_hash=c769cb6c71b24c8a86590b22402fda50&variables={"tag_name":"datascience","first":7,"after":"QVFCYVRCb0RSVUNFeEE4MUhJWHUwZGNuNUJQQzdrQy1xQkhnd2JoSUY3STZRZC1kaWhMTW9BN0llZXV5eU1wZ3pPUkE0UHY3UEVyWmNmcWtPV3E5d2ZYTg=="}""" url = f'{self.__api_tag_url}?query_hash={self.__query_hash}&variables={json.dumps(variables)}' yield response.follow(url, callback=self.get_api_hastag_posts) post = hashtag['edge_hashtag_to_media']['edges'] for pst in post: if pst['node']['edge_hashtag_to_media']['count'] > 30 or pst[ 'node']['edge_liked_by']['count'] > 100: yield response.follow(f'/p/{pst["node"]["shortcode"]}/', callback=post_parse) yield InstaPostsItem(data=post['node'])
def parse(self, response: Response): """Parse URLs. :param Response response: HTTP response returned by URL requested """ arts = Selector(response).css("article") logging.info("La page {} contient {} articles".format( response.url, len(arts))) for art in arts: # title art_title_section = art.css("div.title-and-meta") # url art_rel_url = art_title_section.css( "h2.node__title a::attr(href)").get() if art_rel_url is not None: yield response.follow(art_rel_url, callback=self.parse_article) # get next page from bottom pagination to iterate over pages next_page = response.css("li.pager-next a::attr(href)").get() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response: Response): """Parse URLs. :param Response response: HTTP response returned by URL requested """ tutos = Selector(response).css("div.views-row") logging.info("La page {} contient {} tutoriels".format( response.url, len(tutos))) for tuto in tutos: # url tuto_rel_url = tuto.css("a::attr(href)").get() if tuto_rel_url is not None: yield response.follow(tuto_rel_url, callback=self.parse_article)
def parse(self, response: Response, **kwargs): try: js_data = self.get_js_shared_data(response) yield scrapy.FormRequest( self.__login_url, method='POST', callback=self.parse, formdata={ 'username': self.__login, 'enc_password': self.__password }, headers={'X-CSRFToken': js_data['config']['csrf_token']}) except AttributeError as e: if response.json().get('authenticated'): yield response.follow(self.__tag_url, callback=self.first_tag_page_parse)
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product'): self.data_read_callback({ 'name': product_container.css( 'div.productTitleContent a ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.product_price_text ::text').get().strip()), 'link': response.request.url + product_container.css( 'div.productTitleContent a ::attr(href)').get().strip() }) next_page = response.css('div.pagination a.next ::attr(href)').get() if next_page: yield response.follow(next_page, self.parse)
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product-container'): self.data_read_callback({ 'name': product_container.css('a.product-name ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.product-price ::text').get().strip()), 'link': product_container.css( 'a.product-name ::attr(href)').get().strip() }) next_page = response.css('li.pagination_next a::attr(href)').get() if next_page: request_url_split = urlsplit(response.request.url) yield response.follow( f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}", self.parse)
def parse(self, response: Response, **kwargs): is_empty_page = True for product_container in response.css('div.ramecekshop'): is_empty_page = False self.data_read_callback({ 'name': product_container.css('a.nadpisramecek ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'a.objednejkosobr ::text').get().strip()), 'link': self._get_product_link( response, product_container.css( 'a.nadpisramecek ::attr(href)').get().strip()) }) if not is_empty_page: yield response.follow(self._get_next_page_url(response), self.parse)
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product'): availability = product_container.css( "span.p-cat-availability ::text").get().strip() if not availability.startswith("Skladem"): continue self.data_read_callback({ 'name': product_container.css('a.p-name span ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.p-det-main-price ::text').get().strip()), 'link': response.request.url + product_container.css( 'a.p-name ::attr(href)').get().strip()[1:] }) next_page = response.css( 'div.pagination a.s-page.pagination-page ::attr(href)').get() if next_page: request_url_split = urlsplit(response.request.url) yield response.follow( f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}", self.parse)
def parse(self, response: Response): # list of jokes in main page l_jokes = response.css('article[class="chiste mb10"]') # this will tell if we arrived at the last joke page or not if l_jokes: # get all the jokes in string for joke in l_jokes: l_strings = [x.get() for x in joke.css("p[itemprop='articleBody']::text")] s_joke = "".join(l_strings) url_id = joke.css("a[class='compartir']::attr('href')")[0].get() d_joke = {"hash_id": url_id, "user_str_id": "1000Chistes", "user_name": "1000Chistes", "joke": s_joke} yield d_joke time.sleep(5) # follow onto the next page new_page_number = int(response.url.split(r"/")[-1]) + 1 new_url = "{url}/{page_num}".format(url=r"/".join(response.url.split(r"/")[:-1]), page_num=new_page_number) print(new_url) yield response.follow(new_url, self.parse)
def tag_page_parse(self, response: Response): js_data = self.get_js_shared_data(response) hashtag = js_data['entry_data']['TagPage'][0]['graphql']['hashtag'] url = self.get_api_url(hashtag) yield response.follow(url, callback=self.get_api_hashtag_posts)