Exemple #1
0
 def get_api_hastag_posts(self, response: Response):
     hashtag = response.json()['data']['hashtag']
     url = self.get_url_to_query_next_posts_for_api(hashtag)
     if url:
         yield response.follow(url, callback=self.get_api_hastag_posts)
         posts: list = hashtag['edge_hashtag_to_media']['edges']
         for post in posts:
             yield InstaPostItem(data=post['node'])
             if post['node']['edge_media_to_comment']['count'] > 30 or post[
                     'node']['edge_liked_by']['count'] > 100:
                 yield response.follow(f'/p/{post["node"]["shortcode"]}/',
                                       callback=self.post_page_parse)
Exemple #2
0
 def first_tag_page_parse(self, response: Response):
     js_data = self.get_js_shared_data(response)
     hashtag: dict = js_data['entry_data']['TagPage'][0]['graphql'][
         'hashtag']
     url = self.get_url_to_query_next_posts_for_api(hashtag)
     if url:
         yield response.follow(url, callback=self.get_api_hastag_posts)
     hashtag['posts_count'] = hashtag['edge_hashtag_to_media']['count']
     posts = hashtag.pop('edge_hashtag_to_media')['edges']
     yield InstaHashTagItem(data=hashtag)
     for post in posts:
         yield InstaPostItem(data=post['node'])
         if post['node']['edge_media_to_comment']['count'] > 30 or post[
                 'node']['edge_liked_by']['count'] > 100:
             yield response.follow(f'/p/{post["node"]["shortcode"]}/',
                                   callback=self.post_page_parse)
Exemple #3
0
    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
        page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)")
        page_count = int(page_count)
        for page in range(2, page_count + 1):
            yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page})

        current_page = current_page or 1
        for book in response.css("article.product_pod a"):
            yield response.follow(
                book,
                callback=self.parse_book,
                meta={
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": f"page-{current_page}",
                },
            )
Exemple #4
0
 def parse_propertymodel(self, response: Response, model: Dict) -> PR:
     properties = model["properties"]
     for data in properties:
         _make_id(data)
         existing = self.property_repository.get(data["id"])
         if existing and existing.banned:
             continue
         callback = functools.partial(self.parse_property, data, existing)
         yield response.follow(data["propertyUrl"], callback=callback)
    def get_api_hashtag_posts(self, response: Response):
        js_data = json.loads(response.text)
        hashtag = js_data['data']['hashtag']
        posts = hashtag['edge_hashtag_to_media']['edges']

        for post in posts:
            node = post['node']
            node['typename'] = node.pop('__typename')
            if not node['id'] in self.collected_id['posts_id']:
                self.collected_id['posts_id'].append(node['id'])
                if node['edge_liked_by']['count'] > 100 or node[
                        'edge_media_to_comment']['count'] > 30:
                    post_url = f'{self.start_urls[0]}p/{node["shortcode"]}/'
                    yield response.follow(
                        post_url, callback=self.get_popular_posts_owners)
                yield InstagramPostitem(node)
        if hashtag['edge_hashtag_to_media']['page_info']['has_next_page']:
            url = self.get_api_url(hashtag)
            yield response.follow(url, callback=self.get_api_hashtag_posts)
Exemple #6
0
    def get_api_hastag_posts(self, response: Response):
        js_data = self.get_js_shared_data(response)
        hashtag = js_data['entry_data']['TagPage'][0]['graphql']['hashtag']
        variables = {
            "tag_name": hashtag['name'],
            "first": 50,
            "after":
            hashtag['edge_hashtag_to_media']['page_info']['end_cursor']
        }
        """https://www.instagram.com/graphql/query/?query_hash=c769cb6c71b24c8a86590b22402fda50&variables={"tag_name":"datascience","first":7,"after":"QVFCYVRCb0RSVUNFeEE4MUhJWHUwZGNuNUJQQzdrQy1xQkhnd2JoSUY3STZRZC1kaWhMTW9BN0llZXV5eU1wZ3pPUkE0UHY3UEVyWmNmcWtPV3E5d2ZYTg=="}"""

        url = f'{self.__api_tag_url}?query_hash={self.__query_hash}&variables={json.dumps(variables)}'
        yield response.follow(url, callback=self.get_api_hastag_posts)

        post = hashtag['edge_hashtag_to_media']['edges']
        for pst in post:
            if pst['node']['edge_hashtag_to_media']['count'] > 30 or pst[
                    'node']['edge_liked_by']['count'] > 100:
                yield response.follow(f'/p/{pst["node"]["shortcode"]}/',
                                      callback=post_parse)
            yield InstaPostsItem(data=post['node'])
Exemple #7
0
    def parse(self, response: Response):
        """Parse URLs.

        :param Response response: HTTP response returned by URL requested
        """
        arts = Selector(response).css("article")
        logging.info("La page {} contient {} articles".format(
            response.url, len(arts)))
        for art in arts:
            # title
            art_title_section = art.css("div.title-and-meta")

            # url
            art_rel_url = art_title_section.css(
                "h2.node__title a::attr(href)").get()

            if art_rel_url is not None:
                yield response.follow(art_rel_url, callback=self.parse_article)

        # get next page from bottom pagination to iterate over pages
        next_page = response.css("li.pager-next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
Exemple #8
0
    def parse(self, response: Response):
        """Parse URLs.

        :param Response response: HTTP response returned by URL requested
        """
        tutos = Selector(response).css("div.views-row")
        logging.info("La page {} contient {} tutoriels".format(
            response.url, len(tutos)))
        for tuto in tutos:
            # url
            tuto_rel_url = tuto.css("a::attr(href)").get()

            if tuto_rel_url is not None:
                yield response.follow(tuto_rel_url,
                                      callback=self.parse_article)
Exemple #9
0
 def parse(self, response: Response, **kwargs):
     try:
         js_data = self.get_js_shared_data(response)
         yield scrapy.FormRequest(
             self.__login_url,
             method='POST',
             callback=self.parse,
             formdata={
                 'username': self.__login,
                 'enc_password': self.__password
             },
             headers={'X-CSRFToken': js_data['config']['csrf_token']})
     except AttributeError as e:
         if response.json().get('authenticated'):
             yield response.follow(self.__tag_url,
                                   callback=self.first_tag_page_parse)
Exemple #10
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product'):
         self.data_read_callback({
             'name':
             product_container.css(
                 'div.productTitleContent a ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.product_price_text ::text').get().strip()),
             'link':
             response.request.url + product_container.css(
                 'div.productTitleContent a ::attr(href)').get().strip()
         })
     next_page = response.css('div.pagination a.next ::attr(href)').get()
     if next_page:
         yield response.follow(next_page, self.parse)
Exemple #11
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product-container'):
         self.data_read_callback({
             'name':
             product_container.css('a.product-name ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.product-price ::text').get().strip()),
             'link':
             product_container.css(
                 'a.product-name ::attr(href)').get().strip()
         })
     next_page = response.css('li.pagination_next a::attr(href)').get()
     if next_page:
         request_url_split = urlsplit(response.request.url)
         yield response.follow(
             f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}",
             self.parse)
Exemple #12
0
 def parse(self, response: Response, **kwargs):
     is_empty_page = True
     for product_container in response.css('div.ramecekshop'):
         is_empty_page = False
         self.data_read_callback({
             'name':
             product_container.css('a.nadpisramecek ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'a.objednejkosobr ::text').get().strip()),
             'link':
             self._get_product_link(
                 response,
                 product_container.css(
                     'a.nadpisramecek ::attr(href)').get().strip())
         })
     if not is_empty_page:
         yield response.follow(self._get_next_page_url(response),
                               self.parse)
Exemple #13
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product'):
         availability = product_container.css(
             "span.p-cat-availability ::text").get().strip()
         if not availability.startswith("Skladem"):
             continue
         self.data_read_callback({
             'name':
             product_container.css('a.p-name span ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.p-det-main-price ::text').get().strip()),
             'link':
             response.request.url + product_container.css(
                 'a.p-name ::attr(href)').get().strip()[1:]
         })
     next_page = response.css(
         'div.pagination a.s-page.pagination-page ::attr(href)').get()
     if next_page:
         request_url_split = urlsplit(response.request.url)
         yield response.follow(
             f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}",
             self.parse)
Exemple #14
0
    def parse(self, response: Response):

        # list of jokes in main page
        l_jokes = response.css('article[class="chiste mb10"]')

        # this will tell if we arrived at the last joke page or not
        if l_jokes:
            # get all the jokes in string
            for joke in l_jokes:
                l_strings = [x.get() for x in joke.css("p[itemprop='articleBody']::text")]
                s_joke = "".join(l_strings)
                url_id = joke.css("a[class='compartir']::attr('href')")[0].get()

                d_joke = {"hash_id": url_id, "user_str_id": "1000Chistes", "user_name": "1000Chistes", "joke": s_joke}

                yield d_joke

            time.sleep(5)

            # follow onto the next page
            new_page_number = int(response.url.split(r"/")[-1]) + 1
            new_url = "{url}/{page_num}".format(url=r"/".join(response.url.split(r"/")[:-1]), page_num=new_page_number)
            print(new_url)
            yield response.follow(new_url, self.parse)
 def tag_page_parse(self, response: Response):
     js_data = self.get_js_shared_data(response)
     hashtag = js_data['entry_data']['TagPage'][0]['graphql']['hashtag']
     url = self.get_api_url(hashtag)
     yield response.follow(url, callback=self.get_api_hashtag_posts)