Exemple #1
0
    def download_image(self, response):
        # full_img_url = response.xpath('//a[contains(text(), "View")')
        img_urls = response.meta['img_urls']
        remaining_images = response.meta['remaining_images']
        new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
        new.context['lang'] = self.lang

        CDN_DOMAIN_MATCH = 'https://scontent-tpe1-1.xx.fbcdn.net'
        for img_element in response.xpath('//img/@src'):
            img_url = img_element.extract()
            if 'scontent' in img_url:
                 img_urls.append(img_url)
                 break

        if len(remaining_images) > 0:
            img_url = remaining_images.pop()
            yield scrapy.Request(img_url, callback=self.download_image, 
                meta={'remaining_images' : remaining_images, 'item': new, 'img_urls': img_urls, 'check_reactions': response.meta['check_reactions'] })
        else:
            new.add_value('image_urls', img_urls)
            reaction_payload = response.meta['check_reactions']
            if reaction_payload['check']:
                yield scrapy.Request(reaction_payload['url'], callback=self.parse_reactions, meta={'item':new})
            else:
                yield new.load_item()
Exemple #2
0
    def parse_post(self, response):
        new = ItemLoader(item=FbcrawlItem(),
                         response=response,
                         parent=response.meta['item'])
        new.context['lang'] = self.lang
        new.add_xpath(
            'source',
            "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()"
        )
        new.add_xpath(
            'shared_from',
            '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()'
        )
        new.add_xpath(
            'post_text',
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        )

        #check reactions for old posts
        check_reactions = response.xpath(
            "//a[contains(@href,'reaction/profile')]/div/div/text()").get()
        if not check_reactions:
            yield new.load_item()
        else:
            reactions = response.xpath(
                "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href"
            )
            reactions = response.urljoin(reactions[0].extract())
            yield scrapy.Request(reactions,
                                 callback=self.parse_reactions,
                                 meta={
                                     'item': new,
                                     'current_date':
                                     response.meta['current_date']
                                 })
Exemple #3
0
    def parse_post(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
        new.context['lang'] = self.lang           
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
        # new.add_xpath('date','//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')


        # Crawling links that lead to external websites
        try:
            internal_link = response.xpath("//div//a[contains(@href, '://')]/@href").get()
            internal_redirection_page = requests.get(internal_link)
            internal_redirection_page_content = internal_redirection_page.content.decode("utf-8").replace('\\', '')
            # find first link-like substring in content of the redirecting page (it includes links that get replaced like
            # "https://trib.al/<some_shortened_link>":
            shortened_external_link = re.findall('https:[a-zA-Z0-9/.?=\n_]*', internal_redirection_page_content)[0]     # find first link-like substring in content
            external_link = requests.get(shortened_external_link).url
            new.add_value('link', external_link)
        except:
            new.add_value('link', '')

        #check reactions for old posts
        check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get()
        if not check_reactions:
            yield new.load_item()       
        else:
            new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")              
            reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
            reactions = response.urljoin(reactions[0].extract())
            yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
Exemple #4
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//div[contains(@id,'m_group_stories')]//div[contains(@data-ft,'top_level_post_id')]"
        ):
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=1000,
                                 meta={'item': new})

        #load following page
        #tries to click on "more"
        new_page = response.xpath(
            "//div[contains(@id,'m_group_stories')]/div[2]/a/@href").extract()
        if not new_page:
            self.logger.info('Crawling has finished with no errors!')
        else:
            self.logger.info('new page')
            self.k -= 1
            new_page = response.urljoin(new_page[0])
            yield scrapy.Request(new_page, callback=self.parse_page)
Exemple #5
0
    def parse_post(self, response):
        new = ItemLoader(item=FbcrawlItem(),
                         response=response,
                         parent=response.meta['item'])
        new.add_xpath(
            'source',
            "substring-before(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, concat(substring('&', 1 div contains(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, 'profile.php')), substring('?', 1 div not(contains(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, 'profile.php')))))"
        )
        new.add_xpath(
            'shared_from',
            '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()'
        )
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath(
            'text',
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        )
        new.add_xpath(
            'reactions',
            "//a[contains(@href,'reaction/profile')]/div/div/text()")
        new.add_value('url', response.url)

        reactions = response.xpath(
            "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href"
        )
        reactions = response.urljoin(reactions[0].extract())
        yield scrapy.Request(reactions,
                             callback=self.parse_reactions,
                             meta={'item': new})
Exemple #6
0
    def parse_group(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            new.add_xpath('membersgroup',
                          "//td/span[contains(@id,'u_0_2')]/text()")
            new.add_xpath('photosgroup',
                          "//td/span[contains(@id,'u_0_4')]/text()")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page
        #tries to click on "more", otherwise it looks for the appropriate
        #year for 1-click only and proceeds to click on others
        new_group = response.xpath(
            "//div[2]/a[contains(@href,'permalinks&refid')]/@href").extract()
        new_group = response.urljoin(new_group[0])
        yield scrapy.Request(new_group, callback=self.parse_group)
Exemple #7
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):            
            new = ItemLoader(item=FbcrawlItem(),selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")        
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})       

        #load following page
        #tries to click on "more", otherwise it looks for the appropriate
        #year for 1-click only and proceeds to click on others
        new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
        if not new_page: 
            if response.meta['flag'] == self.k and self.k >= self.year:                
                self.logger.info('There are no more, flag set at = {}'.format(self.k))
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Everything OK, new flag: {}'.format(self.k))                                
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
                else:
                    while not new_page: #sometimes the years are skipped 
                        self.logger.info('XPATH not found for year {}'.format(self.k-1))
                        self.k -= 1
                        self.logger.info('Trying with previous year, flag={}'.format(self.k))
                        if self.k < self.year:
                            self.logger.info('The previous year to crawl is less than the parameter year: {} < {}'.format(self.k,self.year))
                            self.logger.info('This is not handled well, please re-run with -a year="{}" or less'.format(self.k))
                            break                        
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('New page found with flag {}'.format(self.k))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Now going with flag {}'.format(self.k))
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) 
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag']))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
            else:
                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
                self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
Exemple #8
0
    def parse_post(self, response):
        new = ItemLoader(item=FbcrawlItem(),
                         response=response,
                         parent=response.meta['item'])
        new.add_xpath(
            'source',
            "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()"
        )
        new.add_xpath(
            'shared_from',
            '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()'
        )
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath(
            'text',
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        )
        new.add_xpath(
            'reactions',
            "//a[contains(@href,'reaction/profile')]/div/div/text()")

        reactions = response.xpath(
            "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href"
        )
        reactions = response.urljoin(reactions[0].extract())
        yield scrapy.Request(reactions,
                             callback=self.parse_reactions,
                             meta={'item': new})
Exemple #9
0
    def parse_page(self, response):
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"
        ):  #select all posts
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            new.add_xpath('comments',
                          ".//div/a[contains(text(),'comment')]/text()")
            new.add_xpath('url',
                          ".//a[contains(text(),'Notizia completa')]/@href")

            post = post.xpath(".//a[contains(text(),'Notizia completa')]/@href"
                              ).extract()  #returns full post-link in a list
            temp_post = response.urljoin(post[0])
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 dont_filter=True,
                                 meta={'item': new})

        next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href")
        if len(next_page) > 0:
            next_page = response.urljoin(next_page[0].extract())
            yield scrapy.Request(next_page, callback=self.parse_page)
        else:
            next_page = response.xpath(
                "//div/a[contains(text(),'2017')]/@href")
            if len(next_page) > 0:
                next_page = response.urljoin(next_page[0].extract())
                yield scrapy.Request(next_page, callback=self.parse_page)
Exemple #10
0
 def parse_reactions(self,response):
     new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
     new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
     new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
     new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
     new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
     new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
     new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        
     yield new.load_item()
Exemple #11
0
 def parse_rispostina(self, response):
     for daje in response.xpath(
             "//div[contains(@id,'root')]/div/div/div"):  #select all posts
         new = ItemLoader(item=FbcrawlItem(), selector=daje)
         new.add_xpath('source',
                       ".//h3/a/text()")  #| ./div/div/h3/a/text()")
         new.add_xpath(
             'text',
             ".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()"
         )
         yield new.load_item()
Exemple #12
0
    def parse_post(self, response):
        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
        new.context['lang'] = self.lang
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
     #   new.add_xpath('date','//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
        #check reactions for old posts
        check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get()

        if check_reactions: 
            new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
            reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
            reactions = response.urljoin(reactions[0].extract())
            reaction_payload = {
                'check': True,
                'url': reactions
            }
        else:
            reaction_payload = {
                'check': False,
            }

        image_path = response.xpath('//div[@data-ft]/div[@class]/a/@href')
        if image_path.get() and self.parse_image:
            image_urls = image_path
            img_prefix = '/photo.php'
            found_img_urls = []
            for selected_url in image_urls:
                url = selected_url.extract()
                if img_prefix in url:
                    found_img_urls.append(response.urljoin(url))
            if len(found_img_urls) > 0:
                first_url = found_img_urls.pop()
                yield scrapy.Request(first_url, callback=self.download_image,  
                    meta = {'remaining_images' : found_img_urls, 'item': new, 'img_urls': [], 'check_reactions': reaction_payload })
            else:
                yield new.load_item()
        else:
            new.add_value('image_urls', [])
            if reaction_payload['check']:
                yield scrapy.Request(reaction_payload['url'], callback=self.parse_reactions, meta={'item':new})
            else:
                yield new.load_item()
Exemple #13
0
    def parse_post(self, response):
        with open('comment_urls.csv', 'a+') as f:
            f.write(str(response.url) + '\n')
        new = ItemLoader(item=FbcrawlItem(),
                         response=response,
                         parent=response.meta['item'])
        new.add_xpath(
            'source',
            "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()"
        )
        new.add_xpath(
            'shared_from',
            '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()'
        )
        new.add_xpath('date', '//div/div/abbr/text()')
        content = response.xpath(
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        ).extract()
        contents = []
        for c in range(0, len(content)):
            try:
                temp = content[c].replace(';', ' ')
                contents.append(temp)
            except:
                temp = content[c]
                contents.append(temp)
        new.add_value('text', contents)
        new.add_xpath(
            'reactions',
            "//a[contains(@href,'reaction/profile')]/div/div/text()")

        reactions = response.xpath(
            "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href"
        )
        reactions = response.urljoin(reactions[0].extract())
        yield scrapy.Request(reactions,
                             callback=self.parse_reactions,
                             meta={'item': new})
Exemple #14
0
    def parse_page(self, response):
        for post in response.xpath(
                '//div[@id="MPhotoContent"]/div/div/div/div/div[not(contains(@id,"see"))]'
        ):  #select all posts
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            new.add_xpath('source', "./div/h3/a/text()")
            new.add_xpath(
                'text',
                "div/div/span[not(contains(text(),' · '))]/text() | ./div/div/text()"
            )
            yield new.load_item()

        rispostina = response.xpath(
            '//div/a[contains(text(),"rispost")]/@href')

        for i in range(len(rispostina)):
            risp = response.urljoin(rispostina[i].extract())
            yield scrapy.Request(risp, callback=self.parse_rispostina)

        next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href")
        if len(next_page) > 0:
            next_page = response.urljoin(next_page[0].extract())
            yield scrapy.Request(next_page, callback=self.parse_page)
Exemple #15
0
 def parse_reactions(self, response):
     new = ItemLoader(item=FbcrawlItem(),
                      response=response,
                      parent=response.meta['item'])
     new.context['lang'] = self.lang
     new.add_xpath('likes',
                   "//a[contains(@href,'reaction_type=1')]/span/text()")
     new.add_xpath('haha',
                   "//a[contains(@href,'reaction_type=4')]/span/text()")
     new.add_xpath('love',
                   "//a[contains(@href,'reaction_type=2')]/span/text()")
     new.add_xpath('wow',
                   "//a[contains(@href,'reaction_type=3')]/span/text()")
     new.add_xpath('sigh',
                   "//a[contains(@href,'reaction_type=7')]/span/text()")
     new.add_xpath('angry',
                   "//a[contains(@href,'reaction_type=8')]/span/text()")
     new.add_xpath('care',
                   "//a[contains(@href,'reaction_type=16')]/span/text()")
     yield new.load_item()
     current_date = response.meta['current_date']
     if self.date >= current_date:
         #time.sleep(60)
         raise CloseSpider('Reached date: {}'.format(self.date))
Exemple #16
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #        #open page in browser for debug
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date2(date)
            current_date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_xpath('date', './@data-ft')
            new.add_xpath('post_id', './@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet, click once
        #and keep looking for "more"
        new_page = response.xpath(
            "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
        ).extract()
        if not new_page:
            if response.meta['flag'] == self.k and self.k >= self.year:
                self.logger.info('There are no more, flag set at = {}'.format(
                    self.k))
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Everything OK, new flag: {}'.format(
                        self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'XPATH not found for year {}'.format(self.k - 1))
                        self.k -= 1
                        self.logger.info(
                            'Trying with previous year, flag={}'.format(
                                self.k))
                        if self.k < self.year:
                            self.logger.info(
                                'The previous year to crawl is less than the parameter year: {} < {}'
                                .format(self.k, self.year))
                            self.logger.info(
                                'This is not handled well, please re-run with -a year="{}" or less'
                                .format(self.k))
                            break
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('New page found with flag {}'.format(
                        self.k))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Now going with flag {}'.format(self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, click on more! new_page = {} flag = {}'.
                    format(new_page, date))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                #                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
                self.logger.info(
                    'First page scraped, click on more {}! Flag not set, default flag = {}'
                    .format(new_page, date))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Exemple #17
0
    def parse_page(self, response):
        for post in response.xpath(
                "//article[contains(@data-ft,'top_level_post_id')]"):
            try:
                many_features = post.xpath('./@data-ft').get()
                date = []
                date.append(many_features)
                date = parse_date(date, {'lang': self.lang})
                current_date = datetime.strptime(
                    date, '%Y-%m-%d %H:%M:%S') if date is not None else date

                if current_date is None:
                    date_string = post.xpath('.//abbr/text()').get()
                    date = parse_date2([date_string], {'lang': self.lang})
                    current_date = datetime(
                        date.year, date.month,
                        date.day) if date is not None else date
                    date = str(date)

                #if 'date' argument is reached stop crawling
                if self.date > current_date:
                    raise CloseSpider('Reached date: {}'.format(self.date))

                new = ItemLoader(item=FbcrawlItem(), selector=post)
                if abs(self.count) + 1 > self.max:
                    raise CloseSpider(
                        'Reached max num of post: {}. Crawling finished'.
                        format(abs(self.count)))
                self.logger.info('Parsing post n = {}, post_date = {}'.format(
                    abs(self.count) + 1, date))
                new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
                new.add_value('date', date)
                new.add_xpath('post_id', './@data-ft')
                new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
                #page_url #new.add_value('url',response.url)

                #returns full post-link in a list
                post = post.xpath(
                    ".//a[contains(@href,'footer')]/@href").extract()
                temp_post = response.urljoin(post[0])
                self.count -= 1
                yield scrapy.Request(temp_post,
                                     self.parse_post,
                                     priority=self.count,
                                     meta={'item': new})
            except:
                continue

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #this literally only get the more button link if there exists one...
        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
            print(new_page)
        else:
            new_page = response.xpath(
                '//*[@id="structured_composer_async_container"]/div[2]/a/@href'
            ).extract()
            #            with open('0.html', 'wb') as f:
            #                f.write(response.body)
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
            #'//*[@id="structured_composer_async_container"]/div[2]/a/@href'

        if not new_page or 'Recent' in str(
                response.xpath(
                    '//*[@id="structured_composer_async_container"]/div[2]/a').
                extract()):
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    #                    print('does this ever happen????')
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            #            print(new_page)
            #            print(new_page[0])
            #            sys.exit("this is what it looks like")

            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Exemple #18
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()")
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            new.add_xpath('reactions',
                          ".//a[contains(@aria-label,'reactions')]/text()")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 meta={'item': new})

        #load following page
        next_page = response.xpath(
            "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
        ).extract()
        if len(next_page) == 0:
            if response.meta['flag'] == 4 and self.year <= 2015:
                self.logger.info('2014 reached, flag = 5')
                next_page = response.xpath(
                    "//div/a[contains(@href,'time') and contains(text(),'2015')]/@href"
                ).extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 5})
            elif response.meta['flag'] == 3 and self.year <= 2015:
                self.logger.info('2015 reached, flag = 4')
                next_page = response.xpath(
                    "//div/a[contains(@href,'time') and contains(text(),'2015')]/@href"
                ).extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 4})
            elif response.meta['flag'] == 2 and self.year <= 2016:
                self.logger.info('2016 reached, flag = 3')
                next_page = response.xpath(
                    "//div/a[contains(@href,'time') and contains(text(),'2016')]/@href"
                ).extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 3})
            elif response.meta['flag'] == 1 and self.year <= 2017:
                self.logger.info('2017 reached, flag = 2')
                next_page = response.xpath(
                    "//div/a[contains(@href,'time') and contains(text(),'2017')]/@href"
                ).extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 2})
            elif response.meta['flag'] == 0 and self.year <= 2018:
                self.logger.info('2018 reached, flag = 1')
                next_page = response.xpath(
                    "//div/a[contains(@href,'time') and contains(text(),'2018')]/@href"
                ).extract()
                self.logger.info('next_page = {}'.format(next_page[0]))
                new_page = response.urljoin(next_page[0])
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 1})
        else:
            new_page = response.urljoin(next_page[0])
            if 'flag' in response.meta:
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': 0})
Exemple #19
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
#        #open page in browser for debug
#        from scrapy.utils.response import open_in_browser
#        open_in_browser(response)

        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date,{'lang':self.lang})
            current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string],{'lang':self.lang})
                current_date = datetime(date.year,date.month,date.day) if date is not None else date
                date = str(date)

            #if 'date' argument is reached stop crawling
            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(),selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_value('date',date)
            new.add_xpath('post_id','./@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
        else:
            new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^

        if not new_page:
            self.logger.info('[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
                else:
                    while not new_page: #sometimes the years are skipped this handles small year gaps
                        self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
            else:
                self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
Exemple #20
0
    def parse_page(self, response):
        '''
        Parse the given page for all the posts. Then  recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//article[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            #'date' argument is reached stop crawling, scraps more to capture all required data till date as spiders stops immediately
            #prev_date = self.date -  d1.timedelta(days=1)
            #if prev_date >= current_date:
            # if self.date >= current_date:
            #     time.sleep(60)
            #     raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(), selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))
            new.add_xpath('comments', './footer/div[2]/a[1]/text()')
            new.add_xpath('reactions', "./footer/div[2]/span/a[1]/text()")
            new.add_xpath('content', ".//tr/td/h3/text()")
            #new.add_xpath('image',".//div/div[2]/div/a/@href")
            new.add_value('date', date)
            new.add_xpath('post_id', './@data-ft')
            story_url = url_strip(
                post.xpath(".//a[contains(@href,'footer')]/@href").extract())
            post_url = response.urljoin(story_url)
            new.add_value('url', post_url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={
                                     'item': new,
                                     'current_date': current_date
                                 })

        #load following page, try to click on "more"
        #if "more" link not present look for the highest year not parsed yet

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})

            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})