Ejemplo n.º 1
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
#        #open page in browser for debug
#        from scrapy.utils.response import open_in_browser
#        open_in_browser(response)

        #select all posts
        for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date,{'lang':self.lang})
            current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string],{'lang':self.lang})
                current_date = datetime(date.year,date.month,date.day) if date is not None else date
                date = str(date)

            #if 'date' argument is reached stop crawling
            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(),selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_value('date',date)
            new.add_xpath('post_id','./@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()
        else:
            new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^

        if not new_page:
            self.logger.info('[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
                else:
                    while not new_page: #sometimes the years are skipped this handles small year gaps
                        self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']})
            else:
                self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
                yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
Ejemplo n.º 2
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')

                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)

                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })


# =============================================================================
# CRAWL REACTIONS
# =============================================================================
#    def parse_reactions(self,response):
#        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
#        new.context['lang'] = self.lang
#        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
#        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
#        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
#        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
#        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
#        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
#        yield new.load_item()
#
#    #substitute
#    yield new.load_item()
#    ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
#    _________v___
#    #response --> reply/root
#    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
#    reactions = response.urljoin(reactions[0].extract())
#    if reactions:
#        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
#    else:
#        yield new.load_item()
Ejemplo n.º 3
0
    def parse_page(self, response):
        '''
        '''
        if self.type == 'post':
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_post,
                                 priority=10,
                                 meta={'index': 1})
        elif self.type == 'page':
            #select all posts
            for post in response.xpath(
                    "//div[contains(@data-ft,'top_level_post_id')]"):
                many_features = post.xpath('./@data-ft').get()
                date = []
                date.append(many_features)
                date = parse_date(date, {'lang': self.lang})
                current_date = datetime.strptime(
                    date, '%Y-%m-%d %H:%M:%S') if date is not None else date

                if current_date is None:
                    date_string = post.xpath('.//abbr/text()').get()
                    date = parse_date2([date_string], {'lang': self.lang})
                    current_date = datetime(
                        date.year, date.month,
                        date.day) if date is not None else date
                    # try:
                    #     current_date = datetime(date.year,date.month,date.day) if date is not None else date
                    # except:
                    #     date_month = month.index(date[1]) + 1
                    #     current_date = datetime(2019,int(date_month),int(date[0]))
                    # current_date = datetime(date.year,date.month,date.day) if date is not None else date
                    date = str(date)

                if abs(self.count) + 1 > self.max:
                    raise CloseSpider(
                        'Reached max num of post: {}. Crawling finished'.
                        format(abs(self.count)))
                self.logger.info('Parsing post n = {}, post_date = {}'.format(
                    abs(self.count) + 1, date))

                #returns full post-link in a list
                post = post.xpath(
                    ".//a[contains(@href,'footer')]/@href").extract()
                temp_post = response.urljoin(post[0])
                self.count -= 1
                yield scrapy.Request(temp_post,
                                     self.parse_post,
                                     priority=self.count,
                                     meta={'index': 1})

            #load following page, try to click on "more"
            #after few pages have been scraped, the "more" link might disappears
            #if not present look for the highest year not parsed yet
            #click once on the year and go back to clicking "more"

            #new_page is different for groups
            if self.group == 1:
                new_page = response.xpath(
                    "//div[contains(@id,'stories_container')]/div[2]/a/@href"
                ).extract()
            else:
                new_page = response.xpath(
                    "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
                ).extract()
                #this is why lang is needed

            if not new_page:
                self.logger.info(
                    '[!] "more" link not found, will look for a "year" link')
                #self.k is the year link that we look for
                if response.meta['flag'] == self.k and self.k >= self.year:
                    xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                        self.k) + "')]/@href"
                    new_page = response.xpath(xpath).extract()
                    if new_page:
                        new_page = response.urljoin(new_page[0])
                        self.k -= 1
                        self.logger.info(
                            'Found a link for year "{}", new_page = {}'.format(
                                self.k, new_page))
                        yield scrapy.Request(new_page,
                                             callback=self.parse_page,
                                             priority=-1000,
                                             meta={'flag': self.k})
                    else:
                        while not new_page:  #sometimes the years are skipped this handles small year gaps
                            self.logger.info(
                                'Link not found for year {}, trying with previous year {}'
                                .format(self.k, self.k - 1))
                            self.k -= 1
                            if self.k < self.year:
                                raise CloseSpider(
                                    'Reached date: {}. Crawling finished'.
                                    format(self.date))
                            xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                                self.k) + "')]/@href"
                            new_page = response.xpath(xpath).extract()
                        self.logger.info(
                            'Found a link for year "{}", new_page = {}'.format(
                                self.k, new_page))
                        new_page = response.urljoin(new_page[0])
                        self.k -= 1
                        yield scrapy.Request(new_page,
                                             callback=self.parse_page,
                                             priority=-1000,
                                             meta={'flag': self.k})
                else:
                    self.logger.info('Crawling has finished with no errors!')
            else:
                new_page = response.urljoin(new_page[0])
                if 'flag' in response.meta:
                    self.logger.info(
                        'Page scraped, clicking on "more"! new_page = {}'.
                        format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={'flag': response.meta['flag']})
                else:
                    self.logger.info(
                        'First page scraped, clicking on "more"! new_page = {}'
                        .format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={'flag': self.k})
Ejemplo n.º 4
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #        #open page in browser for debug
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date2(date)
            current_date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))
            new = ItemLoader(item=FbcrawlItem(), selector=post)
            self.logger.info('Parsing post n = {}'.format(abs(self.count)))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_xpath('date', './@data-ft')
            new.add_xpath('post_id', './@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")

            #page_url #new.add_value('url',response.url)
            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet, click once
        #and keep looking for "more"
        new_page = response.xpath(
            "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
        ).extract()
        if not new_page:
            if response.meta['flag'] == self.k and self.k >= self.year:
                self.logger.info('There are no more, flag set at = {}'.format(
                    self.k))
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Everything OK, new flag: {}'.format(
                        self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'XPATH not found for year {}'.format(self.k - 1))
                        self.k -= 1
                        self.logger.info(
                            'Trying with previous year, flag={}'.format(
                                self.k))
                        if self.k < self.year:
                            self.logger.info(
                                'The previous year to crawl is less than the parameter year: {} < {}'
                                .format(self.k, self.year))
                            self.logger.info(
                                'This is not handled well, please re-run with -a year="{}" or less'
                                .format(self.k))
                            break
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info('New page found with flag {}'.format(
                        self.k))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info('Now going with flag {}'.format(self.k))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, click on more! new_page = {} flag = {}'.
                    format(new_page, date))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                #                self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR')
                self.logger.info(
                    'First page scraped, click on more {}! Flag not set, default flag = {}'
                    .format(new_page, date))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Ejemplo n.º 5
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })
        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('img', './/div[h3]/div[2]/img/@src')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })
Ejemplo n.º 6
0
    def parse_page(self, response):
        '''
        '''

        #select all posts
        for post in response.xpath(
                "//article[contains(@data-ft,'top_level_post_id')]"):
            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            # prev_date = self.date -  d1.timedelta(days=1)
            # #print(prev_date > current_date)
            # if prev_date >= current_date:
            #     raise CloseSpider('Reached date: {}'.format(self.date))

            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))

            post_id = id_strip(post.xpath("./@data-ft").extract())

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={
                                     'index': 1,
                                     'post_id': post_id,
                                     'current_date': current_date
                                 })

            #load following page, try to click on "more"
            #if "more" link not present look for the highest year not parsed yet

        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={'flag': self.k})
                else:
                    while not new_page:
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     priority=-1000,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     priority=-1000,
                                     meta={'flag': self.k})
Ejemplo n.º 7
0
    def parse_page(self, response):
        '''
        '''
        if self.type == 'post':
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_post,
                                 priority=10,
                                 meta={'index':1})
        elif self.type == 'page':
            #select all posts
            for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):     
                many_features = post.xpath('./@data-ft').get()
                date = []
                date.append(many_features)
                date = parse_date(date,{'lang':self.lang})
                current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date
    
                if current_date is None:
                    date_string = post.xpath('.//abbr/text()').get()
                    date = parse_date2([date_string],{'lang':self.lang})
                    current_date = datetime(date.year,date.month,date.day) if date is not None else date                
                    date = str(date)

                if abs(self.count) + 1 > self.max:
                    raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))
                self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))

                #returns full post-link in a list
                post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 
                temp_post = response.urljoin(post[0])
                self.count -= 1
                yield scrapy.Request(temp_post, 
                                     self.parse_post, 
                                     priority = self.count,
                                     meta={'index':1})
    
            #load following page, try to click on "more"
            #after few pages have been scraped, the "more" link might disappears 
            #if not present look for the highest year not parsed yet
            #click once on the year and go back to clicking "more"

            #new_page is different for groups
            if self.group == 1:
                new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()      
            else:
                new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      
                #this is why lang is needed     

            if not new_page: 
                self.logger.info('[!] "more" link not found, will look for a "year" link')
                #self.k is the year link that we look for 
                if response.meta['flag'] == self.k and self.k >= self.year:                
                    xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                    new_page = response.xpath(xpath).extract()
                    if new_page:
                        new_page = response.urljoin(new_page[0])
                        self.k -= 1
                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                        yield scrapy.Request(new_page, 
                                             callback=self.parse_page, 
                                             priority = -1000, 
                                             meta={'flag':self.k})
                    else:
                        while not new_page: #sometimes the years are skipped this handles small year gaps
                            self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))
                            self.k -= 1
                            if self.k < self.year:
                                raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))
                            xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"
                            new_page = response.xpath(xpath).extract()
                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))
                        new_page = response.urljoin(new_page[0])
                        self.k -= 1
                        yield scrapy.Request(new_page, 
                                             callback=self.parse_page,
                                             priority = -1000,
                                             meta={'flag':self.k}) 
                else:
                    self.logger.info('Crawling has finished with no errors!')
            else:
                new_page = response.urljoin(new_page[0])
                if 'flag' in response.meta:
                    self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))
                    yield scrapy.Request(new_page, 
                                         callback=self.parse_page, 
                                         priority = -1000, 
                                         meta={'flag':response.meta['flag']})
                else:
                    self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))
                    yield scrapy.Request(new_page, 
                                         callback=self.parse_page, 
                                         priority = -1000, 
                                         meta={'flag':self.k})
Ejemplo n.º 8
0
    def parse_page(self, response):
        '''
        Parse the given page for all the posts. Then  recursively for another page.
        '''
        #select all posts
        for post in response.xpath(
                "//article[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            #'date' argument is reached stop crawling, scraps more to capture all required data till date as spiders stops immediately
            #prev_date = self.date -  d1.timedelta(days=1)
            #if prev_date >= current_date:
            # if self.date >= current_date:
            #     time.sleep(60)
            #     raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(), selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))
            new.add_xpath('comments', './footer/div[2]/a[1]/text()')
            new.add_xpath('reactions', "./footer/div[2]/span/a[1]/text()")
            new.add_xpath('content', ".//tr/td/h3/text()")
            #new.add_xpath('image',".//div/div[2]/div/a/@href")
            new.add_value('date', date)
            new.add_xpath('post_id', './@data-ft')
            story_url = url_strip(
                post.xpath(".//a[contains(@href,'footer')]/@href").extract())
            post_url = response.urljoin(story_url)
            new.add_value('url', post_url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={
                                     'item': new,
                                     'current_date': current_date
                                 })

        #load following page, try to click on "more"
        #if "more" link not present look for the highest year not parsed yet

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})

            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Ejemplo n.º 9
0
    def parse_page(self, response):
        for post in response.xpath(
                "//article[contains(@data-ft,'top_level_post_id')]"):
            try:
                many_features = post.xpath('./@data-ft').get()
                date = []
                date.append(many_features)
                date = parse_date(date, {'lang': self.lang})
                current_date = datetime.strptime(
                    date, '%Y-%m-%d %H:%M:%S') if date is not None else date

                if current_date is None:
                    date_string = post.xpath('.//abbr/text()').get()
                    date = parse_date2([date_string], {'lang': self.lang})
                    current_date = datetime(
                        date.year, date.month,
                        date.day) if date is not None else date
                    date = str(date)

                #if 'date' argument is reached stop crawling
                if self.date > current_date:
                    raise CloseSpider('Reached date: {}'.format(self.date))

                new = ItemLoader(item=FbcrawlItem(), selector=post)
                if abs(self.count) + 1 > self.max:
                    raise CloseSpider(
                        'Reached max num of post: {}. Crawling finished'.
                        format(abs(self.count)))
                self.logger.info('Parsing post n = {}, post_date = {}'.format(
                    abs(self.count) + 1, date))
                new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
                new.add_value('date', date)
                new.add_xpath('post_id', './@data-ft')
                new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
                #page_url #new.add_value('url',response.url)

                #returns full post-link in a list
                post = post.xpath(
                    ".//a[contains(@href,'footer')]/@href").extract()
                temp_post = response.urljoin(post[0])
                self.count -= 1
                yield scrapy.Request(temp_post,
                                     self.parse_post,
                                     priority=self.count,
                                     meta={'item': new})
            except:
                continue

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #this literally only get the more button link if there exists one...
        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
            print(new_page)
        else:
            new_page = response.xpath(
                '//*[@id="structured_composer_async_container"]/div[2]/a/@href'
            ).extract()
            #            with open('0.html', 'wb') as f:
            #                f.write(response.body)
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
            #'//*[@id="structured_composer_async_container"]/div[2]/a/@href'

        if not new_page or 'Recent' in str(
                response.xpath(
                    '//*[@id="structured_composer_async_container"]/div[2]/a').
                extract()):
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    #                    print('does this ever happen????')
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            #            print(new_page)
            #            print(new_page[0])
            #            sys.exit("this is what it looks like")

            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Ejemplo n.º 10
0
    def parse_page(self, response):

        #allowed maximum number of outdated post in a page
        maximum_outdated_count = 3
        outdated_count = 0

        posts = []
        if response.meta['group'] == 1:
            posts = response.xpath(
                "//div[@id='m_group_stories_container']//div[contains(@data-ft,'mf_story_key')]"
            )
        else:
            posts = response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]")

        for post in posts:
            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                if date_string is None:
                    continue
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            #if 'date' argument is reached stop crawling
            if self.date > current_date:
                outdated_count += 1
                if outdated_count > maximum_outdated_count:
                    self.logger.info(
                        'Reached date: {} for crawling page {}. Crawling finished'
                        .format(self.date, response.url))
                    return
                else:
                    continue
            #if 'skipto_date' argument is not reached, skip crawling
            if self.skipto_date < current_date:
                continue

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])

            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 meta={
                                     'index': 1,
                                     'page_url': response.url
                                 })

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if response.meta['group'] == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()
            #this is why lang is needed

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={
                                             'flag': self.k,
                                             'group': response.meta['group']
                                         })
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            self.logger.info(
                                'Reached date: {} for crawling page {}. Crawling finished'
                                .format(self.date, response.url))
                            return
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         priority=-1000,
                                         meta={
                                             'flag': self.k,
                                             'group': response.meta['group']
                                         })
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     priority=-1000,
                                     meta={
                                         'flag': response.meta['flag'],
                                         'group': response.meta['group']
                                     })
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     priority=-1000,
                                     meta={
                                         'flag': self.k,
                                         'group': response.meta['group']
                                     })