Beispiel #1
0
    def parse_page(self, response):
        '''
        parse page does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) retrieves not-replied-to comments
        '''
        #loads replied-to comments pages
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        for reply in response.xpath(path):
            source = reply.xpath(
                "substring-before(.//h3/a/@href, concat(substring('&', 1 div contains(.//h3/a/@href, 'profile.php')), substring('?', 1 div not(contains(.//h3/a/@href, 'profile.php')))))"
            ).extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment @ page {}'.format(
                str(response.meta['index']), ans))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init'
                                 })
        #loads regular comments
        if not response.xpath(path):
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment @ page {}'.format(
                    i, response.url))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath(
                    'source',
                    "substring-before(.//h3/a/@href, concat(substring('&', 1 div contains(.//h3/a/@href, 'profile.php')), substring('?', 1 div not(contains(.//h3/a/@href, 'profile.php')))))"
                )
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()

        #previous comments
        if not response.xpath(path):
            for next_page in response.xpath(
                    './/div[contains(@id,"see_next")]'):
                new_page = next_page.xpath('.//@href').extract()
                new_page = response.urljoin(new_page[0])
                self.logger.info('New page to be crawled {}'.format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'index': 1})
Beispiel #2
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')

                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)

                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })


# =============================================================================
# CRAWL REACTIONS
# =============================================================================
#    def parse_reactions(self,response):
#        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
#        new.context['lang'] = self.lang
#        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
#        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
#        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
#        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
#        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
#        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
#        yield new.load_item()
#
#    #substitute
#    yield new.load_item()
#    ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
#    _________v___
#    #response --> reply/root
#    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
#    reactions = response.urljoin(reactions[0].extract())
#    if reactions:
#        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
#    else:
#        yield new.load_item()
Beispiel #3
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })
        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('img', './/div[h3]/div[2]/img/@src')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })
Beispiel #4
0
    def parse_reply(self,response):
        '''
        parse reply to comments, root comment is added if flag
        '''
#        from scrapy.utils.response import open_in_browser
#        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(xAll_ROOT_DIV):
                new = ItemLoader(item=CommentsItem(),selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source',xREPLY_['attributes']['source'])
                new.add_xpath('source_url',xREPLY_['attributes']['source_url'])
                new.add_value('reply_to','ROOT')
                new.add_xpath('text',xREPLY_['attributes']['text_root'])
                new.add_xpath('date',xREPLY_['attributes']['date'])
                new.add_xpath('reactions',xREPLY_['attributes']['reactions'])
                new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(xAll_REPLIES_DIV):
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source_url',xREPLY_['attributes']['source_url'])
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text',xREPLY_['attributes']['text_child'])
                new.add_xpath('date',xREPLY_['attributes']['date'])
                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url',response.url)
                yield new.load_item()

            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority = 1000,
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
                                           'index':response.meta['index'],
                                           'group':response.meta['group']})

            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={'index':response.meta['index']+1,
                                           'group':response.meta['group']})

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'):
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source',xREPLY_['attributes']['source'])
                new.add_xpath('source_url',xREPLY_['attributes']['source_url'])
                new.add_value('reply_to',response.meta['reply_to'])
                new.add_xpath('text',xREPLY_['attributes']['text_child'])
                new.add_xpath('date',xREPLY_['attributes']['date'])
                new.add_xpath('reactions',xREPLY_['attributes']['reactions'])
                new.add_value('url',response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={'reply_to':response.meta['reply_to'],
                                           'flag':'back',
                                           'url':response.meta['url'],
                                           'index':response.meta['index'],
                                           'group':response.meta['group']})

            else:
                next_reply = response.meta['url']
                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={'index':response.meta['index']+1,
                                           'group':response.meta['group']})
Beispiel #5
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = xNESTED_COMMENT_['root'] % (str(response.meta['index']))
        group_flag = response.meta['group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath( xNESTED_COMMENT_['attributes']['source'] ).extract()
            answer = reply.xpath(xNESTED_COMMENT_['attributes']['answer']).extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={'reply_to':source,
                                       'url':response.url,
                                       'index':response.meta['index'],
                                       'flag':'init',
                                       'group':group_flag})
        #load regular comments
        if not response.xpath(path): #prevents from exec
            path2 = xREGULAR_COMMENT_['root']
            for i,reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i+1))
                new = ItemLoader(item=CommentsItem(),selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source',xREGULAR_COMMENT_['attributes']['source'])
                new.add_xpath('source_url',xREGULAR_COMMENT_['attributes']['source_url'])
                new.add_xpath('text',xREGULAR_COMMENT_['attributes']['text'])
                new.add_xpath('date',xREGULAR_COMMENT_['attributes']['date'])
                new.add_xpath('reactions',xREGULAR_COMMENT_['attributes']['reactions'])
                new.add_value('url',response.url)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = xNEXT_COMMENTS_['root']
            prev_xpath = xPREV_COMMENTS_DIV
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath(xNEXT_COMMENTS_['attributes']['new_page']).extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info('New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={'index':1,
                                               'group':1})
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info('New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={'index':1,
                                               'group':group_flag})
Beispiel #6
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one 
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) goes to new comment page
        '''
        #load replied-to comments pages

        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        post_id = response.meta['post_id']
        current_date = response.meta['current_date']

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag,
                                     'post_id': post_id,
                                     'current_date': current_date
                                 })
        #load regular comments
        if not response.xpath(path):
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                new.add_value('post_id', post_id)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1,
                                             'post_id': post_id,
                                             'current_date': current_date
                                         })
                    if self.date >= current_date:
                        raise CloseSpider('Reached date: {}'.format(self.date))
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag,
                                             'post_id': post_id,
                                             'current_date': current_date
                                         })

                    if self.date >= current_date:
                        raise CloseSpider('Reached date: {}'.format(self.date))
Beispiel #7
0
    def parse_page(self, response):
        '''
        parse page does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment @ page {}'.format(
                str(response.meta['index']), ans))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })
        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment @ page {}'.format(
                    i, response.url))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })
Beispiel #8
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_page,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })


# =============================================================================
# CRAWL REACTIONS
# =============================================================================
#    def parse_reactions(self,response):
#        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
#        new.context['lang'] = self.lang
#        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
#        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
#        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
#        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
#        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
#        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
#        yield new.load_item()
#
#    #substitute
#    yield new.load_item()
#    ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
#    _________v___
#    #response --> reply/root
#    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
#    reactions = response.urljoin(reactions[0].extract())
#    if reactions:
#        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
#    else:
#        yield new.load_item()
Beispiel #9
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_value('url', response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, trying to go back')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index']
                                     })
            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(
                    next_reply,
                    dont_filter=True,
                    callback=self.parse_page,
                    meta={'index': response.meta['index'] + 1})

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_value('url', response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, trying to go back')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=100,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index']
                                     })
            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(
                    next_reply,
                    dont_filter=True,
                    callback=self.parse_page,
                    meta={'index': response.meta['index'] + 1})
Beispiel #10
0
    def parse_reply(self, response):
        """
        parse reply to comments, root comment is added if flag
        """
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta["flag"] == "init":
            # parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context["lang"] = self.lang
                new.add_xpath("source", ".//h3/a/text()")
                new.add_xpath("source_url", ".//h3/a/@href")
                new.add_value("reply_to", "ROOT")
                new.add_xpath("text", ".//div[1]//text()")
                new.add_xpath("date", ".//abbr/text()")
                new.add_xpath(
                    "reactions",
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value("post_id", response.meta["post_id"])
                new.add_value("url", response.url)
                yield new.load_item()
            # parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context["lang"] = self.lang
                new.add_xpath("source", ".//h3/a/text()")
                new.add_xpath("source_url", ".//h3/a/@href")
                new.add_value("reply_to", response.meta["reply_to"])
                new.add_xpath("text", ".//div[h3]/div[1]//text()")
                new.add_xpath("date", ".//abbr/text()")
                new.add_xpath(
                    "reactions",
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value("post_id", response.meta["post_id"])
                new.add_value("url", response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info("Back found, more nested comments")
                back_page = response.urljoin(back[0])
                yield scrapy.Request(
                    back_page,
                    callback=self.parse_reply,
                    priority=1000,
                    meta={
                        "reply_to": response.meta["reply_to"],
                        "flag": "back",
                        "url": response.meta["url"],
                        "index": response.meta["index"],
                        "post_id": response.meta["post_id"],
                        "group": response.meta["group"],
                    },
                )

            else:
                next_reply = response.meta["url"]
                self.logger.info(
                    "Nested comments crawl finished, heading to proper page: {}"
                    .format(response.meta["url"]))
                yield scrapy.Request(
                    next_reply,
                    callback=self.parse_post,
                    meta={
                        "index": response.meta["index"] + 1,
                        "post_id": response.meta["post_id"],
                        "group": response.meta["group"],
                    },
                )

        elif response.meta["flag"] == "back":
            # parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context["lang"] = self.lang
                new.add_xpath("source", ".//h3/a/text()")
                new.add_xpath("source_url", ".//h3/a/@href")
                new.add_value("reply_to", response.meta["reply_to"])
                new.add_xpath("text", ".//div[h3]/div[1]//text()")
                new.add_xpath("date", ".//abbr/text()")
                new.add_xpath(
                    "reactions",
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value("post_id", response.meta["post_id"])
                new.add_value("url", response.url)
                yield new.load_item()
            # keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info("Back found, more nested comments")
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(
                    back_page,
                    callback=self.parse_reply,
                    priority=1000,
                    meta={
                        "reply_to": response.meta["reply_to"],
                        "flag": "back",
                        "url": response.meta["url"],
                        "index": response.meta["index"],
                        "post_id": response.meta["post_id"],
                        "group": response.meta["group"],
                    },
                )

            else:
                next_reply = response.meta["url"]
                self.logger.info(
                    "Nested comments crawl finished, heading to home page: {}".
                    format(response.meta["url"]))
                yield scrapy.Request(
                    next_reply,
                    callback=self.parse_post,
                    meta={
                        "index": response.meta["index"] + 1,
                        "post_id": response.meta["post_id"],
                        "group": response.meta["group"],
                    },
                )
Beispiel #11
0
    def parse_post(self, response):
        """
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        """
        # load replied-to comments pages
        # select nested comment one-by-one matching with the index: response.meta['index']
        #  self.logger.info(response.url)
        path = (
            './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'
            + "[" + str(response.meta["index"]) + "]")
        #  testpath = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]][1]'
        testpath = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]/@id'
        bomb = 0
        try:
            if response.xpath(testpath) != []:
                #  self.logger.info(response.meta['testpath'])
                #  self.logger.info(response.xpath(testpath).extract())
                if response.meta["testpath"] == response.xpath(
                        testpath).extract():
                    bomb = 1
        except Exception:
            self.logger.info("seems ok to continue")
        try:
            #  self.logger.info(response.meta['post_id'])
            post_id = response.meta["post_id"]
        except Exception:
            self.logger.info("single post")
        testpath = response.xpath(testpath).extract()
        #  try:
        #  post_div = response.xpath('//div[contains(@data-ft,"top_level_post_id")]')
        #  many_features = post_div.xpath('./@data-ft').get()
        #  except Exception:
        #  many_features = ''
        group_flag = response.meta[
            "group"] if "group" in response.meta else None

        if bomb == 0:
            for reply in response.xpath(path):
                source = reply.xpath(".//h3/a/text()").extract()
                answer = reply.xpath(
                    './/a[contains(@href,"repl")]/@href').extract()
                ans = response.urljoin(answer[::-1][0])
                self.logger.info("{} nested comment".format(
                    str(response.meta["index"])))
                yield scrapy.Request(
                    ans,
                    callback=self.parse_reply,
                    priority=1000,
                    meta={
                        "reply_to": source,
                        "url": response.url,
                        "index": response.meta["index"],
                        "flag": "init",
                        "post_id": response.meta["post_id"],
                        "group": group_flag,
                    },
                )
            # load regular comments
            if not response.xpath(path):  # prevents from exec
                path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
                for i, reply in enumerate(response.xpath(path2)):
                    self.logger.info("{} regular comment".format(i + 1))
                    new = ItemLoader(item=CommentsItem(), selector=reply)
                    new.context["lang"] = self.lang
                    new.add_xpath("source", ".//h3/a/text()")
                    new.add_xpath("source_url", ".//h3/a/@href")
                    new.add_xpath("text", ".//div[h3]/div[1]//text()")
                    new.add_xpath("date", ".//abbr/text()")
                    new.add_xpath(
                        "reactions",
                        './/a[contains(@href,"reaction/profile")]//text()')
                    new.add_value("post_id", post_id)
                    new.add_value("url", response.url)
                    yield new.load_item()

            # new comment page
            if not response.xpath(path):
                # for groups
                next_xpath = './/div[contains(@id,"see_next")]'
                prev_xpath = './/div[contains(@id,"see_prev")]'
                if not response.xpath(next_xpath) or group_flag == 1:
                    for next_page in response.xpath(prev_xpath):
                        new_page = next_page.xpath(".//@href").extract()
                        new_page = response.urljoin(new_page[0])
                        self.logger.info(
                            "New page to be crawled {}".format(new_page))
                        yield scrapy.Request(
                            new_page,
                            callback=self.parse_post,
                            meta={
                                "index": 1,
                                "post_id": post_id,
                                "testpath": testpath,
                                "group": 1,
                            },
                        )
                else:
                    for next_page in response.xpath(next_xpath):
                        new_page = next_page.xpath(".//@href").extract()
                        new_page = response.urljoin(new_page[0])
                        self.logger.info(
                            "New page to be crawled {}".format(new_page))
                        yield scrapy.Request(
                            new_page,
                            callback=self.parse_post,
                            meta={
                                "index": 1,
                                "post_id": post_id,
                                "testpath": testpath,
                                "group": group_flag,
                            },
                        )
Beispiel #12
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                #response --> reply/root
                """
                PROFILE REACTIONS SECTION (REPEAT SEE LINE 176 )
                the only difference is that, when getting the item temporarily
                the selector is the root instead of the reply, (it matches the for loop)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #reactions = new.get_value('reactions')
                #print("reactions",reactions)

                temp = ItemLoader(item=CommentsItem(), selector=root)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                PROFILE REACTIONS SECTION SECTION (REPEAT SEE LINE 176)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            """
            adds random time pauses to prevent blocking
            DOWNSIDE: the algorithm will go slower, but still
            runs pretty quickly
            the greater the length of time, the more 
            likely you'll go undetected, but if you're using a large amount 
            of data, this may be unreasonable
            """
            #print("did we make it")
            r = randrange(0, 20)
            time.sleep(r)
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                #print("reply")
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                SECTION (REPEAT SEE LINE 176)
                """

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                check = 0
                item = new.load_item()
                if profile:
                    check += 1
                    print(1)
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #response --> reply/root
                #print("before ", item)
                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    print(2)
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    print(3)
                    yield item
                #print("after ", item)

            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })
Beispiel #13
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''

        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })

        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """ 
                PROFILE REACTIONS SECTION
                adds functionality for adding profile and specific reaction data
                gets the profile url, creates a new item
                if the profile exists, add info to new item and increment 'check'
                to signal that new information has been added to the item
                and it's already been yielded
                repeat this process for reactions
                """

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))

                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })