コード例 #1
0
    def parse(self, response):

        # edit this only

        url = response.meta.get('url')

        title = response.css("h1.entry-title::text").get()
        time = response.css("time::text").get()
        content = response.css("div.td-post-content p ::text").getall()
        content = ' '.join(content)

        loader = ItemLoader(item=FashionLadyItem(), selector=response)
        loader.add_value("url", url)
        loader.add_value("title", title)
        loader._add_value("time", time)
        loader.add_value("content", content)

        os.remove(response.meta.get('temp_file'))
        return loader.load_item()


# title = response.css("h1.entry-title::text").get()
# time = response.css("time::text").get()
# content = response.css("div.td-post-content p::text").getall()
# content = ' '.join(content)
コード例 #2
0
    def parse(self, response):

        # edit this only

        url = response.meta.get('url')

        title = response.css("h1.bm-content-header__title::text").get()
        category = response.css("h2.bm-content-header__subtitle::text").get()
        # content=response.css("div.bm-article-body p.bm-article-body__copy::text").getall()
        # content2=response.css("div.bm-article-body p::text").getall()
        # content.extend(content2)
        content = response.xpath(
            "string(//div[@class='bm-article-body'])").getall()
        content = ' '.join(content)
        content = content.replace("SHARE", "", 1)

        loader = ItemLoader(item=LorealParisItem(), selector=response)
        loader.add_value("url", url)
        loader.add_value("title", title)
        loader._add_value("category", category)
        loader.add_value("content", content)

        os.remove(response.meta.get('temp_file'))
        return loader.load_item()


# title = response.css("h1.bm-content-header__title::text").get()
# category = response.css("h2.bm-content-header__subtitle::text").get()
# content = content=response.css("div.bm-article-body p.bm-article-body__copy::text").getall()
コード例 #3
0
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(), response=response)
        h1_tag = response.xpath('//h1/a/text()').extract_first()
        tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()
        l._add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        return l.load_item()
コード例 #4
0
 def parse_detail(self, response):
     title = "".join(response.css(".ArticleHeader_headline::text").extract())
     content = "".join(response.css('.StandardArticleBody_body > p::text').extract())
     if title and content:
         loader = ItemLoader(item=NewsContext(), response=response)
         loader._add_value("url", response.url)
         loader._add_value("title", title)
         loader._add_value("content", content)
         loader.add_value("date", int(time.time()))
         loader.add_value("domain", self.task_domain)
         return loader.load_item()
コード例 #5
0
 def parse_detail(self, response):
     title = response.css(".story-body h1::text").extract()
     content = "".join(
         response.css('div[property=articleBody] p::text').extract())
     if title and content:
         loader = ItemLoader(item=NewsContext(), response=response)
         loader._add_value("url", response.url)
         loader._add_value("title", title)
         loader._add_value("content", content)
         loader.add_value("date", int(time.time()))
         loader.add_value("domain", self.task_domain)
         return loader.load_item()
コード例 #6
0
    def parse_lot(self, response):

        l = ItemLoader(item=LarsenDelpetersonItem(), response=response)
        l.default_output_processor = TakeFirst()

        l.add_xpath('LotNum', '//h1/text()')
        l.add_xpath(
            'LotDescription',
            '//h2[contains(text(), "Item Details:")]/following-sibling::p[1]/text()[1]'
        )

        address = response.xpath(
            '//b[contains(text(), "Item Location:")]/following-sibling::text()[1]'
        ).extract_first()
        city, region = address.split(',')
        l._add_value('City', city)
        l._add_value('State', region)
        l._add_value('ZIP', region)
        l.add_xpath(
            'Contact',
            '//b[contains(text(), "Equipment Contact:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'Phone',
            '//b[contains(text(), "Phone Number:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'Category',
            '//strong[contains(text(), "Category:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'ClosesOn',
            '//strong[contains(text(), "Closes On")]/following-sibling::text()[1]'
        )
        l.add_xpath('image_urls', '//div[@id="gallery"]//a/@href')
        l.add_value('folder_name', self.auction_id)

        yield l.load_item()
コード例 #7
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')

                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)

                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()
            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })


# =============================================================================
# CRAWL REACTIONS
# =============================================================================
#    def parse_reactions(self,response):
#        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])
#        new.context['lang'] = self.lang
#        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
#        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
#        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
#        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
#        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
#        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")
#        yield new.load_item()
#
#    #substitute
#    yield new.load_item()
#    ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾
#    _________v___
#    #response --> reply/root
#    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")
#    reactions = response.urljoin(reactions[0].extract())
#    if reactions:
#        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
#    else:
#        yield new.load_item()
コード例 #8
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''
        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })
        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                profile_img = ""
                # source_url = str(response.xpath(".//h3/a/@href").get())
                # index1 = source_url.find("/profile.php?id=")
                # if index1 != -1:
                #     index2 = source_url.find("&fref=nf&refid=18&__tn__=")
                #     if index2 == -1:
                #         index2 = source_url.find("&refid=18&__tn__=")
                #     source_url = source_url[index1+16:index2]
                #     profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url)
                # else:
                #     index2 = source_url.find("?fref=nf&refid=18&__tn__=-R")
                #     source_url = source_url[1:index2]
                #     profile_img = "https://avatars.io/facebook/{}".format(source_url)
                # new._add_value('source_url', source_url)
                new._add_value('profile_img', profile_img)
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('img', './/div[h3]/div[2]/img/@src')
                # new.add_xpath('date','.//abbr/text()')
                date_string = response.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                new._add_value('date', date)
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                # new.add_value('url',response.url)
                yield new.load_item()

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })