Python ItemLoader.replace_value Exemples, scrapy.loader.ItemLoader.replace_value Python Exemples

Exemple #1

0

Afficher le fichier

    def get_news(self,response):
	try:
            l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract())

            l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract())

            r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}"
	    date0 = re.compile(r1)
	    date = ''.join(l.get_collected_values('date'))
	    date1 = date0.findall(date)
            l.replace_value('date', date1[0])
            l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Exemple #2

0

Afficher le fichier

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value(
            'title',
            response.xpath('//div[@id="contentwrap"]/h1/text()').extract())

        l.add_value('date',
                    response.xpath('//div[@class="infos"]/p/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',
                    response.xpath('//div[@class="content"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="description"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/div/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        return l.load_item()

Exemple #3

0

Afficher le fichier

Fichier : jljgdj.py Projet : qiangber/scrapy_news

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                'title',
                response.xpath('//p[@class="title1"]/text()').extract_first())

            date = response.xpath(
                '//pre[@class="f_title"]/text()').extract_first()
            loader.replace_value('date', date[date.find(u"日期：") + 3:][0:10])

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="contents"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Exemple #4

0

Afficher le fichier

Fichier : hxzg_spider.py Projet : taianjianbing/web_news

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title',
                    response.xpath('//table/tr[3]/td[2]/text()').extract())

        l.add_value('date',
                    response.xpath('//table/tr[4]/td/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/div/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/span/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="tdbg"]/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Exemple #5

0

Afficher le fichier

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract())
	    l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract())

            l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url
	    if url[11:17]=="shzfzz":
                date = ''.join(l.get_collected_values('date'))
                date = time.strptime(date.split()[0], u'%Y年%m月%d日')
                l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Exemple #6

0

Afficher le fichier

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first())
            loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first())

            loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first())
            loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values("date")).strip()
            date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S')
            loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date))

            loader.add_value("content",
                             ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract()))
            loader.add_value("content",
                             ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Exemple #7

0

Afficher le fichier

Fichier : curriculum.py Projet : alan8365/nutc_curriculum

    def parse(self, response):
        filename = response.url.split('/')[-1]

        if filename == 'teacher_list.js':
            contents = response.text.strip().split('\n')[-8:]

            for content in contents:
                content = content.split('=')[1].strip().replace(';', '')

                teachers = eval(content)
                teacher_loader = ItemLoader(item=TeacherItem())

                for teacher in teachers:
                    teacher_loader.replace_value('number', teacher[0])
                    teacher_loader.replace_value('name', teacher[1])
                    yield teacher_loader.load_item()

        else:
            content = response.text.strip().split('\n')[-4]
            content = content.split('=')[1].strip().replace(';', '')

            class_numbers = eval(content)

            class_number_loader = ItemLoader(item=ClassesItem())
            for class_number in class_numbers:
                class_number_loader.replace_value('number', class_number[0])
                class_number_loader.replace_value('full_name', class_number[1])
                class_number_loader.replace_value('name', class_number[1])
                yield class_number_loader.load_item()

Exemple #8

0

Afficher le fichier

Fichier : curriculum.py Projet : alan8365/nutc_curriculum

    def parse(self, response):

        all_row = response.css(".empty_html tr")

        for i in range(1, len(all_row)):
            row = all_row[i]
            course_loader = ItemLoader(item=CourseItem(), selector=row)

            # https://aisap.nutc.edu.tw/public/day/course_list.aspx?sem=1081&clsno=1120170121&_p=2 -> 1120170121
            class_id = re.search(r'clsno=[\w\d]*', response.url)[0][6:]

            course_loader.replace_css('number', 'td:nth-child(2)::text')
            course_loader.replace_value('class_id', class_id)
            course_loader.replace_css('name', 'td:nth-child(4)::text, td:nth-child(4) > strong::text')
            course_loader.replace_css('time', 'td:nth-child(6)::text')
            course_loader.replace_css('location', 'td:nth-child(6)::text')
            course_loader.replace_css('compulsory', 'td:nth-child(7)::text')
            course_loader.replace_css('credit', 'td:nth-child(8)::text')
            course_loader.replace_css('popular', 'td:nth-child(9) > strong::text')
            course_loader.replace_css('teacher_name', 'td:nth-child(10)::text')
            course_loader.replace_css('popular_limit', 'td:nth-child(11)::text')

            yield course_loader.load_item()

        next_page = response.css('.page > b:last-child > a::attr(href)').get()

        if next_page:
            yield response.follow(next_page)

Exemple #9

0

Afficher le fichier

Fichier : qx162.py Projet : qiangber/scrapy_news

    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="left"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//h1[@class="h1"]/text()').extract_first())

            loader.add_value(
                'date',
                response.xpath('//div[@class="zuoze"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@class="post-time"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            if date == '':
                return
            loader.replace_value('date', date.strip() + ":00")

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//span[@id="zoom"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//p[@class="summary"]/descendant-or-self::text()').
                    extract()))

            loader.add_value('url', response.url)
            loader.add_value('collection_name', self.name)
            loader.add_value('website', self.website)

            yield loader.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            yield l.load_item()

Exemple #10

0

Afficher le fichier

Fichier : crawlspider.py Projet : alinzel/Demo

 def parse_art_url(self, response):
     # TODO ItemLoaders提供的是填充容器(items字典形式)的机制
     items = ItemLoader(
         item=CrawlSpiderItem(), response=response
     )  # 返回 scrapy对象 <scrapy.loader.ItemLoader object at 0x7f3f1d459f28>
     # TODO 通过ItemLoader实例化返回的对象，进行页面解析
     # # TODO add_xpath会将匹配到的内容添加到指定字段
     # items.add_xpath('art_content', '//div[@class="entry"]/p/text()')
     # items.add_xpath('art_create_time', '//div[@class="entry-meta"]/p/text()')
     # TODO 解析页面，拿到文章内容
     art_content = response.xpath('//div[@class="entry"]')
     art_content_list = art_content.xpath(
         'string(.)').extract()[0].strip().split('\r\n')
     art_contents = ''
     # TODO 对数据进行处理
     for i in art_content_list:
         art_contents += i
         art_contents = art_contents.strip().replace('\n', '').replace(
             ' ', '').replace('\t', '')
     # TODO 获取时间
     art_create_time = response.xpath(
         '//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first()
     # TODO replace_value替代原来字段的值
     items.replace_value('art_img', response.meta['art_img'])
     items.replace_value('art_url', response.meta['art_url'])
     items.replace_value('art_title', response.meta['art_title'])
     items.replace_value('art_content', art_contents)
     items.replace_value('art_create_time', art_create_time.strip()[0:-2])
     # TODO return给管道
     return items.load_item()

Exemple #11

0

Afficher le fichier

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@id="title_tex"]/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="dc-title"]/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="xl-tit"]/text()').extract_first())

            url = response.url
            url = url[url.rfind('/') + 2:url.rfind('_')]
            loader.replace_value('date',
                                 url[0:4] + '-' + url[4:6] + '-' + url[6:8])

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="tex"]/descendant-or-self::text()').extract(
                        )))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="tex"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="dc-text02"]/descendant-or-self::text()'
                    ).extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Exemple #12

0

Afficher le fichier

 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath('//div[@class="article_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@class="article_title1"]/text()').extract())
     r1 = r"\d{1,4}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     date1 = date1[0] + '-' + date1[1] + '-' + date1[2]
     l.replace_value('date', date1)
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/span/text()').extract())
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="MyContent"]/p/font/span/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@class="MsoNormal"]/span/span/font/span/text()'
                        ).extract())
     l.add_value(
         'content',
         response.xpath(
             '//p[@class="MsoNormal"]/span/span/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@class="article_intro"]/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@id="MsoNormal"]/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()

Exemple #13

0

Afficher le fichier

 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath(
             '//div[@id="lbyright_xwxq_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@id="lbyright_xwxq_xxx"]/text()').extract())
     r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     l.replace_value('date', date1[0])
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="lbyright_xwxq_txt"]/p/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()

Exemple #14

0

Afficher le fichier

    def get_news(self,response):
        l = ItemLoader(item=SpiderItem(),response=response)
        l.add_value('title', response.xpath('//h2[@class="titleH2"]/text()').extract())
        l.add_value('title', response.xpath('//div[@class="Article-Left"]/h3/text()').extract())
        l.add_value('title', response.xpath('//div[@class="tit"]/h1/text()').extract())

        l.add_value('date',response.xpath('//div[@class="from"]/span/text()').extract())
        l.add_value('date',response.xpath('//div[@class="CopyFrom"]/text()').extract())
        l.add_value('date',response.xpath('//div[@class="auther-from"]/text()').extract())
        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/p/font/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Exemple #15

0

Afficher le fichier

Fichier : dzw_spider.py Projet : taianjianbing/web_news

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="layout"]/h2/text()').extract())
	    l.add_value('title', response.xpath('//div[@id="wrapper"]/h1/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/h1/text()').extract())

            l.add_value('date',response.xpath('//div[@class="layout"]/div/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="left"]/span/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y-%m-%d')
            l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="news-con"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Exemple #16

0

Afficher le fichier

Fichier : amazon_spider.py Projet : lucaslopes0198/scrapy-amazon

    def parse(self, response):
        items = ItemLoader(item=ScrapyamazonItem())

        all_div_quotes = response.css('div.a-section.a-spacing-medium')

        for quote in all_div_quotes:
            product_name = quote.css('.a-color-base.a-text-normal::text').extract()
            product_author = quote.css('.a-color-secondary .a-size-base.a-link-normal').css('::text').extract()
            product_price = quote.css('.a-spacing-top-small .a-price:nth-child(1) span.a-offscreen').css('::text').extract()
            product_imagelink = quote.css('.s-image::attr(src)').extract()
            
            items.replace_value('product_name', product_name)
            items.replace_value('product_author', product_author)
            items.replace_value('product_price', product_price)
            items.replace_value('product_imagelink', product_imagelink)

            yield items.load_item()

Exemple #17

0

Afficher le fichier

Fichier : spiders.py Projet : willrp/willcrawler-app

    def parse_price(self, response):
        try:
            lproduct = response.meta["lproduct"]

            lprice = ItemLoader(item=AsosPrice(), response=response)

            pricejson = json.loads(response.body)
            outletprice = pricejson[0]["productPrice"]["current"]["value"]
            if str(outletprice) == "0.0":
                outletprice = pricejson[0]["productPrice"]["xrp"]["value"]
            retailprice = pricejson[0]["productPrice"]["rrp"]["value"]
            if str(retailprice) == "0.0":
                retailprice = pricejson[0]["productPrice"]["previous"]["value"]

            currency = pricejson[0]["productPrice"]["currency"]

            lprice.replace_value("outlet", float(outletprice))
            lprice.replace_value("retail", float(retailprice))
            lprice.replace_value("currency", str(currency).upper())
            lproduct.replace_value("price", dict(lprice.load_item()))
            yield lproduct.load_item()
        # Exception for products that have other products inside, like suits and vests
        except AttributeError as e:
            self.logger.info(str(e))

Exemple #18

0

Afficher le fichier

    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="text_c clearfix"]/h1/text()').
                extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text_c"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()').
                extract_first())

            loader.add_value(
                'date',
                response.xpath(
                    '//p[@class="text_tools"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath('////div[@class="text_c clearfix"]/h5/text()').
                extract_first())
            loader.add_value(
                'date',
                response.xpath('//p[@class="sou"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@id="p_publishtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y年%m月%d日%H:%M')
            loader.replace_value('date', time.strftime('%Y-%m-%d', date))

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_c"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_show"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="show_text"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="p_content"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01')
            l.add_value('source', '')
            l.add_value('content', '')

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Exemple #19

0

Afficher le fichier

    def parse(self, response):
        api = self.api_keys[self.flag]
        # for key in self.api_key:
        yt = build('youtube', 'v3', developerKey=api, cache_discovery=False)

        for city in self.cities[0]:

            logging.info('++++++++++++++++++ {} ++++++++++++++++++'.format(city))
            # publishedBefore
            # Exceptions
            # if the token in a given api run out the search with raise a error
            try:
                search = yt.search().list(q='{}'.format(city), part='snippet', maxResults=self.max_videos, type='video',
                                          publishedBefore=self.date).execute()

            # ------- api Exceptions in case of runing out of quota -------
            except:
                self.flag += 1
                yt = build('youtube', 'v3', developerKey=self.api_keys[self.flag], cache_discovery=False)

                print("""
                                            | quotaExceeded | {}
                                            """.format(self.flag))
                search = yt.search().list(q='{}'.format(city), part='snippet', maxResults=self.max_videos, type='video',
                                          publishedBefore=self.date).execute()
            # ------- Exceptions -------

            for i in range(len(search['items'])):

                # get basic snippet data
                l = ItemLoader(item=YoutubeItem(), response=response)
                l.add_value('city', city)
                l.add_value('videoId', str(search['items'][i]['id']['videoId']))
                l.add_value('title', str(search['items'][i]['snippet']['title']))
                l.add_value('datetime', str(search['items'][i]['snippet']['publishedAt']))
                l.add_value('description', str(search['items'][i]['snippet']['description']))
                l.add_value('channelId', str(search['items'][i]['snippet']['channelId']))
                statistic = yt.videos().list(id='{}'.format(search['items'][i]['id']['videoId']),
                                             part='statistics').execute()

                # filter for null value in case the video have no like or dislike
                try:
                    l.add_value('like', statistic['items'][0]['statistics']['likeCount'])
                except:
                    l.add_value('like', '0')
                try:
                    l.add_value('dislike', statistic['items'][0]['statistics']['dislikeCount'])
                except:
                    l.add_value('dislike', '0')
                yield l.load_item()

                try:  # avoid some video that has comments disable
                    try:  # see if we runing out of quota

                        comments = yt.commentThreads().list(videoId=str(search['items'][i]['id']['videoId']),
                                                            part='snippet', maxResults=self.max_comments).execute()
                        # yield parse_comments(response, comments) # normal comments
                        c = ItemLoader(item=Youtubecomments(), response=response)
                        c.add_value('videoId', str(search['items'][i]['id']['videoId']))
                        if len(comments['items']) != 0:
                            c.add_value('c_id', str(
                                comments['items'][i]['id']))
                            c.add_value('authorDisplayName', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName']))
                            c.add_value('authorChannelUrl', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl']))
                            c.add_value('textOriginal', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']))
                            c.add_value('publishedAt', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']))
                            c.add_value('updatedAt', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt']))
                            c.add_value('likeCount', str(
                                comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount']))
                            c.add_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount']))

                            for i in range(1, len(comments['items'])):
                                c.replace_value('c_id', str(
                                    comments['items'][i]['id']))
                                c.replace_value('authorDisplayName', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName']))
                                c.replace_value('authorChannelUrl', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl']))
                                c.replace_value('textOriginal', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']))
                                c.replace_value('publishedAt', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']))
                                c.replace_value('updatedAt', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt']))
                                c.replace_value('likeCount', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount']))
                                c.replace_value('totalReplyCount',
                                                str(comments['items'][i]['snippet']['totalReplyCount']))
                                yield c.load_item()

                    # ------- api Exceptions in case of runing out of quota -------
                    except Exception as e:
                        if 'parameter has disabled comments' in str(e):
                            print('''
                            | parameter has disabled comments |
                            ''')
                            pass
                        elif 'quota' in str(e):
                        # else:
                            print('+-+-+-+-+-+-+-+-++-+-+-++-+-{}-+-+-+-+-+-+-+-++--+-+'.format(e))
                            print('+-+-+-+-+-+-+-+-++-+-+-++-+-{}-+-+-+-+-+-+-+-++--+-+'.format(self.flag))
                            print("""
                            | quotaExceeded |
                            """)
                            self.flag += 1
                            yt = build('youtube', 'v3', developerKey=self.api_keys[self.flag],
                                       cache_discovery=False)
                            comments = yt.commentThreads().list(videoId=str(search['items'][i]['id']['videoId']),
                                                                part='snippet', maxResults=self.max_comments).execute()
                            c = ItemLoader(item=Youtubecomments(), response=response)
                            c.add_value('videoId', str(search['items'][i]['id']['videoId']))
                            if len(comments['items']) != 0:
                                c.add_value('c_id', str(
                                    comments['items'][i]['id']))
                                c.add_value('authorDisplayName', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName']))
                                c.add_value('authorChannelUrl', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl']))
                                c.add_value('textOriginal', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']))
                                c.add_value('publishedAt', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']))
                                c.add_value('updatedAt', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt']))
                                c.add_value('likeCount', str(
                                    comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount']))
                                c.add_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount']))

                                for i in range(1, len(comments['items'])):
                                    c.replace_value('c_id', str(
                                        comments['items'][i]['id']))
                                    c.replace_value('authorDisplayName', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet'][
                                            'authorDisplayName']))
                                    c.replace_value('authorChannelUrl', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet'][
                                            'authorChannelUrl']))
                                    c.replace_value('textOriginal', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal']))
                                    c.replace_value('publishedAt', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']))
                                    c.replace_value('updatedAt', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt']))
                                    c.replace_value('likeCount', str(
                                        comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount']))
                                    c.replace_value('totalReplyCount',
                                                    str(comments['items'][i]['snippet']['totalReplyCount']))
                                    yield c.load_item()


                except:
                    pass

Exemple #20

0

Afficher le fichier

Fichier : movie_spider.py Projet : Dawinia/gp_DA_movie

    def parse_movie_info(self, response, tpp_id):
        logger.info(f"crawled movie info of {response.url}")
        item_loader = ItemLoader(item=MovieInfoItem(), response=response)
        person_item_loader = ItemLoader(item=PersonInfoItem(),
                                        response=response)
        data = response.xpath(
            "//script[@type='application/ld+json']/text()").extract()[0]
        try:
            text = json.loads(data, strict=False)
        except json.decoder.JSONDecodeError as de:
            logger.error(f"json decode error {de} in url = {response.url}")
        finally:
            text = json.loads(data, strict=False)

        logger.info(f"len of movie info = {len(text)}")
        item_loader.replace_value('movieName', text.get('name', ''))
        item_loader.replace_value('movieName', text.get('name', ''))
        item_loader.replace_value('dbMovieID', text.get('url', '')[9:-1])
        item_loader.replace_value('tppMovieID', tpp_id)

        base_column = {
            'director': 'directors',
            'author': 'writers',
            'actor': 'actors'
        }

        def get_name_list(parent):
            result = [
                child.get('name', '').split(' ')[0]
                for child in text.get(parent, [])
            ][:10]
            return result if len(result) else [""]

        def get_person_info(parent):
            logger.info(f"start to crawl person info")
            for detail in text.get(parent, []):
                person_item_loader.replace_value('name',
                                                 detail.get('name', ''))
                person_item_loader.replace_value('url', detail.get('url', ''))
                person_item_loader.replace_value('identity', parent)
                logger.info(f"get person info with identity {parent}")
                yield person_item_loader

        for column, item_name in base_column.items():
            item_loader.replace_value(item_name, get_name_list(column))
        # item_loader.replace_value('directors', get_name_list('director'))
        # item_loader.replace_value('writers', get_name_list('author'))
        # item_loader.replace_value('actors', get_name_list('actor'))

        item_loader.replace_value('genre', text.get('genre', []))

        info = response.xpath('//*[@id="info"]').get()
        pattern = '<span class="pl">制片国家/地区:</span>(.*?)<br>'
        item_loader.replace_value('area', re.findall(pattern, info))
        item_loader.replace_value('duration', text.get('duration', ''))
        item_loader.replace_value('publishedDate',
                                  text.get('datePublished', ''))
        item_loader.replace_value(
            'rateCount',
            text.get('aggregateRating', []).get('ratingCount', '0.0'))
        item_loader.replace_value(
            'doubanRate',
            text.get('aggregateRating', []).get('ratingValue', '0.0'))

        logger.info(f"finish parse one movie info, ready to parse person")

        for column in base_column.keys():
            for item in get_person_info(column):
                yield item.load_item()
        # next(get_person_info('director'))
        # get_person_info('author')
        # get_person_info('actor')

        yield item_loader.load_item()

        time.sleep(random.uniform(1, 2))

Exemple #21

0

Afficher le fichier

Fichier : movie_spider.py Projet : Dawinia/gp_DA_movie

    def parse_box_info(self, response):
        """
        获取每日电影票房信息
        :param response:
        :return:
        """
        time.sleep(random.uniform(0, 1))
        logger.info(f"now crawl url for boxOffice: {response.url}")
        item_loader = ItemLoader(item=BoxOfficeItem(), response=response)
        text = json.loads(response.text, strict=False)

        query_date = text.get('calendar', []).get('selectDate', "")
        for i, movie_info in enumerate(
                text.get('movieList', []).get('list', [])):
            if i == 30:
                break
            field_map = {
                'seatRate': 'avgSeatView',
                'boxRate': 'boxRate',
                'showRate': 'showCountRate',
                'splitSumBoxInfo': 'sumSplitBoxDesc',
                'sumBoxInfo': 'sumBoxDesc',
                'showView': 'avgShowView'
            }
            movie_name = movie_info.get('movieInfo', []).get('movieName')
            movie_id = movie_info.get('movieInfo', []).get('movieId')
            for field, json_attr in field_map.items():
                item_loader.replace_value(field, movie_info.get(json_attr, ''))
            item_loader.replace_value('movieID', movie_id)
            item_loader.replace_value('movieName', movie_name)
            item_loader.replace_value(
                'releaseInfo',
                movie_info.get('movieInfo', []).get('releaseInfo', ""))
            item_loader.replace_value('showInfo',
                                      movie_info.get('showCount', 0))
            item_loader.replace_value(
                'boxInfo',
                movie_info.get('boxSplitUnit', []).get('num', ""))
            item_loader.replace_value(
                'splitBoxInfo',
                movie_info.get('splitBoxSplitUnit', []).get('num', ""))
            item_loader.replace_value('crawlDate', query_date)
            item_loader.replace_value('yearRate',
                                      get_year_rate(query_date, i + 1))
            logger.info(
                f"get {i + 1} boxOffice, named {movie_info.get('movieName', '')}."
            )
            # logger.error(f"boxOffice spider put {movie_info.get('movieName')} into queue")
            yield item_loader.load_item()

            # 根据电影名称从豆瓣获取电影详情页链接
            search_url = self.search_base_url + movie_name

            time.sleep(random.uniform(0, 1))

            yield scrapy.Request(url=search_url,
                                 cookies=self.cookies,
                                 callback=self.parse_movie_info_url,
                                 dont_filter=True,
                                 cb_kwargs=dict(movie_name=movie_name,
                                                movie_year=query_date,
                                                tpp_id=movie_id))

Exemple #22

0

Afficher le fichier

 def getInfo(self, res):
     if not mch(res):
         return
     response = etree.HTML(res.text)
     loader = ItemLoader(item=booking.Booking(), response=res)
     supplier_obj_id = res.meta.get('statics.hotels.id')
     supplier_name = res.meta.get('statics.hotels.supplier')
     if supplier_obj_id:
         loader.add_value('statics_hotels_id', supplier_obj_id)
         loader.add_value('statics_hotels_supplier', supplier_name)
     pic = []
     for e in self.allXpath:
         Xpath = eval('bk.' + e)
         fielName, lable = '_'.join(e.split('_')[:-1]), e.split('_')[-1]
         tempResult = ''
         if lable == 'non':
             if response.xpath(Xpath):
                 tempResult = response.xpath(Xpath)[0].strip()
         elif lable == 'ren':
             if re.findall(Xpath, res.text):
                 tempResult = re.findall(Xpath, res.text)[0].strip()
         elif lable == 'rea':
             if re.findall(Xpath, res.text):
                 for each in re.findall(Xpath, res.text):
                     tempResult += each.strip()
         elif lable == 'sub':
             if response.xpath(Xpath):
                 tempResult = re.sub(
                     '\\n+', '\\n',
                     response.xpath(Xpath)[0].xpath('string(.)')).strip()
         elif lable == 'sua':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 if isinstance(temp, list):
                     tempResult += temp[0]
                 elif isinstance(temp, str):
                     tempResult += temp
             tempResult = re.sub('\\n+', '\\n', tempResult).strip()
         elif lable == 'pic':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 pic.append(temp[0])
             tempResult = pic
         elif lable == 'pir':
             for each in re.findall(Xpath, res.text):
                 pic.append(each)
             tempResult = pic
         elif lable == 'xpl':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             tl = []
             for each in response.xpath(selects):
                 temp = re.sub('\\n+', ' - ',
                               each.xpath(subSelcets).strip())
                 tl.append(temp)
             loader.add_value(fielName.lower(), tl)
         if lable != 'xpl':
             if loader.get_collected_values(fielName.lower()):
                 if loader.get_collected_values(fielName.lower())[0] == '':
                     loader.replace_value(fielName.lower(), tempResult)
             else:
                 loader.add_value(fielName.lower(), tempResult)
     yield loader.load_item()

Exemple #23

0

Afficher le fichier

Fichier : people.py Projet : qiangber/scrapy_news

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="text_c clearfix"]/h1/text()').
                extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text_c"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()').
                extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="clearfix w1000_320 text_title"]/h1/text()').
                extract_first())

            url = response.url
            url = url[url.find('n1') + 3:url.rfind('/')]
            loader.replace_value('date',
                                 url[0:4] + '-' + url[5:7] + '-' + url[7:])

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_c"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_show"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="show_text"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="p_content"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="rwb_zw"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Exemple #24

0

Afficher le fichier

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title',
                    response.xpath('//h1[@class="title"]/text()').extract())
        l.add_value(
            'title',
            response.xpath(
                '//span[@class="articletitle_p22"]/text()').extract())
        l.add_value('title',
                    response.xpath('//h1[@class="tit_h2"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//span[@class="gog_title"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//td[@class="gog_title"]/text()').extract())

        l.add_value('date',
                    response.xpath('//div[@class="info"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//span[@class="p12 LightGray2"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//div[@class="articletime"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//body/table[5]/tr[5]/td[2]/div/text()').extract())
        l.add_value(
            'date',
            response.xpath(
                '//body/table[6]/tr/td/table/tr/td/table[3]/tr/td/text()').
            extract())
        r1 = r"\d{4}.\d{1,2}.\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="p16"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content01 p16"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/div/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//span[@class="gog_content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/a/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="gog_content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath(
                '//td[@class="gog_content"]/font/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="p16"]/div/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Exemple #25

0

Afficher le fichier

Fichier : wy_spider.py Projet : taianjianbing/web_news

    def get_news(self, response):
        try:
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value(
                'title',
                response.xpath(
                    '//div[@class="post_content_main"]/h1/text()').extract())
            l.add_value(
                'title',
                response.xpath(
                    '//div[@class="endContent"]/h1/text()').extract())
            l.add_value(
                'title',
                response.xpath('//div[@class="theTitle"]/h1/text()').extract())
            l.add_value(
                'title',
                response.xpath(
                    '//div[@class="ep-main-bg"]/h1/text()').extract())
            l.add_value(
                'title',
                response.xpath(
                    '//div[@class="ep-content-main"]/h1/text()').extract())
            l.add_value(
                'title',
                response.xpath(
                    '//div[@class="endContent bg_endPage_Lblue"]/h1/text()').
                extract())

            l.add_value(
                'date',
                response.xpath(
                    '//div[@class="post_time_source"]/text()').extract())
            l.add_value(
                'date',
                response.xpath('//span[@class="info"]/text()').extract())
            l.add_value(
                'date',
                response.xpath('//div[@class="text"]/text()').extract())
            l.add_value(
                'date',
                response.xpath(
                    '//div[@class="ep-time-soure cDGray"]/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            date = time.strptime(date.split()[0], u'%Y-%m-%d')
            l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value(
                'content',
                response.xpath('//div[@class="post_text"]/p/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@class="endText"]/p/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@class="endText"]/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@class="end-text"]/p/text()').extract())
            l.add_value(
                'content',
                response.xpath(
                    '//div[@class="end-text"]/div/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@id="content"]/p/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@id="endText"]/p/text()').extract())
            l.add_value(
                'content',
                response.xpath('//div[@id="endText"]/div/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Exemple #26

0

Afficher le fichier

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value(
            'title',
            response.xpath('//div[@class="crumbs"]/h1/text()').extract())
        l.add_value(
            'title',
            response.xpath('//div[@class="headBox"]/h1/text()').extract())
        l.add_value('title',
                    response.xpath('//h1[@class="artTitle"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//h1[@class="artiTitle"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//h1[@class="artiTitle clearB"]/text()').extract())
        l.add_value('title',
                    response.xpath('//h1[@class="c_title"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//h1[@class="c_title"]/span/text()').extract())
        l.add_value('title',
                    response.xpath('//td[@class="a4"]/text()').extract())
        l.add_value(
            'title',
            response.xpath('//div[@class="Left"]/div/h1/text()').extract())
        l.add_value(
            'title',
            response.xpath('//div[@class="wrapl"]/h1/text()').extract())
        l.add_value(
            'title',
            response.xpath('//div[@class="big_img2"]/h1/text()').extract())
        l.add_value('title',
                    response.xpath('//div[@id="contit"]/h1/text()').extract())
        l.add_value(
            'title',
            response.xpath('//div[@class="headBox"]/div/h1/text()').extract())

        l.add_value(
            'date',
            response.xpath('//div[@class="pub_date"]/span/text()').extract())
        l.add_value(
            'date',
            response.xpath(
                '//div[@class="artiInfo pub_date fl"]/span/text()').extract())
        l.add_value('date',
                    response.xpath('//td[@class="a5"]/span/text()').extract())
        l.add_value(
            'date',
            response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
        l.add_value(
            'date',
            response.xpath('//div[@class="wrapl"]/h3/text()').extract())
        l.add_value('date',
                    response.xpath('//div[@class="more"]/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0] + " " + "00:00:00")

        l.add_value(
            'content',
            response.xpath('//div[@id="articleBody"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@id="articleBody"]/p/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@id="artiContent"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@id="artiContent"]/p/font/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="c_body"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="a1"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="a1"]/p/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@id="artbody"]/p/span/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="artbody"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="artCon"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@class="artCon"]/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="box3"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="c_content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath(
                '//div[@class="c_content"]/span/p/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="cc"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="content"]/p/text()').extract())
        l.add_value('content',
                    response.xpath('//div[@id="bigpic"]/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)

        return l.load_item()

Exemple #27

0

Afficher le fichier

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath('//h1[@id="title"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath('//span[@id="title"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath('//div[@id="Title"]/text()').extract_first())

            loader.add_value(
                "date",
                response.xpath('//span[@class="time"]/text()').extract_first())
            loader.add_value(
                "date",
                response.xpath('//span[@id="pubtime"]/text()').extract_first())
            try:
                date = ''.join(loader.get_collected_values("date")).strip()
                if date != '':
                    date = time.strptime(date, u'%Y年%m月%d日 %H:%M:%S')
                    loader.replace_value(
                        "date", time.strftime("%Y-%m-%d %H:%M:%S", date))
                else:
                    date = ''.join(
                        response.xpath(
                            '//div[@class="info"]/text()').extract()).strip()
                    if date != '':
                        loader.replace_value("date", date.strip()[:16] + ':00')
            except ValueError:
                end = response.url.find('/_c')
                loader.replace_value(
                    "date",
                    response.url[end - 10:end].replace('/', '-') + " 00:00:00")

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="content"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="article"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//td[@class="p1"]//p/descendant::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        yield loader.load_item()

Exemple #28

0

Afficher le fichier

Fichier : spiders.py Projet : willrp/willcrawler-app

    def parse_product(self, response):
        try:
            sessionid = response.meta["sessionid"]
            sessionname = response.meta["sessionname"]
            gender = response.meta["gender"]
            productlink = response.meta["productlink"]

            lproduct = ItemLoader(item=AsosProduct(), response=response)

            name = response.css("div.product-hero>h1::text").extract_first()
            code = response.css("div.product-code>span::text").extract_first()
            kind = response.css(
                "div.product-description span strong:nth-child(1)::text"
            ).extract_first()
            brand = response.css(
                "div.brand-description span strong::text").extract_first()
            details = response.css(
                "div.product-description span ul>li::text").extract()

            # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE
            if (kind is None) or (brand is None):
                temp = getByPos(details)
                details = temp.details
                if (kind is None) or (kind == " "):
                    kind = temp.kind
                if brand is None:
                    brand = temp.brand
            # FURTHER FIXING WITH PARISIAN PETITE BRAND
            if re.search("\s+$", str(kind)) is not None:
                kind = str(kind) + str(
                    response.css("div.product-description span a>strong::text"
                                 ).extract_first())

            care = response.css("div.care-info span::text").extract_first()

            # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE ON CARE SECTION
            if care is None:
                care = response.css(
                    "div.care-info span>*::text").extract_first()

            lstimages = response.css(
                "div.product-gallery li.image-thumbnail img::attr(src)"
            ).extract()
            images = [(img.split("?", 1)[0] + "?wid=" + str(self.imgwidth))
                      for img in lstimages]

            about = response.css("div.about-me span::text").extract_first()

            # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE ON ABOUT SECTION
            if about is None:
                about = response.css(
                    "div.about-me span>*::text").extract_first()

            # Todo: GETTING ONLY RELIABLE DATA IN ORDER TO DEPLOY
            if name is None:
                raise AttributeError("No name")
            elif code is None:
                raise AttributeError("No code")
            elif productlink is None:
                raise AttributeError("No product link")
            elif kind is None:
                raise AttributeError("No kind")
            elif brand is None:
                raise AttributeError("No brand")
            elif details is None:
                raise AttributeError("No details")
            elif care is None:
                raise AttributeError("No care")
            elif about is None:
                raise AttributeError("No about")
            elif images is None:
                raise AttributeError("No images")

            lproduct.replace_value("name", stripSpaces(str(name)).title())
            lproduct.replace_value("code", stripSpaces(str(code)))
            lproduct.replace_value("link", stripSpaces(str(productlink)))
            lproduct.replace_value("kind", stripSpaces(str(kind)).title())
            lproduct.replace_value("brand", stripSpaces(str(brand)).title())
            lproduct.replace_value("details",
                                   [str(x).strip() for x in details])
            lproduct.replace_value("care", stripSpaces(str(care)))
            lproduct.replace_value("about", stripSpaces(str(about)))
            lproduct.replace_value("images", images)
            lproduct.replace_value("storename", self.storename.title())
            lproduct.replace_value("sessionid", str(sessionid))
            lproduct.replace_value("sessionname",
                                   stripSpaces(str(sessionname)).title())
            lproduct.replace_value("gender", stripSpaces(str(gender)).title())

            iid = re.findall("iid=[0-9]+", productlink)[0][4:]
            pricelink = "http://www.asos.com/api/product/catalogue/v2/stockprice?" \
                        "productIds=" + iid + "&store=COM&currency=GBP"

            requestprice = response.follow(pricelink,
                                           callback=self.parse_price)
            requestprice.meta["lproduct"] = lproduct

            yield requestprice
        # Exception for products that have other products inside, like suits and vests
        except AttributeError as e:
            self.logger.info(str(e))