def parse_titles(self, response):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('hub', response.meta['hname'])
     loader.add_css('title', 'div.company_post h1 span::text')
     loader.add_css('date', 'div.published::text')
     loader.add_css('article', 'div.content::text')
     yield loader.load_item()
Example #2
0
	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('url', response.url)
		l.add_value('name', self.name)
		l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src')
		return l.load_item()
		def parse_item(self,response):
			l = ItemLoader(item =MeizituItem(),response = response)
			l.add_xpath('name','//h2/a/text()')
			l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
			l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity())
			l.add_value('url', response.url)
			return l.load_item()
Example #4
0
 def _parse(self, response):
     l = ItemLoader(item=BookmarksItem(), response=response)
     l.add_xpath(u"name", u"/html/head/title")
     l.add_xpath(u"anchors", u"//a/@href'")
     l.add_xpath(u"description", u"/html/body/text()")
     l.add_value(u"last_updated", datetime.datetime)  # you can also use literal values
     return l.load_item()
Example #5
0
	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
		return l.load_item()
    def parse_rate(self,response):

        loader = ItemLoader(item = RateItem(),response=response)
        
        for attr,xpath in self.settings.getdict('RATE_XPATH').items():
            loader.add_xpath(attr,xpath)

        return loader.load_item()
Example #7
0
    def parse(self, response):
        for item in self.find_items(response):
            loader = ItemLoader(item=self.item_class())
            for target in self.get_targets():
                loader.add_value(target.name, target.get_value(item, response))

            val = self.Meta.detail_path.get_value(item, response)
            yield gen_request(val, self.parse_details, loader.load_item())
Example #8
0
    def parse_content(self,response):
        bbsItem_loader = ItemLoader(item=BbsDmozItem(),response = response)
        url = str(response.url)
        bbsItem_loader.add_value('url',url)
        bbsItem_loader.add_xpath('forum',self._x_query['forum'])
        bbsItem_loader.add_xpath('poster',self._x_query['poster'])
        bbsItem_loader.add_xpath('content',self._x_query['page_content'])

        return bbsItem_loader.load_item()
Example #9
0
 def test_load_item_using_default_loader(self):
     i = TestItem()
     i['summary'] = u'lala'
     il = ItemLoader(item=i)
     il.add_value('name', u'marta')
     item = il.load_item()
     assert item is i
     self.assertEqual(item['summary'], u'lala')
     self.assertEqual(item['name'], [u'marta'])
Example #10
0
    def parse(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        for name, xpath in response.meta['fields'].iteritems():
            if xpath:
                item.fields[name] = Field()
                l.add_xpath(name, xpath)

        return l.load_item()
Example #11
0
    def parse_detail(self, response):
        il = ItemLoader(NewsItem(), response=response)

        il.add_css("title", "%s::text" % self.title)
        il.add_css("date", "%s::text" % self.date)
        il.add_css("auth", "%s::text" % self.auth)
        il.add_css("content", "%s > p::text" % self.content)
        il.add_value("cate", response.meta["cate"])
        return il.load_item()
Example #12
0
    def parse(self, response):
        l = ItemLoader(item=PlantItem(), response=response)

        l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
        l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
        l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
        l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
        # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()")

        return l.load_item()
Example #13
0
    def parse_stuff(self, response):
        hxs = Selector(response)
        sites = hxs.xpath('//body')
        items_main = []

        for site in sites:
            loader = ItemLoader(item = Items_Main(), response = response)
            loader.add_xpath('fragment', '//*[not(self::script)]/text()')
            items_main.append(loader.load_item())
            return items_main
Example #14
0
    def parse(self, response):
        l = ItemLoader(item=UniprotItem(), response=response)
        l.add_xpath('proteinName', "//*[@id='page-header']/h2/span/text()")
        l.add_value('uniprotAccession', response.url)
        l.add_xpath('uniprotProteinLength', "//*[@id='sequences-section']/div[1]/div[2]/div[1]/span[2]/text()")
        listing = response.xpath("//*[@id='subcellular_location']/div[1]/ul")
        subcellular_location = []
        for li in listing:
            subcellular_location.append(li.xpath("./li/a/text()").extract())
        l.add_value('uniprotLocalization', subcellular_location)

        yield l.load_item()
Example #15
0
    def parse_event_detail(self, response):
        event = response.meta['event']
        events = response.meta['events']
        players = response.xpath('//table[@class="sticky-enabled"]/tbody/tr')
        event_loader = ItemLoader(event)

        for player in players:
            event_loader.add_value(
                'players', player.xpath('td/text()').extract())

        events.append(event_loader.load_item())
        return events
 def print_url(self, response):
     """
         @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx
         @returns items 1 1
         @returns requests 0 0
         @scrapes title link html text last_updated file_urls
     """
     l = ItemLoader(item=UrbanRenewalItem(), response=response)
     l.add_xpath('title', '//title')
     l.add_value('link', response.url)
     l.add_xpath('text', '//div[@id="content"]')
     l.add_xpath('last_updated', '//div[@class="lastUpdated"]')
     return l.load_item()
Example #17
0
    def parse(self, response):
        l=ItemLoader(item=RentalItem(),response=response)
        l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_value('url', response.url)

        return l.load_item()
Example #18
0
    def parse(self, response):
        sel = Selector(response)
        last_page = sel.xpath('//span[@class="step-links"]/a/text()')[-1].extract()
        self.num_page = int(last_page)

        loader = ItemLoader(item=User(), response=response)
        loader.add_value('uid', self.uid)
        loader.add_xpath('name', '//a[@class="username"]/text()')

        for i in range(1, self.num_page + 1):
            url = self.start_urls[0] + '/' + str(i)
            yield Request(url,
                          callback=self.parse_list,
                          meta={'loader': loader})
Example #19
0
    def parse(self, response):
        if not self.fields:
            # init database fields from saved state
                self.fields = self.next_window()

        search_fos =  urlparse.parse_qs(urlparse.urlparse(response.url).query)['mauthors'][0].split(':')[1]
        self.logger.debug('Search fos: %s' % search_fos)
        # get 10 author divs
        for divs in response.xpath('//div[@class="gsc_1usr gs_scl"]')[0:9]:
            user = divs.extract()
            # Content in the img's alt tag is the actual name, shown on the profile
            # However, the name in the actual link differs sometimes slightly
            # EH Roberts (link) instead of E H Roberts (on profile + alt)

            id = re.search('citations\?user=([^&]+)(&|)',user)
            name = re.search('alt="([^"]+)"', user)
            citecount = re.search('<div class="gsc_1usr_cby">.*([0-9]+)</div>', user)
            fostmp = re.findall('label:([^"]+)("|)', user)
            fos = [i[0] for i in fostmp]
            if id and name:
                item = ItemLoader(item=AuthorItem(), response=response)
                item.add_value('fos', fos)
                item.add_value('id', id.group(1))
                item.add_value('name', name.group(1))

                # unknown citation count:
                cited = citecount.group(1) if citecount else None
                item.add_value('cited', cited)
                yield item.load_item()
            # Also scrape field of studies while we are at it
            for f in fos:
                if f != search_fos:
                    fos_item = FOSItem()
                    fos_item['field_name'] = f
                    yield fos_item
        # generate  next url
        new1 = response.xpath('//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first()
        if new1:
            new2 = re.search('mauthors(.*)\'"', new1)
            if new2:
                newUrl = str(new2.group(1)).replace('\\x3d','=').replace('\\x26', '&')
                newUrl = 'https://scholar.google.de/citations?view_op=search_authors&hl=de&mauthors' + newUrl
                self.container.append(newUrl)

        # proceed with another random url or label to randomize access pattern to gscholar
        next_url = self.choose_next()

        if next_url:
            yield Request(url=next_url)
Example #20
0
	def parse_first_page(self, response):
		count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0])
		title = response.request.cookies['title']
		albumURL = response.url.replace(".html", '')
		for x in xrange(1,count+1):
			suffix = ".html"
			if x > 1:
				suffix = "_"+str(x)+".html"
				request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
				yield request
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', title)
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src')
		yield l.load_item()
Example #21
0
    def parse_content(self, response):
        logger.info('Dealing with images: %s', response.url)
        item_load = ItemLoader(item=ScrapyMeizituItem(), response=response)
        item_load.add_value('url', response.url)
        item_load.add_xpath('name', self._x_query['name'])
        item_load.add_xpath('tags', self._x_query['tags'])
        item_load.add_xpath('image_urls', self._x_query['image_urls'])

        return item_load.load_item()
    def parse_depth_chart(self, response):
        loader = ItemLoader(item=NFL_Team_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()')
        loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()')

        yield loader.load_item()
 def _set_loader(self, response, from_page, xs, item):
     self.from_page = from_page
     rpt = self.scraper.get_rpt(from_page)
     if not self.from_page == 'MP':
         item = response.request.meta['item']
         if rpt.content_type == 'J':
             json_resp = json.loads(response.body_as_unicode())
             self.loader = JsonItemLoader(item=item, selector=json_resp)
         else:
             self.loader = ItemLoader(item=item, response=response)
     else:
         if rpt.content_type == 'J':
             self.loader = JsonItemLoader(item=item, selector=xs)
         else:
             self.loader = ItemLoader(item=item, selector=xs)
     self.loader.default_output_processor = TakeFirst()
     self.loader.log = self.log
Example #24
0
    def parse(self,response):
        l = ItemLoader(item = NytimesItem(),response = response)
        l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()')
        l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()')
        #print(type(l.load_item()))
        x = l.load_item()
        #print(len(x['date']),len(x['topnews']),len(x['sectionnews']))
        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        topnewslist = []
        sectionnewslist = []
        nytdict['date'] = str(datalist)

        for t in x['topnews']:
            topnewslist.append(str(t.encode('ascii','ignore')))
        nytdict['topnews']=topnewslist

        for t in x['sectionnews']:
            sectionnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['sectionnews']=sectionnewslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()
Example #25
0
    def parse(self, response):

        for outer in response.css('#comapreTable tr:not(:first-child)'):

            if outer.css('td[align="center"]'):
                ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
                cname = outer.css('td[align="center"]>a::text').extract_first()

            for inner in outer.xpath('td[div[@align="left"]/a]'):
                loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
                loader.add_value('ccode', ccode)
                loader.add_value('cname', cname)
                loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
                loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
                loader.add_css('name', 'a::text', MapCompose(unicode.strip))
                item = loader.load_item()

                yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
 def parse_CatalogRecord(self, response):
     CatalogRecord = ItemLoader(item=catalogscraperItem(), response=response)
     CatalogRecord.default_output_processor = TakeFirst()
     keywords = '|'.join(r"\b" + re.escape(word.strip()) + r"\b" for word in open('Catalog_Scraper/spiders/keys.txt'))
     r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE)
     if r.search(response.body_as_unicode()):
         # The following lines tell the spider how to populate the fields defined in "items.py". The first argument of "CatalogRecord.add_xpath" indicated which field the spider is being directed to fill, while the second provides an xpath, directing the spider to where the relevent information is contained on a give webpage.
         CatalogRecord.add_xpath('title', './/div[@id="dublin-core-title"]/div[@class="element-text"]/text()')
         # CatalogRecord.add_xpath('subject', '')
         # CatalogRecord.add_xpath('description', '')
         # CatalogRecord.add_xpath('creator', '')
         # CatalogRecord.add_xpath('source', '')
         # CatalogRecord.add_xpath('published', '')
         # CatalogRecord.add_xpath('published', '')
         # CatalogRecord.add_xpath('rights', '')
         # CatalogRecord.add_xpath('citation', '')
         # CatalogRecord.add_xpath('url', '')
         return CatalogRecord.load_item()
Example #27
0
    def parse_details(self, response):

        item = response.meta["item"]
        urlLast = response.meta["urlLast"]

        loader = ItemLoader(item,response=response)
        loader.add_xpath("Description","//*[@id='body']/p[3]/text()")
        loader.add_xpath("Education","//td[. = 'Education Level (Highest Grade Completed)']/following-sibling::td[1]/text()")

        if urlLast.endswith("no_last_statement.html"):
            loader.add_value('Message',u'')
            return loader.load_item()
        else:
            request = scrapy.Request(urlLast, meta={"item": loader.load_item()}, callback=self.parse_details2)
            return request
Example #28
0
	def parse_first_page(self, response):
		count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
		title = response.request.cookies['title']
		albumURL = response.url.replace(".shtml", '')
		# print u'', count, title, albumURL
		for x in xrange(1,count+1):
			suffix = ".shtml"
			if x > 1:
				suffix = "_"+str(x)+".shtml"
				# print u'',albumURL+suffix
				request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
				yield request
				
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', title)
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
		yield l.load_item()
    def parse(self, response):
        def strip_dollar(x):
            return x.strip('$')




        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                        '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]')))
        except TimeoutException:
            print 'Page load time out'
            pass

        while True:
            try:
                try:
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH,
                                '//*[@id="depart-container"]/div/div/div/button')))
                except TimeoutException:
                    break

                next = self.driver.find_element_by_xpath(
                    '//*[@id="depart-container"]/div/div/div/button')
                next.click()

            except ElementNotVisibleException:
                break
        for trips in Selector(
                text=self.driver.page_source).xpath(self.trips_list_xpath):
            loader = ItemLoader(BusTrip(), selector=trips)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.price_in = MapCompose(strip_dollar)


            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            dateoftrip = str(response.url).split("/")[-1]
            loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape'))
            yield loader.load_item()
Example #30
0
    def parse_item(self, response):
        l = ItemLoader(item=GetEmailsItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        emails = response.xpath('//text()').re(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}")

        l.add_value('email', emails)
        l.add_value('url', response.url)

        return l.load_item()
Example #31
0
    def parse(self, response):

        productos = response.css('div.product-tile-inner')
        promedio = 0.0
        num_items = 0
        for prod in productos:
            text_price = prod.css('.price::attr(data-bind)')
            precio = str(text_price).replace(").formatMoney(2, '.', '\">]","").replace(").formatMoney(2, '.', ',\">]","").replace("[<Selector xpath=\"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' price ')]/@data-bind\" data=\"text:'$' + (","")
            try:
                promedio = promedio + float(precio)
                num_items = num_items +1
            except:
                print(precio)

        for producto in productos:
            existe_producto = len( producto.css('div.detail'))
            if(existe_producto > 0):
                # titulo = producto.css('a.name::text')
                # url = producto.xpath('//div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src')
                producto_loader = ItemLoader(
                    item = ProductoFybeca(),
                    selector = producto
                )
                
                producto_loader.default_output_processor = TakeFirst()

                producto_loader.add_css(
                    'titulo',
                    'a.name::text'
                    )
                
                producto_loader.add_xpath(
                    'imagen',
                    'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src'
                )
                producto_loader.add_value(
                    'promedio',
                    promedio/num_items
                )

                producto_loader.add_css(
                    'precio',
                    '.price::attr(data-bind)'
                )

                #producto_imprimir = producto_loader.load_item()
                #print(producto_imprimir)
                yield producto_loader.load_item()
Example #32
0
    def parse(self, response):
        for i in range(1,
                       11):  #range to 10 because bing results are 10 per page
            for row in response.xpath("//li[@class='b_algo'][%s]" % i):
                l = ItemLoader(item=CandcrawlerItem(), selector=row)
                l.add_xpath("headline", "h2//text()")
                l.add_xpath(
                    "metadata",
                    "div[@class='b_caption']/div[@class='b_factrow b_twofr']/div[@class='b_vlist2col']/ul/li/div//text()"
                )
                l.add_xpath(
                    "li_url",
                    "div[@class='b_caption']/div[@class='b_attribution']/cite/text()"
                )
                l.add_xpath("summary", "div[@class='b_caption']/p//text()")
                l.add_xpath("search",
                            "//div[@class='b_searchboxForm']/input/@value")
                l.add_value("link", response.request.url)
                #this is to get only the LinkedIn results
                if 'linkedin.com/in' in response.xpath(
                        "//li[@class='b_algo'][%s]/div[@class='b_caption']/div[@class='b_attribution']/cite/text()"
                        % i).get():
                    yield l.load_item()
                else:
                    pass

        next_page = response.xpath(
            "//li[@class='b_pag']/nav/ul/li/a[@aria-label='Page 2']/@href"
        ).get()
        if next_page is not None:
            next_page = "http://www.bing.com" + next_page
            yield response.follow(next_page, callback=self.parse)
Example #33
0
    def parse(self, response, **kwargs):
        loader = ItemLoader(item=YelpItem(), response=response)
        for script in response.css('script').getall():
            if '{"gaConfig' in script:
                detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1))
        loader.add_value('direct_url', detail_json['staticUrl'])
        loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId'])
        loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1])
        if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']:
            loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText'])
        loader.add_value('title', detail_json['bizDetailsPageProps']['businessName'])
        loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews'])
        #TODO: find way to not use hardcoded documentIds
        post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}]

        yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data),
            headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
Example #34
0
 def parse_item(self, response):
     loader = ItemLoader(item=SpiderItem(), response=response)
     content = ''
     try:
         title = response.xpath(r'//*[@class="dbt"]//text()').extract()
         date = response.xpath(r'//*[@class="lf"]//text()').extract_first()
         if date is not None:
             date = date.split(" ")[0]
         else:
             date = '1970-01-01'
         content = response.xpath(
             r'//*[@class="nra"]//text() | //*[@class="bzzx_xjnr"]//text()'
         ).extract()
         loader.add_value('date', date)
         loader.add_value('title', title)
         loader.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         loader.add_value('date', '1970-01-01')
         loader.add_value('title', 'unknown')
         loader.add_value('content', '')
     finally:
         self.logger.info("crawled url: %s" % response.url)
         loader.add_value('url', response.url)
         loader.add_value('collection_name', self.name)
         loader.add_value("website", self.website)
         if content == '':
             self.logger.warning(' url: %s msg: %s' %
                                 (response.url, ' content is None'))
         yield loader.load_item()
Example #35
0
    def populate_item(self, selector, url):
        item_loader = ItemLoader(item=MediumScraperItem(), selector=selector)
        item_loader.default_output_processor = TakeFirst()
        item_loader.add_xpath('author',
                              './/a[@data-action="show-user-card"]/text()')
        item_loader.add_xpath('title',
                              './/*[contains(@class, "title")]/text()')
        item_loader.add_xpath('title',
                              './/h3[contains(@class, "title")]/*/text()')
        item_loader.add_xpath('subtitle_preview',
                              './/*[@name="previewSubtitle"]/text()')
        item_loader.add_xpath(
            'collection', './/a[@data-action="show-collection-card"]/text()')
        item_loader.add_xpath('read_time', './/*[@class="readingTime"]/@title')
        item_loader.add_xpath(
            'claps', './/button[@data-action="show-recommends"]/text()')
        item_loader.add_xpath(
            'responses',
            './/a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()'
        )
        item_loader.add_xpath('published_date', './/time/text()')
        item_loader.add_xpath(
            'article_url', './/a[contains(@class, "button--smaller")]/@href')
        item_loader.add_value('scraped_date', datetime.now())

        return item_loader.load_item()
Example #36
0
    def parse(self, response):
        sites = response.xpath('//table/tbody/tr')
        for site in sites:

            url = urljoin(response.url,
                          site.xpath("td[2]/a/@href").extract_first())
            urlLast = urljoin(response.url,
                              site.xpath("td[3]/a/@href").extract_first())
            item = DeathItem()
            loader = ItemLoader(item, selector=site)
            loader.add_xpath('Mid', 'td[1]/text()')
            loader.add_xpath('firstName', 'td[5]/text()')
            loader.add_xpath('lastName', 'td[4]/text()')
            loader.add_xpath('Date', 'td[8]/text()')
            loader.add_xpath('Race', 'td[9]/text()')
            loader.add_xpath('County', 'td[10]/text()')
            loader.add_xpath('Age', 'td[7]/text()')
            loader.add_value('OILink', url)
            loader.add_value('OLastStatement', urlLast)

            if url.endswith(("jpg", "no_info_available.html")):
                loader.add_value('Description', u'')
                loader.add_value('Education', u'')
                if urlLast.endswith("no_last_statement.html"):
                    loader.add_value('Message', u'')
                    yield loader.load_item()
                else:
                    request = scrapy.Request(urlLast,
                                             meta={"item": loader.load_item()},
                                             callback=self.parse_details2)
                    yield request
            else:
                request = scrapy.Request(url,
                                         meta={
                                             "item": loader.load_item(),
                                             "urlLast": urlLast
                                         },
                                         callback=self.parse_details)
                yield request
Example #37
0
    def parse_profile(self, response: Response):
        query = urllib.parse.urlparse(response.url).query
        id = int(urllib.parse.parse_qs(query)['id'][0])

        l = ItemLoader(item=UserItem(), response=response)
        l.add_value('id', id)
        l.add_xpath(
            'name',
            '//div[@id="viewprofile"]//td[@id="profile-left"]/li[@id="profile-name"]/strong/text()'
        )
        l.add_xpath(
            'avatar_url',
            '//div[@id="viewprofile"]//td[@id="profile-left"]//img//@src')
        l.add_xpath(
            'registration_date',
            '//div[@id="viewprofile"]//td[@id="profile-right"]//li/span[text()="Зарегистрирован:"]/following-sibling::strong/text()'
        )
        return l.load_item()
Example #38
0
    def parse_item(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        try:
            for attr in ['title', 'date', 'content']:
                function = getattr(self, 'get' + attr, None)
                if function:
                    l.add_value(attr, function(response))
                else:
                    self.logger.error('no method for %s' % attr)

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            pass
        finally:
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Example #39
0
    def parse(self, response):
        curr_url = response.url
        key = get_schemenetloc(curr_url)
        if key in self.websites:
            rule = self.websites[key]
            item = ItemLoader(item=GovernwebcrawlerItem(), response=response)
            root = rule['root_div']
            title = rule['title']
            content = rule['content']
            time = rule['time']
            desc = rule['desc']
            item.add_xpath('title', root + title)
            item.add_xpath('time', root + time)
            item.add_xpath('content', root + content)
            item.add_value('url', curr_url)
            item.add_value('desc', desc)

            yield item.load_item()

        body = response.body
        content = body.decode('utf8', errors='ignore')

        results = Selector(text=content).xpath('//a').extract()
        for res in results:
            sel = Selector(text=res)
            url = sel.xpath('//a/@href').extract()
            name = sel.xpath('//a/text()').extract()
            if len(url) != 0:
                url = urljoin(curr_url, url[0])
                req = Request(url=url, callback=self.parse)
                if not url.endswith('.html'):
                    req.meta['PhantomJS'] = True
                yield req
Example #40
0
 def parse_goods(self, response: HtmlResponse):
     loader = ItemLoader(item=LeroyItem(), response=response)
     loader.add_xpath('name', '//h1[@class="header-2"]/text()')
     loader.add_xpath('photos',
                      '//uc-pdp-media-carousel//img[@slot="thumbs"]/@src')
     loader.add_xpath('params', '//dl[@class="def-list"]/div')
     loader.add_value('url', response.url)
     loader.add_xpath('price', '//span[@slot="price"]/text()')
     yield loader.load_item()
Example #41
0
 def item_parse(self, response: HtmlResponse):
     loader = ItemLoader(item=LeroymerlinItem(), response=response)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('parameters', '//div[@class="def-list__group"]')
     loader.add_xpath('photos', '//img[@itemprop="image"]/@src')
     loader.add_xpath('price', '//span[@slot="price"]/text()')
     loader.add_value('link', response.url)
     yield loader.load_item()
 def parse_author(self, response):
     quote_item = response.meta['quote_item']
     loader = ItemLoader(item=quote_item, response=response)
     loader.add_css('author_name', '.author-title::text')
     loader.add_css('author_birthday', '.author-born-date::text')
     loader.add_css('author_bornlocation', '.author-born-location::text')
     loader.add_css('author_bio', '.author-description::text')
     yield loader.load_item()
Example #43
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_raw):
            if type(publish_time_raw) == type([]):
                publish_time_raw_str = publish_time_raw.pop()
            else:
                publish_time_raw_str = publish_time_raw
            time_splited = publish_time_raw_str.split(',')
            year = str(time_splited[1]).strip()
            mounth_day = time_splited[0].split(' ')
            day = str(mounth_day[1]).strip()
            mounth = mounth_day[0]

            mounth_dict = {
                u'一月': '01',
                u'二月': '02',
                u'三月': '03',
                u'四月': '04',
                u'五月': '05',
                u'六月': '06',
                u'七月': '07',
                u'八月': '08',
                u'九月': '09',
                u'十月': '10',
                u'十一月': '11',
                u'十二月': '12',
            }

            mounth_num_str = mounth_dict[mounth]

            if len(day) < 2:
                day = '0' + day

            publish_time_dealed = year + '-' + mounth_num_str + '-' + day + ' 00:00:00'
            return publish_time_dealed

        def deal_publish_user(publish_user_raw):
            if type(publish_user_raw) == type([]):
                if publish_user_raw:
                    publish_user_name = publish_user_raw.pop()
                else:
                    publish_user_name = ''
            else:
                publish_user_name = publish_user_raw
            return publish_user_name.strip()

        def deal_read_count(read_count_raw):
            if read_count_raw:  #这里边一定是list对象。
                read_count_str = read_count_raw.pop()
                read_count_str = str(read_count_str)
                read_count = str(read_count_str).replace('阅读次数:',
                                                         '').replace(',', '')
                return int(read_count)
            else:
                return 0

        loader1 = ItemLoader(response=response, item=YfspiderspeakItem())
        loader1.add_value('url', response.url)
        loader1.add_value('id', response.url.split('/')[-1])
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title', '//div[@id="main"]//h1[@class="entry-title"]/text()',
            lambda x: x[0].strip())
        loader1.add_xpath(
            'content',
            '//div[@id="main"]//div[@class="entry-content"]//text()',
            lambda x: ''.join([oneP.strip() for oneP in x]))
        loader1.add_xpath('publish_time',
                          '//div[@id="main"]//span[@class="date"]/text()',
                          deal_publish_time)
        loader1.add_xpath('publish_user',
                          '//div[@id="main"]//span[@class="author"]//text()',
                          deal_publish_user)
        loader1.add_value(
            'read_count',
            response.xpath(
                "//div[@id='content']/article/div[contains(@class,'tags')]//text()"
            ).re(u'阅读次数\:(.*)'), deal_read_count)
        loader1.add_xpath(
            'video_urls',
            '//div[@id="main"]//div[@class="entry-content"]//iframe/@src')
        loader1.add_xpath(
            'img_urls',
            '//div[@id="main"]//div[@class="entry-content"]//img/@src')

        item1 = loader1.load_item()
        return item1
Example #44
0
    def parse_content_english(self, response):
        def deal_publish_time(publish_time):
            if publish_time:
                publish_time_split = publish_time[0].strip().split('/')
                return publish_time_split[2] + '-' + publish_time_split[
                    1] + '-' + publish_time_split[0]
            else:
                return None

        loader1 = ItemLoader(response=response, item=YfspiderspeakItem())
        loader1.add_value('url', response.url)
        loader1.add_value('id', response.url.split('/')[-1])
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@id="container"]//h1[@class="entry-title"]//text()',
            lambda x: x[0].strip())
        loader1.add_xpath(
            'content',
            '//div[@id="container"]//div[@class="entry-content"]//p//text()',
            lambda x: Join([oneP.strip() for oneP in x]))

        loader1.add_xpath(
            'publish_time',
            '//div[@id="container"]//div[@class="entry-meta"]//span[@class="entry-date"]/text()',
            deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//div[@id="container"]//div[@class="entry-meta"]//span[@class="author vcard"]/a/text()',
            lambda x: x.strip() if x else None)
        loader1.add_value(
            'read_count',
            response.xpath('//div[@id="content"]/text()').re('^\s*\d+\s*'),
            lambda x: x.strip() if x else 0)

        item1 = loader1.load_item()
        return item1
Example #45
0
    def parse_lot(self, response):

        l = ItemLoader(item=LarsenDelpetersonItem(), response=response)
        l.default_output_processor = TakeFirst()

        l.add_xpath('LotNum', '//h1/text()')
        l.add_xpath(
            'LotDescription',
            '//h2[contains(text(), "Item Details:")]/following-sibling::p[1]/text()[1]'
        )

        address = response.xpath(
            '//b[contains(text(), "Item Location:")]/following-sibling::text()[1]'
        ).extract_first()
        city, region = address.split(',')
        l._add_value('City', city)
        l._add_value('State', region)
        l._add_value('ZIP', region)
        l.add_xpath(
            'Contact',
            '//b[contains(text(), "Equipment Contact:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'Phone',
            '//b[contains(text(), "Phone Number:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'Category',
            '//strong[contains(text(), "Category:")]/following-sibling::text()[1]'
        )
        l.add_xpath(
            'ClosesOn',
            '//strong[contains(text(), "Closes On")]/following-sibling::text()[1]'
        )
        l.add_xpath('image_urls', '//div[@id="gallery"]//a/@href')
        l.add_value('folder_name', self.auction_id)

        yield l.load_item()
    def parse_article_child_page(self, response):
        """Extracts and yields article item & author-article relation item from article child page"""

        self.logger.info('Parsing article child page {}'.format(response.url))
        article_loader = ItemLoader(item=ArticleItem(), response=response)

        article_loader.add_value('url', response.url)
        article_loader.add_css('title', '#woe #hero h2::text')
        article_loader.add_css('pub_date', '#woe #hero .authwrp .sdate::text')
        article_loader.add_css('text', '#woe .postbody *::text')
        article_loader.add_css(
            'tags', "head meta[property='article:tag'] ::attr(content)")
        article_item = article_loader.load_item()

        article_author_loader = ItemLoader(item=ArticleAuthorItem(),
                                           response=response)
        article_author_loader.add_css('authors', '.goauthor::attr(href)')
        article_author_loader.add_value('article_url', article_item['url'])
        article_author_item = article_author_loader.load_item()

        yield article_item
        yield article_author_item
Example #47
0
    def read_news(self, response):
      print('simple_spider: read_news')
      titulo = response.xpath(self.tituloPath).get()
      cuerpo = response.xpath(self.cuerpoPath).getall()
      fecha_publicacion   = response.xpath(self.fechaPath).get()
      
      # Date should has format: YYYY-MM-DDTHH:MM:SS
      fecha_publicacion = self.format_fecha(fecha_publicacion)
	  
      if datetime.strptime(fecha_publicacion, '%Y-%m-%dT%H:%M:%S') < self.date_pbl_min:
          self.date_pbl_min = datetime.strptime(fecha_publicacion, '%Y-%m-%dT%H:%M:%S')

      news = ItemLoader(item=News())
      news.add_value('titulo', titulo)
      news.add_value('cuerpo', cuerpo)
      news.add_value('fecha_publicacion', fecha_publicacion)
      news.add_value('url', response.url)
      news.add_value('diario', self.name)
      news.add_value('page', self.current_page)
      return news.load_item()
Example #48
0
 def parse_definition(self, response:HtmlResponse):
     loader = ItemLoader(item=DictionaryItem(), response=response)
     loader.add_xpath('aword', "//h1/text() | //h1/span/text()")
     loader.add_xpath('definition', "//div[@id='medical-entry-1']/div[@class='vg']//span[@class='dtText']/em[@class='mw_t_it']/text() | //div[@id='medical-entry-1']/div[@class='vg']//span[@class='dtText']/text()")
     loader.add_value('link', response.url)
     yield loader.load_item()
Example #49
0
    def parse_detail(self, response):
        # 处理question页面,从页面中提取quesiont item

        match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*',
                             response.url)
        question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail'
                            )  # .QuestionHeader-detail span.RichText::text
        item_loader.add_value('url', response.url)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css(
            'comments_num',
            '.QuestionHeader-Comment button.Button--plain::text')
        item_loader.add_css(
            'watch_user_num',
            '.NumberBoard-itemInner strong.NumberBoard-itemValue::attr("title")'
        )
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_urls.format(question_id, 3, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Example #50
0
    def parse(self, response):
        filename = response.url.split("/")[-2]
        with open(filename, 'wb') as f:
            f.write(response.body)
            print response.css('title::text').extract()
#          for quote in response.css('div.quote'):
#             yield {
#                 'text': ''.split(quote.css('span.text::text').extract_first()),
#                 'author': quote.css('small.author::text').extract_first(),
#                 'tags': quote.css('div.tags a.tag::text').extract(),
#             }
        l = ItemLoader(item=Product(), response=response)
        l.add_xpath('name', '//div[@class="product_name"]')
        l.add_xpath('name', '//div[@class="product_title"]')
        l.add_xpath('price', '//p[@id="price"]')
        #          l.add_css('stock', 'p#stock]')
        l.add_value('last_updated', 'today')  # you can also use literal values
        print l.load_item()
        return l.load_item()
Example #51
0
	def parse_post(self, response):
		date = response.xpath('//div[@class="section simple Component-StandardContent "]/p[position()<4]//text()|//div[@class="section simple Component-StandardContent "]/span[position()<2]//text()').getall()
		date = re.findall(r'\b(?:\w+\s\d+\s)?\w+\S+(?:\s\d+(?:th)?)?\,\s\d+\S+', ' '.join(date))
		if not date:
			date = "Date is not published"
		title = response.xpath('//div[@class="section simple Component-StandardContent "]/strong/text()|//div[@class="section simple Component-StandardContent "]/p/strong/text()').get()
		content = response.xpath('//div[@class="section simple Component-StandardContent "]//text()[not (ancestor::strong)]').getall()
		content = [p.strip() for p in content if p.strip()]
		content = re.sub(pattern, "",' '.join(content))

		item = ItemLoader(item=WwealthonebankofcanadaItem(), response=response)
		item.default_output_processor = TakeFirst()

		item.add_value('title', title)
		item.add_value('link', response.url)
		item.add_value('content', content)
		item.add_value('date', date)

		yield item.load_item()
Example #52
0
    def parse_question(self, response):

        if 'QuestionHeader-title' in response.text:
            match_obj = re.match('(.*www.zhihu.com/question/(\d+))(/|$).*',
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")
            question_item = item_loader.load_item()
        else:
            # 处理老版本页面的item提取
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 2),
                             headers=self.header,
                             callback=self.parse_answer)
        yield question_item
    def parse(self, response):
        # change the code here
        articles = response.xpath("//div[@data-article-body]")
        for article in articles:
            info = article.css(".css-jy1umg").xpath(".//text()").getall()
            article_info = ''.join(info)
            article_topics = article.css("div.css-0")
            topics = []
            for topic in article_topics[1:]:
                # content_loader = ItemLoader(
                #     item=HealthlineContentItem(), selector=topic)
                # content_loader.add_css("topic_name", "a::attr(name)")
                topic_data = topic.xpath(".//text()").getall()
                topic_data = ' '.join(topic_data[1:])
                # content_loader.add_value("topic_data", topic_data)
                topics.append(topic_data)

            content = ' '.join(topics)
            loader = ItemLoader(item=HealthlineArticleItem(), selector=article)
            loader.add_css("title", "h1::text")
            loader.add_value("url", response.meta.get('url'))
            loader.add_value("article_info", article_info)
            loader.add_value("content", content)
            os.remove(response.meta.get('temp_file'))
            yield loader.load_item()
Example #54
0
    def parse_article(self, response):
        if 'pdf' in response.url:
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1/text()').get()
        if title:
            title = title.strip()

        date = response.xpath('//span[@class="date"]/text()').get()
        if date:
            date = date.strip()

        content = response.xpath(
            '//div[@class="text__inner"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content[2:]).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
    def parse_article_child_page(self, response):
        selector = response.css('article #postcontent')
        article_loader = ItemLoader(item=ArticleItem(), selector=selector)

        article_loader.add_value('url', response.url)
        article_loader.add_css('title', 'h1::text')
        article_loader.add_css('pub_date',
                               'meta[itemprop=datePublished]::attr(content)')
        article_loader.add_css('text', '#mypost *::text')
        article_loader.add_css('tags',
                               'article #postcontent a.tag.secondary::text')
        article_item = article_loader.load_item()

        article_author_loader = ItemLoader(item=ArticleAuthorItem(),
                                           selector=selector)
        article_author_loader.add_css(
            'authors', 'span[itemprop=author] a.goauthor::attr(href)')
        article_author_loader.add_value('article_url', article_item['url'])
        article_author_item = article_author_loader.load_item()

        yield article_item
        yield article_author_item
Example #56
0
								print Exception,":",e
				if Some_Info:
						for key in Some_Info.keys():
								item.fields[key] = Field()
								l.add_value(key , Some_Info[key])
				yield l.load_item()
		else:
		#感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法
				my_Final_Xpath = Final_Xpath.copy()
				All_Xpath = my_Final_Xpath['All_Xpath'].copy()
				del my_Final_Xpath['All_Xpath']
				all_xpath = All_Xpath['all_xpath']
				del All_Xpath['all_xpath']
				for i in response.xpath(all_xpath[0]):
						item = NettvSpiderItem()
						l = ItemLoader(item=item, response=response)
						#把All_Xpath中的数据提取出来
						for key in All_Xpath.keys():
								item.fields[key] = Field()
								try:
										#itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("")
										if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]]:
												map(lambda x:l.add_value(key , ""),["just_one"])
										else:
												map(lambda x:l.add_value(key, i.xpath(x).extract()) if i.xpath(x).extract() != [] else "",Final_Xpath[key])
								except Exception,e:
										print Exception,",",e
						#将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据
						for key in my_Final_Xpath.keys():
								item.fields[key] = Field()
								try:
Example #57
0
 def product_parse(self, response: HtmlResponse):
     loader = ItemLoader(item=ShopparserItem(), response=response)
     loader.add_value('_id', response.url, re='-(\d+)\/$')
     loader.add_xpath('name', "//h1/text()")
     loader.add_value('link', response.url)
     loader.add_xpath('price', "//span[@slot='price']/text()")
     loader.add_xpath('params', "//div[@class='def-list__group']/dt/text()")
     loader.add_xpath('params',
                      "//div[@class='def-list__group']/dd//text()")
     loader.add_xpath('photos', "//img[@alt='product image']/@src")
     yield loader.load_item()
    def parse_sub_item_detail(self, response):
        l = ItemLoader(item=AmzGenericCrawlerItem(), response=response)

        try:
            supplier = response.xpath(
                '//*[@id="bylineInfo"]/text()').extract_first()
            if supplier == None:
                supplier = response.xpath(
                    '//*[@id="brand"]/text()').extract_first().strip()
                if supplier == None:
                    supplier = "no info"
        except AttributeError:
            supplier = ""

        try:
            product_name = response.xpath(
                '//*[@id="productTitle"]/text()').extract_first().strip(' \n')
        except AttributeError:
            product_name = ""

        try:
            availability = response.xpath('//*[@id="availability"]/span/text()'
                                          ).extract_first().strip(' \n')
        except AttributeError:
            try:
                availability = response.xpath('//*[@id="availability"]/text()'
                                              ).extract_first().strip(' \n')
            except AttributeError:
                availability = "no info"

        try:
            review = response.xpath(
                '//*[@id="acrPopover"]/span[1]/a/i[1]/span/text()'
            ).extract_first().split(" ", 1)[0]
        except (AttributeError, IndexError):
            review = "no review"

        try:
            rank = response.xpath(
                '//th[contains(text(),"Best Sellers Rank")]/following-sibling::td/span/span[1]/text()'
            ).extract_first().split(" ", 1)[0].strip('#').replace(',', '')
        except (AttributeError, IndexError):
            try:
                rank = response.xpath(
                    '//*[@id="SalesRank"]/text()').extract()[1].strip().split(
                        " ", 1)[0].strip('#').replace(',', '')
            except (AttributeError, IndexError):
                rank = "no rank"

        try:
            category = response.xpath(
                '//th[contains(text(),"Best Sellers Rank")]/following-sibling::td/span/span[1]/text()'
            ).extract_first().split(" ", 1)[1].rsplit(" ", 1)[0].split(" ",
                                                                       1)[1]
        except (AttributeError, IndexError):
            try:
                category = \
                response.xpath('//*[@id="SalesRank"]/text()').extract()[1].strip().split(" ", 1)[1].rsplit(" ", 1)[
                    0].split(" ", 1)[1]
            except (AttributeError, IndexError):
                category = "no info"

        item_url = response.request.url

        l.add_value('supplier', supplier)
        l.add_value('product_name', product_name)
        l.add_value('availability', availability)
        l.add_value('review', review)
        l.add_value('rank', rank)
        l.add_value('category', category)
        l.add_value('item_url', item_url)

        return l.load_item()
Example #59
0
    def parse_content(self,response):
        print (response.url)

        def deal_img_urls(img_url_list):
            # for one_img_url in img_url_list:
            #     print (one_img_url)
            return img_url_list

        def deal_publish_time(publish_time_raw_list):
            try:
                year=str(publish_time_raw_list[0])
                mounth=str(publish_time_raw_list[1]) if len(str(publish_time_raw_list[1]))==2 else '0'+str(publish_time_raw_list[1])
                days=str(publish_time_raw_list[2]) if len(str(publish_time_raw_list[2]))==2 else '0'+str(publish_time_raw_list[2])

                hourse=str(publish_time_raw_list[3])
                minite=str(publish_time_raw_list[4])

                publish_time=year+'-'+mounth+'-'+days+' '+hourse+':'+minite+':00'
                return publish_time
            except Exception as e:
                print(e)

        def deal_reply_nodes(response_url):
            # for one_reply_nodes in reply_nodes:
            #     one_reply_nodes.xpath('')
            # 这里边的评论需要重新发起请求,所以这里全部设置成连接,后期的处理中再生成对应的reply_nodes。------mark!
            reply_id=response_url.split('/')[-1].split('?')[0]
            reply_url='http://www.ftchinese.com/index.php/c/newcomment/'+reply_id+'?v=1'
            return reply_url

        def deal_publish_user(publisher_list):
            publish_user_list=[]
            for one_user in publisher_list:
                _=one_user.strip()
                publish_user_list.append(_)
            return publish_user_list


        if not response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'):
            return #charge the content is empty by this?



        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title','//h1[@class="story-headline"]/text()',TakeFirst())
        # loader1.add_xpath('abstract','//div[@class="story-lead"]/text()')#没有abstract这个字段
        loader1.add_value('id',response.url.split('/')[-1].split('?')[0])
        loader1.add_value('img_urls',response.xpath('//div[@class="story-container"]//img/@src|//div[@class="story-container"]//figure/@data-url').extract(),deal_img_urls)
        loader1.add_xpath('content','//div[@class="story-body"]//p//text()',Join())
        loader1.add_value('publish_time',response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'),deal_publish_time)
        loader1.add_xpath('publish_user','//span[@class="story-author"]/a/text()',deal_publish_user)
        loader1.add_value('reply_count',response.xpath('//div[@id="allcomments"]/div[@class="commentcontainer"]'),lambda x:len(x))
        # loader1.add_value('reply_nodes',response.url,deal_reply_nodes)

        item1=loader1.load_item()
        return item1
Example #60
0
    def parse(self, response):
        response.selector.remove_namespaces()
        document = response.xpath('//document')
        manu_products = document.xpath('.//subject/manufacturedProduct')

        spl_il = ItemLoader(item=SplItem(), selector=document)
        spl_il.add_xpath('id', './id/@root')
        spl_il.add_xpath('set_id', './setId/@root')
        spl_il.add_xpath('labeler', './/representedOrganization/name/text()')

        for product in manu_products:
            product_il = ItemLoader(item=ProductItem(), selector=product)
            product_il.add_xpath('code', './manufacturedProduct/code/@code')
            product_il.add_xpath('name', './manufacturedProduct/name/text()')
            product_il.add_xpath(
                'schedule',
                './/policy[@classCode="DEADrugSchedule"]/code/@displayName')

            inactive_ingredients = product.xpath(
                './/ingredient[starts-with(@classCode, "IACT")]')

            for inactive_ingredient in inactive_ingredients:
                inactive_il = ItemLoader(
                    item=InactiveIngredient(),
                    selector=inactive_ingredient,
                )
                inactive_il.add_xpath(
                    'name',
                    './ingredientSubstance/name/text()',
                )
                inactive_il.add_xpath(
                    'unii',
                    './ingredientSubstance/code/@code',
                )

                product_il.add_value(
                    'inactive_ingredients',
                    inactive_il.load_item(),
                )

            for package in product.xpath('.//containerPackagedProduct'):
                package_il = ItemLoader(item=PackageItem(), selector=package)
                package_il.add_xpath('code', './code/@code')

                if not package_il.load_item():
                    continue

                product_il.add_value('packages', package_il.load_item())

            spl_il.add_value('products', product_il.load_item())

        return spl_il.load_item()