def parse_link_page(response):
    for post in response.xpath('//div[@data-type="link"]'):
        l = ItemLoader(RedditPostItem(), selector=post)
        post_root_xpath = './div[contains(@class, "entry")]'
        title = post_root_xpath + '/p[@class="title"]'
        tagline = post_root_xpath + '/p[@class="tagline"]'
        buttons = post_root_xpath + '/ul'
        l.add_xpath('title', title + '/a/text()')
        l.add_xpath('link', title + '/a/@href')
        l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()')
        l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()')
        l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()')
        l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href')
        l.add_xpath('subreddit', './@data-subreddit')
        l.add_xpath('post_timestamp', tagline + '/time/@datetime')
        l.add_value('scrape_timestamp', datetime.datetime.now())

        item = l.load_item()
        # if there are any comments for the post, go scrape them
        item["comments"] = []
        if item["number_of_comments"] > 0:
            yield scrapy.Request(item["comments_link"]+"?limit=500",
                                 callback=parse_comments,
                                 meta={'item': item})
        yield l.load_item()
Beispiel #2
0
    def parse_item(self, response):
        sel = response.css("div.path")

        loader = ItemLoader(item=SeriesItem(), selector=sel)
        loader.add_css("series_id", "a:last-child::attr(href)")
        loader.add_css("series_name", "a:last-child::text")

        series = loader.load_item()
        print(series)

        # 即将销售 & 在售
        for sel in response.css("div.interval01-list-cars-infor"):
            loader = ItemLoader(item=ModelItem(), selector=sel)
            loader.add_css("model_id", "a::attr(href)")
            loader.add_css("model_name", "a::text")
            loader.add_value("series_id", series['series_id'])
            loader.add_value("series_name", series['series_name'])

            yield loader.load_item()

        # 停售
        url = "http://www.autohome.com.cn/ashx/series_allspec.ashx"

        years = response.css(".dropdown-content a::attr(data)")

        for year in years.extract():
            qs = {
                "y": year,
                "s": series["series_id"]
            }

            yield Request(url + "?" + urlencode(qs), self.stop_sale)
    def parse_info(self, response):

        loaderJob = ItemLoader(item=JobInfoItem(), response=response)
        loaderCom = ItemLoader(item=ComInfoItem(), response=response)
        loaderJob.add_value('url', value=response.url)
        loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL())
        loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL())
        loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|'))
        divs = '//ul[@class="terminal-ul clearfix"]/li'
        loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)')
        loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)')
        loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL())

        loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL())
        loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL())
        divs = '//div[@class="company-box"]/ul/li'
        loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司规模[:,:]).*')
        loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司性质[:,:]).*')
        loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司行业[:,:]).*')
        loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司主页[:,:]).*')
        loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(),  re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)')

        return loaderJob.load_item(), loaderCom.load_item()
Beispiel #4
0
    def parse(self, response):
        sites = response.xpath('//table/tbody/tr')
        for site in sites:

            url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first())
            urlLast = urljoin(response.url, site.xpath("td[3]/a/@href").extract_first())
            item = DeathItem()
            loader = ItemLoader(item,selector=site)
            loader.add_xpath('Mid','td[1]/text()')
            loader.add_xpath('firstName','td[5]/text()')
            loader.add_xpath('lastName','td[4]/text()')
            loader.add_xpath('Date','td[8]/text()')
            loader.add_xpath('Race','td[9]/text()')
            loader.add_xpath('County','td[10]/text()')
            loader.add_xpath('Age','td[7]/text()')
            loader.add_value('OILink',url)
            loader.add_value('OLastStatement',urlLast)

 
            if url.endswith(("jpg","no_info_available.html")):
                loader.add_value('Description',u'')
                loader.add_value('Education',u'')
                if urlLast.endswith("no_last_statement.html"):
                    loader.add_value('Message',u'')
                    yield loader.load_item()
                else:
                    request = scrapy.Request(urlLast, meta={"item" : loader.load_item()}, callback =self.parse_details2)
                    yield request
            else:        
                request = scrapy.Request(url, meta={"item": loader.load_item(),"urlLast" : urlLast}, callback=self.parse_details)
                yield request
Beispiel #5
0
    def parse_colleagues(self, response, author_id):
        self.logger.info('Parsing colleagues for author %s.' % author_id)

        # get all authors listed
        num_authors = 0
        for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'):
            num_authors += 1
            name_xp = './*[@class="gsc_1usr_name"]/text()'
            id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user']
            cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()'
            fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...]

            # load general author item for colleague
            co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div)
            co_auth.add_value('id', id_val)
            co_auth.add_xpath('name', name_xp)
            co_auth.add_xpath('cited', cited_by_xp)
            co_auth.add_xpath('fos', fos_xp)
            yield co_auth.load_item()

            # load co-authorship
            relation = [author_id, id_val]
            relation.sort()
            co_rel = ItemLoader(item=CoAuthorItem(), response=response)
            co_rel.add_value('author1', relation[0])
            co_rel.add_value('author2', relation[1])
            yield co_rel.load_item()

        self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id))

        next_url = self.choose_next()

        if next_url:
            yield Request(url=next_url)
Beispiel #6
0
    def parse(self,response):
        l = ItemLoader(item = NytimesItem(),response = response)
        l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()')
        l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()')
        #print(type(l.load_item()))
        x = l.load_item()
        #print(len(x['date']),len(x['topnews']),len(x['sectionnews']))
        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        topnewslist = []
        sectionnewslist = []
        nytdict['date'] = str(datalist)

        for t in x['topnews']:
            topnewslist.append(str(t.encode('ascii','ignore')))
        nytdict['topnews']=topnewslist

        for t in x['sectionnews']:
            sectionnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['sectionnews']=sectionnewslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()
Beispiel #7
0
    def parse_item(self, response):
        if "Digi-Key Part Number" not in response.body:
            return
        i = DetailsItem()
        if response.meta.get("callback_result_queue"):
            i['hitlist'] = response.meta.get('callback_result_queue')
        i = DetailsItem()
        i['site_name'] = self.site_name
        i['site_url'] = self.base_url
        loader = ItemLoader(i, response=response)
        loader.add_xpath("site_part_id", "//meta[@itemprop='productID']/@content", re="sku.(.*)")
        loader.add_xpath("manuf_part_id", "//meta[@itemprop='name']/@content", )
        loader.add_xpath("manuf_name", "//span[@itemprop='name']/text()")
        loader.add_xpath("description", "//td[@itemprop='description']/text()")
        loader.add_xpath("datasheet_link", "//a[@class='lnkDatasheet']/@href")
        loader.add_xpath("image_url", "//a[@class='lnkProductPhoto']/@href")
        loader.add_value("page_url", response.url)
        loader.add_xpath("part_detail", "//td[@class='attributes-table-main']")
        loader.add_xpath("packaging", "//th[contains(text(),'Packaging')]/following-sibling::td/text()")
        loader.add_xpath("package", "//th[contains(text(),'Standard Package')]/following-sibling::td/text()")
        loader.add_value("package", PACKAGE_DEFAULT)
        loader.add_value("packaging", PACKAGING_DEFUALT)
        loader.add_xpath("type", "//th[text()='Accessory Type']/following-sibling::td/text()")
        loader.add_value("version", VERSION_DEFAULT)
        loader.add_value("date_created", self.timestamp())
        i = loader.load_item()

        prices = response.xpath("//table[@id='pricing']/tr[td and not(contains(.//text(),'Call'))]")
        for price in prices:
            td = price.xpath("td")
            if len(td) == 3:
                pi = PriceItem()
                pi['site_name'] = self.site_name
                pi['site_part_id'] = i['site_part_id']
                pi['date_created'] = self.timestamp()
                pi['price_type'] = i['packaging']
                pi['quantity'] = td[0].xpath("text()").extract()[0].replace(",", "")
                pi['price'] = td[1].xpath("text()").extract()[0].replace(",", "")
                i['price_data'].append(pi)

        avail = AvailabilityItem()
        avail['site_name'] = self.site_name
        avail['site_part_id'] = i['site_part_id']
        avail['date_created'] = self.timestamp()
        loader = ItemLoader(avail, response=response)
        loader.add_xpath("stock", "//td[@id='quantityavailable']", re='":\s([\d|\,]*)')
        loader.add_value("factory_leadtime", FACTORY_LEAD_TIME_DEFAULT)
        loader.add_value("factory_lead_uom", FACTORY_LEAD_UOM_DEFAULT)
        avail = loader.load_item()
        i['inventory_data'].append(avail)
        yield i
Beispiel #8
0
    def parse_details(self, response):

        item = response.meta["item"]
        urlLast = response.meta["urlLast"]

        loader = ItemLoader(item,response=response)
        loader.add_xpath("Description","//*[@id='body']/p[3]/text()")
        loader.add_xpath("Education","//td[. = 'Education Level (Highest Grade Completed)']/following-sibling::td[1]/text()")

        if urlLast.endswith("no_last_statement.html"):
            loader.add_value('Message',u'')
            return loader.load_item()
        else:
            request = scrapy.Request(urlLast, meta={"item": loader.load_item()}, callback=self.parse_details2)
            return request
Beispiel #9
0
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+"
        )
        l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
        l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
        l.add_xpath(
            "image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))
        )

        # Housekeeping fields
        l.add_value("url", response.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("date", datetime.datetime.now())

        return l.load_item()
Beispiel #10
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     images = []
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     if sel.xpath('//div[@class="neirong-shouquan"]'):
         return
     item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()')
     item.add_xpath('author', '//span[@class="author-name"]/text()')
     item.add_value('source', u'虎嗅网')
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     article_time = sel.xpath('//span[@class="article-time"]/text()').extract()
     date_time = compare_time(article_time, "%Y-%m-%d %H:%M")
     if not date_time:
         return
     item.add_value('date_time', article_time)
     image_url = sel.xpath('//div[@class="article-img-box"]/img/@src').extract()[0]
     images.append(image_url)
     elements = sel.xpath('//div[@id="article_content"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
    def parse_item(self,response):
        sel = Selector(response)
        il = ItemLoader(item=Product(), response=response)

        cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()')
        availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()')
        price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text')
        sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text')
       
        """If the xpath doesn't retunr a category, the product belongs to the Bundle category"""
        if not cat:
            il.add_value("category", "Bundle")
        else:
            il.add_value("category", cat)
       
        il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text")
        il.add_value("url",response.url)
       
        """If a product can be added to the cart, the product is available online, if not, the product is not available online"""
        if "ADD TO CART" in availability:
            il.add_value("availability", "Product is available online")
        else:
            il.add_value("availability", "Product is not available online")

        """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website"""
        if not price:
            il.add_value("regPrice",sale)
            il.add_value("salePrice", None)
        else:
            il.add_value("regPrice", price)
            il.add_value("salePrice",sale)
        return il.load_item()
Beispiel #12
0
    def parse_news(self, response):
        if response.status == 200:
            to_group_id = response.meta['group_id']
            url = response.url
            ac = response.xpath('//div[@id="article-main"]')
            title = ac.xpath('.//h1[@class="article-title"]/text()')
            title = title.extract_first(default='')
            info = ac.xpath('.//div[@class="articleInfo"]')
            source = info.xpath('.//span[@class="src"]/text()')
            source = source.extract_first(default='')
            source = source.strip('\n\t ')
            ctime = info.xpath('.//span[@class="time"]/text()')
            ctime = ctime.extract_first(default='')
            contents = ac.xpath('.//p/text()').extract()
            contents = map(lambda x: re.sub(r'<.*?>', '', x), contents)
            text = '\n'.join(contents)
            labels = ac.xpath('.//a[@class="label-link"]/text()').extract()

            il = ItemLoader(item=ArticleItem())
            il.add_value('to_group_id', to_group_id)
            il.add_value('url', url)
            il.add_value('title', title)
            il.add_value('source', source)
            il.add_value('ctime', ctime)
            il.add_value('text', text)
            il.add_value('labels', labels)
            yield il.load_item()
        else:
            return
    def parse_movie(self,response):
        
        loader = ItemLoader(item=DoubanItem(),response=response)
        
        for attr,xpath in self.settings.getdict('INFO_XPATH').items():
            loader.add_xpath(attr,xpath)

        s = response.xpath('//div[@id="info"]').extract_first()
        for attr,regex in self.settings.getdict('RE').items():
            loader.add_value(attr,re.findall(regex,s))
            
        loader.add_value('rate',self.parse_rate(response))
        loader.add_value('url',response.url)
  
        if self.settings.get('ALLOW_COVER') == True:
            image_urls = self._get_urls(
                self.image_base_url,
                urljoin,
                response.xpath('//div[@id="mainpic"]/a/img/@src').extract(),
                lambda s:s.split('/')[-1],
            )

            loader.add_value('image_urls',image_urls)
        
        return loader.load_item()
Beispiel #14
0
	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
		return l.load_item()
		def parse_item(self,response):
			l = ItemLoader(item =MeizituItem(),response = response)
			l.add_xpath('name','//h2/a/text()')
			l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
			l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity())
			l.add_value('url', response.url)
			return l.load_item()
Beispiel #16
0
	def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('url', response.url)
		l.add_value('name', self.name)
		l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src')
		return l.load_item()
Beispiel #17
0
    def parse_content_page(self, response):

        # Detect if this is a redirection page
        m = redirect_re.search(response.body)
        if m:
            import requests
            new_url = m.group(1)
            new_content = requests.get(new_url).content
            response = scrapy.http.HtmlResponse(new_url, body=new_content)

        # Start scraping
        il = ItemLoader(item = LuliItem(), response=response)
        
        il.add_css('content', 'div#articleNew > p::text')
        il.add_css('content', 'div[itemprop="articleBody"] > p')
        
        il.add_css('date', 'div#articleDate::text')
        il.add_css('date', 'header > time[datetime]::attr(datetime)')
        
        il.add_css('title', 'div#articleNew > h1::text')
        il.add_css('title', 'h1[itemprop="headline"]::text')
        
        il.add_value('url', response.url)

        item = il.load_item() 
        yield item
	def parse(self, response):
#		l = ItemLoader(item = ItjuziItem(),response=response)
		jsonresponse = json.loads(response.body_as_unicode())
		for i in range(0,len(jsonresponse['data']['list'])):
			l = ItemLoader(item = LianjiaErshouItem(),response=response)
			house_code = jsonresponse['data']['list'][i]['house_code']
			price_total = jsonresponse['data']['list'][i]['price_total']
			ctime = jsonresponse['data']['list'][i]['ctime']
			title = jsonresponse['data']['list'][i]['title']
			frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num']
			tags = jsonresponse['data']['list'][i]['tags']
			house_area = jsonresponse['data']['list'][i]['house_area']
			community_id = jsonresponse['data']['list'][i]['community_id']
			community_name = jsonresponse['data']['list'][i]['community_name']
			is_two_five = jsonresponse['data']['list'][i]['is_two_five']
			frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num']
			l.add_value('house_code',house_code)
			l.add_value('price_total',price_total)
			l.add_value('ctime',ctime)
			l.add_value('title',title)
			l.add_value('frame_hall_num',frame_hall_num)
			l.add_value('tags',tags)
			l.add_value('house_area',house_area)
			l.add_value('community_id',community_id)
			l.add_value('community_name',community_name)
			l.add_value('is_two_five',is_two_five)
			l.add_value('frame_bedroom_num',frame_bedroom_num)
			print l
			yield l.load_item()
Beispiel #19
0
	def parse_page(self, response):
		#爬取图片
		# print u'~~~~', unicode(response.body, "gbk").encode("utf8")
		# print(self.config["xpathImagesPath"])
		# print(response.xpath(self.config["xpathImagesPath"]))
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.config["id"])
		l.add_value('url', response.url)
		if self.config.has_key("imageUrlReplacement"):
			l.add_value('replace', self.config["imageUrlReplacement"])
			
		if self.config.has_key("xpathImagesPath"):
			l.add_xpath('image_urls', self.config["xpathImagesPath"])
		if self.config.has_key("xpathFilesPath"):
			l.add_xpath('file_urls', self.config["xpathFilesPath"])
		yield l.load_item()
		
		#TODO:获取下一页地址,递归调用自parse_page
		if self.config.has_key("xpathNextImageUrl"):
			nextUrls = response.xpath(self.config["xpathNextImageUrl"])
			if len(nextUrls) > 0:
				nextPage = nextUrls.extract()[0]
				if not nextPage.startswith("http"):
					if nextPage.startswith("/"):
						nextPage = response.url[0:response.url.index("/",10)+1]+nextPage 
					else:
						nextPage = response.url[0:response.url.rfind("/")+1]+nextPage 
				request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']})
				yield request
Beispiel #20
0
    def parse(self, response):
        """ This function parses the categories and its subcategories on a gscholar web page.

        @url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus
        @returns items 1 1
        @returns requests 0 0
        @scrapes name subs
        """

        # We need the div that is 'selected' i.e. contains gs_sel as a css class
        title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()'

        item = ItemLoader(item=CategoryItem(), response=response)
        title = response.xpath(title_xp).extract_first()

        item.add_value('name', title)
        subs = []
        for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'):
            s = {'name' : sub.xpath('text()').extract_first()}
            rel_url = sub.xpath('@href').extract_first()
            s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0]
            subs.append(s)
            req = Request(urljoin(response.url,rel_url), callback=self.parse_item)
            req.meta['parent'] = title
            yield req
        item.add_value('subs', subs)
        yield item.load_item()
Beispiel #21
0
	def parse_item(self, response):
		"""
		This function parses a property page.

		@url http://localhost:9312/properties/property_000000.html
		@returns items 1
		@scrapes title price description address image_urls
		@scrapes url project spider server date
		"""
		l = ItemLoader(item=PropertiesItem(), response=response)
		l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
		l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float),
					re='[,.0-9]+')
		l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
		l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
		l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
					MapCompose(lambda i: urlparse.urljoin(response.url, i)))

		# Housekeeping fields
		l.add_value('url', response.url)
		l.add_value('project', self.settings.get('BOT_NAME'))
		l.add_value('spider', self.name)
		l.add_value('server', socket.gethostname())
		l.add_value('date', datetime.datetime.now())
		return l.load_item()
    def parse_item(self, response):
        # FIXME: fix array issue
        i = ItemLoader(item=SalefinderItem(), response=response)
        title = r'//div[@id="product-details-container"]//h1/text()'
        price = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        per = r'//div[@id="product-details-container"]//span[@class="price"]/text()'
        image_url = r'//a[@id="product-image-container"]//img/@src'

        i.add_xpath('title', title, MapCompose(unicode.lower))
        i.add_xpath('price', price, re=r'[,.0-9]+')
        i.add_xpath('per', per, re=r'pk|each|kg')
        i.add_xpath('image_url', image_url)

        i.add_value('url', response.url)
        i.add_value('date', date.today().isoformat())

        product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']")
        product_buy_text = product_buy.extract_first().lower()

        # Detect the vendor from a product-buy div
        if 'coles' in product_buy_text:
            i.add_value('vendor', 'coles')
        elif 'woolworths' in product_buy_text:
            i.add_value('vendor', 'woolworths')
        else:
            i.add_value('vendor', 'unknown')
        return i.load_item()
 def parse_titles(self, response):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('hub', response.meta['hname'])
     loader.add_css('title', 'div.company_post h1 span::text')
     loader.add_css('date', 'div.published::text')
     loader.add_css('article', 'div.content::text')
     yield loader.load_item()
Beispiel #24
0
    def parse_artwork(self, response):
        """Extracts information from an artwork detail page
        """

        # create a url version free of search query noise
        url_bits = urlparse.urlparse(response.url)
        url_bits = url_bits._replace(query='')
        clean_url = urlparse.urlunparse(url_bits)

        loader = ItemLoader(item=ArtworkItem(), response=response)
        loader.add_value('museum_code', self.name)
        loader.add_value('url', clean_url)
        loader.add_xpath('artist_name',
                         '//div[@id="tombstone"]/p[1]/a/text()[1]')

        artist_url = response.xpath('//div[@id="tombstone"]/p[1]/a/@href')
        artist_url = urlparse.urljoin(response.url, artist_url.extract()[0])
        loader.add_value('artist_url', artist_url)

        loader.add_css('title', '#tombstone span:nth-of-type(1)::text')
        loader.add_xpath('thumbnail',
                         '//div[@id="artwork-image"]/a/img/@src')
        loader.add_xpath('on_display', ON_DISPLAY_SELECTOR)
        item = loader.load_item()

        self.logger.info('Scraped ' + item['title'][0])

        yield item
Beispiel #25
0
    def parse(self, response):
        l=ItemLoader(item=RentalItem(),response=response)
        l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_value('url', response.url)

        return l.load_item()
Beispiel #26
0
    def parse_image(self, response):
        logger.info("正在收集页面数据: %s ..." % response.url)
        loader = ItemLoader(item=MeiTuItem(), response=response)

        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()")
        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()")
        loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()")
        loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()")
        loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()")
        loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()")

        try:
            loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]")
        except ValueError:
            pass

        loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()")
        loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()")
        loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src")
        loader.add_value("page_url", response.url)


        yield loader.load_item()
Beispiel #27
0
    def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
        loader.add_css('name', u'.modTitle>h1::text')

        def parse_category():
            for e in response.css(u'.catType>a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('category', list(parse_category()))
        loader.add_css('detail', u'.zhiyeShow')

        item = loader.load_item()

        return FormRequest(
            url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
            formdata={'code': item['code'][0]},
            meta={'item': item},
            dont_filter=True,
            callback=self.parse_majors
        )
Beispiel #28
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p')
     article_time = content.xpath('//span[@class="pubTime"]/text()').extract()
     date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M")
     if not date_time:
         return
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_value('date_time', date_time)
     item.add_xpath('title', '//div[@class="hd"]/h1/text()')
     item.add_xpath('reading_number', '//em[@id="top_count"]/text()')
     item.add_xpath('author', '//span[@class="auth"]/text()')
     item.add_value('original_link', response.url)
     elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('content', content)
     item.add_value('image_urls', images)
     item.add_value('source', u'腾讯科技')
     item.add_value('category', CATEGORY.TECHNOLOGY)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
    def parse_item(self,response):
        sel = Selector(response)
		
        #department information
        """
        li=response.xpath("//li[@class='menulevel-0']|//li[@class='menulevel-0 menulevel-0-extra']")
        dName = li.xpath("a/span/text()").extract()
        print dName
		"""
		#category name
        cName = response.xpath("//div/a[2]/text()").extract()[0]
        print ("'Category name' "+ cName)
		#product information
        il = ItemLoader(item=Product(), response=response)
        il.add_xpath("title","//div[contains(@class,'productdetail-container')]//span[contains(@id,'ProdTitle')]/..//text()")
        #il.add_xpath("title","//div[@class='catalogueTitle']/*/text()")   #@class='catalogueTitle'  id='subcatemenu-container'
        il.add_value("url",response.url)
        il.add_xpath("current_price","//div[contains(@class,'pricing') or contains(@class,'price')]//span[contains(@id,'Saleprice') or contains(@class,'salePrice')]//text()")
        il.add_xpath("regular_price","//div[contains(@class,'pricing') or contains(@class,'price')]//span[contains(@id,'Regprice') or contains(@class,'regPrice')]//text()")
        limited = sel.xpath("//div[contains(@id,'FinalClearance')]").extract()
        if len(limited) > 0:
            il.add_value("availability","Limited Quantities")
        else:
            il.add_value("availability","Available")
        return il.load_item()
Beispiel #30
0
 def _parse(self, response):
     l = ItemLoader(item=BookmarksItem(), response=response)
     l.add_xpath(u"name", u"/html/head/title")
     l.add_xpath(u"anchors", u"//a/@href'")
     l.add_xpath(u"description", u"/html/body/text()")
     l.add_value(u"last_updated", datetime.datetime)  # you can also use literal values
     return l.load_item()
Beispiel #31
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'uk')
     article.add_value('language', 'english')
     article.nested_css('div.content__article-body').add_xpath(
         'body', './p//text()')
     article.nested_css('meta[property="og:title"]').add_xpath(
         'headline', './@content')
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else
         ciso8601.parse_datetime(v).isoformat(sep='T'))
     article.nested_css(
         'meta[property="article:published_time"]').add_xpath(
             'published_time',
             './@content',
             time_in,
         )
     article.add_xpath('category',
                       '//head/meta[@property="article:section"]/@content')
     article.add_xpath('keywords', '//head/meta[@name="keywords"]/@content')
     article.add_value('url', response.url)
     article.add_value('encoding', response.encoding)
     return article.load_item()
Beispiel #32
0
    def parse(self, response):
        self.driver.get(response.url)
        iframe = self.driver.find_element_by_css_selector('iframe')
        self.driver.switch_to_frame(iframe)
        for i in range(len(self.data)):
            if self.data.iloc[i, 0] in self.alr_crawled:
                continue
            file_name = self.data.iloc[i, 0] + ".mp3"
            text = self.data.iloc[i, 1]

            # Input text
            element = self.driver.find_element_by_xpath(
                "/html/body/form/textarea")
            element.clear()
            element.send_keys(text)

            # Adjust speed
            speed_element = self.driver.find_element_by_xpath(
                "/html/body/form/input[4]")
            self.driver.execute_script("arguments[0].value = '0.85';",
                                       speed_element)

            # Submit
            self.driver.find_element_by_xpath(
                "/html/body/form/input[5]").click()
            time.sleep(2)

            # Get link to download file
            download_element = self.driver.find_element_by_xpath(
                "/html/body/audio/source")
            loader = ItemLoader(item=WavItem(), selector=download_element)
            relative_url = download_element.get_attribute("src")
            absolute_url = response.urljoin(relative_url)
            loader.add_value("file_urls", absolute_url)
            loader.add_value("file_name", file_name)
            yield loader.load_item()
Beispiel #33
0
    def qidian_parse(self, response):
        list_selector = response.xpath("// div[@class='book-mid-info']")
        for one_selector in list_selector:
            novel = ItemLoader(item=QidianHotItem(), selector=one_selector)
            novel.add_xpath("name", "h4/a/text()")
            novel.add_xpath("author", "p[1]/a[1]/text()")
            novel.add_xpath("type", "p[1]/a[2]/text()")
            novel.add_css("form", ".author span::text")
            # name = one_selector.xpath("h4/a/text()").extract()[0]
            # author = one_selector.xpath("p[1]/a[1]/text()").extract()[0]
            # type = one_selector.xpath("p[1]/a[2]/text()").extract()[0]
            # form = one_selector.xpath('p[1]/span/text()').extract()[0]
            # item = QidianHotItem()
            # item["name"] = name
            # item["author"] = author
            # item["type"] = type
            # item["form"] = form
            yield novel.load_item()

        self.current_page += 1
        if self.current_page <= 5:
            next_url = "https://www.qidian.com/rank/hotsales?style=1&page=%d" % (
                self.current_page)
            yield Request(next_url, callback=self.qidian_parse)
Beispiel #34
0
    def parse(self, response):
        '''
        This function extracts all the reviews and pagination links from the reviews of above amazon link
        '''

        all_reviews_div = response.xpath('//div[@data-hook="review"]')

        for review in range(0, len(all_reviews_div)):

            i = ItemLoader(AmazonreviewsItem(), all_reviews_div[review])

            # required  items
            i.add_xpath(
                'review_txt', './/*[@data-hook="review-body"]//text()', Join())
            i.add_xpath(
                'rating', './/*[@data-hook="review-star-rating"]//text()', re='^[0-9]')

            # extra info fields
            i.add_value('url', response.url)
            i.add_value('project', self.settings.get('BOT_NAME'))
            i.add_value('spider', self.name)
            i.add_value('server', socket.gethostname())
            # using MapCompose for preprocessing items: converting datatime object to string
            i.add_value('date', datetime.date.today(),
                        MapCompose(lambda x: x.strftime('%Y/%m/%d')))

            yield i.load_item()

    # identify the next button for looping review urls to extract more urls
        next_link = response.xpath(
            '//*[@data-hook="pagination-bar"]//li[@class ="a-last"]//a/@href').extract_first()

        if next_link:
            next_link = response.urljoin(next_link)
            # next_link = urljoin('https://www.amazon.in', next_link)
            yield scrapy.Request(url=next_link, callback=self.parse, dont_filter=True)
Beispiel #35
0
    def parse_question(self, response):
        """
        获取 详细的 item
        :param response:
        :return:
        """
        question_id = response.meta.get('question_id')
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title', '.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail')
        item_loader.add_value('url', response.url)
        item_loader.add_value('question_id', question_id)
        item_loader.add_css('answer_nums', '.List-headerText span::text')
        item_loader.add_css('comment_nums',
                            '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_nums', '.NumberBoard-itemValue::text')
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')

        question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Beispiel #36
0
    def parse_item(self, response):
        item_loader = ItemLoader(item=HouseRentingLianjiaItem(),
                                 response=response)

        item_loader.add_css(field_name='title', css='div.title *::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author',
                            css='div.brokerName > a.name::text')
        item_loader.add_css(field_name='image_urls',
                            css='div.thumbnail > ul > li > img::attr(src)')
        item_loader.add_css(field_name='author_link',
                            css='div.brokerName > a.name::attr(href)')
        item_loader.add_css(field_name='content',
                            css='div.introduction *::text',
                            re=r'\s*(.*)\s*')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time',
                            css='div.zf-room > p::text')

        item_loader.add_css(field_name='price',
                            css='div.price > span.total::text')
        item_loader.add_css(field_name='detail', css='div.zf-room *::text')

        yield item_loader.load_item()
Beispiel #37
0
 def ads_parse(self, response):
     item = ItemLoader(AvitoRealEstateItem(), response)
     item.add_value('url', response.url)
     item.add_css('title',
                  'div.title-info-main h1.title-info-title span::text')
     item.add_xpath(
         'photos', "//div[contains(@class, 'gallery-img-frame')]/@data-url")
     item.add_xpath(
         'dpublic',
         "//div[@class='title-info-metadata-item-redesign']/text()")
     item.add_xpath('floor',
                    "//span[text() = 'Этаж: ']/parent::li/text()[2]")
     item.add_xpath(
         'floorcnt',
         "//span[text() = 'Этажей в доме: ']/parent::li/text()[2]")
     item.add_xpath('housetype',
                    "//span[text() = 'Тип дома: ']/parent::li/text()[2]")
     item.add_xpath(
         'roomcnt',
         "//span[text() = 'Количество комнат: ']/parent::li/text()[2]")
     item.add_xpath(
         'square',
         "//span[text() = 'Общая площадь: ']/parent::li/text()[2]")
     item.add_xpath(
         'kitchensquare',
         "//span[text() = 'Площадь кухни: ']/parent::li/text()[2]")
     item.add_xpath(
         'buildyear',
         "//span[text() = 'Год постройки: ']/parent::li/text()[2]")
     item.add_xpath(
         'authorname',
         '//div[@class="seller-info-name js-seller-info-name"]/a[1]/text()')
     item.add_xpath(
         'authorurl',
         '//div[@class="seller-info-name js-seller-info-name"]/a[1]/@href')
     yield item.load_item()
Beispiel #38
0
 def parse_torrent_page(self, response):
     torrent_item = response.meta['torrent_item']
     loader = ItemLoader(item=torrent_item, response=response)
     loader.add_css('title', "h1 .breaker-breaker::text")
     loader.add_css('description', "#descript::text")
     loader.add_value('link', "https://archive.org{}"
                      .format(response.css(".item-download-options div.format-group:nth-last-child(2) a::attr(href)")
                              .get()))
     loader.add_value('date', str(datetime.datetime.strptime(response.css("time::text").get(),
                                                             "%B %d, %Y")).split(sep=" ")[0])
     title = str(response.css("h1 .breaker-breaker::text").get())
     link = "https://archive.org{}".format(response.css(".item-download-options div.format-group:nth-last-child(2) "
                                                        "a::attr(href)").get())
     if link.endswith('.torrent'):
         self.torrents_list.append({
             'title': title,
             'description': response.css("#descript::text").get(),
             'link': link,
             'date': str(datetime.datetime.strptime(response.css("time::text").get(),
                                                    "%B %d, %Y")).split(sep=" ")[0]
         })
         yield loader.load_item()
     else:
         return
Beispiel #39
0
    def _parse_item(self, resp):
        il = ItemLoader(item=_ISArticleItem(), response=resp)
        il.add_value('url', resp.url)
        il.add_value('time', int(time.time()))
        il.add_xpath('title', '//article//h1//text()')
        il.add_xpath(
            'ingress',
            '//section//article//p[contains(@class, "ingress")]//text()')

        pgraphs_xpath = '//article//p[contains(@class, "body")]'
        content = [
            ''.join(Selector(text=pgraph).xpath('//text()').getall())
            for pgraph in resp.xpath(pgraphs_xpath).getall()
        ]
        il.add_value('content', content)

        il.add_xpath('published',
                     '//article//div[contains(@class, "timestamp")]//text()')
        il.add_xpath('author',
                     '//article//div[contains(@itemprop, "author")]//text()')
        il.add_xpath(
            'images',
            '//section//article//div[contains(@class, "clearing-container")]')
        return il.load_item()
Beispiel #40
0
    def parse(self, response):
        games = response.xpath("//div[@id='search_resultsRows']/a")
        for game in games:
            loader = ItemLoader(item=SteamItem(),
                                selector=game,
                                response=response)
            loader.add_xpath('game_url', ".//@href")
            loader.add_xpath('img_url',
                             ".//div[@class='col search_capsule']/img/@src")
            loader.add_xpath('game_name', ".//span[@class='title']/text()")
            loader.add_xpath(
                'release_date',
                "//div[@class='col search_released responsive_secondrow']/text()"
            )
            loader.add_xpath(
                'platform',
                ".//span[contains(@class,'platform_img') or @class='vr_supported']/@class"
            )
            loader.add_xpath(
                'reviews_summary',
                ".//span[contains(@class, 'search_review_summary')]/@data-tooltip-html"
            )
            loader.add_xpath(
                'discount_rate',
                ".//div[@class='col search_discount responsive_secondrow']/span/text()"
            )
            loader.add_xpath(
                'original_price',
                ".//div[contains(@class, 'search_price_discount_combined')]")
            loader.add_xpath('discounted_price', ".//@href")

            yield loader.load_item()
        next_page = response.xpath(
            "//a[@class='pagebtn' and text()='>']/@href").get()
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.parse)
Beispiel #41
0
    def parse_question(self, response):
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css('title',
                            '.QuestionHeader h1.QuestionHeader-title::text')
        item_loader.add_css('content', '.QuestionHeader-detail span')
        item_loader.add_value('url', response.url)

        match_obj = re.match('(.*www.zhihu.com/question/(\d+))', response.url)
        if match_obj:
            question_id = int(match_obj.group(2))
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('answer_num', 'h4.List-headerText span::text')
        item_loader.add_css('comments_num',
                            '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
        item_loader.add_css('topics',
                            '.QuestionHeader-topics .Popover div::text')

        question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer,
                             dont_filter=True)
        yield question_item
Beispiel #42
0
 def second_food_parse(self, response):
     item = response.meta['item']
     print('>>>: %s 连接成功' % response.status)
     print(
         '------------------------------------第一层数据爬取已完成-----------------------------------'
     )
     print(
         '----------------------------------开始进行第二层的商品数据爬取-----------------------------------'
     )
     print('>>> 爬取的商家名:', item['business_name'])
     print('>>> 爬取的商家URL: ', response.url)
     count = 0
     json_file = json.loads(response.text.encode('utf-8'))
     for index in json_file:
         for food in index.get('foods'):
             business_food = ItemLoader(item=MyElmBusinessFoodItem(),
                                        response=response)
             business_food.add_value('business_food', item['business_name'])
             business_food.add_value('food_name', food.get('name'))
             business_food.add_value('food_rating', food.get('rating'))
             business_food.add_value('food_month_sale',
                                     food.get('month_sales'))
             business_food.add_value(
                 'food_recent_rating',
                 food.get('specfoods')[0].get('recent_rating'))
             business_food.add_value('food_price',
                                     food.get('specfoods')[0].get('price'))
             business_food.add_value(
                 'food_original_price',
                 food.get('specfoods')[0].get('original_price'))
             yield business_food.load_item()
             count += 1
     print('----------成功抓取 %d 商品----------' % count)
     print(
         '----------------------------------#######################-----------------------------------\n'
     )
Beispiel #43
0
    def parse_abs_page(self, response):
        """
        From arXiv abstract page, fetches: 
        - submisison date and time
        - all categories including cross-references
        """

        new = ItemLoader(item=ArxivItem(),
                         response=response,
                         parent=response.meta['item'])

        # all arXiv categories
        other_cat_full_cont = response.css(
            'td[class*=subjects]').extract()[0].split('</span>;')
        if len(other_cat_full_cont) > 1:
            other_cats = other_cat_full_cont[1]
            other_cats_list = [
                x.strip('\(').strip('\)')
                for x in re.findall('\(.*?\)', other_cats)
            ]
        else:
            other_cats_list = []

        main_cat = re.findall(
            '\(.*?\)',
            response.css('div.metatable span::text').extract()[0])[0].strip(
                '\(').strip('\)')
        all_cats = [main_cat] + other_cats_list
        new.add_value('all_cat', all_cats)

        # submission date
        new.add_value(
            'date',
            response.css('div.submission-history::text').extract()[-2])

        yield new.load_item()
Beispiel #44
0
    def parse_item(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        try:
            for attr in ['title', 'date', 'content']:
                function = getattr(self, 'get' + attr, None)
                if function:
                    l.add_value(attr, function(response))
                else:
                    self.logger.error('no method for %s' % attr)

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            pass
        finally:
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Beispiel #45
0
    def parse_post(self, response):

        new = ItemLoader(item=FbcrawlItem(),
                         response=response,
                         parent=response.meta['item'])
        new.context['lang'] = self.lang
        new.add_xpath(
            'source',
            "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()"
        )
        new.add_xpath(
            'shared_from',
            '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()'
        )
        #   new.add_xpath('date','//div/div/abbr/text()')
        new.add_xpath(
            'text',
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        )

        #check reactions for old posts
        check_reactions = response.xpath(
            "//a[contains(@href,'reaction/profile')]/div/div/text()").get()
        if not check_reactions:
            yield new.load_item()
        else:
            new.add_xpath(
                'reactions',
                "//a[contains(@href,'reaction/profile')]/div/div/text()")
            reactions = response.xpath(
                "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href"
            )
            reactions = response.urljoin(reactions[0].extract())
            yield scrapy.Request(reactions,
                                 callback=self.parse_reactions,
                                 meta={'item': new})
Beispiel #46
0
    def parse_item(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        try:
            l.add_value('title', response.xpath('//span[contains(@id, "zt")]/text()').extract_first() or '')
            l.add_value('title', response.xpath('//span[@id="show_bt"]/descendant-or-self::text()').extract_first() or '')
            l.add_value('title', response.xpath('//span[@class="show_bt"]/descendant-or-self::text()').extract_first() or '')

            date = response.xpath('//span[contains(@id, "sj")]/text()').re_first(r'\d+-\d+-\d+')
            if date == None:
                date = response.xpath('//div[contains(@class, "show_date")]/text()').re_first(r'\d+-\d+-\d+')
            date = (date if date else '1970-01-01') + ' 00:00:00'

            l.add_value('date', date)

            l.add_value('source', self.website)

            l.add_value('content',
                        ''.join(response.xpath('//div[@id="Main1_txt"]/descendant-or-self::text()').extract()))
            l.add_value('content',
                        ''.join(response.xpath('//span[@id="Main1_txt"]/descendant-or-self::text()').extract()))

            l.add_value('content', ''.join(response.xpath('//span[@id="txt"]/descendant-or-self::text()').extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            pass
        finally:
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
 def parse_taobao_item(self, response):
     item_loader = ItemLoader(item=TaobaoGoodsDetailItem(),
                              response=response)
     item_loader.add_value('goods_id', response.meta['goods_id'])
     item_loader.add_xpath(
         'store_name',
         '//*[@id="J_ShopInfo"]/div/div[1]/div[1]/dl/dd/strong/a/text()')
     item_loader.add_xpath('original_price',
                           '//*[@id="J_StrPrice"]/em[2]/text()')
     item_loader.add_xpath('actual_price',
                           '//*[@id="J_PromoPriceNum"]/text()')
     item_loader.add_xpath('sales_volume',
                           '//*[@id="J_Counter"]/div/div[2]/a/@title')
     item_loader.add_xpath('postage', '//*[@id="J_WlServiceTitle"]/text()')
     item_loader.add_xpath('attribute', '')
     item_loader.add_xpath('details', '//*[@id="attributes"]/ul/li/text()')
     item_loader.add_xpath(
         'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[1]/dd/a/text()')
     item_loader.add_xpath(
         'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[2]/dd/a/text()')
     item_loader.add_xpath(
         'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[3]/dd/a/text()')
     goodsDetailItem = item_loader.load_item()
     yield goodsDetailItem
Beispiel #48
0
	def parse_page(self, response):
		# extract the json response from the tag: <script id="__NEXT_DATA__" type="application/json”>
		jsonresponse = json.loads(response.css('#__NEXT_DATA__::text').extract()[0])
		# access the historical data within the JSON object 
		nestedJson = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical']
		# retrieve the id of the crypto (a key value)
		id = [str(k) for k in nestedJson.keys()][0]
		# get the name of the respective crypto 
		name = nestedJson[id]['name']
		# save the ticker symbol 
		ticker = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['symbol']
		# accesss the historical data: e.g. Open, Close, High, Low, etc. 
		data = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['quotes']
		for d in data: 
			loader = ItemLoader(item=CryptoItem())
			loader.default_input_processor = MapCompose(str)
			loader.default_ouput_processor = Join('')

			for (field, path) in self.jmes_paths.items():
				loader.add_value(field, SelectJmes(path)(d))
			loader.add_value("Name", name)
			loader.add_value("Ticker", ticker)

			yield loader.load_item()
    def parse_page(self, response, title, description, pubDate, author):
        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'website':
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//meta[contains(@property, "og:title")]//@content')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_value('description', description)
            #l.add_value('author', '')
            l.add_xpath('author', '//*//a[contains(@href,"author")]//text()')
            l.add_xpath('content',
                        '//div[contains(@class, "article-body")]//p//text()')
            l.add_value('pubDate', pubDate)
            l.add_value('source', 'thetimes')

            yield l.load_item()
        else:
            next
Beispiel #50
0
    def parse_busi_art(self, res):
        tag = res.meta['tag']

        url = res.url
        main = res.css('.container.js-social-anchor-start')
        ci = ItemLoader(item=CNN(), selector=main)

        ci.add_value('tag', tag)
        ci.add_value('crawled_at', self.crawled_at)
        ci.add_value('url', url)

        ci.add_css('title', 'h1.article-title.speakable::text')

        ci.add_xpath('timestamp', './/span[@class="cnnDateStamp"]/text()')

        img_ = main.xpath('.//div[@id="storytext"]//img/@src').extract()

        ci.add_value('image_urls', img_)
        ci.add_css('summary', 'h2.speakable::text')

        ci.add_xpath('text', './/p/text()')
        ci.add_value('source', self.source)

        return ci.load_item()
    def parse_product(self, response):
        #  name = scrapy.Field(output_processor=TakeFirst())
        # category = scrapy.Field()
        # description = scrapy.Field()
        # specification = scrapy.Field()
        # price = scrapy.Field(output_processor=TakeFirst())
        # images = scrapy.Field()
        # image_urls = scrapy.Field()
        l = ItemLoader(item=ToystoresItem(), response=response)
        l.add_xpath(
            'name',
            "normalize-space(//h1[@class='ui header pdp']/div[1]/text())")
        l.add_xpath(
            'category',
            "normalize-space(//div[@class='ui breadcrumb mobile hidden'])")
        l.add_xpath('description',
                    "normalize-space(//div[@id='productDescription'])")
        l.add_xpath('specification',
                    "//div[@id='productSpecifications']//table//tr")
        l.add_xpath(
            'price',
            "concat(normalize-space(//div[@class='ui price']/text()),normalize-space(//div[@class='ui price']/span/text()))"
        )

        urls = response.xpath(
            "//div[contains(@class,'pdp') and contains(@class,'thumb')]//img/@src"
        ).extract()
        urls = [
            'https://www.intertoys.nl' + url.replace('thumb', 'full')
            for url in urls
        ]
        l.add_value('image_urls', urls)
        # referer = response.request.headers.get('Referer', '')
        # # referer = referer.split('/')
        # l.add_value('category', type(referer)) #[referer[-3], referer[-2]])
        return l.load_item()
Beispiel #52
0
    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = "".join(response.xpath('//h1//text()').getall())
        if title:
            title = title.strip()

        date = response.xpath('//span[@class="datum"]/text()').get()
        if date:
            date = date.strip()
        else:
            return

        content = response.xpath('//article//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
    def parse(self, response):
        productos = response.css('div.product-tile-inner')
        for producto in productos:
            print(producto)
            detalles = producto.css('div.detail')
            tiene_detalles = len(detalles) > 0 and detalles
            if (tiene_detalles):
                producto_loader = ItemLoader(  #Instancia para cargar las propiedad del Item
                    item=ProductoFybeca(),  #Clase item
                    selector=producto  # Selector por defecto
                )

                producto_loader.default_output_processor = TakeFirst(
                )  #No guardar el arreglo

                producto_loader.add_css(
                    'titulo',  #Nombre de la propiedad del item
                    'a.name::text'  # css para obtener el dato
                )
                producto_loader.add_xpath(
                    'imagen',  #Nombre de la propiedad del item
                    'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src'  # xpath para obtener el dato
                )
                yield producto_loader.load_item()
    def parse(self, response):
        
        json_resp = json.loads(response.body)
        houses = json_resp.get('cat1').get('searchResults').get('listResults')

        for house in houses:
            loader = ItemLoader(item = ZillowItem())
            loader.add_value('id',house.get('id'))
            loader.add_value('image_urls',house.get('imgSrc'))
            loader.add_value('detail_url',house.get('detailUrl'))
            loader.add_value('status_type',house.get('statusType'))
            loader.add_value('status_text',house.get('statusText'))
            loader.add_value('price',house.get('price'))
            loader.add_value('address',house.get('address'))
            loader.add_value('beds',house.get('beds'))
            loader.add_value('baths',house.get('baths'))
            loader.add_value('area_sqft',house.get('area'))
            loader.add_value('latitude',house.get('latLong').get('latitude'))
            loader.add_value('longitude',house.get('latLong').get('longitude'))
            loader.add_value('broker_name',house.get('brokerName'))
            yield loader.load_item()

        current_page = response.meta['currentPage']
        total_pages = json_resp.get('cat1').get('searchList').get('totalPages')
        
        if current_page <= total_pages:
            nxt_pg = current_page + 1

            yield scrapy.Request(
                url= parse_new_url(URL,pg_num=nxt_pg),
                callback=self.parse,
                cookies=cookie_parser(),
                meta={
                    'currentPage': nxt_pg
                }
            )
Beispiel #55
0
    def parse_question(self, response):
        # 处理question页面,从页面中提取具体的question item

        zhihu_id = response.meta.get("zhihu_id", "")

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", zhihu_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num",
                            ".QuestionHeaderActions button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-value::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_urls.format(zhihu_id, 20, 0),
                             headers=self.header,
                             callback=self.parse_answer)
        yield question_item
Beispiel #56
0
    def parse_item(self, response):

        loader = ItemLoader(item=SephoraScrapyProjectItem(), response=response)
        loader.default_output_processor = TakeFirst()

        loader.add_xpath(
            'brand_name', "//h1[@data-comp='DisplayName Flex Box']" +
            "//span[@class='css-euydo4']//text()")

        loader.add_xpath(
            'item_name', "//h1[@data-comp='DisplayName Flex Box']" +
            "//span[@class='css-0']//text()")

        loader.add_xpath('price', "//div[@data-comp='Price Box']//text()")

        loader.add_xpath('category', "//a[@class='css-1ylrown ']//text()")

        loader.add_xpath('subcategory', "//a[@class='css-1ylrown ']//text()")

        loader.add_xpath('subsubcategory',
                         "//h1[@class='css-bnsadm ']//text()")

        details_xpath = \
            self.get_detail_and_ingredient_xpath(response, 'Details')
        if details_xpath:
            loader.add_xpath('details', details_xpath)

        ingredient_xpath = \
            self.get_detail_and_ingredient_xpath(response, 'Ingredients')
        if ingredient_xpath:
            loader.add_xpath('ingredients', ingredient_xpath)

        image_url = self.get_image_url(response)
        loader.add_value('image_url', image_url)

        yield loader.load_item()
Beispiel #57
0
    def parse_item(self, response):
        """Parse a page with an apartment.

        @url http://www.merkur-berlin.de/?page_id=39&showExpose=1&exposeID=926C081BECA043C9BE7756469D94722F
        @returns items 1 1
        @scrapes url title address rooms size warm_rent description location
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_xpath('title', '//h4[@class="entry-title"]/text()')
        item.add_xpath('address', '//address/text()')

        for field, info in dict(rooms='Rooms', size='AreaLiving', warm_rent='PriceWarmmiete',
                                cold_rent='Price').items():
            item.add_xpath(field, '//div[@class="infotables"]//tr[@id="infotable_{info}"]/td[@class='
                                  '"infotable_value"]/text()'.format(info=info))

        for field, h2 in dict(description='Objekt', equipment='Ausstattung',
                              location='Lage', other='Mehr Angebote').items():
            item.add_xpath(field, '//div[@class="infoblock"]/h2[starts-with(normalize-space(.),'
                                  ' "{h2}")]/following-sibling::p/text()'.format(h2=h2))

        return item.load_item()
    def parse(self, response):
        sel = Selector(response)
        noticias = sel.xpath(
            '//div[@class="view-content"]/div[@class="posts"]')
        for i, elem in enumerate(noticias):
            item = ItemLoader(Noticia(), elem)  # Cargo mi item

            # Llenando mi item a traves de expresiones XPATH
            item.add_xpath('titular', './/h2/a/text()')
            item.add_xpath('descripcion', './/p/text()')
            item.add_value('id', i)
            yield item.load_item()  # Retorno mi item lleno

        # METODO #2: UTILIZANDO BEAUTIFUL SOUP

        # soup = BeautifulSoup(response.body)
        # contenedor_noticias = soup.find_all(class_='view-content')
        # id = 0
        # for contenedor in contenedor_noticias:
        #   noticias = contenedor.find_all(class_='posts', recursive = False)
        #   for noticia in noticias:
        #     item = ItemLoader(Noticia(), response.body)
        #     titular = noticia.find('h2').text.replace('\n', '').replace('\r', '')
        #     descripcion = noticia.find('p')
        #     if (descripcion):
        #       item.add_value('descripcion', descripcion.text.replace('\n', '').replace('\r', ''))
        #     else:
        #       item.add_value('descripcion', 'N/A')
        #     item.add_value('titular', titular)
        #     item.add_value('id', id)
        #     id += 1
        #     yield item.load_item()


# EJECUCION
# scrapy runspider 4_eluniverso.py -o resultados.csv -t csv
Beispiel #59
0
    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1[@class="page-title"]/text()').get()
        if title:
            title = title.strip()

        date = response.xpath('//span[@class="date"]/text()').get()
        if date:
            date = datetime.strptime(date.strip(), '%d/%m/%Y')
            date = date.strftime('%Y/%m/%d')

        content = response.xpath(
            '//span[@class="description"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Beispiel #60
0
 def collectDetailInfo(self, response: scrapy.http.Response):
     loader = ItemLoader(item=BuddhacrawlerItem(), response=response)
     loader.add_value('url', response.url)
     loader.add_value('hostUrl', self.host)
     loader.add_value('city', self.city)
     loader.add_xpath(
         'articleTitle',
         "//div[@class='newsBox']/div[@class='bt']/h2[@class='bt1']/text()")
     loader.add_xpath('articleTag', "//div[@class='newslink']/a/text()")
     loader.add_xpath('articleText',
                      "//div[@class='newsBox']/div[@class='newsCon']")
     loader.add_xpath(
         'publishTime',
         "//div[@class='newsBox']/div[@class='bt']/div[@class='bt2']/text()"
     )
     loader.add_value('coverPictureUrl', '')
     loader.add_xpath(
         'articlePictureUrls',
         "//div[@class='newsBox']/div[@class='newsCon']/div[@class='pgc-img']/img/@src"
     )
     loader.add_value('articleVideoUrls', [])
     loader.add_value('createTime',
                      datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
     return loader.load_item()