def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) # yield jsonresponse for c in jsonresponse['list']: loader = ItemLoader(item=StockCubesItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(c)) item = loader.load_item() ownerLoader = ItemLoader(item=OwnerItem()) ownerLoader.default_input_processor = MapCompose(str) ownerLoader.default_output_processor = Join(' ') for (field, path) in self.owner_jmes_paths.items(): ownerLoader.add_value(field, SelectJmes(path)(c['owner'])) owner = ownerLoader.load_item() item['owner'] = owner yield item # 开始提取用户信息 uid = owner['id'] # https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid=6626771620&pid=-24(创建的组合) createdCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-24' # 请求用户创建的组合 # 通过cb_kwargs的方式,给解析函数传递参数 yield scrapy.Request( createdCubeUrl, self.parseCubeList, headers=self.send_headers, cb_kwargs=dict(uid=uid, screen_name=owner['screen_name'])) # 请求用户关注的组合,这个地方不去传递uid和screen_name信息,这种情况下,通过请求网页去解析, # TODO 请求网页的速度超慢,想办法优化,开启多线程? followedCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-120' yield scrapy.Request(followedCubeUrl, self.parseCubeList, headers=self.send_headers) # 组合信息: # https://xueqiu.com/cubes/quote.json?code=ZH976766,SP1034535,SP1012810,ZH1160206,ZH2003755,ZH1996976,ZH1079481,ZH1174824,ZH1079472,SP1040320 page = jsonresponse['page'] maxPage = jsonresponse['maxPage'] if (page < maxPage): url = f'{self.cube_discover_url}{page+1}' yield scrapy.Request(url, headers=self.send_headers)
def parse(self, response, **kwargs): """ response.xpath("//span[contains(@class, 'Head')]//text()").getall() :param response: :param kwargs: :return: """ item_loader = ItemLoader(item=self.item_loader_cls, response=response, spider_name=self.name) item_loader.default_input_processor = default_input_processor item_loader.default_output_processor = default_output_processor # logger.info("************************************** TEST **************************************") # logger.info("**********************************************************************************") # for section in response.xpath("//span[contains(@class, 'Head')]").getall(): # logger.info(section.split()) # logger.info("**********************************************************************************") # logger.info("**********************************************************************************") for field_name, xpath in self.item_loader_xpath.items(): item_loader.add_xpath(field_name=field_name, xpath=xpath) item_loader.add_value('word', response.request.url.split("/")[-1]) yield item_loader.load_item()
def parse_page(self, response): job_list = response.css('.srp_container > .row') for job in job_list: job_loader = ItemLoader(JobPost(), selector=job) job_loader.default_input_processor = MapCompose( lambda x: x.strip()) job_loader.default_output_processor = TakeFirst() job_loader.add_css('job_title', '.content > ul > .desig::text') job_loader.add_css('experience_required', '.content > .exp::text') job_loader.add_css('location', '.content > .loc > span::text') job_loader.add_css('company_name', '.content > .orgRating > .org::text') job_loader.add_css('job_description_url', 'div::attr(data-url)') job_loader.add_css( 'key_skills', '.content > .more > div[class = "desc"] >span[class = "skill"]::text' ) job_loader.add_css( 'job_description', '.content > .more > span[class = "desc"]::text') job_loader.add_css('salary', '.other_details > .salary::text') job_loader.add_css( 'posted_by', '.other_details > .rec_details > .rec_name::text') job_loader.add_css( 'posted_on', '.other_details > .rec_details > span[class = "date"]::text') yield job_loader.load_item()
def parse(self, response): match = re.search('/displaySeminarList/',response.url) if match: urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url, self.parse) else: table = response.xpath(self.seminar_list_xpath) corpId = parse_qs(urlparse(response.url).query)['corpId'] for index,semi in enumerate(table): loader = ItemLoader(SeminarItem(),semi) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_value('companyid',corpId) loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()') loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+') loader.add_xpath('time','.//td[@class="time"]/text()') loader.add_xpath('area','.//td[@class="area"]/text()') loader.add_xpath('place','.//td[@class="place"]/text()') loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+') loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)') loader.add_xpath('target','.//td[@class="target"]/text()') yield loader.load_item()
def parse_subjects(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('SUBJECTS', '//meta[@name="DC.subject.keyword"]/@content') return il.load_item()
def save_to_csv(self, response, **meta): # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=NmSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('company_name', meta['company_name']) il.add_value('entity_id', meta['business_id']) il.add_value('dba_name', meta['dba_name']) il.add_value('company_subtype', meta['company_subtype']) il.add_value('non_profit_indicator', meta['non_profit_indicator']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('status', meta['status']) il.add_value('creation_date', meta['creation_date']) il.add_value('domestic state', meta['domestic_state']) il.add_value('period of duration', meta['peroid_of_duration']) il.add_value('business purpose', meta['business_purpose']) il.add_value('mixed_subtype', meta['officer_title']) il.add_value('mixed_name', meta['officer_name']) il.add_value('person_address_string', meta['officer_address']) il.add_value('permit_type', 'business_license') il.add_value('sourceName', 'NM_SOS') il.add_value( 'url', 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch' ) return il
def parse_corporate_bodies(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('CORPORATE_BODIES', '//a[contains(@class, "bigblack")]//text()') il.add_xpath('CORPORATE_BODIES', '//p[contains(@class, "bodytext")]//a/text()') return il.load_item()
def parse_description(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('DESCRIPTION', '//h1[contains(@class, "parchment")]//text()') il.add_xpath('DESCRIPTION', '//td[contains(@class, "tabbody")]//ul/li/text()') return il.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.allocine.fr/films @returns items @scrapes title """ selector = Selector(response) # iterate over movies for movie in selector.xpath(self.movies_list_xpath): loader = ItemLoader(AllocineMovies(), selector=movie) # define processors loader.default_input_processor = MapCompose( lambda x: x.strip(',').split(), replace_escape_chars) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_item(self, response): item_loader = ItemLoader(item=BwtTestItem(), response=response) item_loader.default_input_processor = MapCompose(str.strip) item_loader.default_output_processor = Join() for field_name in BwtTestItem.fields.keys(): item_loader.add_xpath(field_name, self.__get_selector(field_name)) yield item_loader.load_item()
def parse_book(self, response): book_loader = ItemLoader(item=BookItem(), response=response) book_loader.default_input_processor = MapCompose(remove_tags) book_loader.add_value( "image_urls", response.urljoin( response.css(".item.active > img::attr(src)").extract_first())) book_loader.add_css("title", ".col-sm-6.product_main > h1", TakeFirst()) book_loader.add_css("price", ".price_color", TakeFirst()) book_loader.add_css("upc", ".table.table-striped > tr:nth-child(1) > td", TakeFirst()) book_loader.add_css("product_type", ".table.table-striped > tr:nth-child(2) > td", TakeFirst()) book_loader.add_css("tax", ".table.table-striped > tr:nth-child(5) > td", TakeFirst()) book_loader.add_css("stock", ".table.table-striped > tr:nth-child(6) > td", TakeFirst()) book_loader.add_css("reviews", ".table.table-striped > tr:nth-child(7) > td", TakeFirst()) book_loader.add_css("rating", ".star-rating::attr(class)", TakeFirst()) return book_loader.load_item()
def save_to_csv(self, response, **meta): il = ItemLoader(item=IlDupageFoodInspectionsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Dupage_Food_Inspections') il.add_value('url', 'https://eco.dupagehealth.org/#/pa1/search') il.add_value( 'location_address_string', meta['location_address_string'] if meta['location_address_string'] else 'IL') il.add_value('abate_date', meta['abate_date']) il.add_value('inspection_date', meta['inspection_date']) il.add_value( 'company_name', self._getDBA(meta['company_name'])[0] if meta['company_name'] else '') il.add_value('violation_type', meta['violation_type']) il.add_value('violation_description', meta['violation_description']) il.add_value( 'dba_name', self._getDBA(meta['company_name'])[1] if meta['company_name'] else '') il.add_value('inspection_type', meta['inspection_type']) il.add_value('violation_date', meta['violation_date']) il.add_value('abate_status', meta['abate_status']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('violation_rule', meta['violation_rule']) return il.load_item()
def parse_description(response): il = ItemLoader(item=TnaMediaWebcrawlerItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() # il.add_xpath('DESCRIPTION', '//div[contains(@class, "entry-content clearfix")]/p//text()') il.add_xpath('DESCRIPTION', '//ul[contains(@id, "archive")]//p//text()') return il.load_item()
def parse(self, response): selector = Selector(response) # iterate over deals for thread in selector.xpath(self.thread_list_xpath): print("Thread: " + str(thread)) #print("Link to Follow: " + ''.join(thread.xpath('.//a[@class="title"]/@href').extract())) loader = ItemLoader(item=Thread(), selector=thread) # define processors loader.default_input_processor = MapCompose(str.strip) loader.default_output_processor = Join() thread_url = 'http://www.clubsnap.com/forums/' + ''.join( thread.xpath('.//a[@class="title"]/@href').extract()) thread_detail = scrapy.Request(thread_url, callback=self.parse_details) # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.items(): print("Load: " + field + " " + xpath) loader.add_xpath(field, xpath) yield loader.load_item() # follow next page links if response.css("a[rel='next']::attr(href)").extract(): next_page = 'http://www.clubsnap.com/forums/' + response.css( "a[rel='next']::attr(href)").extract()[0] print("NEXT PAGE: " + next_page) yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)
def parse(self, response): #First, we define the selector print("response", response) print("type response", type(response)) selector = Selector(response) print("selector", selector) #print("len selector offer", len(selector.xpath(self.offer_list_xpath))) #Second, we define the offers urls #job_urls = response.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]/a/@href').extract() # iterate over deals for offer in selector.xpath(self.offer_list_xpath): #print("offer", offer) loader = ItemLoader(IndeedOffer(), selector=offer) # define processors loader.default_input_processor = MapCompose(str.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.items(): #print("field", field) #print("xpath", xpath) loader.add_xpath(field, xpath) yield loader.load_item()
def parse_description(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/p/text()') il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]//ul/li/text()') il.add_xpath( 'DESCRIPTION', '//table[contains(@class, "table table-striped")]//tbody/tr/td/text()') il.add_xpath('DESCRIPTION', '//div[contains(@class, "accordion-content")]//p/text()') il.add_xpath( 'DESCRIPTION', '//div[contains(@class, "two-thirds pad-horizontal-large margin-none margin-bottom-large")]//p/text()' ) il.add_xpath('DESCRIPTION', '//div[contains(@class, "video-box")]//p/text()') il.add_xpath('DESCRIPTION', '//div[contains(@class, "entry-content")]//h3/text()') il.add_xpath( 'DESCRIPTION', '//div[contains(@class, "entry-content clearfix")]//p/text()') return il.load_item()
def get_player_info(self, response): loader = ItemLoader(item=NFL_Player_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0] number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract() if type(number_and_position) is list: number_and_position = number_and_position[0] number = number_and_position.split()[0] position = number_and_position.split()[1] else: number = '' position = '' loader.add_value('number', number) loader.add_value('position', position) loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') yield loader.load_item()
def parse_content(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('CONTENT', '//div[contains(@class, "entry-content")]/p//text()') return il.load_item()
def parse_title(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_value('TITLE', 'Design Registers ') il.add_xpath('TITLE', '//span[contains(@class, "btprojtoptitle")]/text()') # il.add_xpath('TITLE', '//span[contains(@class, "btprojtitle")]/text()') return il.load_item()
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") yield item_loader.load_item()
def parse_catalogue_reference(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('CATALOGUE_REFERENCE', '//span[contains(@class, "catRef")]/text()') return il.load_item()
def scrape_product(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() item_loader.add_css("my_field", "selector") return item_loader.load_item()
def populate_item(self, selector): item_loader = ItemLoader(item=MySpiderItem(), selector=selector) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() # #item_loader.add_css("my_field", "my_css") #item_loader.add_xpath("my_field", "my_xpath") # return item_loader.load_item()
def parse_shop(self, response): shop_loader = ItemLoader(item=ExclusiveScraperItem(), response=response) shop_loader.default_input_processor = MapCompose(remove_tags, replace_escape_chars, strip_html5_whitespace) shop_loader.default_output_processor = TakeFirst() shop_loader.add_css("title", ".grid__item.large-up--one-third.product__selector-container > h1") shop_loader.add_css("price", "span.product__price.js-product-price") shop_loader.add_css("discount_price", "span.product__price.product__discount.js-product-price") yield shop_loader.load_item()
def parse_description(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('DESCRIPTION', '//span[contains(@class, "btprojtxt")]/text()') il.add_xpath('DESCRIPTION', '//p[contains(@class, "btprojtxt")]/text()') # il.add_xpath('DESCRIPTION', '//a[contains(@class, "btprojlinks")]/text()') # il.add_xpath('DESCRIPTION', '//span[contains(@class, "btprojtoptitle")]/text()') return il.load_item()
def parse_keywords(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('KEYWORDS', '//meta[@name="keywords"]/@content') il.add_xpath('KEYWORDS', '//meta[@name="Keywords"]/@content') il.add_xpath('KEYWORDS', '//META[@NAME="Keywords"]/@CONTENT') return il.load_item()
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() # #item_loader.add_css("my_field", "my_css") #item_loader.add_xpath("my_field", "my_xpath") # return item_loader.load_item()
def parse_depth_chart(self, response): loader = ItemLoader(item=NFL_Team_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()') loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()') yield loader.load_item()
def parse_cube_info(self, response, symbol_list): json_response = json.loads(response.body_as_unicode()) for s in symbol_list: loader = ItemLoader(item=CubeItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(json_response[s])) item = loader.load_item() yield item
def save_csv(self,response,data_dic): il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags,lambda data:re.sub(r'\s+', ' ',data) if data else '',replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Food_Inspections') il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html') for k in data_dic: il.add_value(k,(data_dic[k])) return il
def parse_raw(self, symbol, raw): terms = BranchSettlementItem.Meta.fields loader = ItemLoader(item=BranchSettlementItem()) loader.default_input_processor = MapCompose(str, str.strip) loader.default_output_processor = TakeFirst() loader.add_value('date', self.date) loader.add_value('code', symbol) for idx, field in enumerate(terms): loader.add_value(field, raw[idx]) return loader.load_item()
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") #item_loader.add_css("", "") yield FormRequest("POST_URL", formdata={'parameter': 'p'}, meta={'item': item_loader.load_item()}, callback=self.populate_field)
def getDescription(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/p//text()') il.add_xpath('DESCRIPTION', '//div[contains(@class, "breather")]/ul/li//text()') il.add_xpath('DESCRIPTION', '//div[contains(@id, "col starts-at-full ends-at-two-thirds clr")]/p//text()') il.add_xpath('DESCRIPTION', '//div[contains(@id, "col starts-at-full ends-at-half clr")]/p//text()') il.add_xpath('DESCRIPTION', '//div[contains(@class, "col starts-at-full ends-at-half clr")]/p//text()') return il.load_item()
def parse_auction_item(self, response): loader = ItemLoader(AuctionItems(), response=response) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_bids(self, response): selector = Selector(response) for bid in selector.select(self.bid_list_xpath) : loader = ItemLoader(BidItems(), selector=bid) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_bid_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): def strip_dollar(x): return x.strip('$') self.driver.get(response.url) try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]'))) except TimeoutException: print 'Page load time out' pass while True: try: try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div/div/div/button'))) except TimeoutException: break next = self.driver.find_element_by_xpath( '//*[@id="depart-container"]/div/div/div/button') next.click() except ElementNotVisibleException: break for trips in Selector( text=self.driver.page_source).xpath(self.trips_list_xpath): loader = ItemLoader(BusTrip(), selector=trips) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.price_in = MapCompose(strip_dollar) for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) dateoftrip = str(response.url).split("/")[-1] loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape')) yield loader.load_item()
def parse_answers(self, response): # use selector to extract answers selector = Selector(response) # iterate over answers for answer in selector.xpath(self.answers_list_xpath): loader = ItemLoader(item=ZhihuAnswer(), selector=answer) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) item = loader.load_item() # convert the full text of answer into html item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace') # if summary has image, convert it to html if "summary_img" in item: item["summary_img"] = item["summary_img"].encode('ascii', 'xmlcharrefreplace') else: item['summary_img'] = "" # change vote to integer item["vote"] = int(item["vote"]) # in case of anonymous authors if "author" not in item: item["author"] = u'匿名用户' # complete links item["question_link"] = u"http://www.zhihu.com" + item["question_link"] if "author_link" in item: item["author_link"] = u"http://www.zhihu.com" + item["author_link"] else: item["author_link"] = "" # add the date when scraped item["date"] = date.today() yield item