def get_car_url(self,response): html_body = response.body page = response.request.meta.get('page',0) city = response.request.meta.get('city',0) cookie_old = response.request.meta.get('cookie_old','') hxs = Selector(text = html_body) cars = hxs.xpath('//ul[@class="car-info-ul clearfix"]/li/a/@href').extract() # city = hxs.xpath('//span[@id="current_city"]/text()').extract() #print ''.join(city) print city for car in cars: #print car car_id = re.compile('\/info\/v\/(\d+)').findall(car) if car_id: car_id = car_id[0] else: continue print car_id url = 'http://ic5u.com%s' % car item = {'car_id':car_id,'city':city,'url':url} facade.update_item('taoche0515',{'car_id':car_id,'city':city},item) #self.mongo_db.car_list_url.update({'url':url,'city':city},item,True) if len(cars) == 12: next_page = page + 1 next_url = 'http://ic5u.com/index.php/Home/CarSearch/index/p/%d.html' % next_page print 'next_page',next_url facade.update_item('taoche_linkbase',{'url':next_url,'city':city},{'url':next_url,'cookie_old':cookie_old,'page':page,'city':'city'}) logging.info('保存城市%s,第%d页下一页的的链接:%s' %(city,page,next_url)) print cookie_old return Request(next_url,meta={'q_flag':2,'city':city,'page':next_page,'cookie_old':cookie_old},cookies = {'PHPSESSID':cookie_old})
def get_car_url(self, response): html_body = response.body print html_body result = json.loads(html_body) page = result.get('data', {}).get('page', 0) total = result.get('data', {}).get('total', 0) third_ids = result.get('data', {}).get('third_ids', '') round_id = result.get('data', {}).get('round_ids', '') tags = result.get('data', {}).get('tags', '') tags = urllib.quote(tags.encode('utf-8')) area = result.get('data', {}).get('area', '') if page == 1 and total > 10: for pn in range(2, total / 10 + 2): new_url = "https://www.innotree.cn/ajax/bigdata/company/getList?type=&page=%d&size=10&first_ids=&second_ids=&third_ids=%s&round=%s&area=%s&tags=%s" % ( pn, third_ids, round_id, area, tags) logging.info('挖掘出新链接[%s],来自:%s' % (new_url, response.url)) facade.save_new_yinguoshu(new_url) print new_url data = result.get('data', {}).get('list', []) if len(data) == 0: facade.update_item('yinguoshu_no_item', {'from_url': response.url}, {'from_url': response.url}) for item in data: item['from_url'] = response.url item['crawl_time'] = datetime.datetime.now().strftime( "%Y%m%d %H:%M:%S") facade.update_item('yinguoshu_item', {'cid': item['cid']}, item) #facade.save_yinguoshu(item) logging.info('save item:{"cid":"%s"}' % item['cid'])
def process_step4(self, response, page_item): body = response.body_as_unicode().encode('utf-8') real_url = response.url logging.info('第4步:通过跳转破解加密连接,得到real_url:%s' % real_url) if 'www.baidu.com/link?url' in real_url: logging.warning("破解失败,链接仍然是百度加密url:%s" % real_url) return [] #保存搜索结果 item = { 'real_url': real_url, 'crawl_time': math.floor(time.time()), 'query': response.request.meta.get('query', '') } self.process_tst_info(target=item, src=response.request.meta) logging.debug("通过跳转破解加密连接seitem:%s" % item) #保存 where = { 'query': item.get('query', ''), 'real_url': item.get('real_url', '') } facade.update_item(self.decode_url_table_name, where, item) for parser in self.parsers: items = parser.process_page(page_item, html_body=page_item['html_body'] if 'html_body' in page_item else None, p_facade=facade) for item in items: #透传字段 self.process_tst_info(target=item, src=response.request.meta) parser.save_item(item) logging.debug( '解析出的完整item:\n %s' % ' \n '.join(['%s:%s' % (k, v) for k, v in item.items()])) return []
def parse(self, response): info_content = response.body url = response.url info_json = json.loads(info_content) info_json['crawl_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') facade.update_item('koubei_autohome_app', {'from_url': url}, info_json)
def save_item(self, item): #where不能是None,不能是空(空则是所有进行更新),没有必要更新的都用id来做 where = {} print item['from_url'] print '**********************' for key in self.replace_keys: where[key] = item[key] if len(where) == 0: where["_id"] = ObjectId('123456789012') logging.error( 'get_update_where error! 没有配置replace_keys或replace_key不是must_keys' ) print where #old_item = self.item_col.find_one(where) old_item = facade.find_one_item(self.table_name, where) if old_item is None: item['first_crawl_time'] = datetime.datetime.now().strftime( "%Y%m%d %H:%M:%S") if 'selled' in item: item['sell_time'] = datetime.datetime.now().strftime( "%Y%m%d %H:%M:%S") else: item['first_crawl_time'] = old_item[ 'first_crawl_time'] if 'first_crawl_time' in old_item else old_item[ 'frist_crawl_time'] if 'selled' in item and 'sell_time' not in old_item: item['sell_time'] = item['crawl_time'] if 'selled' in item and 'sell_time' in old_item: item['sell_time'] = old_item['sell_time'] logging.debug('保存解析item. from url:%s get where for a item:%s' % (item['from_url'], where)) #self.item_col.update(where, item, True) facade.update_item(self.table_name, where, item)
def drop_item(self, url): #如果规则里面有dorp_item且不为空,说明item已经没用。需要删除,更新item状态添加字段drop_item:yes。 car_id = re.findall(r'infoid=(\d+)', url)[0] where = {'car_id': car_id} #old_item = self.item_col.find(where) old_item = facade.find_one_item(self.table_name, where) for item in old_item: item['drop_item'] = 'yes' item['crawl_time'] = datetime.datetime.now().strftime( "%Y%m%d %H:%M:%S") #self.item_col.update(where, item, True) facade.update_item(self.table_name, where, item)
def extract_links_withrule(self, response, dups, rule): logging.debug('执行extract-link. rule:%s' % rule) linkinfos = [] if rule.get('xpath', None) is None or len(rule.get('xpath')) < 1: logging.debug("rule没有xpath,不执行具体的解析") return linkinfos sgml = LinkExtractor(allow_domains=self.allowed_domain, restrict_xpaths = (rule['xpath']), allow=rule['allow'], deny=rule['deny'],tags=('a')) links = sgml.extract_links(response) logging.info('从xpath挖出item-url数量[%d] url[%s] xpath:%s ' % (len(links), response.url, rule['xpath'])) for link in links: if link.url in dups: continue dups.add(link.url) linkinfo = {'url':link.url, 'from_url':response.url, 'spider_name':self.name, 'status':0, 'schedule_cnt':0, 'failed_cnt':0, 'in_time':math.floor(time.time()), 'update_time':math.floor(time.time()), 'dont_redirect':self.dont_redirect,'parser_ranks':rule.get('parser_ranks',[]),'link_type':rule.get('link_type','')} if rule.get('rule', None) is not None: linkinfo['rule'] = rule.get('rule') linkinfos.append(linkinfo) logging.debug('target link:%s from_url:%s' % (link.url, response.url)) logging.debug('[%d]个targetLink extract from url:%s' % (len(linkinfos), response.url)) if rule['xpath'] == 'regex': print rule['allow'] html_parser = HTMLParser.HTMLParser() txt = html_parser.unescape(response.body) urls = re.findall(rule['allow'],txt) logging.info('从regex挖出item-url数量[%d] url[%s] xpath:%s ' % (len(urls), response.url, rule['xpath'])) for url in urls: if url in dups: continue dups.add(url) linkinfo = {'url':url, 'from_url':response.url, 'spider_name':self.name, 'status':0, 'schedule_cnt':0, 'failed_cnt':0, 'in_time':math.floor(time.time()), 'update_time':math.floor(time.time()), 'dont_redirect':self.dont_redirect} linkinfos.append(linkinfo) logging.debug('target link:%s from_url:%s' % (url, response.url)) if len(linkinfos) == 0: print str(rule) spider_warning = {'spider_name':self.name,'rule':str(rule),'url':response.request.url} facade.update_item('spider_warning',{'rule':str(rule),'url':response.request.url},spider_warning) logging.warning('spider_name:%s,xtarget rule:%s,url:%s 挖掘到的新链接为0 ,不符合预期' % (self.name,str(rule),response.request.url)) return linkinfos