def process_item(self, item, spider): test = item con_ticket = get_mongodb('raw_ly', 'Ticket', profile='mongo-raw') con_vs = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw') if item.name == 'ticket': con_ticket.update({'pid': item['pid']}, { '$set': { 'pid': item['pid'], 'lyId': item['lyId'], 'info': item['info'], 'stockList': item['stock_list'] } }, upsert=True) elif item.name == 'ticket-delete': pass # con_ticket.remove({'lyId': item['ly_id'], 'pid': {'$nin': item['id_list']}}, multi=True) elif item.name == 'vs-ticket-info': con_vs.update( {'lyId': item['ly_id']}, {'$set': { 'ticketInfo': item['info'], 'crawl': True }}) return item
def populate_tasks(self): col_src, db_tar, col_tar = ('BaiduLocality', 'geo', 'LocalityTransfer') if self.args.type == 'mdd' else ( 'BaiduPoi', 'poi', 'ViewSpotTransfer') col = get_mongodb('proc_baidu', col_src, profile='mongo-raw') col_target = get_mongodb(db_tar, col_tar, profile='mongo') cursor = col.find({}) cursor.skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) for val in cursor: def func(entry=val): surl = entry['source']['baidu']['surl'] if 'surl' in entry['source']['baidu'] else '' self.log(u'Processing: zhName=%s, sid=%s, surl=%s' % (entry['zhName'], entry['source']['baidu']['id'], surl)) self.resolve_targets(entry) target = col_target.find_one({'source.baidu.id': entry['source']['baidu']['id']}) if not target: target = {} for m in self.mergers: m.process(entry, target) if target: target['taoziEna'] = True target['lxpEna'] = True col_target.save(target) self.add_task(func)
def run(self): from utils.database import get_mongodb col = get_mongodb(self.args.db, self.args.col, profile='mongo') cursor = col.find({'images': {'$ne': None}}, snapshot=True) cursor.skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) col_im = get_mongodb('imagestore', 'Images', profile='mongo') cursor = col.find({'images': {'$ne': None}}, {'images': 1}).sort('_id', pymongo.ASCENDING) cursor.skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) self.total = 0 super(ImageValidator, self).run() for entry in cursor: def func(val=entry): modified = False if 'images' not in val or not val['images']: return for img in val['images']: key = img['key'] match = re.search(r'[0-9a-f]{32}', key) if not match: continue new_key = match.group() # 使用new_key去imagestore中查询 ret = col_im.find_one({'$or': [{'key': new_key}, {'url_hash': new_key}]}) if not ret: print 'Image not exists: %s' % key continue if img['key'] != new_key: modified = True img['key'] = new_key if 'url' in img: modified = True img.pop('url') if 'cropHint' in img: ch = img['cropHint'] if ch['bottom'] == 0 and ch['right'] == 0: modified = True img.pop('cropHint') if modified: print 'Updating %s' % val['_id'] col.update({'_id': val['_id']}, {'$set': {'images': val['images']}}) self.add_task(func) gevent.sleep(0) self._join()
def parse(self, entry): col_vs = get_mongodb('poi', 'ViewSpot', 'mongo') col_dining = get_mongodb('poi', 'Restaurant', 'mongo') col_shopping = get_mongodb('poi', 'Shopping', 'mongo') poi_dbs = { 'vs': col_vs, 'dining': col_dining, 'shopping': col_shopping } def fetch_poi_item(mfw_id, poi_type): col_poi = poi_dbs[poi_type] tmp = col_poi.find_one({'source.mafengwo.id': mfw_id}, {'_id': 1}) if tmp: return {'type': poi_type, 'item_id': tmp['_id']} else: return None ret = None for v in ['vs', 'dining', 'shopping']: ret = fetch_poi_item(entry['poi_id'], v) if ret: break if not ret: return for item_type, item_data in self.parse_contents(entry['contents']): if item_type != 'image': item_data['source'] = {'mafengwo': {'id': entry['comment_id']}} item_data['type'] = ret['type'] item_data['itemId'] = ret['item_id'] self.update(item_type, item_data)
def build_city_map(self, refresh_redis=False): """ 建立从自有数据库的city到大众点评的city的映射 """ redis = self.engine.redis city_map = {} col_dp = get_mongodb('raw_dianping', 'City', 'mongo-raw') col_loc = get_mongodb('geo', 'Locality', 'mongo') for city_item in col_dp.find({}): city_name = city_item['city_name'] redis_key = 'dianping:norm_city_%s' % city_name norm_city_info = None if refresh_redis or not redis.exists(redis_key): candidates = list( col_loc.find({'alias': city_name}, {'_id': 1})) if len(candidates) > 1: self.log('Duplicate cities found for %s' % city_name, logging.WARN) elif not candidates: self.log('No city found for %s' % city_name, logging.WARN) else: norm_city_info = candidates[0] redis.set(redis_key, norm_city_info) else: exec 'from bson import ObjectId' norm_city_info = eval(redis.get(redis_key)) if norm_city_info: city_id = norm_city_info['_id'] city_map[city_id] = city_item self.city_map = city_map
def look_up_vs(self): """ 查询匹配:先做已有库名字匹配,无则通过百度旅游suggestion匹配 """ conn_taozi = get_mongodb('poi', 'ViewSpot', 'mongo') conn_raw_ly = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw') coon_mapping = get_mongodb('poi', 'LyMapping', 'mongo') for vs in self.vs_generate(): def map_vs(vs_info=vs): ly_id = int(vs_info['lyId']) ly_name = vs_info['lyName'] res = conn_taozi.find_one({'alias': ly_name}, {'_id': True, 'zhName': True}) if res is not None: coon_mapping.update({'itemId': res['_id']}, {'$set': {'itemId': res['_id'], 'zhNameLxp': res['zhName'], 'zhNameLy': ly_name, 'lyId': ly_id}}, upsert=True) conn_raw_ly.update({'lyId': ly_id}, {'$set': {'mapOk': True}}, upsert=False) else: suggs = self.get_baidu_sug(ly_name, None) if len(suggs): target = suggs[0] # 只选第一个 if target['type_code'] >= 6 and self.cal_dist(vs_info['lat'], vs_info['lng'], target['lat'], target['lng']) < 50: # 单位 km res = conn_taozi.find_one({'source.baidu.id': target['sid']}, {'_id': True, 'zhName': True}) if res is not None: coon_mapping.update({'itemId': res['_id']}, {'$set': {'itemId': res['_id'], 'zhNameLxp': res['zhName'], 'zhNameLy': ly_name, 'lyId': ly_id, 'mapEstimated': True}}, upsert=True) conn_raw_ly.update({'lyId': ly_id}, {'$set': {'mapEstimated': True}}, upsert=False) self.add_task(map_vs)
def sync_images(self): """ 同步图像编辑状态 :return: """ src_col = get_mongodb('poi', 'ViewSpot', 'mongo') dst_col = get_mongodb('poi', 'ViewSpot', 'mongo-cms') cursor = src_col.find({}, {'isDone': 1, 'images': 1}) for val in cursor: def task_func(entry=val): ops_set = {} ops_unset = {} for key in ['isDone', 'images']: if key in entry: ops_set[key] = entry[key] else: ops_unset[key] = 1 ops = {} if ops_set: ops['$set'] = ops_set if ops_unset: ops['$unset'] = ops_unset if ops: dst_col.update({'_id': entry['_id']}, ops) self.add_task(task_func)
def process_item(self, item, spider): col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') # get country country_code = item['country_code'] if country_code not in QyerCityProcPipeline.country_map: col_country = get_mongodb('geo', 'Country', profile='mongodb-general') country = col_country.find_one({'code': country_code}) assert country != None QyerCityProcPipeline.country_map[country_code] = country else: country = QyerCityProcPipeline.country_map[country_code] city_id = item['city_id'] city = col_loc.find_one({'source.qyer.id': city_id}) if not city: city = col_loc.find_one({'alias': item['zh_name'].lower(), 'location': { '$near': {'type': 'Point', 'coordinates': [item['lng'], item['lat']]}}, 'country._id': country['_id']}) if city: dist = utils.haversine(city['location']['coordinates'][0], city['location']['coordinates'][1], item['lng'], item['lat']) if dist > 100: city = {} if not city: city = {} city['enName'] = item['en_name'] zh_name = item['zh_name'] short_name = utils.get_short_loc(zh_name) city['zhName'] = short_name alias1 = city['alias'] if 'alias' in city and city['alias'] else [] alias2 = item['alias'] if 'alias' in item and item['alias'] else [] alias1.extend(alias2) alias1.append(short_name) city['alias'] = list(set(filter(lambda val: val, [tmp.lower().strip() for tmp in alias1]))) source = city['source'] if 'source' in city else {} source['qyer'] = {'id': item['city_id'], 'url': item['url']} city['source'] = source city['country'] = {'id': country['_id'], '_id': country['_id']} for k in ('enName', 'zhName'): if k in country: city['country'][k] = country[k] city['level'] = 2 city['desc'] = item['desc'] city['imageList'] = item['imageList'] city['images'] = [] city['location'] = {'type': 'Point', 'coordinates': [item['lng'], item['lat']]} city['abroad'] = country_code != 'CN' city['isHot'] = item['is_hot'] > 0 col_loc.save(city) return item
def add_image(image_url): from hashlib import md5 url_hash = md5(image_url).hexdigest() image = {'url_hash': url_hash, 'key': url_hash, 'url': image_url} col_im = get_mongodb('imagestore', 'Images', 'mongo') if not col_im.find_one({'key': image['key']}, {'_id': 1}): col = get_mongodb('imagestore', 'ImageCandidates', 'mongo') col.update({'key': image['key']}, {'$set': image}, upsert=True) return image['key']
def populate_tasks(self): from urllib import quote col_raw1 = get_mongodb('raw_baidu', 'BaiduPoi', 'mongo-raw') col_raw2 = get_mongodb('raw_baidu', 'BaiduLocality', 'mongo-raw') col = get_mongodb('raw_mfw', 'MfwSug', 'mongo-raw') query = json.loads(self.args.query) if self.args.query else {} for col_raw in [col_raw1, col_raw2]: cursor = col_raw.find(query, { 'ambiguity_sname': 1, 'sname': 1, 'sid': 1 }).skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) for val in cursor: def func(entry=val): for name in set( filter(lambda v: v.strip(), [ entry[k] for k in ['ambiguity_sname', 'sname'] ])): self.log(u'Parsing: %s, id=%s' % (name, entry['sid'])) url = 'http://www.mafengwo.cn/group/ss.php?callback=j&key=%s' % quote( name.encode('utf-8')) key = md5(url).hexdigest() if col.find_one({'key': key}, {'_id': 1}): # The record already exists self.log(u'Already exists, skipping: %s, id=%s' % (name, entry['sid'])) continue response = self.request.get(url) if not response: self.log( u'Failed to query url: %s, %s, id=%s' % (url, name, entry['sid']), logging.ERROR) continue col.update({'key': key}, { 'key': key, 'body': response.text, 'name': name, 'url': url }, upsert=True) self.add_task(func)
def parse(self, response): col = get_mongodb('raw_data', 'GeoNames', profile='mongodb-crawler') countries = response.meta['country'] query = {'featureClass': 'P', 'population': {'$gt': 0}} if countries: if len(countries) > 1: query['$or'] = [{'country': tmp.upper()} for tmp in countries] else: query['country'] = countries[0].upper() for entry in col.find(query): # city = col.find_one({'_id': entry['_id']}) city = entry item = CityItem() item['city_id'] = city['_id'] item['en_name'] = city['asciiName'] item['zh_name'] = city['enName'] item['lat'] = city['lat'] item['lng'] = city['lng'] item['population'] = city['population'] item['level'] = city['featureCode'] s = set([tmp.lower().strip() for tmp in (item['alias'] if 'alias' in city else [])]) s.add(city['asciiName'].lower()) s.add(city['enName'].lower()) for val in city['altName']: s.add(val.lower()) item['alias'] = list(s) country_code = city['country'] item['country_code'] = country_code if country_code in GeoNamesProcSpider.country_map: country = GeoNamesProcSpider.country_map[country_code] elif country_code not in GeoNamesProcSpider.missed_countries: col_country = get_mongodb('geo', 'Country', profile='mongodb-general') country = col_country.find_one({'code': country_code}) if not country: self.log('MISSED COUNTRY: %s' % country_code, log.WARNING) GeoNamesProcSpider.missed_countries.add(country_code) continue else: GeoNamesProcSpider.country_map[country_code] = country else: continue item['en_country'] = country['enName'] if 'enName' in country else None item['zh_country'] = country['zhName'] if 'zhName' in country else None yield Request(url='http://maps.googleapis.com/maps/api/geocode/json?address=%s,%s&sensor=false' % ( item['en_name'], item['en_country']), callback=self.parse_geocode, meta={'item': item, 'lang': 'zh'}, headers={'Accept-Language': 'zh-CN'}, dont_filter=True)
def process_item(self, item, spider): if not self.is_handler(item, spider): return item data = item['data'] item_type = item['type'] if item_type == 'question': col = get_mongodb('raw', 'MafengwoQuestion', 'mongo-raw') col.update({'q_id': data['q_id']}, {'$set': data}, upsert=True) else: col = get_mongodb('raw', 'MafengwoAnswer', 'mongo-raw') col.update({'a_id': data['a_id']}, {'$set': data}, upsert=True) return item
def resolve_targets(item): data = item['data'] col_mdd = get_mongodb('geo', 'Locality', 'mongodb-general') col_country = get_mongodb('geo', 'Country', 'mongodb-general') country_flag = False crumb_list = data.pop('crumbIds') crumb = [] for cid in crumb_list: ret = col_mdd.find_one({'source.mafengwo.id': cid}, { '_id': 1, 'zhName': 1, 'enName': 1 }) if not ret and not country_flag: ret = col_country.find_one({'source.mafengwo.id': cid}, { '_id': 1, 'zhName': 1, 'enName': 1, 'code': 1 }) if ret: # 添加到country字段 data['country'] = ret for key in ret: data['country'][key] = ret[key] country_flag = True if ret: crumb.append(ret['_id']) data['targets'] = crumb # 从crumb的最后开始查找。第一个目的地即为city city = None for idx in xrange(len(crumb_list) - 1, -1, -1): cid = crumb_list[idx] ret = col_mdd.find_one({'source.mafengwo.id': cid}, { '_id': 1, 'zhName': 1, 'enName': 1 }) if ret: city = {'_id': ret['_id']} for key in ['zhName', 'enName']: if key in ret: city[key] = ret[key] break if city: data['locality'] = city
def resolve_targets(data): """ 将baidu sid解析为相应的object ID :param entry: :param data: :return: """ if 'locList' not in data: return col_country = get_mongodb('geo', 'Country', 'mongo') col_mdd = get_mongodb('geo', 'Locality', 'mongo') def func(loc_list): """ 顺序查找loc_list中的项目。如果有命中的,则返回。 :param col: :param loc_list: :return: """ target_list = [] country = None country_flag = True for item in loc_list: if country_flag: ret = col_country.find_one({'alias': item['sname']}, {'zhName': 1, 'enName': 1}) else: ret = col_mdd.find_one({'source.baidu.id': item['sid']}, {'zhName': 1, 'enName': 1}) if not ret: continue if country_flag: country = ret country_flag = False target_list.append(ret) return country, target_list country, target_list = func(data.pop('locList')) if country: data['country'] = country data['abroad'] = country['zhName'] not in [u'中国', u'澳门', u'香港', u'台湾'] else: data['abroad'] = None if target_list: data['locList'] = target_list
def process_item(self, item, spider): col = get_mongodb('raw_data', 'YahooCityInfo', profile='mongodb-crawler') # retrieve the locality if it exists if 'woeid' in item: data = col.find_one({'woeid': item['woeid'], 'level': 2}) else: data = col.find_one({ 'country': item['country'], 'state': item['state'], 'level': 1 }) if not data: data = {} for k in [ 'country', 'state', 'city', 'abroad', 'coords', 'woeid', 'level' ]: if k in item: data[k] = item[k] col.save(data) return item
def parse(self, response): meta = response.meta col_raw = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler') for country in meta['countries']: # 查找指定国家的POI for entry in col_raw.find( {'country_info.country_engname': country}): lat = entry['poi_lat'] lng = entry['poi_lng'] if not lat or not lng: continue item = QyerPoiItem() for k in entry: if k in item.fields: item[k] = entry[k] # 这一步是为了获得poi所在城市 url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%f,%f' % ( lat, lng) yield Request(url=url, meta={'item': item}, callback=self.parse_geocode, dont_filter=True)
def process_item(self, item, spider): test = item con_ticket = get_mongodb('raw_ly', 'Ticket', profile='mongo-raw') con_vs = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw') if item.name == 'ticket': con_ticket.update({'pid': item['pid']}, {'$set': {'pid': item['pid'], 'lyId': item['lyId'], 'info': item['info'], 'stockList': item['stock_list']}}, upsert=True) elif item.name == 'ticket-delete': pass # con_ticket.remove({'lyId': item['ly_id'], 'pid': {'$nin': item['id_list']}}, multi=True) elif item.name == 'vs-ticket-info': con_vs.update({'lyId': item['ly_id']}, {'$set': {'ticketInfo': item['info'], 'crawl': True}}) return item
def crawl_vs(self): """ crawl vs with ly api through city iteration :return: """ conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw') city_list = self.crawl_city() self.logger.info('-=-=-=-=length: %s' % len(city_list)) for ct in city_list: def func(city=ct): self.logger.info('================%s==============' % city) if int(city['location_id']) <= 35: query_obj = {'clientIp': '127.0.0.1', 'provinceId': int(city['location_id'])} elif int(city['location_id']) <= 404: query_obj = {'clientIp': '127.0.0.1', 'cityId': int(city['location_id'])} else: query_obj = {'clientIp': '127.0.0.1', 'countryId': int(city['location_id'])} raw_xml = self.scenerylist().send_request(query_obj) node = etree.fromstring(raw_xml) responce_code = node.xpath('//rspCode/text()')[0] if '0000' == str(responce_code): total_page = node.xpath('//sceneryList')[0].attrib['totalPage'] for page in xrange(int(total_page)): temp_query = copy.deepcopy(query_obj) temp_query['page'] = page + 1 raw_xml = self.scenerylist().send_request(temp_query) node = etree.fromstring(raw_xml) vs_nodes = node.xpath('//sceneryName') for vs in vs_nodes: name = vs.text.encode('utf-8') ly_id = int(vs.xpath('../sceneryId')[0].text) self.logger.info('----%s-----%s' % (name, ly_id)) conn.update({'lyId': ly_id}, {'$set': {'lyId': ly_id, 'lyName': name}}, upsert=True) self.add_task(func)
def build_cursor(self): col = get_mongodb('poi', 'Restaurant', 'mongo') query = { 'source.dianping.id': None, 'locality._id': { '$in': self.city_map.keys() } } if self.args.query: exec 'from bson import ObjectId' extra_query = eval(self.args.query) else: extra_query = {} if extra_query: query = {'$and': [query, extra_query]} cursor = col.find(query, { 'locality': 1, 'zhName': 1, 'alias': 1, 'location': 1 }).skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) return cursor
def process(self, entry): qunar_id = entry['source']['qunar']['id'] image_list_url = 'http://travel.qunar.com/place/api/poi/image?offset=0&limit=1000&poiId=%d' % qunar_id self.logger.debug('Processing poi: %d, url: %s' % (qunar_id, image_list_url)) try: validators = [qunar_validator, qunar_json_validator] response = self.request.get( image_list_url, user_data={'ProxyMiddleware': { 'validator': validators }}) images = response.json()['data'] except (IOError, ValueError, KeyError) as e: self.logger.warn('Failed: %s' % image_list_url) return if not images: return col_im = get_mongodb('imagestore', 'Images', 'mongo') col_cand = get_mongodb('imagestore', 'ImageCandidates', 'mongo') for idx, img_entry in enumerate(images): url = img_entry['url'] key = md5(url).hexdigest() url_hash = key ord_idx = idx image = { 'url': url, 'key': key, 'url_hash': url_hash, 'ord': ord_idx } if img_entry['userName']: image['meta'] = {'userName': img_entry['userName']} self.logger.debug('Retrieved image: %s, url=%s, poi=%d' % (key, url, qunar_id)) ops = {'$set': image, '$addToSet': {'itemIds': entry['_id']}} ret = col_im.update({'url_hash': url_hash}, ops) if not ret['updatedExisting']: col_cand.update({'url_hash': url_hash}, ops, upsert=True)
def update_city(self, item): city_candidates = item['poi_city'] country_info = item['country_info'] # lookup the city city = None col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') for city_name in city_candidates: city_list = list( col_loc.find( { 'country.id': country_info['_id'], 'alias': re.compile(r'^%s' % city_name.lower()), 'location': { '$near': { '$geometry': { 'type': 'Point', 'coordinates': [item['poi_lng'], item['poi_lat']] }, '$minDistance': 0, '$maxDistance': 100 * 1000 } } }, { 'zhName': 1, 'enName': 1, 'coords': 1 }).limit(5)) if city_list: city = city_list[0] break if not city: self.log( 'Failed to find locality from DB: %s' % ', '.join(city_candidates), log.WARNING) return alias_names = list( set( filter(lambda val: val, [(city[k].strip() if k in city and city[k] else '') for k in ['zhName', 'enName']]))) try: zhName = city['zhName'].strip() except (ValueError, KeyError, AttributeError): zhName = alias_names[0] try: enName = city['enName'].strip() except (ValueError, KeyError, AttributeError): enName = alias_names[0] item['poi_city'] = { 'id': city['_id'], '_id': city['_id'], 'zhName': zhName, 'enName': enName } return item
def process_item(self, item, spider): if not self.is_handler(item, spider): return item data = item['data'] item_type = item['type'] # 数据库授权 if item_type == 'question': col = get_mongodb('raw', 'QunarQuestion', 'mongo-raw') col.update({'post_id': data['post_id']}, {'$set': data}, upsert=True) # log.msg('note_id:%s' % data['note_id'], level=log.INFO) else: col = get_mongodb('raw', 'QunarAnswer', 'mongo-raw') col.update({'post_id': data['post_id']}, {'$set': data}, upsert=True) # log.msg('note_id:%s' % data['note_id'], level=log.INFO) return item
def mv_candidates(image): """ 将图像添加到ImageCandidates里面 """ from utils.database import get_mongodb col_cand = get_mongodb('imagestore', 'ImageCandidates', profile='mongo') col_img = get_mongodb('imagestore', 'Images', profile='mongo') image_id = image.pop('_id') print 'Moving %s' % image['url'] image['url_hash'] = md5(image['url']).hexdigest() image['key'] = image['url_hash'] col_cand.update({'url_hash': image['url_hash']}, {'$set': image}) col_img.remove({'_id': image_id})
def bind_shop_id(shop, dianping_id): col = get_mongodb('poi', 'Restaurant', 'mongo') col.update({'_id': shop['_id']}, {'$set': { 'source.dianping': { 'id': dianping_id } }})
def vs_generate(self): """ 待处理景点生成器 """ conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw') for entry in list(conn.find({'mapped': False}, {'lyId': 1, 'lyName': 1, 'lat': 1, 'lng': 1})): conn.update({'lyId': entry['lyId']}, {'$set': {'mapped': True}}, upsert=False) yield entry
def process_item(self, item, spider): if not self.is_handler(item, spider): return item data = item['data'] item_type = item['type'] # 数据库授权 if item_type == 'question': col = get_mongodb('raw_faq', 'CtripQuestion', 'mongo-raw') col.update({'q_id': data['q_id']}, {'$set': data}, upsert=True) # log.msg('note_id:%s' % data['note_id'], level=log.INFO) else: col = get_mongodb('raw_faq', 'CtripAnswer', 'mongo-raw') col.update({'a_id': data['a_id']}, {'$set': data}, upsert=True) # log.msg('note_id:%s' % data['note_id'], level=log.INFO) return item
def process_item(self, item, spider): if type(item).__name__ != YahooCityItem.__name__: return item col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') data = {} level = item['level'] data['zhName'] = item['zh_name'] data['enName'] = item['en_name'] abroad = item['abroad'] data['abroad'] = abroad data['shortName'] = item['en_name' if abroad else 'zh_name'] data['alias'] = list(set(item['alias'])) data['pinyin'] = [] country_info = item['country'] data['country'] = {'id': country_info['_id'], 'zhName': country_info['zhName'], 'enName': country_info['enName']} data['level'] = level data['images'] = [] if 'coords' in item: data['coords'] = item['coords'] data['source'] = {'name': 'yahoo'} if 'woeid' in item: data['source']['id'] = item['woeid'] if level > 1: # cities prov = col_loc.find_one({'country.id': country_info['_id'], 'alias': item['state'].lower(), 'level': 1}) if prov: data['superAdm'] = {'id': prov['_id'], 'zhName': prov['zhName'], 'enName': prov['enName']} else: spider.log('Cannot find province: %s, %s' % (item['state'], item['en_country'])) if 'woeid' in item: entry = col_loc.find_one({'source.name': 'yahoo', 'source.id': item['woeid']}) else: entry = col_loc.find_one({'country.id': country_info['_id'], 'alias': data['enName'].lower()}) if not entry: entry = {} key_set = set(data.keys()) - {'alias'} for k in key_set: entry[k] = data[k] if 'alias' not in entry: entry['alias'] = [] entry['alias'] = list(set(entry['alias']).union(data['alias'])) col_loc.save(entry) return item
def process(self, entry): """ 开始处理评论列表 """ col_raw = get_mongodb('raw_qunar', 'PoiComment', 'mongo-raw') tmpl = 'http://travel.qunar.com/place/api/html/comments/poi/%d?sortField=1&pageSize=%d&page=%d' qunar_id = entry['source']['qunar']['id'] page = 0 page_size = 50 while True: page += 1 comments_list_url = tmpl % (qunar_id, page_size, page) self.logger.debug('Fetching: poi: %d, page: %d, url: %s' % (qunar_id, page, comments_list_url)) redis_key = 'qunar:poi-comment:list:%d:%d:%d' % (qunar_id, page_size, page) def get_comments_list(): """ 获得评论列表的response body """ validators = [qunar_validator, qunar_json_validator] response = self.request.get( comments_list_url, timeout=15, user_data={'ProxyMiddleware': { 'validator': validators }}) return response.text try: comments_list_expire = 3600 * 24 search_result_text = self.redis.get_cache( redis_key, get_comments_list, expire=comments_list_expire) data = json.loads(search_result_text) except (IOError, ValueError): self.logger.warn('Fetching failed: %s' % comments_list_url) break if data['errmsg'] != 'success': self.logger.warn('Fetching failed %s, errmsg: %s' % (comments_list_url, data['errmsg'])) break tmp = self.parse_comments(data['data']) comments = list(tmp) if tmp else [] for c in comments: c['poi_id'] = qunar_id col_raw.update({'comment_id': c['comment_id']}, {'$set': c}, upsert=True) # 如果返回空列表,或者comments数量不足pageSize,说明已经到达最末页 if not comments or len(comments) < page_size: return
def start_requests(self): # send request col = get_mongodb('raw_data', 'CityInfo', profile='mongodb-crawler') # get the collection of cityinfo for temp in col.find({}, {'city': 1, 'woeid': 1}): city = temp['city'] woeid = temp['woeid'] city_id = temp['_id'] url = 'http://weather.yahooapis.com/forecastrss?w=%d&u=c' % woeid data = {'city_id': city_id, 'city': city, 'woeid': woeid} yield Request(url=url, callback=self.parse, meta={'data': data})
def process_item(self, item, spider): col = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler') data = col.find_one({'poi_id': item['poi_id']}) if not data: data = {} for key in item.keys(): data[key] = item[key] col.save(data) return item
def process_item(self, item, spider): if not self.is_handler(item, spider): return item col = get_mongodb('misc', 'Proxy', 'mongo') col.update({'host': item['host'], 'port': item['port']}, {'$set': {k: item[k] for k in item.keys()}}, upsert=True) return item
def get_baidu_sug(self, name, location): from utils import mercator2wgs from urllib import quote url = u'http://lvyou.baidu.com/destination/ajax/sug?wd=%s&prod=lvyou_new&su_num=20' % name key = quote(name.encode('utf-8')) col = get_mongodb('raw_baidu', 'BaiduSug', 'mongo-raw') ret = col.find_one({'key': key}, {'body': 1}) body = None if ret: body = ret['body'] else: try: response = ProcessorEngine.get_instance().request.get(url) if response: body = response.text col.update({'key': key}, { 'key': key, 'body': body, 'url': url }, upsert=True) except IOError: pass if not body: return [] try: sug = json.loads(json.loads(body)['data']['sug']) result = [] for s in sug['s']: tmp = re.split(r'\$', s) entry = { 'sname': tmp[0].strip(), 'parents': tmp[6].strip(), 'sid': tmp[8].strip(), 'surl': tmp[22].strip(), 'parent_sid': tmp[26].strip(), 'type_code': int(tmp[24]) } mx = float(tmp[14]) my = float(tmp[16]) entry['lng'], entry['lat'] = self.bd_mc_to_ll(mx, my) # entry['lng'], entry['lat'] = mercator2wgs(mx, my) result.append(entry) return result except IOError as e: e.message += 'url: %s' % url raise e except (ValueError, KeyError): return []
def update(item_type, item_data): if item_type == 'comment': db_dict = { 'vs': 'ViewSpotComment', 'dining': 'DiningComment', 'shopping': 'ShoppingComment' } db_name = db_dict[item_data.pop('type')] col = get_mongodb('comment', db_name, 'mongo') col.update( {'source.mafengwo.id': item_data['source']['mafengwo']['id']}, {'$set': item_data}, upsert=True) elif item_type == 'image': col = get_mongodb('imagestore', 'ImageCandidates', 'mongo') col.update({'key': item_data['key']}, {'$set': item_data}, upsert=True) else: assert False, 'Invalid type: %s' % item_type
def store_shops(shop_list): """ 将shop保存到raw_dianping数据库 """ col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw') for shop in shop_list: ret = col.find_one({'shop_id': shop['shop_id']}, {'_id': 1}) if not ret: col.update({'shop_id': shop['shop_id']}, {'$set': shop}, upsert=True)
def update_country(self, item): country_name = item['country_info'] # lookup the country col_country = get_mongodb('geo', 'Country', profile='mongodb-general') ret = col_country.find_one({'alias': country_name.lower()}, {'zhName': 1, 'enName': 1}) if not ret: self.log('Failed to find country: %s' % country_name, log.WARNING) return item['country_info'] = {'id': ret['_id'], '_id': ret['_id'], 'zhName': ret['zhName'], 'enName': ret['enName']} return item
def parse(self, response): col = get_mongodb('raw_data', 'YahooCityInfo', profile='mongodb-crawler') countries = response.meta['countries'] level = response.meta['level'] query = {'$or': [{'country.countrycode': tmp} for tmp in countries]} if countries else {} if level: query['level'] = level for entry in list(col.find(query, {'_id': 1})): city = col.find_one({'_id': entry['_id']}) item = YahooCityItem() for k in ['country', 'state', 'city', 'coords', 'woeid', 'abroad', 'level']: if k in city: item[k] = city[k] country_code = city['country']['countrycode'] if country_code not in self.country_map: col_country = get_mongodb('geo', 'Country', profile='mongodb-general') country_info = col_country.find_one({'code': country_code}) if not country_info: self.log('Unable to find country: %s' % country_code, log.WARNING) continue self.country_map[country_code] = country_info country_info = self.country_map[country_code] item['country'] = country_info item['en_country'] = country_info['enName'] item['zh_country'] = country_info['zhName'] if 'city' in city: item['en_name'] = city['city'] item['zh_name'] = city['city'] else: item['en_name'] = city['state'] item['zh_name'] = city['state'] item['alias'] = list({item['en_name'].lower()}) yield Request(url='http://maps.googleapis.com/maps/api/geocode/json?address=%s,%s&sensor=false' % ( item['en_name'], item['en_country']), callback=self.parse_geocode, meta={'item': item, 'lang': 'zh'}, headers={'Accept-Language': 'zh-CN'}, dont_filter=True)
def parse(self, response): for entry in get_mongodb('raw_data', 'QyerCountry', profile='mongodb-crawler'): item = QyerCountryItem() item['country_id'] = entry['countryId'] item['country_zh'] = entry['zhName'] item['country_en'] = entry['enName'] item['cont_zh'] = entry['zhContinent'] item['cont_en'] = entry['enContinent'] item['is_hot'] = entry['isHot'] yield item
def check_exist(entry): """ Check if an image is already processed """ col_im = get_mongodb('imagestore', 'Images', 'mongo') url = entry['url'] url_hash = md5(url).hexdigest() assert url_hash == entry['url_hash'] ret = col_im.find_one({'url_hash': url_hash}, {'_id': 1}) return bool(ret)
def get_poi_image(self, shop_id, page_idx=1): template = 'http://www.dianping.com/shop/%d/photos?pg=%d' album_url = template % (shop_id, page_idx) validators = [ lambda v: status_code_validator(v, [200, 404]), lambda v: response_size_validator(v, 4096) ] response = self.request.get( album_url, timeout=15, user_data={'ProxyMiddleware': { 'validator': validators }}) from lxml import etree from hashlib import md5 col = get_mongodb('raw_dianping', 'DianpingImage', 'mongo-raw') root_node = etree.fromstring(response.text, parser=etree.HTMLParser()) for image_node in root_node.xpath( '//div[@class="picture-list"]/ul/li[@class="J_list"]'): try: image_title = image_node.xpath( './div[@class="picture-info"]/div[@class="name"]//a[@href and @title and @onclick]/@title' )[0] if u'默认图片' in image_title: continue except IndexError: continue try: image_src = image_node.xpath( './div[@class="img"]/a[@href and @onclick]/img[@src and @title]/@src' )[0] pattern = re.compile(r'(/pc/[0-9a-z]{32})\(\d+[cx]\d+\)/') match = re.search(pattern, image_src) if not match: continue image_src = re.sub(pattern, '\\1(1024c1024)/', image_src) key = md5(image_src).hexdigest() image_entry = { 'url_hash': key, 'key': key, 'url': image_src, 'shop_id': shop_id } col.update({'key': key}, {'$set': image_entry}, upsert=True) except IndexError: continue
def build_cursor(self): col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw') query = {} if self.args.query: exec 'from bson import ObjectId' query = eval(self.args.query) cursor = col.find(query).skip(self.args.skip) if self.args.limit: cursor.limit(self.args.limit) return cursor
def crawl_vs(self): """ crawl vs with ly api through city iteration :return: """ conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw') city_list = self.crawl_city() self.logger.info('-=-=-=-=length: %s' % len(city_list)) for ct in city_list: def func(city=ct): self.logger.info('================%s==============' % city) if int(city['location_id']) <= 35: query_obj = { 'clientIp': '127.0.0.1', 'provinceId': int(city['location_id']) } elif int(city['location_id']) <= 404: query_obj = { 'clientIp': '127.0.0.1', 'cityId': int(city['location_id']) } else: query_obj = { 'clientIp': '127.0.0.1', 'countryId': int(city['location_id']) } raw_xml = self.scenerylist().send_request(query_obj) node = etree.fromstring(raw_xml) responce_code = node.xpath('//rspCode/text()')[0] if '0000' == str(responce_code): total_page = node.xpath( '//sceneryList')[0].attrib['totalPage'] for page in xrange(int(total_page)): temp_query = copy.deepcopy(query_obj) temp_query['page'] = page + 1 raw_xml = self.scenerylist().send_request(temp_query) node = etree.fromstring(raw_xml) vs_nodes = node.xpath('//sceneryName') for vs in vs_nodes: name = vs.text.encode('utf-8') ly_id = int(vs.xpath('../sceneryId')[0].text) self.logger.info('----%s-----%s' % (name, ly_id)) conn.update( {'lyId': ly_id}, {'$set': { 'lyId': ly_id, 'lyName': name }}, upsert=True) self.add_task(func)
def start_requests(self): conn = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw') for entry in conn.find({'crawl': False}, {'lyId': 1, 'lyName': 1}): scenery_id = int(entry['lyId']) request_body = {'clientIp': '127.0.0.1', 'sceneryIds': scenery_id, 'payType': 0} request_xml = self.assemble_req_xml(self.api['sceneryprice'], request_body) yield Request( url=self.url, method='POST', body=request_xml, headers={'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest'}, callback=self.parse_ticket_types, meta={'lyId': scenery_id})
def process_item(self, item, spider): if not self.is_handler(item, spider): return item data = item['data'] poi_type = item['type'] col_names = {'locality': 'Locality', 'attraction': 'Viewspot', 'restaurant': 'Dining', 'shopping': 'Shopping', 'activity': 'Activity', 'hotel': 'Hotel'} col = get_mongodb('raw_koubei', col_names[poi_type], 'mongo-raw') col.update({'id': data['id']}, {'$set': data}, upsert=True) return item
def zhName_to_file(filename): viewspot_conn = get_mongodb("poi", "ViewSpot", "mongo") cursor = viewspot_conn.find() f = open(filename, 'a') i = 0 for val in cursor: i = i + 1 if val.has_key('zhName'): s = str(val['zhName']) + '\n' f.write(s) if i % 1000 == 0: f.flush() f.close()
def process_item(self, item, spider): data = {} if 'loc' in item: data['loc'] = item['loc'] if 'current_temprature' in item: data['current_temprature'] = item['current_temprature'] if 'forecast' in item: data['forecast'] = item['forecast'] if 'current' in item: data['current'] = item['current'] if 'source' in item: data['source'] = item['source'] col = get_mongodb('yahooweather', 'CityTemprature', profile=None) col.save(data) return item
def start_requests(self): # send request country_list = [] if 'param' in dir(self): param = getattr(self, 'param', []) if 'country' in param: country_list = param['country'] if not country_list: country_list = list([tmp['code'] for tmp in get_mongodb('geo', 'Country').find({}, {'code': 1})]) first_url = 'https://weather.yahoo.com/' for country_code in country_list: abroad = (country_code.lower() != 'cn') temp_url = first_url + country_code data = {'countrycode': country_code, 'abroad': abroad} yield Request(url=temp_url, callback=self.parse_state_url, meta={'data': data})
def process_item(self, item, spider): col = get_mongodb('raw_data', 'YahooCityInfo', profile='mongodb-crawler') # retrieve the locality if it exists if 'woeid' in item: data = col.find_one({'woeid': item['woeid'], 'level': 2}) else: data = col.find_one({'country': item['country'], 'state': item['state'], 'level': 1}) if not data: data = {} for k in ['country', 'state', 'city', 'abroad', 'coords', 'woeid', 'level']: if k in item: data[k] = item[k] col.save(data) return item
def process_item(self, item, spider): if type(item).__name__ != CityItem.__name__: return item col = get_mongodb('raw_data', 'TravelGisCity', 'localhost', 27027) ret = col.find_one({'name': item['city'], 'countryCode': item['code']}) if not ret: ret = {} ret['name'] = item['city'] ret['country'] = item['country'] ret['countryCode'] = item['code'] ret['lat'] = item['lat'] ret['lng'] = item['lng'] col.save(ret) return item
def process_dining_item(item, spider): data = item['data'] add_set_ops = {} for key in ['tags']: elements = DianpingPipeline.add_to_set(data, key) if elements: add_set_ops[key] = {'$each': elements} col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw') ops = {'$set': data} if add_set_ops: ops['$addToSet'] = add_set_ops from pymongo.errors import OperationFailure try: col.update({'shop_id': data['shop_id']}, ops, upsert=True) except OperationFailure as e: spider.log(e.message, level=log.ERROR) return item
def parse(self, response): meta = response.meta col_raw = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler') for country in meta['countries']: # 查找指定国家的POI for entry in col_raw.find({'country_info.country_engname': country}): lat = entry['poi_lat'] lng = entry['poi_lng'] if not lat or not lng: continue item = QyerPoiItem() for k in entry: if k in item.fields: item[k] = entry[k] # 这一步是为了获得poi所在城市 url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%f,%f' % (lat, lng) yield Request(url=url, meta={'item': item}, callback=self.parse_geocode, dont_filter=True)
def process_item(self, item, spider): if type(item).__name__ != CityItem.__name__: return item col = get_mongodb('raw_data', 'GeoNamesCity', 'localhost', 27027) ret = col.find_one({'_id': item['city_id']}) if not ret: ret = {} ret['enName'] = item['en_name'] ret['zhName'] = item['zh_name'] ret['countryCode'] = item['country_code'] ret['lat'] = item['lat'] ret['lng'] = item['lng'] ret['population'] = item['population'] ret['_id'] = item['city_id'] col.save(ret) return item
def update_city(self, item): city_candidates = item['poi_city'] country_info = item['country_info'] # lookup the city city = None col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') for city_name in city_candidates: city_list = list(col_loc.find({'country.id': country_info['_id'], 'alias': re.compile(r'^%s' % city_name.lower()), 'location': { '$near': { '$geometry': {'type': 'Point', 'coordinates': [item['poi_lng'], item['poi_lat']]}, '$minDistance': 0, '$maxDistance': 100 * 1000 } }}, {'zhName': 1, 'enName': 1, 'coords': 1}).limit(5)) if city_list: city = city_list[0] break if not city: self.log('Failed to find locality from DB: %s' % ', '.join(city_candidates), log.WARNING) return alias_names = list(set(filter(lambda val: val, [(city[k].strip() if k in city and city[k] else '') for k in ['zhName', 'enName']]))) try: zhName = city['zhName'].strip() except (ValueError, KeyError, AttributeError): zhName = alias_names[0] try: enName = city['enName'].strip() except (ValueError, KeyError, AttributeError): enName = alias_names[0] item['poi_city'] = {'id': city['_id'], '_id': city['_id'], 'zhName': zhName, 'enName': enName} return item
def process_item(self, item, spider): if not isinstance(item, ChanyoujiYoujiItem): return item col = get_mongodb('raw_data', 'ChanyoujiNote1', profile='mongodb-crawler') note = {'noteId': item['trips_id'], 'title': item['title'], 'authorName': item['authorName'], 'favorCnt': item['favorCnt'], 'commentCnt': item['commentCnt'], 'viewCnt': item['viewCnt'], 'note': item['data'], 'authorAvatar': item['authorAvatar'], 'authorId': item['authorId'] } ret = col.find_one({'noteId': note['noteId']}) if not ret: ret = {} for k in note: ret[k] = note[k] col.save(ret) return item
def process_item(self, item, spider): if type(item).__name__ != BusStatoinItem.__name__: return item col = get_mongodb('raw_data', 'BusStation', 'localhost', 27027) ret = col.find_one({'stationId': item['station_id']}) if not ret: ret = {} ret['stationId'] = item['station_id'] ret['city'] = item['city'] ret['province'] = item['prov'] ret['name'] = item['name'] if 'addr' in item and item['addr']: ret['addr'] = item['addr'] if 'tel' in item and item['tel']: ret['tel'] = item['tel'] if 'blat' in item and 'blng' in item and item['blat'] and item['blng']: ret['blat'] = item['blat'] ret['blng'] = item['blng'] col.save(ret) return item
def fetch_db_col(db, col, profile): return get_mongodb(db, col, profile)
def fetch_db_col(self, db, col, profile): sig = '%s.%s.%s' % (db, col, profile) if sig not in self.col_dict: self.col_dict[sig] = get_mongodb(db, col, profile) return self.col_dict[sig]
"type": "geo_point" }, "type": { "type": "string" } } } } } } es_client.indices.put_mapping(type_name, viewspot_map, index_name) bulk_data = [] viewspot_conn = get_mongodb("poi", "ViewSpot", 'mongo') cursor = viewspot_conn.find() i = 0 for val in cursor: i = i + 1 #元数据 bulk_data.append({ "index": { "_index": index_name, "_type": type_name, "_id": i } }) doc = {}
def process_item(self, item, spider): city_info = item['poi_city'] country_info = item['country_info'] # lookup the poi col_vs = get_mongodb('poi', 'ViewSpot', profile='mongodb-general') vs = col_vs.find_one({'source.qyer.id': item['poi_id']}) if not vs: vs = {} source = vs['source'] if 'source' in vs else {} source['qyer'] = {'id': item['poi_id'], 'url': item['poi_url']} vs['source'] = source desc = vs['description'] if 'description' in vs else {} desc['desc'] = item['poi_summary'] vs['description'] = desc vs['name'] = item['poi_name'] vs['zhName'] = item['poi_name'] vs['enName'] = item['poi_englishName'] def _image_proc(url): m = re.search(r'^(.+pic\.qyer\.com/album/.+/index)/[0-9x]+$', url) return m.group(1) if m else url vs['imageList'] = map(_image_proc, item['poi_photo'] if 'poi_photo' in item and item['poi_photo'] else []) vs['country'] = country_info vs['city'] = city_info alias = filter(lambda val: val, list(set([vs[k].strip().lower() if vs[k] else '' for k in ['name', 'zhName', 'enName']]))) alias.extend(item['alias']) vs['alias'] = list(set(alias)) vs['rating'] = item['rating'] if 'rating' in item else None vs['targets'] = [city_info['_id'], country_info['_id']] vs['enabled'] = True vs['abroad'] = True vs['location'] = {'type': 'Point', 'coordinates': [item['poi_lng'], item['poi_lat']]} if 'viewport' in item: vs['viewport'] = {'northeast': {'type': 'Point', 'coordinates': [item['viewport']['northeast']['lng'], item['viewport']['northeast']['lat']]}, 'southwest': {'type': 'Point', 'coordinates': [item['viewport']['southwest']['lng'], item['viewport']['southwest']['lat']]}, } details = item['poi_detail'] if 'poi_detail' in item else [] new_det = [] for entry in details: if entry['title'][:2] == u'门票': vs['priceDesc'] = entry['content'] elif entry['title'][:4] == u'到达方式': vs['trafficInfo'] = entry['content'] elif entry['title'][:4] == u'开放时间': vs['openTime'] = entry['content'] elif entry['title'][:2] == u'地址': vs['address'] = entry['content'] elif entry['title'][:2] == u'网址': vs['website'] = entry['content'] elif entry['title'][:4] == u'所属分类': tags = set(vs['tags'] if 'tags' in vs else []) for t in re.split(ur'[/\||\s,]', entry['content']): # for t in re.split(r'[/\|\s,]', entry['content']): t = t.strip() if t: tags.add(t) vs['tags'] = list(tags) else: new_det.append(entry['title'] + entry['content'])