def parse_list(self, response): data = json.loads(response.body) m = response.meta['data'] for entry in data['stationList']: item = BusStatoinItem() item['station_id'] = entry['PK_STATION_ID'] item['city'] = m['city_name'] item['prov'] = m['prov_name'] item['name'] = entry['STATION_NAME'] if 'STATION_ADDRESS' in entry: item['addr'] = entry['STATION_ADDRESS'] if 'TELPHONE' in entry: item['tel'] = entry['TELPHONE'] key = self.baidu_key.values()[random.randint( 0, len(self.baidu_key) - 1)] url = u'http://api.map.baidu.com/geocoder/v2/?ak=%s&output=json&address=%s' % ( key, item['name']) if utils.get_short_loc(item['city']) not in item['name']: url += u'&city=%s' % item['city'] yield Request(url=url, callback=self.parse_addr, meta={ 'item': item, 'key': key })
def parse_addr(self, response): item = response.meta['item'] try: result = json.loads(response.body)['result'] loc = result['location'] lat, lng = loc['lat'], loc['lng'] item['blat'] = lat item['blng'] = lng yield item except KeyError: if 'stop_flag' in response.meta or 'addr' not in item or not item[ 'addr']: yield item else: key = self.baidu_key.values()[random.randint( 0, len(self.baidu_key) - 1)] url = u'http://api.map.baidu.com/geocoder/v2/?ak=%s&output=json&address=%s' % ( key, item['addr']) if utils.get_short_loc(item['city']) not in item['addr']: url += u'&city=%s' % item['city'] yield Request(url=url, callback=self.parse_addr, meta={ 'item': item, 'key': key, 'stop_flag': True })
def process_item(self, item, spider): col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') # get country country_code = item['country_code'] if country_code not in QyerCityProcPipeline.country_map: col_country = get_mongodb('geo', 'Country', profile='mongodb-general') country = col_country.find_one({'code': country_code}) assert country != None QyerCityProcPipeline.country_map[country_code] = country else: country = QyerCityProcPipeline.country_map[country_code] city_id = item['city_id'] city = col_loc.find_one({'source.qyer.id': city_id}) if not city: city = col_loc.find_one({'alias': item['zh_name'].lower(), 'location': { '$near': {'type': 'Point', 'coordinates': [item['lng'], item['lat']]}}, 'country._id': country['_id']}) if city: dist = utils.haversine(city['location']['coordinates'][0], city['location']['coordinates'][1], item['lng'], item['lat']) if dist > 100: city = {} if not city: city = {} city['enName'] = item['en_name'] zh_name = item['zh_name'] short_name = utils.get_short_loc(zh_name) city['zhName'] = short_name alias1 = city['alias'] if 'alias' in city and city['alias'] else [] alias2 = item['alias'] if 'alias' in item and item['alias'] else [] alias1.extend(alias2) alias1.append(short_name) city['alias'] = list(set(filter(lambda val: val, [tmp.lower().strip() for tmp in alias1]))) source = city['source'] if 'source' in city else {} source['qyer'] = {'id': item['city_id'], 'url': item['url']} city['source'] = source city['country'] = {'id': country['_id'], '_id': country['_id']} for k in ('enName', 'zhName'): if k in country: city['country'][k] = country[k] city['level'] = 2 city['desc'] = item['desc'] city['imageList'] = item['imageList'] city['images'] = [] city['location'] = {'type': 'Point', 'coordinates': [item['lng'], item['lat']]} city['abroad'] = country_code != 'CN' city['isHot'] = item['is_hot'] > 0 col_loc.save(city) return item
def parse_list(self, response): data = json.loads(response.body) m = response.meta['data'] for entry in data['stationList']: item = BusStatoinItem() item['station_id'] = entry['PK_STATION_ID'] item['city'] = m['city_name'] item['prov'] = m['prov_name'] item['name'] = entry['STATION_NAME'] if 'STATION_ADDRESS' in entry: item['addr'] = entry['STATION_ADDRESS'] if 'TELPHONE' in entry: item['tel'] = entry['TELPHONE'] key = self.baidu_key.values()[random.randint(0, len(self.baidu_key) - 1)] url = u'http://api.map.baidu.com/geocoder/v2/?ak=%s&output=json&address=%s' % (key, item['name']) if utils.get_short_loc(item['city']) not in item['name']: url += u'&city=%s' % item['city'] yield Request(url=url, callback=self.parse_addr, meta={'item': item, 'key': key})
def parse_addr(self, response): item = response.meta['item'] try: result = json.loads(response.body)['result'] loc = result['location'] lat, lng = loc['lat'], loc['lng'] item['blat'] = lat item['blng'] = lng yield item except KeyError: if 'stop_flag' in response.meta or 'addr' not in item or not item['addr']: yield item else: key = self.baidu_key.values()[random.randint(0, len(self.baidu_key) - 1)] url = u'http://api.map.baidu.com/geocoder/v2/?ak=%s&output=json&address=%s' % (key, item['addr']) if utils.get_short_loc(item['city']) not in item['addr']: url += u'&city=%s' % item['city'] yield Request(url=url, callback=self.parse_addr, meta={'item': item, 'key': key, 'stop_flag': True})
def process_item(self, item, spider): col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general') # get country country_code = item['country_code'] if country_code not in QyerCityProcPipeline.country_map: col_country = get_mongodb('geo', 'Country', profile='mongodb-general') country = col_country.find_one({'code': country_code}) assert country != None QyerCityProcPipeline.country_map[country_code] = country else: country = QyerCityProcPipeline.country_map[country_code] city_id = item['city_id'] city = col_loc.find_one({'source.qyer.id': city_id}) if not city: city = col_loc.find_one({ 'alias': item['zh_name'].lower(), 'location': { '$near': { 'type': 'Point', 'coordinates': [item['lng'], item['lat']] } }, 'country._id': country['_id'] }) if city: dist = utils.haversine(city['location']['coordinates'][0], city['location']['coordinates'][1], item['lng'], item['lat']) if dist > 100: city = {} if not city: city = {} city['enName'] = item['en_name'] zh_name = item['zh_name'] short_name = utils.get_short_loc(zh_name) city['zhName'] = short_name alias1 = city['alias'] if 'alias' in city and city['alias'] else [] alias2 = item['alias'] if 'alias' in item and item['alias'] else [] alias1.extend(alias2) alias1.append(short_name) city['alias'] = list( set( filter(lambda val: val, [tmp.lower().strip() for tmp in alias1]))) source = city['source'] if 'source' in city else {} source['qyer'] = {'id': item['city_id'], 'url': item['url']} city['source'] = source city['country'] = {'id': country['_id'], '_id': country['_id']} for k in ('enName', 'zhName'): if k in country: city['country'][k] = country[k] city['level'] = 2 city['desc'] = item['desc'] city['imageList'] = item['imageList'] city['images'] = [] city['location'] = { 'type': 'Point', 'coordinates': [item['lng'], item['lat']] } city['abroad'] = country_code != 'CN' city['isHot'] = item['is_hot'] > 0 col_loc.save(city) return item