def dump_shops(): try: for cityName in DUMP_CITY_NAMES: city = City(cityName) inject_cookie(city.headers) inject_cookie(city.map_headers) city.get() results = city.search(DUMP_KEYWORD) add_results_to_output(results, DUMP_FILE_SHOPS) finally: logger.info(f'Data dumped to {DUMP_FILE_SHOPS}')
def to_dict(self): ''' Returns a hash of the Place in the database ''' data = {} city = City.get(City.id == self.city) owner = User.get(User.id == self.owner) data['owner_id'] = owner.id data['city_id'] = city.id data['name'] = self.name data['description'] = self.description data['number_rooms'] = self.number_rooms data['number_bathrooms'] = self.number_bathrooms data['max_guest'] = self.max_guest data['price_by_night'] = self.price_by_night data['latitude'] = self.latitude data['longitude'] = self.longitude return super(Place, self).to_dict(self, data)
def to_dict(self): owner = User.get(User.id == self.owner) city = City.get(City.id == self.city) return { 'id': self.id, 'created_at': self.created_at.strftime('%Y/%m/%d %H:%M:%S'), 'updated_at': self.updated_at.strftime('%Y/%m/%d %H:%M:%S'), 'owner_id': owner.id, 'city_id': city.id, 'name': self.name, 'description': self.description, 'number_rooms': self.number_rooms, 'number_bathrooms': self.number_bathrooms, 'max_guest': self.max_guest, 'price_by_night': self.price_by_night, 'latitude': self.latitude, 'longitude': self.longitude }
from city import City from dbhelper import Database from config import MongoDB if __name__ == '__main__': db = Database(MongoDB) beijing = City('南宁', searchDB=db) beijing.get() results = beijing.search(keyword='朝阳广场地铁站', category='美食', save=True, details=False)
class CitySpider(object): """ 爬取某城市中店铺评论 """ def __init__(self, cityName, area=None): self.cityName = cityName self.area = area self.city = City(cityName, searchDB=Database(MongoDB), commentsDB=Database(MongoDB)) self.city.get() self.category_list = [] self.coa_category = [] self.fin_category = [] self.process_category(self.city.category, self.category_list) self.coarsness_category(self.category_list) self.fine_grained_category(self.category_list) def get_area(self, save=False): """ 获取该城市所有的区 :param save: 是否将城市所有区保存到数据库 """ area_list = [] for item in self.city.locations: try: if (item['text'] == '全部地区'): continue area_list.append(item['text']) except: logger.debug(f'获取城市分区失败:[城市:{self.cityName}]') logger.info(f'获取 “{self.cityName}” 所有区成功.') if save: areaDB = init_area_db(Database(MongoDB)) areaDB.save({'area': area_list}, self.cityName) logger.info(f'已将 “{self.cityName}” 所有区信息保存到数据库中.') return area_list def process_category(self, obj, category_list): """ 处理分类数据 :return: category_list """ for item in obj: if 'children' in item.keys(): category_list.append({item['text']: []}) self.process_category( item['children'], category_list[len(category_list) - 1][item['text']]) else: category_list.append(item['text']) return category_list def coarsness_category(self, category_list): for item in category_list: if isinstance(item, str): self.coa_category.append(item) elif isinstance(item, dict): self.coa_category.append(list(item.keys())[0]) def fine_grained_category(self, category_list): for item in category_list: if isinstance(item, str): self.fin_category.append(item) elif isinstance(item, dict): self.fine_grained_category(item[list(item.keys())[0]]) def get_category(self, save=False): """ 获取该城市所有的店铺分类结果 :param save: 是否将城市所有店铺分类保存到数据库 """ logger.info(f'获取 “{self.cityName}” 所有店铺分类成功.') if save: categoryDB = init_category_db(Database(MongoDB)) categoryDB.save({'category': self.category_list}, tname=self.cityName) logger.info(f'已将 “{self.cityName}” 所有店铺分类信息保存到数据库中.') return self.city.category def save_shop_info(self): """ 获取该城市所有的店铺的信息,并保存在数据库中 """ for area in self.get_area(): for category in self.fin_category: if category == '全部分类': continue # self.city.search('', category=category, location=area, filter=None, sort='按人气排序', save=True, details=True, comments=False) self.city.async_search('', category=category, location=area, filter=None, sort='按人气排序', save=True, details=False, comments=False) return True def save_shop_comments(self): """ 获取该城市所有的店铺的评论信息,并保存在数据库中 """ self.city.get_comments(self.get_area(), self.fin_category) return True
'山西': {'areaId': 1, 'provinceId': '4'}, '内蒙古': {'areaId': 1, 'provinceId': '5'}, '辽宁': {'areaId': 2, 'provinceId': '6'}, '吉林': {'areaId': 2, 'provinceId': '7'}, '黑龙江': {'areaId': 2, 'provinceId': '8'}, '上海': {'areaId': 3, 'provinceId': '9'}, '江苏': {'areaId': 3, 'provinceId': '10'}, '浙江': {'areaId': 3, 'provinceId': '11'}, '安徽': {'areaId': 3, 'provinceId': '12'}, '福建': {'areaId': 3, 'provinceId': '13'}, '江西': {'areaId': 3, 'provinceId': '14'}, '山东': {'areaId': 3, 'provinceId': '15'}, '河南': {'areaId': 4, 'provinceId': '16'}, '湖北': {'areaId': 4, 'provinceId': '17'}, '湖南': {'areaId': 4, 'provinceId': '18'}, '广东': {'areaId': 4, 'provinceId': '19'}, '广西': {'areaId': 4, 'provinceId': '20'}, '海南': {'areaId': 4, 'provinceId': '21'}, '重庆': {'areaId': 5, 'provinceId': '22'}, '四川': {'areaId': 5, 'provinceId': '23'}, '贵州': {'areaId': 5, 'provinceId': '24'}, '云南': {'areaId': 5, 'provinceId': '25'}, '西藏': {'areaId': 5, 'provinceId': '26'}, '陕西': {'areaId': 6, 'provinceId': '27'}, '甘肃': {'areaId': 6, 'provinceId': '28'}, '青海': {'areaId': 6, 'provinceId': '29'}, '宁夏': {'areaId': 6, 'provinceId': '30'}, '新疆': {'areaId': 6, 'provinceId': '31'}, '香港': {'areaId': 7, 'provinceId': '32'}, '澳门': {'areaId': 7, 'provinceId': '33'}, '台湾': {'areaId': 7, 'provinceId': '34'}} """ sh = City('上海') # url = sh.url sh.get() # locations = sh.locations results = sh.search('咖啡', category='咖啡厅', location='嘉定区', save=False, details=True) print(results)