def match(): # 先到匹配表里查 找不到再跑接口匹配 for i in crawler_collection.find(no_cursor_timeout=True): source = 'youda' city = '上海' region = i['CJ_XQ'] friendsName = i['CJ_LPMC'] data = match_collection.find_one({'source': source, 'city': city, 'region': region, 'friendsName': friendsName}) if data: crawler_collection.find_one_and_update({'_id': i['_id']}, {'$set': {'fj_city': data['city'], 'fj_region': data['region'], 'fj_name': data['fjName'], 'fj_flag': 1, 'update_time': datetime.utcnow()}}) log.info('更新数据 _id={}'.format(i['_id'])) else: friendsAddress = i['CJ_ZL'] address = re.search('\d+(号|弄|支|支弄|单号|双号|甲号|乙号|丙号|丁号)', friendsAddress, re.S | re.M) if address: data = match_collection.find_one({'source': source, 'city': city, 'region': region, 'friendsAddress': friendsAddress}) if data: crawler_collection.find_one_and_update({'_id': i['_id']},{'$set': {'fj_city': data['city'], 'fj_region': data['region'], 'fj_name': data['fjName'], 'fj_flag': 1, 'update_time': datetime.utcnow()}}) log.info('更新数据 _id={}'.format(i['_id'])) for j in crawler_collection.find({'fj_flag': None}, no_cursor_timeout=True): if j['CJ_FWYT'] in ['住宅', '综合社区', '别墅', '里弄房', '老公房']: city = '上海' region = j['CJ_XQ'] friendsName = j['name_list'] for name in friendsName: data = match(city=city, region=region, keyword=name) if data: if data['flag'] == '精确匹配': crawler_collection.find_one_and_update({'_id': j['_id']}, {'$set': {'fj_city': data['mcity'], 'fj_region': data['mregion'], 'fj_name': data['mname'], 'fj_flag': 1, 'update_time': datetime.utcnow().replace(tzinfo=timezone.utc)}}) log.info('更新数据 _id={}'.format(j['_id'])) break for k in crawler_collection.find({'fj_flag': None}, no_cursor_timeout=True): if k['CJ_FWYT'] in ['住宅', '综合社区', '别墅', '里弄房', '老公房']: city = '上海' region = k['CJ_XQ'] if 'address' in k: data = match(city=city, region=region, keyword=k['address']) if data: if data['flag'] == '精确匹配': crawler_collection.find_one_and_update({'_id': k['_id']}, {'$set': {'fj_city': data['mcity'], 'fj_region': data['mregion'], 'fj_name': data['mname'], 'fj_flag': 1, 'update_time': datetime.utcnow().replace(tzinfo=timezone.utc)}}) log.info('更新数据 _id={}'.format(k['_id']))
def match_address_estate_type2(data): _id = data['_id'] city = data['city'] region = data['region'] district_name = data['district_name'] match_data = match(city=city, region=region, keyword=district_name) if match_data and match_data['flag'] == '精确匹配': address = match_data['maddress'] collection_lianjia.find_one_and_update({'_id': _id}, {'$set': { 'address': address }}) print('更新地址 _id={} address={}'.format(_id, address)) match_id = match_data['_id'] m = MongoClient(host='192.168.0.136', port=27017) collection_seaweed = m['fangjia']['seaweed'] seaweed_data = collection_seaweed.find_one({'_id': ObjectId(match_id)}) if seaweed_data and 'estate_type2' in seaweed_data: collection_lianjia.find_one_and_update( {'_id': _id}, {'$set': { 'estate_type2': seaweed_data['estate_type2'] }}) print('更新地址 _id={} estate_type2={}'.format( _id, seaweed_data['estate_type2'])) m.close()
def match_data(i, m_address): match_data = match(city=i['city'], region=i['region'], keyword=m_address) if match_data: if '精确匹配' in match_data['flag']: collection_delete_repeat.find_one_and_update({'_id': i['_id']}, {'$set': {'fj_city': match_data['mcity'], 'fj_region': match_data['mregion'], 'fj_name': match_data['mname'], 'fj_id': match_data['_id'], 'fj_flag': 1}}) print('匹配一条数据') return 1 else: return 0 else: return 0
def start(i): city = i['city'] region = i['region'] district_name = i['district_name'] match_data = match(city=city, region=region, keyword=district_name) if match_data: if '精确匹配' in match_data['flag']: collection_delete_repeat.find_one_and_update({'_id': i['_id']}, { '$set': { 'fj_city': match_data['mcity'], 'fj_region': match_data['mregion'], 'fj_name': match_data['mname'], 'fj_id': match_data['_id'], 'fj_flag': 1 } }) print('匹配一条数据')
def add_fj_name(): # 先到匹配表里查 找不到再跑接口匹配 for i in crawler_collection.find(no_cursor_timeout=True): source = 'res' city = '上海' region = i['area'] friendsName = i['fullhousingname'] data = collection_match.find_one({ 'source': source, 'city': city, 'region': region, 'friendsName': friendsName }) if data: crawler_collection.find_one_and_update({'_id': i['_id']}, { '$set': { 'fj_city': data['city'], 'fj_region': data['region'], 'fj_name': data['fjName'], 'fj_flag': 1, 'update_time': datetime.utcnow() } }) print('更新数据 添加格式化城市区域小区名 _id={}'.format(data['_id'])) else: friendsAddress = i['housingaddressall'] address = re.search('\d+(号|弄|支|支弄|单号|双号|甲号|乙号|丙号|丁号)', friendsAddress, re.S | re.M) if address: data = collection_match.find_one({ 'source': source, 'city': city, 'region': region, 'friendsAddress': friendsAddress }) if data: crawler_collection.find_one_and_update( {'_id': i['_id']}, { '$set': { 'fj_city': data['city'], 'fj_region': data['region'], 'fj_name': data['fjName'], 'fj_flag': 1, 'update_time': datetime.utcnow().replace( tzinfo=timezone.utc) } }) print('更新数据 添加格式化城市区域小区名 _id={}'.format(data['_id'])) for j in crawler_collection.find({'fj_flag': None}, no_cursor_timeout=True): if j['propertytype'] in ['住宅', '综合社区', '别墅']: city = '上海' region = j['area'] """ 取两个字段相同的部分为小区名 """ fullhousingname = j['fullhousingname'] newdiskname = j['newdiskname'] Counter(fullhousingname) Counter(newdiskname) c = Counter(fullhousingname) & Counter(newdiskname) friendsName = "".join(c.keys()) data = match(city=city, region=region, keyword=friendsName) if data: if data['flag'] == '精确匹配': crawler_collection.find_one_and_update( {'_id': j['_id']}, { '$set': { 'fj_city': data['mcity'], 'fj_region': data['mregion'], 'fj_name': data['mname'], 'fj_flag': 1, 'update_time': datetime.utcnow() } }) print('更新数据 _id={}'.format(j['_id'])) else: friendsAddress = j['houseaddress'] data = match(city=city, region=region, keyword=friendsAddress) if data: if data['flag'] == '精确匹配': crawler_collection.find_one_and_update( {'_id': j['_id']}, { '$set': { 'fj_city': data['mcity'], 'fj_region': data['mregion'], 'fj_name': data['mname'], 'fj_flag': 1, 'update_time': datetime.utcnow() } }) print('更新数据 _id={}'.format(j['_id']))
username='******', password='******') collection_res_2018 = m['deal_price']['res_second_2018_11'] for i in collection_res_2018.find({"fj_name": None}): fullhousingname = i['fullhousingname'] print(fullhousingname) newdiskname = i['newdiskname'] print(newdiskname) Counter(fullhousingname) Counter(newdiskname) c = Counter(fullhousingname) & Counter(newdiskname) new_name = "".join(c.keys()) print(new_name) data = match(city='上海', region=i['area'], keyword=new_name) print(data) if data: if data['flag'] == '精确匹配': collection_res_2018.find_one_and_update({'_id': i['_id']}, {'$set': {'fj_city': data['mcity'], 'fj_region': data['mregion'], 'fj_name': data['mname'], 'fj_flag': 1, 'update_time': datetime.utcnow()}}) print('更新一条数据 fj_name={}'.format(data['mname'])) # count = 0 # for i in collection_res_2018.find({'fj_flag': 1}): # if '地下' in i['houseaddress']: # collection_res_2018.find_one_and_update({'_id': i['_id']}, {'$set': {'floor': None}})