def test_case_32(self): self.assertSetEqual( set( fix_daodao_open_time( '周一~周六:10:00~17:00,周日:12:00–17:00,关门前15分钟禁止入场').split( '|')), {'<*><周1-周6><10:00-17:00><SURE>', '<*><周7><12:00-17:00><SURE>'})
def test_case_2(self): self.assertSetEqual( set(fix_daodao_open_time('周一 - 周六:下午7点00分 - 上午10点00分').split('|')), { '<*><周1><19:00-23:59><SURE>', '<*><周7><00:00-10:00><SURE>', '<*><周2-周6><00:00-10:00,19:00-23:59><SURE>' })
def test_case_23(self): cases = [ '永久', '全日开放', '全年24小时', '24hours', '全年常开', '完全开放', '周一-周日:24小时', '一直开放', '全日全时段', '每天', '无限制', '24小时参观游览。', '24h', '24小时', '整天', '全年开放。', '24hr' ] for each in cases: self.assertSetEqual(set(fix_daodao_open_time(each).split('|')), {'<*><*><00:00-23:59><SURE>'})
def test_case_11(self): self.assertSetEqual( set( fix_daodao_open_time( '周一 - 周三10:00 - 17:00周四 - 周五10:00 - 21:00周六 - 周日10:00 - 17:00' ).split('|')), { '<*><周1-周3><10:00-17:00><SURE>', '<*><周4-周5><10:00-21:00><SURE>', '<*><周6-周7><10:00-17:00><SURE>' })
def test_case_3(self): self.assertSetEqual( set( fix_daodao_open_time( '周二 - 周四:11:00 - 17:00|周五:11:00 - 21:00|周六 - 周日:11:00 - 17:00' ).split('|')), { '<*><周6-周7><11:00-17:00><SURE>', '<*><周2-周4><11:00-17:00><SURE>', '<*><周5><11:00-21:00><SURE>' })
def test_case_6(self): self.assertSetEqual( set( fix_daodao_open_time( '周一 - 周四 10:00 - 22:00|周五 10:00 - 2:00|周六 8:00 - 2:00'). split('|')), { '<*><周1-周4><10:00-22:00><SURE>', '<*><周5><10:00-23:59><SURE>', '<*><周6><00:00-02:00,08:00-23:59><SURE>', '<*><周7><00:00-02:00><SURE>' })
def test_case_7(self): self.assertSetEqual( set( fix_daodao_open_time( '周一 - 周四 7:00 - 2:00|周五 16:00 - 2:00|周六 13:00 - 2:00'). split('|')), { '<*><周7><00:00-02:00><SURE>', '<*><周1><07:00-23:59><SURE>', '<*><周6><00:00-02:00,13:00-23:59><SURE>', '<*><周2-周4><00:00-02:00,07:00-23:59><SURE>', '<*><周5><00:00-02:00,16:00-23:59><SURE>' })
def test_case_12(self): self.assertSetEqual( set( fix_daodao_open_time( '星期日09:00 - 11:3018:00 - 19:00星期一 - 星期二08:00 - 17:00星期三08:00 - 19:30星期四08:00 - 17:00星期五08:00 - 16:30' ).split('|')), { '<*><周7><09:00-11:30,18:00-19:00><SURE>', '<*><周3><08:00-19:30><SURE>', '<*><周4><08:00-17:00><SURE>', '<*><周1-周2><08:00-17:00><SURE>', '<*><周5><08:00-16:30><SURE>' })
def add_open_time_filter(_v): if not is_legal(_v): return False try: _open_time = fix_daodao_open_time(_v) if is_legal(_open_time): return True except Exception: # 保存不能识别的 open time insert_unknown_keywords('{}_opentime'.format(poi_type), _v) logger.debug("[unknown open time][data: {}]".format(_v)) return False
def test_case_5(self): self.assertSetEqual( set( fix_daodao_open_time( '周一 - 周二 11:00 - 14:00| 17:30 - 22:00|周三 5:30 - 22:00| 23:00 - 14:00|' '周四 11:00 - 14:00| 17:30 - 22:00|周五 - 周日 17:30 - 22:30|' '周五 11:30 - 14:00|周六 - 周日 11:30 - 14:30').split('|')), { '<*><周4><00:00-14:00,17:30-22:00><SURE>', '<*><周6-周7><11:30-14:30,17:30-22:30><SURE>', '<*><周1-周2><11:00-14:00,17:30-22:00><SURE>', '<*><周3><05:30-22:00,23:00-23:59><SURE>', '<*><周5><11:30-14:00,17:30-22:30><SURE>' })
def test_case_35(self): self.assertSetEqual( set(fix_daodao_open_time('周一至周日9:00-18:00,冬季提前1小时关门').split('|')), {'<*><周1-周7><09:00-18:00><SURE>'})
def test_case_36(self): self.assertSetEqual( set(fix_daodao_open_time('08:00-20:00,一周7天运行。').split('|')), {'<*><周1-周7><08:00-20:00><SURE>'})
def test_case_4(self): self.assertSetEqual( set(fix_daodao_open_time('周一 - 周日:8:00 - 18:00').split('|')), {'<*><周1-周7><08:00-18:00><SURE>'})
def test_case_40(self): self.assertSetEqual( set(fix_daodao_open_time('14:00-17:30,周四、周日、节假日不开放').split('|')), {'<*><周5-周6><14:00-17:30><SURE>', '<*><周1-周3><14:00-17:30><SURE>'})
def test_case_49(self): self.assertSetEqual( set( fix_daodao_open_time( '部分商店营业时间:9:00-20:00(周日关门时间会早于平时)。').split('|')), {'<*><周1-周7><09:00-20:00><SURE>'})
def test_case_45(self): self.assertSetEqual( set(fix_daodao_open_time('09:00-17:00 逢星期一关闭').split('|')), {'<*><周2-周7><09:00-17:00><SURE>'})
def test_case_34(self): self.assertSetEqual( set(fix_daodao_open_time('6:00-18:00,通常售票处下午5点停止售票').split('|')), {'<*><周1-周7><06:00-18:00><SURE>'})
def poi_insert_data(cid, _poi_type): init_global_name(_poi_type) ''' 数据最终入库表 if _poi_type == 'attr': sql = 'replace into chat_attraction(`id`,`name`,`name_en`,`data_source`,`city_id`,' \ '`map_info`,`address`,`star`,`plantocount`,`beentocount`,`real_ranking`,' \ '`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,`phone`,`introduction`,' \ '`open`, `open_desc`,`recommend_lv`,`prize`,`traveler_choice`, `alias`, ' \ '`image`, `ori_grade`,`nearCity`, `ranking`,`rcmd_open`,`add_info`,`address_en`,`event_mark`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,-1,"","","","")' elif _poi_type == 'rest': sql = 'replace into chat_restaurant(`id`,`name`,`name_en`,' \ '`source`,`city_id`,`map_info`,`address`,`real_ranking`,' \ '`grade`,`res_url`,`telphone`,`introduction`,`open_time`,`open_time_desc`,`prize`,' \ '`traveler_choice`,`review_num`,`price`,`price_level`,`cuisines`, ' \ '`image_urls`,`tagid`,`norm_tagid`,`norm_tagid_en`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' elif _poi_type == 'shop': sql = 'replace into ' \ 'chat_shopping(`id`,`name`,`name_en`,`data_source`,`city_id`,' \ '`map_info`,`address`,`star`,`plantocount`,`beentocount`,' \ '`real_ranking`,`grade`,`commentcount`,`tagid`,`norm_tagid`,`norm_tagid_en`,`url`,`website_url`,' \ '`phone`,`introduction`,`open`,`open_desc`,`recommend_lv`,`prize`,' \ '`traveler_choice`,`image`,`nearCity`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,' \ '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' else: raise TypeError("Unknown Type: {}".format(poi_type)) ''' conn = poi_ori_pool.connection() # for task_dict in get_task(cid): count = 0 data = [] # 获取融合需要的 poi 信息 _info_dict, _online_official_data, _online_nonofficial_data = get_poi_dict( cid) _city_poi = get_poi_union_info(cid) # 开始数据融合 for miaoji_id, city_id, union_info in _city_poi: # 初始化融合前变量 data_dict = defaultdict(dict) # 有从其他数据源来的数据 other_source = False # 用于判定是否有线上 official 以及 nonofficial 的数据 has_official = False has_nonofficial = False # 获取线上环境数据 o_official_data = _online_official_data.get(miaoji_id, None) o_nonofficial_data = _online_nonofficial_data.get(miaoji_id, None) # 更新 official 判定 if o_official_data is not None: has_official = True if o_nonofficial_data is not None: has_nonofficial = True # 初始化融合信息 for each_name in (json_name_list + norm_name_list + others_name_list): data_dict[each_name] = {} def get_data(src_dict, is_official=False): if each_name in online2source: source_name = online2source[each_name] else: source_name = each_name if each_name in json_name_list: if source_name in src_dict: try: _res = json.loads(src_dict[source_name]) if isinstance(_res, dict): data_dict[each_name] = { k: v for k, v in _res.items() if k in available_source } else: pass except Exception: pass else: data_dict[each_name]['mioji_official' if is_official else 'mioji_nonofficial'] = src_dict.get( source_name, {}) if o_official_data is not None: get_data(o_official_data, is_official=True) if o_nonofficial_data is not None: get_data(o_nonofficial_data, is_official=False) # 遍历所有需要融合的 source 以及 id,并生成 dict 类融合内容 for s_sid in union_info.split('|_||_|'): source, source_id = s_sid.split('|') # todo 增加 online 的处理,先 load data,然后进行数据更新 # todo 使用 online 的 base data 更新 data process 的字段 # 未获得融合 id 信息 if not source_id or not source: continue # 过滤不必要的 source if source not in available_source: logger.debug("[not available source: {}]".format(source)) continue # 未获得融合数据 poi_info = _info_dict[(source, source_id)] if poi_info == {}: continue other_source = True # 加 key for each_name in (json_name_list + norm_name_list + others_name_list): if is_legal(poi_info[each_name]): if isinstance(poi_info[each_name], str): data_dict[each_name][source] = tradition2simple( poi_info[each_name]).decode() else: data_dict[each_name][source] = poi_info[each_name] # 补空白的内容 for each_name in (json_name_list + norm_name_list + others_name_list): if each_name not in data_dict: data_dict[each_name] = {} # 不能融合的内容包含两种 if not o_official_data and not o_nonofficial_data and not other_source: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "没有可供融合的数据") logger.debug('[union_info: {}]'.format(union_info)) continue new_data_dict = {} # 通过优先级获取 !中文 ! def get_name_by_priority(): # 按照标准优先级更新字段信息 name_tmp = get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_chinese) # 从英文字段中找中文 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_chinese) # 从英文字段中找拉丁 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_latin) # 从中文字段中找拉丁 if not name_tmp: name_tmp = get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_latin) return name_tmp # 通过优先级获取 !拉丁字符 ! def get_name_en_by_priority(): # 从融合数据的英文字段中获取 name_en_tmp = get_key.get_key_by_priority_or_default( data_dict['name_en'], norm_name, '', special_filter=check_latin) if not name_en_tmp: get_key.get_key_by_priority_or_default( data_dict['name'], norm_name, '', special_filter=check_latin) return name_en_tmp for norm_name in norm_name_list: # 所有字段处理的过程中,对 name / name_en 进行特殊处理 if norm_name == 'name': if has_official: # official = 1 的点,不更新 name new_data_dict['name'] = data_dict['name']['mioji_official'] elif has_nonofficial: # official = 0 的点,name 已为中文的点不更新 name if any([ toolbox.Common.is_chinese(c) for c in data_dict['name']['mioji_nonofficial'] ]): new_data_dict['name'] = data_dict['name'][ 'mioji_nonofficial'] else: new_data_dict['name'] = get_name_by_priority() else: # 按照标准优先级更新字段信息 new_data_dict['name'] = get_name_by_priority() elif norm_name == 'name_en': # official 1 不更新英文名,否则按优先级更新英文名 if has_official: new_data_dict['name_en'] = data_dict['name_en'][ 'mioji_official'] else: new_data_dict['name_en'] = get_name_en_by_priority() else: new_data_dict[ norm_name] = get_key.get_key_by_priority_or_default( data_dict[norm_name], norm_name, '') # daodao url 处理 if 'daodao' in data_dict['url']: data_dict['url']['daodao'] = data_dict['url']['daodao'].replace( 'www.tripadvisor.com.hk', 'www.tripadvisor.cn') # 餐厅使用 cuisines 添加 tagid if poi_type == 'rest': data_dict['tagid'] = copy.deepcopy(data_dict['cuisines']) new_data_dict['tagid'] = json.dumps({ k: v for k, v in data_dict['tagid'].items() if k in final_source }) for json_name in json_name_list: new_data_dict[json_name] = json.dumps({ k: v for k, v in data_dict[json_name].items() if k in final_source }) new_data_dict['phone'] = new_data_dict['phone'].replace('电话号码:', '').strip() # 数据操作部分 # ori_grade modify tmp_ori_grade = {} if has_official: try: tmp_ori_grade.update(json.loads(o_official_data['ori_grade'])) except Exception as exc: logger.exception(msg="[load ori grade error]", exc_info=exc) if has_nonofficial: try: tmp_ori_grade.update( json.loads(o_nonofficial_data['ori_grade'])) except Exception as exc: logger.exception(msg="[load ori grade error]", exc_info=exc) tmp_ori_grade.update({k: v for k, v in data_dict['grade'].items()}) new_data_dict['ori_grade'] = json.dumps( {k: v for k, v in tmp_ori_grade.items() if k in final_source}) # 添加 source source = '|'.join( map(lambda x: x.split('|')[0], union_info.split('|_||_|'))) # add alias alias = '|'.join( filter( lambda x: x != new_data_dict['name'] and x != new_data_dict[ 'name_en'], set( list(data_dict['name'].values()) + list(data_dict['name_en'].values())))) # add open time final_open_time_desc = get_key.get_key_by_priority_or_default( data_dict['opentime'], 'opentime', special_filter=add_open_time_filter) if final_open_time_desc: norm_open_time = fix_daodao_open_time(final_open_time_desc) else: norm_open_time = '' # add norm tag # todo change make qyer and other can be used unknown_tag = set() if 'daodao' in data_dict['tagid']: try: daodao_tagid, daodao_tagid_en, _unknown_tag = get_norm_tag( data_dict['tagid']['daodao'], poi_type) unknown_tag.update(_unknown_tag) except Exception: daodao_tagid, daodao_tagid_en = '', '' else: daodao_tagid, daodao_tagid_en = '', '' # # rest tag # if 'daodao' in data_dict['tagid']: # try: # daodao_rest_tagid, daodao_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['daodao'], # 'rest') # except Exception: # daodao_rest_tagid, daodao_rest_tagid_en = '', '' # else: # daodao_rest_tagid, daodao_rest_tagid_en = '', '' # shop tag if 'daodao' in data_dict['tagid']: try: daodao_shop_tagid, daodao_shop_tagid_en, _ = get_norm_tag( data_dict['tagid']['daodao'], 'shop') except Exception: daodao_shop_tagid, daodao_shop_tagid_en = '', '' else: daodao_shop_tagid, daodao_shop_tagid_en = '', '' if 'qyer' in data_dict['tagid']: try: qyer_tagid, qyer_tagid_en, _unknown_tag = get_norm_tag( data_dict['tagid']['qyer'], poi_type) unknown_tag.update(_unknown_tag) except Exception: qyer_tagid, qyer_tagid_en = '', '' else: qyer_tagid, qyer_tagid_en = '', '' # # rest tag # if 'qyer' in data_dict['tagid']: # try: # qyer_rest_tagid, qyer_rest_tagid_en, _ = get_norm_tag(data_dict['tagid']['qyer'], 'rest') # except Exception: # qyer_rest_tagid, qyer_rest_tagid_en = '', '' # else: # qyer_rest_tagid, qyer_rest_tagid_en = '', '' # shop tag if 'qyer' in data_dict['tagid']: try: qyer_shop_tagid, qyer_shop_tagid_en, _ = get_norm_tag( data_dict['tagid']['qyer'], 'shop') except Exception: qyer_shop_tagid, qyer_shop_tagid_en = '', '' else: qyer_shop_tagid, qyer_shop_tagid_en = '', '' l_norm_tag = [] l_norm_tag_en = [] l_norm_tag.extend(daodao_tagid.split('|')) l_norm_tag_en.extend(daodao_tagid_en.split('|')) l_norm_tag.extend(qyer_tagid.split('|')) l_norm_tag_en.extend(qyer_tagid_en.split('|')) l_other_norm_tag = [] l_other_norm_tag.extend(daodao_shop_tagid.split('|')) l_other_norm_tag.extend(qyer_shop_tagid.split('|')) # 去除空 tag 以及重复 tag norm_tag = '|'.join(filter(lambda x: is_legal(x), set(l_norm_tag))) norm_tag_en = '|'.join( filter(lambda x: is_legal(x), set(l_norm_tag_en))) other_tag = '|'.join( filter(lambda x: is_legal(x), set(l_other_norm_tag))) # 数据入库部分 # 替换旧的 data_dict data_dict = new_data_dict # 过滤名称 if data_dict['name'].lower() in ( '', 'null', '0') and data_dict['name_en'].lower() in ('', 'null', '0'): if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "中英文名为空") logger.debug("[filter by name][name: {}][name_en: {}]".format( data_dict['name'], data_dict['name_en'])) continue if '停业' in data_dict['name'] or '停业' in data_dict['name_en']: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "停业 POI") logger.debug( "[filter by name with close business][name: {}][name_en: {}]". format(data_dict['name'], data_dict['name_en'])) continue # 这个逻辑太蠢了,去除,23333333 # # name name_en 判断 # if data_dict['name'] != data_dict['name_en']: # if data_dict['name_en'] in data_dict['name']: # data_dict['name'] = data_dict['name'].replace(data_dict['name_en'], '') # phone 处理 if data_dict['phone'] in ('+ 新增電話號碼', '+ 新增电话号码'): data_dict['phone'] = '' # 餐厅的 price_level 单独处理 if poi_type == 'rest': data_dict['price_level'] = W2N.get( data_dict.get('price_level', ''), '0') # 添加 nearCity 字段 nearby_city = get_nearby_city(poi_city_id=city_id, poi_map_info=data_dict['map_info']) # 数据清理以及入库部分 # 全量经纬度不符合规范数据清理 try: lng, lat = data_dict['map_info'].split(',') lng = float(lng) lat = float(lat) data_dict['map_info'] = '{},{}'.format(lng, lat) except Exception as exc: logger.exception(msg="[map_info filter error][data: {}]".format( data_dict['map_info']), exc_info=exc) continue # 清理名称中的多余字符 data_dict['name'] = data_dict['name'].replace('这是您的企业吗?', '').strip() if data_dict['name_en'] in data_dict[ 'name'] and data_dict['name_en'] != data_dict['name']: data_dict['name'].replace(data_dict['name_en'], '') # 字段修改部分 # address if data_dict['address'].lower() in ('null', '0'): data_dict['address'] = '' # open time if norm_open_time.lower() in ('', 'null', '0'): if poi_type in ('attr', 'rest'): norm_open_time = '<*><*><00:00-23:55><SURE>' else: norm_open_time = '<*><*><08:00-20:00><SURE>' # 保存不能识别的 tag 以及 open time 信息 if unknown_tag: insert_unknown_keywords('{}_tag'.format(poi_type), unknown_tag) logger.debug("[unknown tag][tags: {}]".format(unknown_tag)) # 距离过远过滤 poi result = poi_is_too_far(city_id, poi_map_info=data_dict['map_info']) if not result: if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "距城市中心距离过远") logger.debug( "[poi filter by poi city distance][cid: {}][city_map: {}][poi_map_info: {}][distance: {}]" .format(city_id, result.city_map, data_dict['map_info'], result.dist)) continue # 大于 55 长度的电话置空 if len(data_dict['phone']) > 55: logger.debug( "[phone length too long][poi_id: {}][len: {}][phone: {}]". format(miaoji_id, len(data_dict['phone']), data_dict['phone'])) data_dict['phone'] = '' if poi_type == 'attr': per_data = { 'id': miaoji_id, 'name': data_dict['name'], 'name_en': data_dict['name_en'], 'data_source': source, 'city_id': city_id, 'map_info': data_dict['map_info'], 'address': data_dict['address'], 'star': data_dict['star'], 'plantocount': data_dict['plantocounts'], 'beentocount': data_dict['beentocounts'], 'real_ranking': data_dict['ranking'], # 'grade': data_dict['grade'], 'commentcount': data_dict['commentcounts'], 'tagid': data_dict['tagid'], 'norm_tagid': norm_tag, 'norm_tagid_en': norm_tag_en, 'website_url': data_dict['site'], 'phone': data_dict['phone'], 'open': norm_open_time, 'open_desc': data_dict['opentime'], 'recommend_lv': data_dict['recommend_lv'], 'prize': data_dict['prize'], 'traveler_choice': data_dict['traveler_choice'], 'alias': alias, 'image': data_dict['imgurl'], 'ori_grade': data_dict['ori_grade'], 'nearCity': nearby_city } # official 为 1 时,不更新的字段 # nonofficial 以及 新增的数据时进行更新 if not has_official: per_data.update({ 'introduction': data_dict['introduction'], 'url': data_dict['url'], }) if not has_official and not has_nonofficial: per_data.update({ # 明确更新逻辑,当之前没有融合时才会更新状态 'ranking': -1.0, 'rcmd_open': '', 'add_info': '', 'address_en': '', 'event_mark': '', 'grade': -1.0, # 明确更新逻辑,当之前没有融合时才会更新状态 'status_online': 'Open', 'status_test': 'Open' }) # 景点游览部分清理 try: tagid_data = json.loads(data_dict['tagid']) if 'daodao' in tagid_data: if is_legal(tagid_data['daodao']): if '游览' in tagid_data['daodao']: if 'online' in union_info: # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留 # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能加入游览 # filter_data_already_online(poi_type, miaoji_id, "tag 中包含游览被过滤") pass else: logger.debug("[tour filter][data: {}]".format( tagid_data['daodao'])) continue except Exception as exc: logger.exception(msg="[tour filter error]", exc_info=exc) if norm_tag == '' and other_tag != '': # 景点中包含购物被清除 if 'online' in union_info: # 这种内容本来是要被删除的,但是由于 online 环境的某些购物还必须要更新,所以只能保留 # 并参加融合,这种内容算是特殊处理吧,对于新增的,一定不能添加购物进入 # filter_data_already_online(poi_type, miaoji_id, "景点类中存在购物数据被过滤") pass else: continue data.append(per_data) elif poi_type == 'rest': data.append( (miaoji_id, data_dict['name'], data_dict['name_en'], source, key, data_dict['map_info'], data_dict['address'], data_dict['ranking'], data_dict['grade'], data_dict['url'], data_dict['phone'], data_dict['introduction'], norm_open_time, data_dict['opentime'], data_dict['prize'], data_dict['traveler_choice'], data_dict['commentcounts'], data_dict['price'], data_dict['price_level'], data_dict['cuisines'], data_dict['imgurl'], data_dict['tagid'], norm_tag, norm_tag_en, nearby_city)) elif poi_type == 'shop': per_data = { 'id': miaoji_id, 'name': data_dict['name'], 'name_en': data_dict['name_en'], 'data_source': source, 'city_id': city_id, 'map_info': data_dict['map_info'], 'address': data_dict['address'], 'star': data_dict['star'], 'plantocount': data_dict['plantocounts'], 'beentocount': data_dict['beentocounts'], 'real_ranking': data_dict['ranking'], # 'grade': data_dict['grade'], 'commentcount': data_dict['commentcounts'], 'tagid': data_dict['tagid'], 'norm_tagid': norm_tag, 'norm_tagid_en': norm_tag_en, 'website_url': data_dict['site'], 'phone': data_dict['phone'], 'open': norm_open_time, 'open_desc': data_dict['opentime'], 'recommend_lv': data_dict['recommend_lv'], 'prize': data_dict['prize'], 'traveler_choice': data_dict['traveler_choice'], 'image': data_dict['imgurl'], 'nearCity': nearby_city } # official 为 1 时,不更新的字段 # nonofficial 以及 新增的数据时进行更新 if not has_official: per_data.update({ 'introduction': data_dict['introduction'], 'url': data_dict['url'], }) if not has_official and not has_nonofficial: per_data.update({ # 需要增加默认值才能入库 'ranking': -1.0, 'rcmd_open': '', 'image_list': '', 'grade': -1.0, # 明确更新逻辑,当之前没有融合时才会更新状态 'status_online': 'Open', 'status_test': 'Open' }) shopping_tag = [ '礼品与特产商店', '大型购物中心', '农贸市场', '跳蚤市场与街边市场', '古董店', '百货商场', '厂家直营店', '购物' ] important_shopping_tag = [ '礼品与特产商店', '大型购物中心', '百货商场', '厂家直营店', '购物' ] # 购物数据过滤,通过 tag 过滤 tag_list = norm_tag.split('|') if not all([tag.strip() in shopping_tag for tag in tag_list]): if not any([ tag.strip() in important_shopping_tag for tag in tag_list ]): if 'online' in union_info: filter_data_already_online(poi_type, miaoji_id, "非购物数据被过滤") continue data.append(per_data) else: raise TypeError("Unknown Type: {}".format(poi_type)) if count % 300 == 0: db = dataset.connect( "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8" ) table = db[data_process_table_name] _insert = 0 logger.debug("Total: {}".format(count)) _t = time.time() for d in data: _res = table.upsert(d, keys=['id']) if _res: _insert += 1 logger.debug( '[data upsert][count: {}][insert: {}][takes: {}]'.format( count, _insert, time.time() - _t)) logger.debug("[city_id: {}][insert_count_this_times: {}]".format( cid, _insert)) db.commit() data = [] count += 1 logger.debug("[city_id: {}][total: {}]".format(cid, count)) _insert = 0 db = dataset.connect( "mysql+pymysql://mioji_admin:[email protected]/poi_merge?charset=utf8" ) table = db[data_process_table_name] for d in data: _res = table.upsert(d, keys=['id']) if _res: _insert += 1 logger.debug("Insert: {}".format(_insert)) db.commit() logger.debug("Insert: {}".format(_insert)) conn.close() update_already_merge_city("{}_data".format(poi_type), cid)
def test_case_37(self): self.assertSetEqual( set(fix_daodao_open_time('9:30-17:00,周四延长至21:00').split('|')), {'<*><周1-周7><09:30-17:00><SURE>'})
def test_case_42(self): self.assertSetEqual( set(fix_daodao_open_time('8:30 - 18:00 ,周六、周日歇业').split('|')), {'<*><周1-周5><08:30-18:00><SURE>'})
def test_case_38(self): self.assertSetEqual( set(fix_daodao_open_time('10:00–17:00,周一闭馆').split('|')), {'<*><周2-周7><10:00-17:00><SURE>'})
def test_case_41(self): self.assertSetEqual( set(fix_daodao_open_time('12:00 - 18:00 ,周一歇业').split('|')), {'<*><周2-周7><12:00-18:00><SURE>'})
def test_case_8(self): self.assertSetEqual( set(fix_daodao_open_time('周一至周日8:00-17:30。').split('|')), {'<*><周1-周7><08:00-17:30><SURE>'})
def test_case_46(self): self.assertSetEqual( set(fix_daodao_open_time('8:30-19:30(周一休馆)').split('|')), {'<*><周2-周7><08:30-19:30><SURE>'})
def test_case_10(self): self.assertSetEqual( set(fix_daodao_open_time('星期日-星期六 10:00-19:00').split('|')), {'<*><周1-周7><10:00-19:00><SURE>'})
def test_case_47(self): self.assertSetEqual( set(fix_daodao_open_time('8:30--19:30 星期三关闭').split('|')), {'<*><周1-周2><08:30-19:30><SURE>', '<*><周4-周7><08:30-19:30><SURE>'})
def test_case_43(self): self.assertSetEqual( set(fix_daodao_open_time('9:00-19:30,逢星期二关闭').split('|')), {'<*><周3-周7><09:00-19:30><SURE>', '<*><周1><09:00-19:30><SURE>'})
def test_case_39(self): self.assertSetEqual( set(fix_daodao_open_time('8:30-13:30,周五不开放').split('|')), {'<*><周6-周7><08:30-13:30><SURE>', '<*><周1-周4><08:30-13:30><SURE>'})
filter( lambda x: x != new_data_dict['name'] and x != new_data_dict['name_en'], set( list(data_dict['name'].values()) + list(data_dict['name_en'].values())))) # ori_grade modify new_data_dict['ori_grade'] = json.dumps(data_dict['grade']) # todo modify opentime, norm_tagid, comment and so on if 'daodao' in data_dict['opentime']: open_desc = data_dict['opentime']['daodao'] try: norm_open_time = fix_daodao_open_time(open_desc) except Exception: norm_open_time = '' print(open_desc) else: norm_open_time = '' if 'daodao' in data_dict['tagid']: norm_tag, norm_tag_en = get_norm_tag(data_dict['tagid']) else: norm_tag, norm_tag_en = '', '' # 数据入库部分 # 替换旧的 data_dict data_dict = new_data_dict
def test_case_44(self): self.assertSetEqual( set(fix_daodao_open_time('10:00-18:00 周二闭馆').split('|')), {'<*><周1><10:00-18:00><SURE>', '<*><周3-周7><10:00-18:00><SURE>'})