def process_zone_group(zone_name, fliggy_zones, city_name): z_name_list = zone_name.split('/') total_match_list = [] for z_name in z_name_list: match_list = [] for fliggy_zone in fliggy_zones: haoqiao_zone_name_pinyin = pypinyin.slug(unicode(z_name), separator='') fliggy_zone_name_pinyin = pypinyin.slug(unicode(fliggy_zone), separator='') if haoqiao_zone_name_pinyin in fliggy_zone_name_pinyin or fliggy_zone_name_pinyin in haoqiao_zone_name_pinyin: match_list.append(fliggy_zone) if len(match_list) == 0: break elif len(match_list) == 1: total_match_list.append(match_list[0]) else: selected_zone = calculate_similarity(z_name, match_list) total_match_list.append(selected_zone) if len(z_name_list) == len(total_match_list): ratio = get_ratio(city_name, zone_name, haoqiao_setp1_file_name) match_id_list = [] for match in total_match_list: match_id_list.append( get_zid_by_cname_zname(city_name, match, fliggy_setp1_file_name)) content = city_name + '\t' + '####'.join( match_id_list) + '\t' + '####'.join( total_match_list) + '\t' + ratio + '\n' write2file(final_result_file_name, content)
def process_single_zone(zone_name, fliggy_zones, city_name): match_list = [] for fliggy_zone in fliggy_zones: ctrip_zone_name_pinyin = pypinyin.slug(unicode(zone_name), separator='') fliggy_zone_name_pinyin = pypinyin.slug(unicode(fliggy_zone), separator='') if ctrip_zone_name_pinyin in fliggy_zone_name_pinyin or fliggy_zone_name_pinyin in ctrip_zone_name_pinyin: match_list.append(fliggy_zone) if len(match_list) == 0: # 未匹配上,需要找出最长公共字串,以便人工处理 sorted_selected_zones = get_zone_list_by_lcs(zone_name, fliggy_zones) if len(sorted_selected_zones) == 1: content = zone_name + '\t' + sorted_selected_zones[0][ 0] + '\t' + str(sorted_selected_zones[0][1]) + '\n' write2file(ctrip_setp2_file_name, content) elif len(sorted_selected_zones) >= 2: content = zone_name + '\t' + sorted_selected_zones[0][0] + '\t' + str(sorted_selected_zones[0][1]) + '\t' + \ sorted_selected_zones[1][0] + '\t' + str(sorted_selected_zones[1][1]) + '\n' write2file(ctrip_setp2_file_name, content) elif len(match_list) == 1: ratio = get_ratio(city_name, zone_name, ctrip_setp1_file_name) fliggy_zone_id = get_zid_by_cname_zname(city_name, match_list[0], fliggy_setp1_file_name) content = city_name + '\t' + fliggy_zone_id + '\t' + match_list[ 0] + '\t' + ratio + '\n' write2file(final_result_file_name, content) elif len(match_list) > 1: selected_zone = calculate_similarity(zone_name, match_list) ratio = get_ratio(city_name, zone_name, ctrip_setp1_file_name) fliggy_zone_id = get_zid_by_cname_zname(city_name, selected_zone, fliggy_setp1_file_name) content = city_name + '\t' + fliggy_zone_id + '\t' + selected_zone + '\t' + ratio + '\n' write2file(final_result_file_name, content)
def process_single_zone(zone_name, fliggy_zones, city_name): match_list = [] for fliggy_zone in fliggy_zones: haoqiao_zone_name_pinyin = pypinyin.slug(unicode(zone_name), separator='') fliggy_zone_name_pinyin = pypinyin.slug(unicode(fliggy_zone), separator='') if haoqiao_zone_name_pinyin in fliggy_zone_name_pinyin or fliggy_zone_name_pinyin in haoqiao_zone_name_pinyin: match_list.append(fliggy_zone) if len(match_list) == 0: print '未匹配上' elif len(match_list) == 1: ratio = get_ratio(city_name, zone_name, haoqiao_setp1_file_name) fliggy_zone_id = get_zid_by_cname_zname(city_name, match_list[0], fliggy_setp1_file_name) content = city_name + '\t' + fliggy_zone_id + '\t' + match_list[ 0] + '\t' + ratio + '\n' write2file(final_result_file_name, content) elif len(match_list) > 1: selected_zone = calculate_similarity(zone_name, match_list) ratio = get_ratio(city_name, zone_name, haoqiao_setp1_file_name) fliggy_zone_id = get_zid_by_cname_zname(city_name, selected_zone, fliggy_setp1_file_name) content = city_name + '\t' + fliggy_zone_id + '\t' + selected_zone + '\t' + ratio + '\n' write2file(final_result_file_name, content)
def set2py(self, runType='once'): # 每10条处理一次 递归直到全部完成 vlist = self._db.getNonpySetList(10) if False == vlist: print("所有剧集拼音全部处理完毕") exit() for data in vlist: newData = {} newData['title_py'] = slug(data['title'], errors='ignore', separator='') newData['title_sp'] = slug(data['title'], style=Style.FIRST_LETTER, errors='ignore', separator='') newData['title_pyshow'] = reduce(lambda x, y: x + y, pinyin(data['title'])) # if data['summary']: # newData['summary_pyshow'] = reduce(lambda x,y: x + y, pinyin(data['summary'])) #newData['tags'] = list(set(list(filter(lambda v: len(v) > 1, list(seg.cut(data['title'])))) + list(filter(lambda v: len(v) > 1, list(seg.cut(data['summary'])))))) # print(newData['tags']) self._db.saveSetPy(newData, data['_id']) # 保存拼音数据 print("{} 剧集拼音处理完毕".format(data['_id'])) del newData del vlist, data if runType == 'loop': self.set2py()
def to_pinyin(hans, initials=False): '''utils.to_pinyin(hans, initials=False)''' if initials: return slug(hans=hans, style=Style.FIRST_LETTER, separator='', errors='ignore') return slug(hans=hans, style=Style.NORMAL, separator='', errors='ignore')
def tid(self): if self._tid is not None: return str(self._tid) else: tp = pypinyin.slug(self.title, errors='ignore', separator='_') ap = pypinyin.slug(self.author, errors='ignore', separator='_') tid = '{} {}'.format(tp, ap) return tid
def tq_init(xcods, xinxs=['000001']): qx = zsys.TQ_bar() qx.CodPool = xcods #qx.codID,qx.codFN=xcod,zsys.rdatCN+xcod+'.csv' # # # print('tq_init code...') #f_stkCodNamTbl='stk_code.csv' fss = zsys.rdatInx + zsys.f_stkCodNamTbl #;print('f,',fss) zsys.stkCodNamTbl = pd.read_csv(fss, dtype={'code': str}, encoding='GBK') # for xcod in xcods: print('xcod:', xcod) xd = zsys.stkCodNamTbl[zsys.stkCodNamTbl['code'] == xcod] css = xd['name'] ess = pypinyin.slug(css, style=pypinyin.FIRST_LETTER, separator='') xd['enam'] = ess zsys.stkLibCodX[xcod] = xd # fcod = zsys.rdatCN + xcod + '.csv' df = pd.read_csv(fcod, index_col=0) zsys.stkLib[xcod] = df.sort_index() # xcod = xcods[0] qx.wrkCod = xcod qx.wrkCodDat = zsys.stkLib[xcod] qx.wrkCodInfo = zsys.stkLibCodX[xcod] # print('tq_init inx...') #f_stkInxNamTbl='inx_code.csv' fss = zsys.rdatInx + zsys.f_stkInxNamTbl #;print('f,',fss) zsys.stkInxNamTbl = pd.read_csv(fss, dtype={'code': str}, encoding='GBK') for xinx in xinxs: print('xinx:', xinx) xd = zsys.stkInxNamTbl[zsys.stkInxNamTbl['code'] == xinx] css = xd['name'] ess = pypinyin.slug(css, style=pypinyin.FIRST_LETTER, separator='') xd['enam'] = ess zsys.stkInxLibCodX[xinx] = xd # fcod = zsys.rdatCNX + xinx + '.csv' df = pd.read_csv(fcod, index_col=0) zsys.stkInxLib[xinx] = df.sort_index() # xinx = xinxs[0] qx.wrkInx = xinx qx.wrkInxDat = zsys.stkInxLib[xinx] qx.wrkInxInfo = zsys.stkInxLibCodX[xinx] # # #df=pd.read_csv(fdat) #df=df.sort_values('date') # return qx
def calculate_similarity(zone_name, fliggy_zones): curr_similarity = -1 curr_fliggy_zone = None for fliggy_zone in fliggy_zones: zone_name_py = pypinyin.slug(unicode(zone_name), separator='') fliggy_zone_py = pypinyin.slug(unicode(fliggy_zone), separator='') if Levenshtein.ratio(zone_name_py, fliggy_zone_py) > curr_similarity: curr_similarity = Levenshtein.ratio(zone_name_py, fliggy_zone_py) curr_fliggy_zone = fliggy_zone return curr_fliggy_zone
def test_simple_seg(): data = { '北京abcc': 'be3i ji1ng abcc', '你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык', } for h, p in data.items(): assert slug([h], style=TONE2, separator=' ') == p hans = '你好にほんごРусский язык' ret = 'ni3 ha3o' assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret
def showlist(self): #用于搜索后结果的显示 keywd = self.lineEdit.text().strip() if keywd: self.listWidget.clear() #清空显示框 for item in Data.urllist: #对大小写及拼音的识别转换 if (keywd.lower() in item.lower()) or (keywd.lower() in pypinyin.slug(item.lower(), separator='') or (keywd.lower() in pypinyin.slug(item.lower(), style=Style.FIRST_LETTER, separator=''))): self.listWidget.addItem(item) # 加载搜索结果 else: self.listWidget.clear() for item in Data.urllist: self.listWidget.addItem(item) # 空字符时,加载所有列表
def post(self, request): """ 钉钉登录 :param request: :return: """ data = JSONParser().parse(request) timestamp = str(int(time.time() * 1000)) response = requests.post( url= 'https://oapi.dingtalk.com/sns/getuserinfo_bycode?signature={}×tamp={}&accessKey=dingoapfjxo0dzezwe47sy' .format(parse.quote(signature(timestamp)), timestamp), json={"tmp_auth_code": data['code']}) try: response = response.json() if response["errcode"] == 0: try: user = UserProfile.objects.get( unionid=response['user_info']['unionid']) user = User.objects.get(id=user.user_id) except Exception as e: password = make_password('admin') with transaction.atomic(): try: user = User.objects.create( username=pypinyin.slug( response['user_info']['nick'], separator=''), password=password, first_name=response['user_info']['nick']) except Exception as e: user = User.objects.create( username=pypinyin.slug( response['user_info']['nick'], separator='') + str(random.randint(0, 9999)), password=password, first_name=response['user_info']['nick']) UserProfile.objects.create( user=user, openId=response['user_info']['openid'], unionid=response['user_info']['unionid']) data = TokenSerializer(Token.objects.get(user=user)).data data["userphoto"] = '/file/userphoto.jpg' return JsonResponse(data=data, code="999999", msg="成功") else: return JsonResponse(code="999998", msg='登录失败!') except: return JsonResponse(code="999998", msg='登录失败!')
def showlist(self, event): keywd = self.keywdbox.get().strip() if keywd: self.listbox.delete(0, END) # print(urllist) for item in self.urllist: cond_1 = keywd.lower() in item.lower() cond_2 = keywd.lower() in pypinyin.slug(item.lower(), separator='') cond_3 = keywd.lower() in pypinyin.slug(item.lower(), style=Style.FIRST_LETTER, separator='') if any([cond_1, cond_2, cond_3]): self.listbox.insert(END, item) # 加载搜索结果 else: self.listbox.delete(0, END) for item in self.urllist: self.listbox.insert(END, item) # 空字符时,加载所有列表
def str_to_pinyin(chi_characters): ''' 获取中文字符串的简拼和全拼字符串 ''' if not isinstance(chi_characters, unicode): chi_characters = chi_characters.decode('utf-8') # 获得昵称的简拼,如 张三:zs chi_spell = pypinyin.slug(chi_characters, separator='', style=pypinyin.FIRST_LETTER) # 获得昵称的全拼, 张三: zhangsan chi_spell_all = pypinyin.slug(chi_characters, separator='') return chi_spell, chi_spell_all
def update_user_photo_indexer(user_id, image): filename = get_user_path(user_id) + "/" + "indexer.dat" indexer = mc.get(user_id) if not indexer: if not os.path.exists(filename): indexer = {} else: with open(filename, 'rb') as fp: indexer = pickle.load(fp) if indexer is None: return tags = image['tags'] image_name = image['image_name'] for t in tags: pt = pypinyin.slug(t) photo_list = indexer.get(pt, []) photo_list.append(image_name) indexer[pt] = photo_list with open(filename, 'wb') as fp: pickle.dump(indexer, fp) mc.set(user_id, indexer) return indexer
def correct_txt_with_info(txt, dic): """ 修改文章中需要纠错的单词,然后告诉用户文章中专业词汇的个数和相关词汇 :param txt: 需要纠错的文本 :param dic: 字典化的标准库 :return: 纠错过的文本和相关词汇信息 """ txt_num = len(txt) # 文本的长度 word_num = 0 # 领域词汇的个数,初始值为0 word_list = [] # 领域词汇的列表,初始值为0 value = 0 # 分片字符串,一开始从头开始分片 txt1 = txt vocabulary_trie = pytrie.SortedStringTrie(dic) # 生成拼音字典的匹配trie while value < txt_num: tem_txt = txt[value:txt_num] # 得到字串 tem_py = pypinyin.slug(tem_txt) # 字串的拼音 """开始处理字串,把字串作为参数,进行匹配""" result_match = vocabulary_trie.longest_prefix_value(tem_py, default='false') if result_match == 'false': value = value + 1 continue else: need_change = tem_txt[0:len(result_match)] # 需要被纠错的单词 txt1 = txt1.replace(need_change, result_match) word_num = word_num + 1 # 领域词汇数量增加一个 word_list.append(result_match) # 领域词汇列表增加一个 value = value + len(need_change) result = {'text': txt1, 'num': word_num, 'word': word_list} return result
def create(self, validated_data): user = self.context["self"].current_user storage = validated_data.pop("storage") product_pictures = validated_data.pop("pictures") validated_data["name_acronym"] = slug(validated_data["name"], separator="") with transaction.atomic(): # 创建一个保存点 save_id = transaction.savepoint() try: # 添加货品 product = create_product(validated_data, user.id) # 添加货品轮播图 create_product_pictures(product.id, product_pictures) # 更改库存,同时生成库存更改记录 update_product_storage_and_create_record( product, user.id, storage, ProductStorageRecordType.MANUAL_MODIFY, ProductStorageRecordOperatorType.STAFF, ) except Exception as e: print(e) # 回滚到保存点 transaction.savepoint_rollback(save_id) raise # 提交事务 transaction.savepoint_commit(save_id) return product
def generate_user(self, university=None, gender="女"): if gender == "女": name = self.family_names[int( random.uniform(0, len( self.family_names)))] + self.girl_names[int( random.uniform(0, len(self.girl_names)))] pic = self.girl_imgs[int(random.uniform(0, len(self.girl_imgs)))] else: name = self.family_names[int( random.uniform(0, len( self.family_names)))] + self.boy_names[int( random.uniform(0, len(self.boy_names)))] pic = self.boy_imgs[int(random.uniform(0, len(self.boy_imgs)))] pinyin = pypinyin.slug(name.decode('utf-8'), separator='') mail = '%s%d@%s.com' % (pinyin, random.uniform( 1000000000, 9999999999), self.mail_postfix[int( random.uniform(0, len(self.mail_postfix)))]) if not university: school = university_211[int(random.uniform(0, len(university_211)))] else: school = university # print(mail, psw, pic, name, school, '', name, gender) # uid = 1234 uid = self.um.register_user_sync(mail, psw, pic, name, school, '', name, gender) return { 'user': mail, 'icon': pic, 'real_name': name, 'university': school, 'uid': uid }
def data_process(input_df): ''' :param input_df: dataframe,columns=['person_id', 'name', 'rankaff_name', 'rankaff_id', 'ins_en', 'aff_id'] :return: [{'person_id':1234564, 'name':'liu bo', 'ins':['fudan university', 'xx university', 'xxx university'], 'ins_id':[111, 222, 333], 'name_zh':'刘博'}, {...}] ''' input_data = [] for value, sub_df in input_df.groupby('person_id'): row_dict = {} row_dict['person_id'] = value row_dict['ins'] = list(sub_df['ins_en']) row_dict['ins_id'] = list(sub_df['aff_id']) name_zh = sub_df.iloc[0]['name'] row_dict['name_zh'] = name_zh name_py = slug(name_zh, separator='-').replace('v', 'ü') name_list = name_py.split('-') # 如果是复姓 if name_zh[:2] in compound_surname: row_dict['name'] = ''.join(name_list[:2]).capitalize() + ' ' + ''.join(name_list[2:]).capitalize() # 如果是多音字姓 elif name_zh[:1] in polyphony_surname.keys(): row_dict['name'] = polyphony_surname[name_zh[:1]] + ' ' + ''.join(name_list[1:]).capitalize() # 非复姓非多音字姓 else: row_dict['name'] = ''.join(name_list[:1]).capitalize() + ' ' + ''.join(name_list[1:]).capitalize() input_data.append(row_dict) return input_data
def update_user_photo_indexer(user_id, image): filename = get_user_path(user_id) + "/" + "indexer.dat" indexer = mc.get(user_id) if not indexer: if not os.path.exists(filename): indexer = {} else: with open(filename, "rb") as fp: indexer = pickle.load(fp) if indexer is None: return tags = image["tags"] image_name = image["image_name"] for t in tags: pt = pypinyin.slug(t) photo_list = indexer.get(pt, []) photo_list.append(image_name) indexer[pt] = photo_list with open(filename, "wb") as fp: pickle.dump(indexer, fp) mc.set(user_id, indexer) return indexer
def generate_user(self, university=None, gender="女"): if gender == "女": name = self.family_names[int(random.uniform(0, len(self.family_names)))] + self.girl_names[ int(random.uniform(0, len(self.girl_names)))] pic = self.girl_imgs[int(random.uniform(0, len(self.girl_imgs)))] else: name = self.family_names[int(random.uniform(0, len(self.family_names)))] + self.boy_names[ int(random.uniform(0, len(self.boy_names)))] pic = self.boy_imgs[int(random.uniform(0, len(self.boy_imgs)))] pinyin = pypinyin.slug(name.decode('utf-8'), separator='') mail = '%s%d@%s.com' % (pinyin, random.uniform(1000000000, 9999999999), self.mail_postfix[int(random.uniform(0, len(self.mail_postfix)))]) if not university: school = university_211[int(random.uniform(0, len(university_211)))] else: school = university # print(mail, psw, pic, name, school, '', name, gender) # uid = 1234 uid = self.um.register_user_sync(mail, psw, pic, name, school, '', name, gender) return { 'user': mail, 'icon': pic, 'real_name': name, 'university': school, 'uid': uid }
def str2pinyin(hans, nameset,style=pypinyin.FIRST_LETTER): #字符串转拼音,默认只获取首字母 pinyin_str = pypinyin.slug(hans, style=style, separator="") num = 2 while pinyin_str in nameset: pinyin_str += str(num) num += 1 return pinyin_str
def get_classify_dir_path(category_name): pypinyin_slug = pypinyin.slug(category_name, separator='', style=Style.FIRST_LETTER) # print(pypinyin_slug) dir_path = rootDir + 'p**n' + os.sep if pypinyin_slug.endswith('JH') and 'zpdrycsq' in pypinyin_slug: # dir_path = '../jh/zpdr_ycsq_jh/' dir_path = dir_path + 'jh/zpdr_ycsq_jh/' elif (not pypinyin_slug.endswith('JH')) and 'zpdrycsq' in pypinyin_slug: # dir_path = '../all/zpdr_ycsq_all/' dir_path = dir_path + 'all/zpdr_ycsq_all/' elif pypinyin_slug.endswith('JH') and 'wawq' in pypinyin_slug: # dir_path = '../jh/wawq_jh/' dir_path = dir_path + 'jh/wawq_jh/' elif 'wawq' in pypinyin_slug: # dir_path = '../all/wawq_all/' dir_path = dir_path + 'all/wawq_all/' elif 'xqfx' in pypinyin_slug: # dir_path = '../all/xqfx/' dir_path = dir_path + 'all/xqfx/' elif pypinyin_slug.endswith('JH') and 'yczp' in pypinyin_slug: # dir_path = '../jh/yczp_jh/' dir_path = dir_path + 'jh/yczp_jh/' elif (not pypinyin_slug.endswith('JH')) and 'yczp' in pypinyin_slug: # dir_path = '../all/yczp_all/' dir_path = dir_path + 'all/yczp_all/' return dir_path
def convertpinyin(list): """convert all talbe head value from chinese to chinese pinyin and save in a list""" sqlfield = [] for value in list: field = pypinyin.slug(value, separator = '') sqlfield.append(field) return sqlfield
def get_correct_trans_str(match_str): # 检查这个key是否已经在国际化文件里了 trans_res_str = '' if match_str in bundle_data_dict.keys(): trans_res_str = 'k' + str(bundle_data_dict.get(match_str)).strip('"') else: # 添加到string key trans_pinyin_str = slug(match_str, errors='ignore', separator='') # 3PH+英文/拼音概括(3代表占位符个数3个,依次类推) ph_num = str(match_str).count('%@') if ph_num > 0: trans_pinyin_str = str(ph_num) + 'PH' + trans_pinyin_str if len(trans_pinyin_str) > 50 - len(string_name_head): trans_pinyin_str = trans_pinyin_str[0:50 - len(string_name_head)] trans_res_str = '"' + string_name_head + trans_pinyin_str + '"' index = 0 while (trans_res_str in bundle_data_dict.keys()): index += 1 trans_res_str = '"' + string_name_head + trans_pinyin_str + str( index) + '"' bundle_data_dict[trans_res_str] = match_str bundle_data_dict[match_str] = trans_res_str string_key_list_add[trans_res_str] = match_str trans_res_str = 'k' + trans_res_str.strip('"') string_name_list.add(match_str) return trans_res_str
def create_user_by_employee(self, employee_id, password, active=True): """ 通过员工创建Odoo用户 安装依赖 pypinyin: pip install pypinyin """ employee = self.env['hr.employee'].sudo().search([('id', '=', employee_id)]) if employee: # 账号生成改为格式:姓名全拼+手机号末四位@企业邮箱域名 email_name1 = pypinyin.slug(employee.name, separator='') # 全拼 # email_name1 = pypinyin.slug(employee.name, style=Style.FIRST_LETTER, separator='') # 首字母 email_name2 = employee.mobile_phone[7:] # 取手机号末四位 email_name = email_name1 + email_name2 # 这里后续可以加个开关,让管理员自己决定使用其他域名或企业邮箱域名 url = self.env['ir.config_parameter'].sudo().get_param( 'mail.catchall.domain') if url: email_host = url else: email_host = 'dingtalk.com' email_count = len( self.search([('login', 'like', email_name)]).sudo()) if email_count > 0: user = self.env['res.users'].sudo().search([ ('login', '=', email_name + '@' + email_host) ]) values = {'user_id': user.id} employee.sudo().write(values) else: email = email_name + '@' + email_host # 获取不重复的姓名 name = employee.name name_count = len(self.search([('name', 'like', name)]).sudo()) if name_count > 0: name = name + str(name_count + 1) # 创建Odoo用户 values = { 'active': active, "login": email, "password": password, "name": name, 'email': employee.work_email, 'groups_id': self.env.ref('base.group_user') } user = self.sudo().create(values) # 首次自动创建odoo用户后发送钉钉工作通知给该员工 msg = { 'msgtype': 'text', 'text': { "content": "尊敬的{},欢迎加入odoo,您的登陆名为{},初始登陆密码为{},请登陆后及时修改密码!".format( name, email, password), } } self.env['dindin.work.message'].sudo().send_work_message( userstr=employee.din_id, msg=msg) # 注册成功后,自动关联员工与用户 values = {'user_id': user.id} employee.sudo().write(values)
def trans(value): result = pypinyin.slug(value, style=Style.NORMAL, strict=False, separator='') test_dcit[value] = result test_list.append(result) return result
def __jieba_html(self,html): """jieba分词""" keys = [] words = pseg.cut(html) for word,flag in words: if flag in ['ns','n','nt','nz']: # 中文分词的词性类别 参考https://www.cnblogs.com/adienhsuan/p/5674033.html keys.append(slug(word,separator='')) return keys
def get_zone_list_from_fliggy_city_pinyin(city_name, file_name): zone_list = [] for i in range(1, len(open(file_name, "rU").readlines()) + 1): line = linecache.getline(file_name, i) field_list = line.split('\t') if city_name == pypinyin.slug(unicode(field_list[2]), separator=''): zone_list.append(field_list[4]) return zone_list
def str2pinyin(hans, style=pypinyin.FIRST_LETTER): """字符串转拼音,默认只获取首字母 """ pinyin_str = pypinyin.slug(hans, style=style, separator="") num = 2 while pinyin_str in _pinyin_names: pinyin_str += str(num) num += 1 return pinyin_str
def addition_pinyin(data, num): """得到一个有症状名和对应拼音的列表,返回这个列表""" """data参数是str_of_result函数处理后的列表,num是处理后列表的长度""" row = [] # 新建一个空列表 for value in range(0, num): py = pypinyin.slug(data[value]) # 每个词的拼音 tem = [data[value], py] row.append(tem) return row
def h2p(hanzi): """ :param hanzi: 传入汉字 :return: 转出拼音 """ # pinyin = pypinyin.slug(hanzi, separator='', style=Style.FIRST_LETTER) # 下面是全称拼音用法 pinyin = pypinyin.slug(hanzi, separator='') return pinyin
def data_process2(input_df): ''' :param input_df: dataframe,columns=['person_id', 'name', 'rankaff_name', 'rankaff_id', 'ins_en', 'aff_id'] :return: [{'person_id':1234564, 'name':['Huang Ka', 'Huang Qia'], 'ins':['fudan university', 'xx university', 'xxx university'], 'ins_id':[111, 222, 333], 'name_zh':'黄卡'}, {...}] ''' input_data = [] for value, sub_df in input_df.groupby('person_id'): row_dict = {} row_dict['person_id'] = value if 'ins' in sub_df.columns: row_dict['ins'] = list(sub_df['ins_en']) row_dict['ins_id'] = list(sub_df['aff_id']) else: row_dict['ins'] = [''] row_dict['ins_id'] = [''] name_zh = sub_df.iloc[0]['name'] row_dict['name_zh'] = name_zh.strip() # 如果是复姓 if name_zh[:2] in compound_surname: k = 2 first_name = slug(name_zh[:2], separator='') # 如果是多音字姓 elif name_zh[:1] in polyphony_surname.keys(): k = 1 first_name = polyphony_surname[name_zh[:1]] # 非复姓非多音字姓 else: k = 1 first_name = slug(name_zh[:1], separator='') last_name = pinyin(name_zh[k:], heteronym=True, style=Style.NORMAL) name_list = [] for x in product(*last_name): name_list.append(first_name.capitalize() + ' ' + ''.join(x).capitalize()) row_dict['name'] = [i.replace('v', 'ü') for i in name_list] input_data.append(row_dict) return input_data
def get_object_keywords(key_words): keys = [] for k in key_words: pair = k.split("_") if pair is None or len(pair) < 2: continue if pair[1] in Config.config["object_pos"]: keys.append(pypinyin.slug(pair[0])) return keys
def get_meaningful_keywords(key_words): keys = [] for k in key_words: pair = k.split('_') if pair is None or len(pair) < 2: continue if pair[1] in Config.config['meaningful_pos']: keys.append(pypinyin.slug(pair[0])) return keys
def get_face_final(face): ##face: ['face_id,face_name','face_id,face_name',...] face_final = [] for k in face: pair = k.split(',') if pair is None or len(pair) < 1: continue pair[2] = pypinyin.slug(pair[1]) face_final.append(pair) return face_final
def test_update(): data = { '便宜': 'pia2n yi2', '便宜从事': 'bia4n yi2 co2ng shi4', '便宜施行': 'bia4n yi2 shi1 xi2ng', '便宜货': 'pia2n yi2 huo4', '贪便宜': 'ta1n pia2n yi2', '讨便宜': 'ta3o pia2n yi2', '小便宜': 'xia3o pia2n yi2', '占便宜': 'zha4n pia2n yi2', } for h, p in data.items(): assert slug([h], style=TONE2, separator=' ') == p
def get_human_names(raw): keys = [] key_words = raw.split(" ") if key_words is None or len(key_words) == 0: return for k in key_words: pair = k.split("_") if pair is None or len(pair) < 2: continue if pair[1] in Config.config["human_name_pos"]: keys.append(pypinyin.slug(pair[0])) return keys
def translate_tags(tags): Logger.debug("translate_tags in") cv_tags = mc.get("cv_tags") if not cv_tags: Logger.debug("translate_tags load from file") cv_tags = load_cv_tags() mc.set("cv_tags", cv_tags) Logger.debug("translate_tags 3") ret = [] pytags = [pypinyin.slug(w) for w in tags] for tag in pytags: cand = cv_tags.get(tag, []) ret.extend(cand) return ret
def test_update(): data = { '便宜': 'pia2n yi2', '便宜从事': 'bia4n yi2 co2ng shi4', '便宜施行': 'bia4n yi2 shi1 xi2ng', '便宜货': 'pia2n yi2 huo4', '贪便宜': 'ta1n pia2n yi2', '讨便宜': 'ta3o pia2n yi2', '小便宜': 'xia3o pia2n yi2', '占便宜': 'zha4n pia2n yi2', '\u3400': 'qiu1', # CJK 扩展 A:[3400-4DBF] '\u4E00': 'yi1', # CJK 基本:[4E00-9FFF] '\uFA29': 'da3o', # CJK 兼容:[F900-FAFF] } for h, p in data.items(): assert slug([h], style=TONE2, separator=' ') == p
def load_cv_tags(): cv_tags = {} path = os.path.dirname(os.path.realpath(__file__)) + "/category.txt" if not os.path.exists(path): return {} file = open(path, encoding="utf-8") for line in file: items = line.strip().split(":") tag = items[0] words = [pypinyin.slug(w) for w in items[1].split("-")] for word in words: if not word in cv_tags.keys(): cv_tags[word] = [] cv_tags[word].append(tag) return cv_tags
def test_slug(): hans = '中心' assert slug(hans) == 'zhong-xin' assert slug(hans, heteronym=True) == 'zhong-xin'
def get_tag_from_rawlocation(key_location): tags = key_location[2:] tagpy = [] for item in tags: tagpy.append(pypinyin.slug(item)) return tagpy
def getPinyin(word): pinyin = pypinyin.slug(word, style=pypinyin.TONE2) # 多音字的没弄! pinyin = pinyin.replace("-", " ").replace(" ", " ").replace(" ", " ") # print pinyin return pinyin
def post(self): result = {'status': False} Logger.debug('in search') try: user_id = self.get_argument('user_id', '') desc = self.get_argument('desc','') # desc = ['我_r 想_v 找_v 在_p 天安门_ns 的_u 照片_n'] rawTag = self.get_argument('tag', '') # rawTag = '我_r 想_v 找_v 今年_nt 在_p 兵马俑_ns 的_u 照片_n' # 我_r 想_v 找_v 去年_nt 夏天_nt 在_p 兵马俑_ns 农贸市场_n 的_u 照片_n rawLocation = self.get_argument('loc','') token = self.get_argument('token','') user = MongoHelper.get_user_by_id(user_id) Logger.info('user_id:' + user_id + ', raw tag:' + ', raw location:' + ', token:' + token) if token is not user['token']: self.write(json.dumps(result)) Logger.debug('token wrong') return if user_id == '' or rawTag == '': Logger.debug('user id or tag null') self.write(json.dumps(result)) return key_words = rawTag.split(' ') if key_words is None or len(key_words) == 0: Logger.debug('key words none') self.write(json.dumps(result)) return meaningful = Utils.get_meaningful_keywords(key_words) if not meaningful: Logger.debug('meaningful key words none') return Logger.info('meaningful:' + str(meaningful)) if rawLocation: key_location = rawLocation.split(',') latitude = float(key_location[0]) longitude = float(key_location[1]) Logger.info('Latitude: ' + str(latitude) + ', Longitude: ' + str(longitude)) else: latitude = None longitude = None Logger.info('No location info in search') face_name = Utils.get_human_names(rawTag) Logger.info('face name:' + str(face_name)) face_id = list(MongoHelper.get_similar_persons(user_id,face_name)) Logger.info('Similar person face id:' + str(face_id)) if face_id : meaningful.extend(face_id) Logger.info('meaningful_add_face_id:' + str(meaningful)) Logger.debug("before cv: " + str(key_words)) object_name = Utils.get_object_keywords(key_words) Logger.debug("before cv: " + str(object_name)) cv_tags = Utils.translate_tags(object_name) Logger.debug("after cv: " + str(cv_tags)) if cv_tags: meaningful.extend(cv_tags) Logger.debug('meaningful_add_cv_tag:' + str(meaningful)) processed_time = NLPTimeConvertor.time_api(rawTag, user_id) Logger.debug('time api return:' + str(processed_time)) image = [] if processed_time[0]: image = Utils.get_image_by_time(user_id, processed_time[0]) if processed_time[1]: meaningful.append(pypinyin.slug(processed_time[1])) if meaningful: image = Utils.get_images_by_tag(user_id, meaningful, image) image = Utils.sort_by_location(user_id, latitude, longitude, image) Logger.info('returned image:' + str(image)) result['status'] = True result['image'] = image Logger.debug('result: ' + str(result)) finally: self.write(json.dumps(result))
def process(): global nummap global SEP fields = get_field_list() pinyins = map(lambda x: x.replace('__', '_'), map(lambda x: filter(lambda x: x if x.isalnum() or x == SEP else '', x), map(lambda x: nummap[x[0]] + SEP + x[1:] if x[0].isdigit() else x, map(lambda x: pypinyin.slug(x.decode('utf-8'), separator=SEP), fields)))) set_ = set(pinyins) if len(pinyins) != len(set_): dict_ = dict(zip(list(set_), [0] * len(set_))) for pinyin in pinyins: dict_[pinyin] += 1 for key in dict_.keys(): if dict_[key] > 1: print 'the same name: %s' % key raise Exception('variables having the same name') make(fields, pinyins)
# -*- coding: utf-8 -*- import mysql.connector; import csv; import codecs; from collections import OrderedDict; import re; import time; from pypinyin import pinyin,lazy_pinyin; import pypinyin cnx=mysql.connector.connect(user='******',password='******',host='localhost',database='wholeren',charset='utf8'); cursor=cnx.cursor(); updateClient=("UPDATE client set pinyin=%s where id=%s"); cursor.execute('select chineseName,id from client;'); aa=cursor.fetchall(); for row in aa: if row[0] is not None: p= pypinyin.slug(unicode(row[0]),style=pypinyin.NORMAL,separator=''); cursor.execute("""UPDATE client SET pinyin=%s WHERE id=%s""",(p,row[1])); print p #print cursor.fetchOne(); cnx.commit(); cursor.close(); cnx.close();
def test_slug(): hans = u'中心' assert slug(hans) == 'zhong-xin'