def writeCityName(): if not os.path.exists('cityName.csv'): url = "http://www.zxinc.org/gb2260.htm" print 'start reading ...' response = urllib.urlopen(url) page = response.read() page = page.decode('utf8') print 'reading done...' pattern = re.compile(ur'([\u4e00-\u9fa5]{2,5}市)') match = pattern.findall(page) if match: try: with open('cityName.csv', 'wb') as csvfile: csvWrite = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) csvfile.write(codecs.BOM_UTF8) test = PinYin() test.load_word() for result in match: result = result.encode('utf8') py = test.hanzi2pinyin(string=result[:-3]) csvWrite.writerow([result[:-3], py[-1]]) print 'write done!' except Exception as e: print e finally: csvfile.close() else: print 'cityName.csv detected'
def __init__(self): self.pp = PinYin() self.pp.load_word() with open('pinyin_dict', 'r') as ff: line = ff.readline() self.jj_dict = json.loads(line) ff.close()
def get_pinyin_data(): global gPinYin if gPinYin is None: pydatapath = 'pinyin_word.data' gPinYin = PinYin(pydatapath) gPinYin.load_word() return gPinYin
def get_item(marc_no, status=0): dict = {} test = PinYin() test.load_word('word.data') hm = requests.get(ourl + 'item.php?marc_no=' + str(marc_no)).text.encode( encoder).decode('utf8').replace(' ', '') parser = HTMLParser.HTMLParser() s1 = parser.unescape(hm) static = re.findall('<div id="book_info">(.*?)<div class="clear"></div>', s1, re.S)[0] booklist = re.findall('<dl class="booklist">(.*?)</dl>', static, re.S) for each in booklist: pm = re.findall('<dt>(.*?)</dt>', each, re.S)[0] if pm == '': continue st = re.findall('<dd>(.*?)</dd>', each, re.S)[0] try: st1 = re.findall('>(.*?)</a>', st, re.S)[0] except: st1 = st pms = test.hanzi2pinyin_split(string=pm, split="", firstcode=True).replace('/', '') dict[pms] = st1 if status == 1: print pm, print st1 return dict
def __init__(self): self.pa=Parser() self.pp=PinYin() self.pp.load_word() with open(os.path.join(os.path.dirname(__file__),'pinyin_dict'),'r') as ff: line=ff.readline() self.jj_dict=json.loads(line) ff.close()
def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue): d = DBLPQuery.get_cache('cdblp-pub-cache.data') if not d.__contains__(cdblp_venue.get('title')): print('This C-DBLP venue is not on file.') return res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower())) # fix titles as { "Title ..." } fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8')) # get publications cdblp_pubs = d.get(cdblp_venue.get('title')) dblp_pubs = json.loads(fixed_json) cdblp_authors = set() dblp_authors = set() authors = dict() #print(type(cdblp_pubs)) #print(cdblp_pubs.keys()) for ky in cdblp_pubs.keys(): for ki in cdblp_pubs.get(ky).keys(): for pub in cdblp_pubs.get(ky).get(ki): for author in pub.get('authors'): cdblp_authors.add(author) for pub in dblp_pubs.get('result').get('hits').get('hit'): try: for author in pub.get('info').get('authors').get('author'): dblp_authors.add(author) except AttributeError: print('PublicationException: %s' % pub.get('@id')) pinyin = PinYin() pinyin.load_word() for author in cdblp_authors: name_comp = CDBLPAuthor.get_english_name(author, pinyin) if name_comp['full_name'] in dblp_authors: if authors.__contains__(name_comp['full_name']): authors[name_comp['full_name']]['zh'] = name_comp['zh'] authors[name_comp['full_name']]['count'] += 1 else: authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 } elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']): if authors.__contains__(name_comp['full_name_dash']): authors[name_comp['full_name_dash']]['zh'] = name_comp['zh'] authors[name_comp['full_name_dash']]['count'] += 1 else: authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 } return authors
def t2(): test = PinYin() test.load_word() #string = u"Kottlers古玩城" #string = u"Head 2 Toe发型店" #string = u"蓝" #print string #print test.hanzi2pinyin(string=string) #print Cartesian_product(test.hanzi2pinyin(string=string)) name = u"普季(商城)" name = u"Kottlers古玩城" name = u"hello 艾压(重庆店)山" name = u"库兰达(库兰达热带雨林)" #name = u"盛文甘hello店(店)" #name = u"义乌三期市场(原篁园市场)" print name p = re.compile(u'[\u4e00-\u9fa5]+') p_eng = re.compile(u'[a-zA-Z]+') j = 0 strs = [] while (j < len(name)): #for j in xrange(len(name)): # if j if j + 1 == len(name): strs.append(name[j]) else: print(name[j], name[j + 1]), is_hz_py(name[j], name[j + 1]) if not is_hz_py(name[j], name[j + 1]): print name[j], j strs.append(name[j] + u" ") else: strs.append(name[j]) j += 1 name = "".join(strs) ch_names = p.findall(name) tmp = name ll = [] mydict = {} cnames = "".join([ch_name for ch_name in ch_names]) #pys = test.hanzi2pinyin(string=cnames) pys = Cartesian_product(test.hanzi2pinyin(string=cnames)) print cnames, pys, ch_names for p in pys: tmp2 = name for ch_name in ch_names: m = re.search(ch_name, cnames) _start = m.start() _end = m.end() replace = " ".join([k for k in p.split()[_start:_end]]) print _start, _end, replace, tmp2 tmp2 = re.sub(ch_name, replace, tmp2, 1) print tmp2
def getciyun(self): # 得到词云回答者信息 test1 = PinYin() test1.load_word() str1 = str(test1.hanzi2pinyin_split(string=str(self.aa.topic), split="-")) path1 = 'F:/zhihu/answer/people_qb.txt' cloud.ciyun1(path1,str1+'people') #得到词云,问题信息 path2='F:/zhihu/answer/question_top10.txt' cloud.ciyun1(path2,str1+'question') path2 = 'F:/zhihu/answer/p_location.txt' cloud.ciyun1(path2,str1+'slocation')
def name_tran(str): test = PinYin() test.load_word() str[0] family = test.hanzi2pinyin(string=str[0])[0] last = u'' print(str[1:]) for word in test.hanzi2pinyin(string=str[1:]): last = last + word name_en = last.title() + u' ' + family.title() return name_en
def __init__(self): self.pinYinRobot = PinYin() self.pinYinRobot.load_word() self.shengMu = [ "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w" ] self.zhengTi = [ "zhi", "chi", "shi", "ri", "zi", "ci", "si", "yu", "ye", "yue", "yuan", "yin", "yun", "ying" ] print("pinYinRobot is loaded")
def _generate_name(self): if not self.name and not self.email: return [] result = [] # true name pinyin = PinYin(PINYIN) pinyin.load_word() name_pinyin_list = map(pinyin.hanzi2pinyin, self.name) result.extend(self._format(name_pinyin_list, built_in.name_formats)) result.extend(self._format(self.username, built_in.general_formats)) result.extend(self._generate_email()) return list(set(result))
def draw_frame(faces, img, gray, move): global xdeg global ydeg global fps global time_t if move == 2: steering_control(faces, img) # Draw a rectangle around every face for (x, y, w, h) in faces: cv2.rectangle(img, (x, y), (x + w, y + h), (200, 255, 0), 2) #-----rec-face roi = gray[x:x + w, y:y + h] try: roi = cv2.resize(roi, (200, 200), interpolation=cv2.INTER_LINEAR) params = model.predict(roi) if params[1] < 500.0: #print (names[params[0]]) #pec = (' %.2f' % (params[1])) #sign = names[params[0]] + pec pyin = PinYin() pyin.load_word() pname = names[params[0]] change_identity(pname) #pyin.hanzi2pinyin(string = pname) pname = pyin.hanzi2pinyin_split(string=pname, split='') s = '' for p in pname: s = s + p sign = ("%s %.2f" % (s, params[1])) # print(sign) cv2.putText(img, sign, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) #img = cv2ImgAddText(img, sign , x , y - 2, (0, 0, 255), 20) #img = change_cv2_draw(img,sign,(x, y + 2), 20 , 'firebrick' ) except: continue # Calculate and show the FPS fps = fps + 1 sfps = fps / (time.time() - t_start) cv2.putText(img, "FPS : " + str(int(sfps)), (10, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) cv2.imshow("recognize-face", img)
def get_sample_users(): cache = open('author-cache.data', 'w') piy = PinYin() piy.load_word() author_list = [] res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html') dom = BeautifulSoup(res) author_tags = dom.find_all(href=re.compile('^homepage/')) for author_tag in author_tags: if author_tag.findChild('strong'): #print(author_tag.findChild('strong').contents) author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0]) author_list.append(author_name) #print('{} {}'.format(author_name['full_name'], author_name['zh'])) #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name']) #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0])) cache.write(json.dumps(author_list)) cache.close() return author_list
def idiomFind(x): if x == None: raise Exception else: with open('idiom.txt','r') as f: base = f.readlines() random.shuffle(base) j = 0 for i in base: c = i[:3].decode('utf8') if len(i)>1: try: test = PinYin() test.load_word() py = test.hanzi2pinyin(c)[0] if (py == x): return i except: continue return None
def main(args): test = PinYin() test.load_word() conn = getconn() cursor = conn.cursor() cursor.execute('select rname,rid from roominfo where py_name is null') # cursor.execute('select cname,area from area_name_map where py_name is null') rows = cursor.fetchall() for row in rows: myword = row[0].encode("utf8") pylist = test.hanzi2pinyin(string=myword) pystr = pylist[0] for w in pylist[1:]: pystr = pystr + w[0] # cursor.execute('update area_name_map set py_name=? where cname=? and area=?',(pystr,row[0],row[1])) cursor.execute('update roominfo set py_name=? where rid=?', (pystr, row[1])) conn.commit() conn.close()
def Convert(self): py_engine = PinYin() py_engine.load_word() contact = list() f = open(self.filename,'r') for line in open(self.filename): line = f.readline() k = re.findall(r"(\N\:[^\;]*\;[^\;]*\;[^\;]*\;[^\;]*\;)", line) if k: if k[0].find(';') - 2 > 3: xing = k[0][2: 5] ming = k[0][5: k[0].find(';')] + k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)] else: xing = k[0][2: k[0].find(';') ] ming = k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)] contact.append('N:'+xing+';'+ming+';'+";;\n") phones = py_engine.hanzi2pinyin(string=xing) line = "X-PHONETIC-LAST-NAME:" for item in phones: if item != '': line = line + item.capitalize() line += "\n" contact.append(line) phones = py_engine.hanzi2pinyin(string=ming) line = "X-PHONETIC-FIRST-NAME:" for item in phones: if item != '': line = line + item.capitalize() line += "\n" contact.append(line) else: contact.append(line) fout = open("ok_"+self.filename, 'w') for line in contact: fout.write(line)
host='192.100.2.31', user='******', passwd='opensesame', db='traincrawler', port=3306) default_encoding = 'utf-8' if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) base_url = "http://trains.ctrip.com/TrainBooking/Ajax/GetTrainDataV2.aspx?DepartureCity=%s&ArrivalCity=%s&DepartureDate=2017-03-30&NO=01" # post_param = 'http://trains.ctrip.com/TrainBooking/Ajax/SearchListHandler.ashx?Action=getSearchList&value={"IsBus": False, "Filter": "0", "Catalog": "", "IsGaoTie":False, "IsDongChe":False, "CatalogName": "", "DepartureCity": %s, "ArrivalCity": %s, "HubCity": "", "DepartureCityName": %s, "ArrivalCityName": %s, "DepartureDate": "2017-03-24", "DepartureDateReturn": "2017-03-26", "ArrivalDate": "", "TrainNumber": ""}' base_path = 'xc-price/%s' getStations_sql = 'select id,begin_stop,begin_alia,end_stop,end_alia from train_stop_20170331_task_xc where task=0 limit 100' update_sql = 'update train_stop_20170331_task_xc set task = 1 where id =%s' py_util = PinYin() py_util.load_word('word.data') def get(p): time.sleep(1) content = '' try: p = p.encode('utf-8') response = urllib.urlopen(p) content = response.read() response.close() return content.decode('gb2312') except Exception as e: print e content = '500'
# -*- coding: utf-8 -*- # Author: [email protected] # Copyright 2015 @ NLPJob #bug fixed : can be use with pyenv environment #update: this is Python3 script import codecs import sys from langconv import * from pinyin import PinYin py = PinYin() py.load_word() def make_word_4tag(word): if len(word) == 0: return "N" if len(word) == 1: return "S" else: tag = "B" for w in word[1:len(word) - 1]: tag += "M" tag += "E" return tag def make_mecab_train_data(input_file, output_file): input_data = codecs.open(input_file, 'r', 'utf-8')
from common import get_response_by_url from mongoservice import Insert,get_category_by_cid,get_by_pinyin,get_all from bs4 import BeautifulSoup from pinyin import PinYin import os _cid = 160 base_url = "http://www.meishij.net/shiliao.php?cid=" s_pinyin = PinYin() s_pinyin.load_word() # filepath =os.path.abspath("./1.json") '''获取 理疗分类''' def get_meishijie_categories(cid,category_pinyin='',category_cn=''): url=base_url+str(cid) html =get_html_by_url(url) # print(html) # soup = BeautifulSoup(html) # print(soup) # print(soup.prettify()) sop = BeautifulSoup(html) # h = sop.prettify() # print( h ) # head = sop.find('head') # print(head) # p_categories = sop.findAll(attrs={'id':'listnav_ul'})[0] # print(p_categories) # dds = sop.select(".listnav_dl_style1 dd a") dds = sop.select(".listnav_dl_style1 .current a")
class CDBLPAuthor: pinyin = PinYin() pinyin.load_word() def __init__(self, author_name, link=''): self.author_name = CDBLPAuthor.getEnglishName(author_name) if not link: link = 'http://cdblp.cn/search_result.php?author_name={}&area=computer'.format( quote(self.author_name['zh'])) elif author_name == '王伟': link = 'http://127.0.0.1/ww' self.res = urlopen(link) self.dom = BeautifulSoup(self.res) #self.get_all_authors() self.author = { 'author_name': {}, 'coauthors': [], 'publications': [{ 'title': 'Ranking the Difficulty Level of the Knowledge Units Based on Learning Dependency', 'authors': ['Jun Liu', 'Sha Sha', 'Qinghua Zheng', 'Wei Zhang'], 'venue-type': 'journal', 'venue': 'IJDET', 'volume': '', 'number': '', 'pages': '', 'year': '2012', 'cdblpkey': '83594' }] } def get_all_authors(self): l = [] all_name_tags = self.dom.find_all( href=re.compile('namedisambiguation')) i = 0 for name_tag in all_name_tags: if name_tag.string != 'Unknown': print(i, self.author_name['zh'], 'from', name_tag.string) l.append('http://cdblp.cn' + name_tag['href'][5:]) i += 1 c = int( input( 'There are several authors under this name, which one do you want to choose?\n> ' )) if c < 0: c = 0 self.res = urlopen(l[c]) self.dom = BeautifulSoup(self.res) return l[c] def get_author(self): coauthors = self.get_coauthors() publications = [] paper_link_tags = self.dom.find_all(href=re.compile('^/paper')) for paper_link_tag in paper_link_tags: # table cell tag td_tag = paper_link_tag.parent # title title = paper_link_tag.string link = paper_link_tag['href'] cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0] # authors authors = [] counter = 0 for author_tag in td_tag.find_all(href=re.compile('^/author')): if counter == 0: current_author = author_tag.previous_sibling if type(current_author ) == NavigableString and self.author_name[ 'zh'] in current_author.string: authors.append(current_author.string.strip()) if isinstance(author_tag.string, str): authors.append(author_tag.string.strip()) current_author = author_tag.next_sibling if type(current_author ) == NavigableString and self.author_name[ 'zh'] in current_author.string: authors.append( current_author.string.replace('.', '').strip()) counter += 1 # publication data venue_rec = td_tag.find_all(href=re.compile('^/journal')) venue = venue_rec[0].string volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall( venue_rec[1]['href'])[0] issue_result = re.compile( '(/journal_issue)/(.*)/(\d*)/(.*)').findall( venue_rec[2]['href'])[0] year = volume_result[-2] volume = volume_result[-1] number = issue_result[-1] pages = venue_rec[-1].next_sibling.string.replace(':', '').strip() publication = { 'title': title, 'authors': authors, 'venue-type': 'journal', 'venue': venue, 'volume': unquote(volume), 'number': unquote(number), 'pages': pages, 'year': year, 'cdblpkey': cdblbkey } publications.append(publication) self.author = { 'author_name': self.author_name, 'coauthors': coauthors, 'publications': publications } return self.author def get_coauthors(self): coauthors = [] coauthor_table = self.dom.find_all('table')[-2] coauthor_tags = coauthor_table.find_all(href=re.compile('^/author')) for coauthor_tag in coauthor_tags: coauthored_pub_tags = coauthor_tag.parent.find_next_sibling( 'td').find_all('a') author = CDBLPAuthor.getEnglishName(coauthor_tag.string.strip()) author['count'] = len(coauthored_pub_tags) author['pubs'] = map(lambda t: t['href'][1:], coauthored_pub_tags) coauthors.append(author) return coauthors #return list(map(lambda a: '{} {}'.format(a['first_name'], a['last_name']), self.coauthors_en)) @staticmethod def getEnglishName(author_name_zh): author_name_en_split = CDBLPAuthor.pinyin.hanzi2pinyin( author_name_zh.strip()) # return author's English name if isinstance(author_name_en_split, str): author_name = {'full_name': author_name_en_split} else: if len(author_name_zh) > 1: author_name = { 'zh': author_name_zh, 'last_name': author_name_en_split[0].capitalize(), 'first_name': author_name_en_split[1].capitalize() + ''.join(author_name_en_split[2:]) } author_name['full_name'] = '{} {}'.format( author_name['first_name'], author_name['last_name']) author_name['full_name_reverse'] = '{} {}'.format( author_name['last_name'], author_name['first_name']) if len(author_name_zh) == 3: author_name['full_name_dash'] = '{}-{} {}'.format( author_name_en_split[1].capitalize(), author_name_en_split[2], author_name['last_name']) else: author_name = { 'zh': author_name_zh, 'last_name': author_name_en_split[0].capitalize(), 'first_name': '' } author_name['full_name'] = '{} {}'.format( author_name['first_name'], author_name['last_name']) author_name['full_name_reverse'] = '{} {}'.format( author_name['last_name'], author_name['first_name']) return author_name @staticmethod def get_english_name(author_name_zh, py_obj): author_name_en_split = py_obj.hanzi2pinyin(author_name_zh.strip()) # return author's English name if isinstance(author_name_en_split, str): author_name = {'full_name': author_name_en_split} else: if len(author_name_zh) > 1: author_name = { 'zh': author_name_zh, 'last_name': author_name_en_split[0].capitalize(), 'first_name': author_name_en_split[1].capitalize() + ''.join(author_name_en_split[2:]) } author_name['full_name'] = '{} {}'.format( author_name['first_name'], author_name['last_name']) author_name['full_name_reverse'] = '{} {}'.format( author_name['last_name'], author_name['first_name']) if len(author_name_zh) == 3: author_name['full_name_dash'] = '{}-{} {}'.format( author_name_en_split[1].capitalize(), author_name_en_split[2], author_name['last_name']) else: author_name = { 'zh': author_name_zh, 'last_name': author_name_en_split[0].capitalize(), 'first_name': '' } author_name['full_name'] = '{} {}'.format( author_name['first_name'], author_name['last_name']) author_name['full_name_reverse'] = '{} {}'.format( author_name['last_name'], author_name['first_name']) return author_name @staticmethod def get_publications_by_journal(journal, year, issue): res = urlopen('http://cdblp.cn/journal_issue/' + quote('{}/{}/{}'.format(journal, year, issue))) dom = BeautifulSoup(res) publications = [] paper_link_tags = dom.find_all(href=re.compile('^/paper')) for paper_link_tag in paper_link_tags: # table cell tag td_tag = paper_link_tag.parent # title title = paper_link_tag.string link = paper_link_tag['href'] cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0] # authors authors = [] for author_tag in td_tag.find_all(href=re.compile('^/author')): author_name = author_tag.contents[0] if isinstance(author_name, str): authors.append(author_name.strip()) # publication data venue_rec = td_tag.find_all(href=re.compile('^/journal')) venue = venue_rec[0].string volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall( venue_rec[1]['href'])[0] issue_result = re.compile( '(/journal_issue)/(.*)/(\d*)/(.*)').findall( venue_rec[2]['href'])[0] year = volume_result[-2] volume = volume_result[-1] number = issue_result[-1] pages = venue_rec[-1].next_sibling.string.replace(':', '').strip() publication = { 'title': title, 'authors': authors, 'venue-type': 'journal', 'venue': venue, 'volume': unquote(volume), 'number': unquote(number), 'pages': pages, 'year': year, 'cdblpkey': cdblbkey } publications.append(publication) return publications @staticmethod def get_publication_dict(): publication_dict = {} res = urlopen('http://cdblp.cn/jour_scan.php?fid=journalscan') category_dom = BeautifulSoup(res) print( list( map( lambda c: { 'title': c.string, 'href': 'http://cdblp.cn' + c['href'] }, category_dom.find_all(href=re.compile('^/journal'))))) for journal_tag in category_dom.find_all(href=re.compile('^/journal')): journal = journal_tag.string print(journal) print('http://cdblp.cn' + journal_tag['href']) publication_dict[journal] = {} res = urlopen('http://cdblp.cn' + journal_tag['href']) journal_dom = BeautifulSoup(res) for issue_tag in journal_dom.find_all( href=re.compile('^/journal_issue')): print(issue_tag.string) print(issue_tag['href']) issue_result = re.compile( '(/journal_issue)/(.*)/(\d*)/(.*)').findall( issue_tag['href'])[0] year = issue_result[-2] issue = unquote(issue_result[-1]) publications = CDBLPAuthor.get_publications_by_journal( journal, year, issue) if not publication_dict[journal].__contains__(year): publication_dict[journal][year] = {} publication_dict[journal][year][issue] = publications return publication_dict @staticmethod def parallel_get(journal, link): publication_dict = {} print(journal) print(link) res = urlopen(link) journal_dom = BeautifulSoup(res) for issue_tag in journal_dom.find_all( href=re.compile('^/journal_issue')): #print(issue_tag.string) #print(issue_tag['href']) try: issue_result = re.compile( '(/journal_issue)/(.*)/(\d*)/(.*)').findall( issue_tag['href'])[0] year = issue_result[-2] issue = unquote(issue_result[-1]) publications = CDBLPAuthor.get_publications_by_journal( journal, year, issue) if not publication_dict.__contains__(year): publication_dict[year] = {} publication_dict[year][issue] = publications except AttributeError as e: print(journal + year + issue) print(e) except TypeError as et: print(journal + year + issue) print(et) except urllib.error.HTTPError as eh: print(journal + year + issue) print(eh) cache = open('{}-pub-cache.data'.format(journal), 'w') cache.write(json.dumps(publication_dict)) cache.close() return publication_dict
def __init__(self): self.test = PinYin() self.test.load_word()
def get_phonetic(word): pinyin = PinYin() pinyin.load_word() return ''.join(pinyin.hanzipinyin(word))
# -*- coding: utf-8 -*- from pinyin import PinYin if __name__ == '__main__': test = PinYin(no_digit=False, no_letter=False) test.load_word() s = '钓鱼岛是中国的' print('"%s" 转换成拼音: %s' % (s, test.to_pinyin(s))) print('"%s" 转换成带分隔符的拼音: %s' % (s, test.to_pinyin(s, join_with=' '))) print('"%s" 转换成带分隔符的拼音(保留多音字): %s' % (s, test.to_pinyin(s, join_with=' ', multi=True))) print('"%s" 转换成带分隔符的拼音且首字符大写: %s' % (s, test.to_pinyin(s, join_with=' ', capitalize=True))) print('"%s" 转换成带分隔符的拼音且首字符大写(保留多音字): %s' % (s, test.to_pinyin(s, join_with=' ', capitalize=True, multi=True))) print('"%s" 转换成首字母缩写: %s' % (s, test.to_abbr(s))) print('"%s" 转换成首字母缩写(保留多音字): %s' % (s, test.to_abbr(s, multi=True))) s = '加油中国!加油华为!你行!' print('\n\n原始字符串:"%s"' % s) # 原始字符串:"加油中国!加油华为!你行!" print('拼音首字母: %s ' % test.to_abbr(s)) print('全拼: %s' % test.to_pinyin(s))
#!/usr/bin/env python # -*- coding:utf-8 -*- from pinyin import PinYin import sys test = PinYin('application/libraries/pinyin.py/word.data') test.load_word() print test.hanzi2pinyin_split(string=sys.argv[1], split='_')
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import xlwt from pinyin import PinYin, Cartesian_product test = PinYin() test.load_word() wbk = xlwt.Workbook() def main(): #files = ['shop_2.csv', 'sight_2.csv'] files = ['shop_2.csv', 'sight_2.csv', 'district_2_n.csv'] for f in files: func(f) wbk.save("/home/chenyp/sharefolder/cn2pinyin.xls") def func2(filename): #餐馆的输入文档 poitype = filename.split(".")[0].decode('utf-8') column2 = u"%sid" % poitype count = 0 lines = open(filename).readlines()[1:100] MAX = 35000 if len(lines) % MAX == 0: total = len(lines) / MAX else:
u"天气预报", u"京东", u"淘宝", u"百度", u"微信", u"斗鱼", u"爱奇艺", u"腾讯视频", u"qq", u"熊猫tv", u"快递", u'4399', } word2pinyin = PinYin() word2pinyin.load_word() alphabet = {'a':1, 'b':1, 'c':1, 'd':1, 'e':1, 'f':1, 'g':1, 'h':1, 'i':1, 'j':1, 'k':1, 'l':1, 'm':1, 'n':1, 'o':1, 'p':1, 'q':1, 'r':1, 's':1, 't':1, 'u':1, 'v':1, 'w':1, 'x':1, 'y':1, 'z':1} def hanzi2pinyi(word): result = [] for hanzi in word: if hanzi.lower() in alphabet: result.append(hanzi.lower()) else: result.append(word2pinyin.hanzi2pinyin(hanzi)) return ''.join(result)
#/usr/bin/env python # coding=utf-8 import os import sys import numpy as np import pandas as pd import jieba import re sys.path.append('utils/') import config from pinyin import PinYin str2pinyin = PinYin() jieba.load_userdict(config.jieba_dict) stopwords = [ line.strip() for line in open(config.stopwords_path, 'r').readlines() ] stopwords = [w.decode('utf8') for w in stopwords] # stopwords=[] #if config.cut_char_level: stopwords = [ u'?', u'。', u',', ] use_pinyin = False def clean_str(x): punc = "蚂蚁 了 吗 的 !?。,:;."
import config from urllib import quote from pinyin import PinYin import magic import shutil import sys reload(sys) sys.setdefaultencoding( "utf-8" ) if config.uncompression_enable: import uncompression # load config file root = config.root host = config.host py = PinYin(dict_file=os.getcwd()+'/pinyin/word.data') py.load_word() types = [ ".h",".cpp",".cxx",".cc",".c",".cs",".html",".js", ".php",".java",".py",".rb",".as",".jpeg",".jpg",".png", ".gif",".ai",".psd",".mp3",".avi",".rmvb",".mp4",".wmv", ".mkv",".doc",".docx",".ppt",".pptx",".xls",".xlsx", ".zip",".tar",".gz",".7z",".rar",".pdf",".txt",".exe", ".apk",".torrent",".srt",".pyc" ] preview=[] render = web.template.render('template')
# -*- coding: utf-8 -*- import re from openerp import models, fields, api from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS from pinyin import PinYin han2py = PinYin() han2py.load_word() # class multiple_name_search(models.Model): # _name = 'multiple_name_search.multiple_name_search' # name = fields.Char() # 将name转化为拼音的公共方法 def comman_change_name(name): pinyinStr, pyStr = False, False if name: #如果有name pinyinArr = han2py.str2pinyin(name) print pinyinArr pyStr = ''.join([p[0] for p in pinyinArr]) pinyinStr = ''.join(pinyinArr) return {'pinyin': pinyinStr, 'py': pyStr} class WithPinyinProductTemplate(models.Model): _inherit = 'product.template'
def check_contain_english(check_str): for ch in check_str.decode('utf-8'): if ch <= u'\u4e00' or ch >= u'\u9fff': return True return False # import city name into pandas Dataframe with open('ChinaCityList.json') as json_data: d = json.load(json_data) # extract city name into list city = json_normalize(data=d, record_path=['city', 'county']) city_name = city.name.tolist() city_name = [x.encode('utf-8') for x in city_name] # Build a dictionary in the form of city:pinying trans = PinYin() trans.load_word() to_py = trans.hanzi2pinyin city_py = [to_py(x) for x in city_name] city_dict = dict(zip(city_name, city_py)) # city chain game def city_chain(city): if len(city) == 0: print '错误:请确认是否输入汉字' elif check_contain_english(city): print '错误:请确认是否输入了非汉字' else: candidate = [] py_city = to_py(city)