def zhsJieba(self, text): # 轉為簡體 cc = pyopencc.OpenCC('zht2zhs.ini') text = cc.convert(text) # import pdb; pdb.set_trace() #用jibea分詞 if len(text)<200: seg_list = jieba.cut(text) else: seg_list = jieba.analyse.extract_tags(text, topK=30) print seg_list return ' '.join(seg_list)
#!/usr/bin/env python # -*- coding:utf-8 -*- from os.path import join from os.path import dirname try: import pyopencc ftoj = pyopencc.OpenCC('zht2zhs.ini').convert except: print('cannot import opencc, import jianfan') import sys sys.path.append(join(dirname(__file__), '../common/pylib')) from jianfan import ftoj TIPITAKA_DIR = join(dirname(__file__), '..') DATA_REPO_DIR = join(dirname(__file__), '../../data/') localedir = join(dirname(__file__), '../../common/locale/') TranslationDir = join(DATA_REPO_DIR, 'tipitaka/translation/') TreeviewJsonPath = join(dirname(__file__), '../build/treeview.json') def getSDKPath(): return join(dirname(__file__), "../../../google_appengine/") def getRomnDir(): return join(DATA_REPO_DIR, 'tipitaka/romn/') def getInfoFilePath(): return join(dirname(__file__), '../build/tocsInfo.txt')
#!/usr/bin/env python # -*- coding:utf-8 -*- import pyopencc CN2TW = pyopencc.OpenCC('zhs2zhtw_vp.ini').convert if __name__ == '__main__': print(CN2TW("中国鼠标软件打印机"))
#!/usr/bin/env python # -*- coding:utf-8 -*- from os.path import join from os.path import dirname try: import pyopencc jtof = pyopencc.OpenCC('zhs2zht.ini').convert except: print('cannot import opencc, import jianfan') import sys sys.path.append(join(dirname(__file__), '../common/pylib')) from jianfan import jtof #jtof = lambda x: x DICTIONARY_DIR = join(dirname(__file__), '..') DATA_REPO_DIR = join(dirname(__file__), '../../data/') APP_COMMON_DATA_DIR = join(dirname(__file__), "../../common/app/scripts/services/data/") def getSDKPath(): return join(dirname(__file__), "../../../google_appengine/") def getDictBooksCSVPath(): return join(DATA_REPO_DIR, 'dictionary/dict-books.csv') def getDictWordsCSV1Path(): return join(DATA_REPO_DIR, 'dictionary/dict_words_1.csv')
def __init__(self, config_name): import pyopencc self.translator = pyopencc.OpenCC(config_name)
def importNews(self): if self.newsList is None: return listSoup = BeautifulSoup(self.newsList, newsSiteType) for item in listSoup.findAll('item'): link = unicode(item.link.string) # 中文網址預處理 if siteCode.startswith('dw-') or siteCode.startswith('rfi-'): link = self.transZhUrl(link) link = str(link) query = requests.get( '%s/@search?originalUrl=%s' % (siteURL, urllib.quote(link)), headers={'Accept': 'application/json'}, auth=(username, paswd), ) print 'items_total: %s' % query.json().get('items_total') if query.json().get('items_total') > 1: continue webPage = self.getDocs(link) if webPage is None: continue pageSoup = BeautifulSoup(webPage, "lxml") try: # 取得html及keywords(完整列表) if siteCode.startswith('dw-'): result, oldPicturePath = self.dwNewsContent(pageSoup) elif siteCode.startswith('rfi-'): result, oldPicturePath = self.rfiNewsContent(pageSoup) elif siteCode.startswith('cna-'): result, oldPicturePath = self.ncaNewsContent(pageSoup) title, text = result['title'], result['text'] if len(text) < 50: continue targetURL_TW = '%s/zh-tw/%s' % (siteURL, self.folder) targetURL_CN = '%s/zh-cn/%s' % (siteURL, self.folder) newsId = datetime.now().strftime('%Y%m%d%H%M%S') m2s = pyopencc.OpenCC('mix2zhs.ini') m2t = pyopencc.OpenCC('mix2zht.ini') title_t = m2t.convert(title) title_s = m2s.convert(title) text_t = m2t.convert(text) text_s = m2s.convert(text) self.addNews(targetURL_TW, newsId, title_t, text_t, link, oldPicturePath) self.addNews(targetURL_CN, newsId, title_s, text_s, link, oldPicturePath) # import pdb; pdb.set_trace() urllib.urlopen( 'http://%s:%s@%s/reg_trans?id=%s' % (username, paswd, siteURL.replace('http://', ''), newsId)) except: print 'line 227' continue
def s2t(sentences): cc = opencc.OpenCC('zhs2zht.ini') return cc.convert(sentences)
def t2s(sentences): cc = opencc.OpenCC('zht2zhs.ini') return cc.convert(sentences)