Esempio n. 1
0
    def zhsJieba(self, text):

        # 轉為簡體
        cc = pyopencc.OpenCC('zht2zhs.ini')
        text = cc.convert(text)

#        import pdb; pdb.set_trace()
        #用jibea分詞
        if len(text)<200:
            seg_list = jieba.cut(text)
        else:
            seg_list = jieba.analyse.extract_tags(text, topK=30)
            print seg_list
        return ' '.join(seg_list)
Esempio n. 2
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from os.path import join
from os.path import dirname

try:
  import pyopencc
  ftoj = pyopencc.OpenCC('zht2zhs.ini').convert
except:
  print('cannot import opencc, import jianfan')
  import sys
  sys.path.append(join(dirname(__file__), '../common/pylib'))
  from jianfan import ftoj

TIPITAKA_DIR = join(dirname(__file__), '..')
DATA_REPO_DIR = join(dirname(__file__), '../../data/')

localedir = join(dirname(__file__), '../../common/locale/')

TranslationDir = join(DATA_REPO_DIR, 'tipitaka/translation/')
TreeviewJsonPath = join(dirname(__file__), '../build/treeview.json')

def getSDKPath():
  return join(dirname(__file__), "../../../google_appengine/")

def getRomnDir():
  return join(DATA_REPO_DIR, 'tipitaka/romn/')

def getInfoFilePath():
  return join(dirname(__file__), '../build/tocsInfo.txt')
Esempio n. 3
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import pyopencc
CN2TW = pyopencc.OpenCC('zhs2zhtw_vp.ini').convert

if __name__ == '__main__':
  print(CN2TW("中国鼠标软件打印机"))
Esempio n. 4
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from os.path import join
from os.path import dirname

try:
  import pyopencc
  jtof = pyopencc.OpenCC('zhs2zht.ini').convert
except:
  print('cannot import opencc, import jianfan')
  import sys
  sys.path.append(join(dirname(__file__), '../common/pylib'))
  from jianfan import jtof

#jtof = lambda x: x

DICTIONARY_DIR = join(dirname(__file__), '..')
DATA_REPO_DIR = join(dirname(__file__), '../../data/')
APP_COMMON_DATA_DIR = join(dirname(__file__),
    "../../common/app/scripts/services/data/")

def getSDKPath():
  return join(dirname(__file__), "../../../google_appengine/")

def getDictBooksCSVPath():
  return join(DATA_REPO_DIR, 'dictionary/dict-books.csv')

def getDictWordsCSV1Path():
  return join(DATA_REPO_DIR, 'dictionary/dict_words_1.csv')
Esempio n. 5
0
 def __init__(self, config_name):
     import pyopencc
     self.translator = pyopencc.OpenCC(config_name)
    def importNews(self):
        if self.newsList is None:
            return

        listSoup = BeautifulSoup(self.newsList, newsSiteType)

        for item in listSoup.findAll('item'):
            link = unicode(item.link.string)

            # 中文網址預處理
            if siteCode.startswith('dw-') or siteCode.startswith('rfi-'):
                link = self.transZhUrl(link)

            link = str(link)

            query = requests.get(
                '%s/@search?originalUrl=%s' % (siteURL, urllib.quote(link)),
                headers={'Accept': 'application/json'},
                auth=(username, paswd),
            )
            print 'items_total: %s' % query.json().get('items_total')
            if query.json().get('items_total') > 1:
                continue

            webPage = self.getDocs(link)
            if webPage is None:
                continue
            pageSoup = BeautifulSoup(webPage, "lxml")

            try:
                # 取得html及keywords(完整列表)
                if siteCode.startswith('dw-'):
                    result, oldPicturePath = self.dwNewsContent(pageSoup)
                elif siteCode.startswith('rfi-'):
                    result, oldPicturePath = self.rfiNewsContent(pageSoup)
                elif siteCode.startswith('cna-'):
                    result, oldPicturePath = self.ncaNewsContent(pageSoup)

                title, text = result['title'], result['text']

                if len(text) < 50:
                    continue

                targetURL_TW = '%s/zh-tw/%s' % (siteURL, self.folder)
                targetURL_CN = '%s/zh-cn/%s' % (siteURL, self.folder)
                newsId = datetime.now().strftime('%Y%m%d%H%M%S')
                m2s = pyopencc.OpenCC('mix2zhs.ini')
                m2t = pyopencc.OpenCC('mix2zht.ini')
                title_t = m2t.convert(title)
                title_s = m2s.convert(title)
                text_t = m2t.convert(text)
                text_s = m2s.convert(text)
                self.addNews(targetURL_TW, newsId, title_t, text_t, link,
                             oldPicturePath)
                self.addNews(targetURL_CN, newsId, title_s, text_s, link,
                             oldPicturePath)
                #                import pdb; pdb.set_trace()
                urllib.urlopen(
                    'http://%s:%s@%s/reg_trans?id=%s' %
                    (username, paswd, siteURL.replace('http://', ''), newsId))

            except:
                print 'line 227'
                continue
Esempio n. 7
0
def s2t(sentences):
    cc = opencc.OpenCC('zhs2zht.ini')
    return cc.convert(sentences)
Esempio n. 8
0
def t2s(sentences):
    cc = opencc.OpenCC('zht2zhs.ini')
    return cc.convert(sentences)