コード例 #1
0
def convert_to_strings(wikipage):
    # given a wikipage object, the function will return a structurlized
    # dictionary that holds all information from a wikipage.
    from hanziconv import HanziConv
    import wikitextparser as wtp
    import pprint
    try:
        summary = HanziConv.toTraditional(
            wtp.parse(wikipage.content).sections[0].pprint())
    except:
        summary = None
    try:
        sections = [HanziConv.toTraditional(
            sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]]
        try:
            sub_titles = [HanziConv.toTraditional(
                sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]]
        except:
            sub_titles = None
        try:
            section_content = [s[s.find('\n') + 1:] for s in sections]
        except:
            section_content = None
    except:
        sections = None

    try:
        sections = list(zip(sub_titles, section_content))
    except:
        sections = None
    try:
        links = wikipage.links
    except:
        links = None
    return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
コード例 #2
0
def convert_encoding_to_utf_8(filename):
    global total_cnt,success_cnt

    flag1 = True
    flag2 = True
    flag3 = True
    content = codecs.open(filename, 'rb').read()
    source_encoding = chardet.detect(content)['encoding']
    total_cnt+=1
    
    filename_trans = HanziConv.toTraditional(filename)
    if(filename_trans == filename):
        flag1 = False
        
        
    if source_encoding != 'utf-8' and source_encoding != 'UTF-8-SIG':
        content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
    else:
        flag2 = False
        
    content_trans = HanziConv.toTraditional(content)
    if(content_trans == content):
        flag3 = False
    
    if(flag1 or flag2 or flag3):
        backup(filename)
        os.rename(filename, filename_trans)
    
        with open(filename_trans, 'w', encoding='UTF-8-SIG') as file:
            file.write(content_trans)
        success_cnt+=1
コード例 #3
0
ファイル: ConvertFile.py プロジェクト: Nickie-Li/FileConvert
def convert_dir(root_dir):
    global function_list
    
    # Check if root path is valid
    if os.path.exists(root_dir) == False:
        print("[error] dir:",root_dir,"do not exit")
        return
    
    print("work in", root_dir)
    
    for root, dirs, files in os.walk(root_dir):  # recursively work in folder
        '''Convert folder name'''
        ########################################### Function 檔名轉繁體 start ###########################################
        if(function_list[0] == 1):
            root_trans = HanziConv.toTraditional(root)
            if(root_trans != root):
                os.rename(root, root_trans)
        ########################################### Function 檔名轉繁體 end #############################################
    
    for root, dirs, files in os.walk(root_dir):  # recursively work in folder
        
        # Work with file
        for f in files:
            filename = os.path.join(root, f)
            
            ######################################### Function 檔名轉繁體 start #########################################
            if(function_list[0] == 1):
                filename_trans = HanziConv.toTraditional(filename)
                if(filename_trans != filename):
                    os.rename(filename, filename_trans)
            ######################################### Function 檔名轉繁體 end ###########################################
            
            # Read file once
            if(function_list[1] == 1 or function_list[2] == 1) and (any(suf in filename_trans for suf in suffix)):
                content = codecs.open(filename_trans, 'rb').read()
                backup(filename_trans)
            
            ######################################## Function 檔案編碼轉換 start ########################################
            if(function_list[1] == 1):
                try:
                    if(any(suf in filename_trans for suf in suffix)):
                        content = convert_encoding_to_utf_8(filename_trans, content)
                except:
                    print("Fail Convert utf-8",filename)
            ######################################## Function 檔案編碼轉換 end ##########################################
            
            ######################################### Function 檔案轉繁體 start #########################################
            if(function_list[2] == 1):
                try:
                    if(any(suf in filename_trans for suf in suffix)):
                        toTraditional(filename_trans, content)
                except:
                    print("Fail Convert",filename)
コード例 #4
0
 def __init__(self, title, author, author_role, body, form='simplified'):
     if form == 'simplified':
         self.title = HanziConv.toSimplified(title)
         self.author = HanziConv.toSimplified(author)
         self.author_role = HanziConv.toSimplified(author_role)
         self.body = HanziConv.toSimplified(body)
     elif form == 'traditional':
         self.title = HanziConv.toTraditional(title)
         self.author = HanziConv.toTraditional(author)
         self.author_role = HanziConv.toTraditional(author_role)
         self.body = HanziConv.toTraditional(body)
     else:
         raise ValueError(f'Unrecongnized form: {form}')
コード例 #5
0
def trad_and_simp(inputString):
    '''
    Takes in a unicode string containing chinese characters and
    makes sure it contains both traditional and simplified versions
    of every character. If both versions are not present, whatever
    is missing is added. The returned string is in no guarenteed
    order, just guarenteed to have both character sets where possible.

    Parameters
    ----------
    inputString : String
        A string containing traditional and/or simplified
        Chinese characters. These will be expanded so that all simplified
        and traditional characters are present.

    Returns
    -------
    String
        A string is returned that contain traditional and simplified
        versions of every Chinese character found in the input string.
    '''
    totalSet = set(inputString)
    totalSet = totalSet.union(HanziConv.toSimplified(inputString))
    totalSet = totalSet.union(HanziConv.toTraditional(inputString))
    return "".join(totalSet)
コード例 #6
0
def run(
    app: str = typer.Option(default="Spotify", help="Application to track"),
    debug: bool = typer.Option(default=False,
                               is_flag=True,
                               help="To show debug messages or not"),
    traditional: bool = typer.Option(
        default=False,
        is_flag=True,
        help="Translate lyrics into Traditional Chinese if possible",
    ),
):  # pragma: no cover
    {True: logger.enable, False: logger.disable}[debug]("touchbar_lyric")

    if not debug:
        logger.disable("touchbar_lyric")
        logger.disable("__main__")

    media_info = get_info(app)
    if media_info is None:
        return

    songs = universal_search(media_info.name, media_info.artists)

    for song in songs:
        if song.anchor(media_info.position):
            line: str = song.anchor(media_info.position)
            if traditional:
                line = HanziConv.toTraditional(line)
            print(line)
            break
コード例 #7
0
ファイル: tuling_utils.py プロジェクト: chyikwei/pongibot
def send_reuqest(user_id, req_text, api_key):

    req_data = {
        "key": api_key,
        "info": HanziConv.toSimplified(req_text),
        #"loc": ""
        "userid": user_id
    }

    ret_data = {
        "success": False,
    }
    try:
        ret = requests.post(TULING_123_URL, data=req_data, timeout=TIMEOUT)
        if ret.status_code == 200:
            ret_data["success"] = True
            ret_data.update(ret.json())
            if 'text' in ret_data:
                txt = ret_data['text']
                ret_data['text'] = HanziConv.toTraditional(txt)
        else:
            print(ret.text)
    except requests.RequestException:
        pass

    return ret_data
コード例 #8
0
def translate(translate_file_path):
    with open(file=translate_file_path, mode="r", encoding="utf-8") as file:
        content = file.read()
    with open(file=translate_file_path, mode="w", encoding="utf-8") as file:
        if content:
            content = HanziConv.toTraditional(content)
            file.write(content)
コード例 #9
0
ファイル: util.py プロジェクト: CheHaoKang/US_Stock
    def get_Xueqiu_categories(self):
        from hanziconv import HanziConv
        from selenium import webdriver
        from webdriver_manager.chrome import ChromeDriverManager

        url = 'https://xueqiu.com/hq#exchange=US&industry=3_2&firstName=3&page=1'
        while 1:
            try:
                driver = webdriver.Chrome(ChromeDriverManager().install())
                driver.get(url)
                driver.implicitly_wait(10)

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                categories = {}
                for ele in soup.find_all('i', {'class' : 'list-style'}):
                    if re.search("明星股", ele.parent.text):
                        for li in ele.parent.find_all('li'):
                            key = HanziConv.toTraditional(li.text).strip()
                            link = "https://xueqiu.com/hq{}".format(li.select('a')[0]['href'].strip())
                            categories[key] = link

                driver.quit()
                break
            except:
                traceback.print_exc()
                driver.quit()

        self.GICS_csvs(categories)
コード例 #10
0
ファイル: util.py プロジェクト: CheHaoKang/US_Stock
    def get_stock_info(self, stock_name, use_proxy=True):
        from hanziconv import HanziConv

        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Referer': 'http://xueqiu.com/p/ZH010389',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'Host': 'xueqiu.com',
            #'Connection':'keep-alive',
            #'Accept':'*/*',
            'cookie':'s=iabht2os.1dgjn9z; xq_a_token=02a16c8dd2d87980d1b3ddced673bd6a74288bde; xq_r_token=024b1e233fea42dd2e0a74832bde2c914ed30e79; __utma=1.2130135756.1433017807.1433017807.1433017807.1;'
            '__utmc=1; __utmz=1.1433017807.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_1db88642e346389874251b5a1eded6e3=1433017809; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1433017809'
        }

        counter = 0
        while counter < self.RETRY:
            counter += 1

            try:
                proxies = {}
                if use_proxy:
                    proxies = self.get_proxy()
                    print("PROXY => {:}".format(proxies))
                res = requests.get("https://xueqiu.com/S/" + stock_name, headers=headers, proxies=proxies, timeout=self.REQUEST_TIMEOUT)
                reGetStockInfo = re.compile(r"profile-detail.*?\">(.*?)<", re.S | re.UNICODE)
                for stockInfo in reGetStockInfo.findall(res.text):
                    return HanziConv.toTraditional(stockInfo)
            except:
                traceback.print_exc()
                time.sleep(1)

        return ''
コード例 #11
0
def cut(string, using_stopwords=True, simplified_convert=True, log=False):
    string = string.lower()
    if simplified_convert:
        string = HanziConv.toSimplified(string)
    with open(os.path.join(BASE_DIR, 'digit_mark.json'),
              encoding='utf-8') as data_file:
        digit_mark = json.load(data_file)
        for digit in digit_mark:
            string = string.replace(digit, ' ')
        tokens = list(jieba.cut_for_search(string))
        if simplified_convert:
            tokens = [HanziConv.toTraditional(i) for i in tokens]
        tokens = [i for i in tokens if i.strip() != '']
    if using_stopwords:
        with open(os.path.join(BASE_DIR, 'stopwords.txt'),
                  encoding='utf-8') as data_file:
            stopwords = [
                line.replace('\n', '') for line in data_file.readlines()
            ]
            if log:
                removed_tokens = [i for i in list(tokens) if i in stopwords]
                if len(removed_tokens) > 0:
                    print('token removed : ' + ", ".join(removed_tokens))
            tokens = [i for i in list(tokens) if i not in stopwords]
    else:
        tokens = list(tokens)
    return tokens
コード例 #12
0
def print_in_line_reverse(row, msg, style, lang):
    '''
    string 语句,row行数,direction 方向。原理:字数/行数的余数。
    字数/行数, 余数。决定了文字打印出来的坐标。
    '''
    if not msg:
        return None
    if lang == 'S':
        msg = HanziConv.toSimplified(msg)
    elif lang == 'T':
        msg = HanziConv.toTraditional(msg)

    msg = sub(msg)
    len_col = math.ceil(len(msg) / row)  # 向上取整

    big_line = ''
    for i in range(row):
        line = ''
        for j in range(len_col):
            try:
                line += msg[j * row + i] + style  # 可以通过直接切片的方式,进行取值。错误则不打印连接符。
            except:
                line += '㍐' + style
        line = line[::-1]
        big_line += line + '<br>'
    return big_line
コード例 #13
0
 def get_people_name(self):
     if self.get_main_content() != None:
         term_list = segment.seg(
             HanziConv.toSimplified(self.get_main_content()))
         for term in term_list:
             if str(term.nature) == NLP_Constant.people_name_pos:
                 return HanziConv.toTraditional(str(term.word))
     return None
 def simplified_to_traditional(self):
     logging.info("等待中..(簡 to 繁)")
     traditional = open("traditional.txt", "w", encoding="utf-8")
     with open("wiki_text.txt", "r", encoding="utf-8") as simplified:
         for s in simplified:
             traditional.write(HanziConv.toTraditional(s))
     print("成功簡體轉繁體!")
     traditional.close()
コード例 #15
0
ファイル: ConvertFile.py プロジェクト: Nickie-Li/FileConvert
def toTraditional(filename, content):

    content_trans = HanziConv.toTraditional(content)
    
    if content_trans != content:
        # Write with utf8 encoding
        with open(filename, 'w', encoding='UTF-8-SIG') as file:
            file.write(content_trans)
コード例 #16
0
def pre_process(text):
    text = HanziConv.toTraditional(text)

    # load cantonese corpus
    # jb.load_userdict('util/dict/canto_dict.txt')
    vocabs = list(jb.cut(text))
    pp_text = " ".join(vocabs)
    return pp_text
コード例 #17
0
def create_post():
    form = PostForm()
    if form.validate_on_submit():
        chinese = HanziConv.toTraditional(form.chinese_content.data)
        title = HanziConv.toTraditional(form.title.data)
        post = Post(author=current_user,
                    title=title,
                    chinese_content=chinese,
                    content=form.content.data,
                    tags=form.tags.data)
        db.session.add(post)
        db.session.commit()
        flash('Your post has been created!', 'success')
        return redirect(url_for('home'))
    return render_template('create_post.html',
                           title='New Post',
                           form=form,
                           legend='New Post')
コード例 #18
0
 def process_text(self):
     logging.info("等待中..(簡 to 繁)")
     with open('./word2vec_data/traditional.txt', 'w',
               encoding='utf-8') as fw:
         with open('./word2vec_data/wiki_text.txt', 'r',
                   encoding='utf-8') as f:
             for line in f:
                 line = HanziConv.toTraditional(line)
                 fw.write(line)
コード例 #19
0
ファイル: QA_v2.py プロジェクト: ziu-ting/Team-Project
def inputTest():
    x = input("請說話:")
    # x:token
    y = jerry.get_response(x)
    y = HanziConv.toTraditional(y.text)

    print(type(x))
    print(type(y))
    print(y)
コード例 #20
0
ファイル: NLP_Main.py プロジェクト: JE-Chen/Python_NLP_JE
 def Transform_ZhTw_Save(self, File_Name, Next_FileName):
     FileRead = []
     with open(File_Name, 'rb') as RawFile:
         for line in RawFile:
             FileRead.append(HanziConv.toTraditional(line))
     with open(Next_FileName, 'wb') as Next_File:
         for i in range(len(FileRead)):
             for j in range(len(FileRead[i])):
                 Next_File.write(FileRead[i][j].encode('utf-8'))
コード例 #21
0
ファイル: TextRankJob.py プロジェクト: aurora314156/textrank
def textrankJob(n):
    # get keyword
    keyword = textrankGet(n)
    # read testdata line by line
    for i in range(1, 8):
        with open('./finalResult/' + n + 'dataset' + str(i) + '.csv',
                  'w',
                  newline='',
                  encoding='utf-8') as res:
            writer = csv.writer(res)
            with open('./testData/dataset' + str(i) + '.txt',
                      'r',
                      newline='',
                      encoding='utf-8') as txtfile:
                tr = txtfile.readlines()
                flag = True
                for t in tr:
                    if flag is True:
                        article = t
                    else:
                        # store keyword match on article content
                        keywordMatch = []
                        content = t
                        # start match keyword and content
                        for index in keyword:
                            temp = []
                            for k in index:
                                if n == 'tfidf':
                                    k = HanziConv.toTraditional(k)
                                if k in content:
                                    temp.append(k)
                            keywordMatch.append(temp)
                        # write match result to csv
                        writer.writerow([article.strip()])
                        writer.writerow([content.strip()])
                        if n == 'tfidf':
                            tempkeyword = []
                            string = "Result:"
                            tempkeyword.append(string)
                            for k in keywordMatch[0]:
                                tempkeyword.append(k)
                            writer.writerow(tempkeyword)
                            writer.writerow("\n")
                        else:
                            exp_value = [0.4, 0.5, 0.6]
                            for j in range(3):
                                tempkeyword = []
                                tempkeyword.append(exp_value[j])
                                for k in keywordMatch[j]:
                                    tempkeyword.append(k)
                                writer.writerow(tempkeyword)
                            writer.writerow("\n")

                    flag = not flag

    print("------------------------------------------")
コード例 #22
0
 def preprocess(self, line, cond=None):
     line = HanziConv.toTraditional(line)
     # line = re.sub(r"\@[a-z0-9][a-z0-9]*", '', line)
     # line = re.sub(r"\#[a-z0-9][a-z0-9]*", '', line)
     # line = re.split(r"\([a-z][a-z]\)", line.lower())[0]
     if cond == 'only_zh':
         words = [w for w in jieba.cut(line) if is_zh.search(w)]
         line = ' '.join(words)
     line = re.sub("\s+", ' ', line).strip().lower()
     return line
コード例 #23
0
def get_words(path):
    words = []
    with codecs.open(path, 'r', 'utf8') as f:
        line = f.readline()
        while line:
            word = line.strip().replace('\n', '')
            word = HanziConv.toTraditional(word)
            words += word
            line = f.readline()
        return words
コード例 #24
0
ファイル: processwords.py プロジェクト: wyhfrank/kindle2anki
def to_traditional_chinese(content):
    converted = content
    try:
        from hanziconv import HanziConv
        converted = HanziConv.toTraditional(content)
    except ImportError:
        logging.warn(
            'You need to install python module "HanziConv" to convert to traditional Chinese.'
        )
    return converted
コード例 #25
0
    def checkUpdate(self):
        _, latest_chapter_title = self.getLatestChapter()

        if latest_chapter_title != self.latest_chapter_title:
            self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter(
            )
            self.latest_chapter_title_cht = HanziConv.toTraditional(
                self.latest_chapter_title)
            return True
        else:
            return False
コード例 #26
0
 def __init__(self, name, url) -> None:
     self.name = name
     self.url = url
     self.code = url.rsplit("/")[-2]
     self.a_link = f"/comic/{self.code}/"
     self.chapter_count = 0
     self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter(
     )
     self.latest_chapter_title_cht = HanziConv.toTraditional(
         self.latest_chapter_title)
     pass
コード例 #27
0
def chatBot_GET_Google(question):
    url = 'https://www.google.com.tw/search?q=' + question + '+維基百科'
    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(response.text, 'lxml')
        wiki_url = bs.find('cite')
        kwd = wiki_url.text.split('/')[-1]
        keyword_trad = HanziConv.toTraditional(kwd)
        return keyword_trad
    else:
        print('請求失敗')
コード例 #28
0
    def concept_lookup(self):

        print('find only one conception,so get its commonsense at most 10')

        # 先中文查找
        local_commonsense = Query.base_lookup(HanziConv.toTraditional(self.conceptions))

        if not local_commonsense:
            # 如果没有找到,翻译成英文再次查找
            local_commonsense = Query.base_lookup(self.translator.zh_to_en(self.conceptions))
        self.commonsense = set(local_commonsense)
コード例 #29
0
def subot_getGoogle(question):
    url = f'https://www.google.com.tw/search?q={question}+維基百科'
    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(response.text, 'lxml')
        wiki_url = bs.find('cite')
        kwd = wiki_url.text.split('/')[-1]
        keyword_trad = HanziConv.toTraditional(kwd)
        return keyword_trad
    else:
        print('解讀後轉換關鍵字失敗....')
コード例 #30
0
 def articles_parser_insert_mysql(self):  #74218
     self.cursor.execute(
         "SELECT id, title, content FROM articles where id >= 198886 and id <= 200000"
     )
     sql = "INSERT INTO articles_parser (id, title_parser_result, content_parser_result) VALUES (%s, %s, %s)"
     results = self.cursor.fetchall()
     for record in results:
         index = record[0]
         title = record[1]
         content = record[2]
         print(index)
         print(title, end="\n\n")
         print(content)
         if title != "":
             title_parser_result = parsing.Parser(
                 re.sub(
                     r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                     "", title))
             if len(title_parser_result) != 0:
                 if title_parser_result[0] == "error":
                     title = HanziConv.toTraditional(title)
                     title_parser_result = parsing.Parser(
                         re.sub(
                             r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                             "", title))
             else:
                 continue
         else:
             continue
         content = re.sub(r'\、|\,|\。|\?|\?|\;|\;|\:|\~|\:|\⋯|\!', '\n',
                          content)
         content_parser_result = ""
         for line in content.split("\n"):
             line = re.sub(
                 r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+",
                 "", line)
             if len(line) >= 4 and '★' not in line and '◆' not in line:
                 print(line)
                 parser_result = parsing.Parser(line)
             else:
                 continue
             if line == "" or len(
                     parser_result) is not 1 or parser_result[0] == 'error':
                 continue
             content_parser_result += parser_result[0]
             content_parser_result += "@"
         time.sleep(self.sleep)
         val = (index, title_parser_result[0], content_parser_result)
         print(title_parser_result[0], end="\n\n")
         print(content_parser_result)
         self.cursor.execute(sql, val)
         self.db.commit()
     self.db.close()
コード例 #31
0
def generate():
    if win.img_shown is 0:
        win.textBrowser.setText("請先開啟圖片")
        win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17))
    else:
        win.textBrowser.setText('請稍等...')
        win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17))
        predicted_cap = HanziConv.toTraditional(
            predict('./train_captions', "./ckpt-20", win.img_path))
        win.textBrowser.setText(predicted_cap)
        # win.textBrowser.setFont(win.def_font)
        speak(predicted_cap)
コード例 #32
0
ファイル: classical.py プロジェクト: athoo/classicalera
def hello():
    name = request.form['checking']
    temp_name = HanziConv.toTraditional(name)
    # name = HanziConv.toSimplified(name)
    name = name.encode('utf-8')
    name = urllib2.quote(name)
    url_tem= "http://csclab11.cs.nthu.edu.tw:5000/?q=%s"%name
    result = urllib2.urlopen(url_tem).read()
    #result = json.load(result)
    # print type(result)
    d = json.loads(result)
    kangxi=HanziConv.toTraditional(d["result"])
    # print d["result"]
    # namelist.append(temp_name)
    # resultlist.append(d["result"])
    # result = get_result(name)
    kangxi=kangxi.encode('utf-8')
    kangxi=urllib2.quote(kangxi)
    url_kang="http://kxgen.mqstudiotw.com/?%s"%kangxi
    kangxi_result = urllib2.urlopen(url_kang)
    #print kangxi_result
    return render_template('index.html', name=temp_name,result=d["result"])
コード例 #33
0
def writeDBF(filePattern, fullFilePath, dicInput):
	global dbfFileHandle
	global dbfFileIndex
	global writeMax
	# dbfFileHandle = None
	# dbfFileIndex = None

	insertCount = 0; updateCount = 0;
	bFileExists = os.path.exists(fullFilePath) 

	dtWriteDBFStart = datetime.datetime.now()
	# logger.debug("write DBF start")
	today = dtWriteDBFStart.strftime("%Y%m%d")
	fileName = today
	strToken = ""
	if filePattern == "0":
		strToken = "SH"
		fileName += ".SH.txt"
	elif filePattern == "1":
		strToken = "SZ"
		fileName += ".SZ.txt"

	with open(fileName, "w") as text_file:
		for key, value in dicInput.iteritems():
			insertCount += 1

			value = HanziConv.toTraditional(value)
			try:
				value = value.decode("utf8")
			except:
				pass

			strWrite = (u"%s.%s,%s\n" % (key, strToken, value))
			text_file.write(strWrite.encode('utf8'))

	dtWriteDBFEnd = datetime.datetime.now()

	logger.debug("write count : " + str(insertCount) + "/" + str(updateCount))
	logger.debug("write DBF end (" + str(dtWriteDBFEnd - dtWriteDBFStart) + ")")
コード例 #34
0
def get_json_from_page(page):
    from hanziconv import HanziConv
    stopwords = load_stop_words()
    cat_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.categories)),stopwords))
    summary_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.summary)),stopwords))
    return get_places(page.title,cat_constrain_set|summary_constrain_set)
コード例 #35
0
def gen_response(keyword_list):
    dic = {"笑話":"你想要聽我說個笑話嗎", "無聊":"那聽個笑話好嗎"}

    ans = dic[HanziConv.toTraditional(keyword_list[0])]
    print(ans) 
コード例 #36
0
ファイル: test_chinese.py プロジェクト: wiki-ai/revscoring
    # but it's used this way
    "操你", "草你", "日你",  # f**k you
    "操他", "草他", "日他",  # f**k his
    "操她", "草她", "日她",  # f**k her

    # Discrimination (racial slurs)
    "小日本",  # little Japanese
    "台湾狗",  # Taiwanese dogs
    "共产中国",  # communist Chinese
    "流氓国家",  # rogue country
    "人渣",  # human slag
    "我去",  # this is verbal and bad
    "鬼子"  # devil, usually a suffix
]
BAD = [HanziConv.toSimplified(word) for word in bad_init] + \
      [HanziConv.toTraditional(word) for word in bad_init]

INFORMAL = [
    # Hello
    "你好",  # nǐ hǎo; The standard "hello" greeting.
    "您好",  # nín hǎo; The same "hello" greeting as above
    "你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

    # Good afternoon
    "午安",  # wǔ'an; note: seldom used in the Mainland.
    "下午好",  # xìawǔ hǎo! Seldom used in the Republic of China

    # Good evening / Good night
    "晚安",  # wǎn'an; Literally "Peace at night", Good night.
    "晚上好",  # wǎnshang hǎo; Good evening!
コード例 #37
0
        m = re.search(ur"^(\[.+?\])(.+?):", s)
        if m:
            s = m.group(2) + m.group(1)
        else:
            m = re.search(ur"^\[.+?\](.*)", s)
            if m:
                s = m.group(1)
    return s

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("input", action="store", nargs = 1)
    parser.add_argument("output", action="store", nargs = 1)
    parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1)
    parser.add_argument("--traditional", action="store_true", default=False)
    args = parser.parse_args()

    buf = codecs.open(args.input[0], "rb", args.encoding).read()

    if args.traditional:
        buf = HanziConv.toTraditional(buf)
    else:
        buf = HanziConv.toSimplified(buf)

    lines = buf.split("\n")
    lines.sort(key = sort_func)
    codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
コード例 #38
0
def get_sentences(page):
    from hanziconv import HanziConv
    sentences = []
    for line in HanziConv.toTraditional(page.content).splitlines():
        sentences.extend(line.split('。'))
    return sentences
コード例 #39
0
new_lines = []

n = 0
for line in lines:
    if line[0] in "#%":
        new_lines.append(line)
        continue
    try:
        cmd, value = line.strip(' ').decode('utf-8').split(u' ', 1)
    except ValueError as e:
        # '\t' 鍵盤對應部份
        new_lines.append(line)
        continue

    newv = HanziConv.toTraditional(value)
    if newv != value:
        # print value ,
        # print ' -> ',
        # print newv
        n += 1
    elif len(value.strip()) > 1:
        print value.strip()
        pass
    else:
        newl = line.strip().split(' ')[0].decode('utf-8') + ' ' + newv
        new_lines.append(newl.encode('utf-8'))

print len(lines)
print n