def traditional2simple(file_path, out_path): try: cc = opencc.OpenCC("t2s") with codecs.open(file_path, mode='r', encoding='utf-8') as rf: text_content = cc.convert(rf.read()) with codecs.open(out_path, mode='w', encoding='utf-8') as wf: wf.write(text_content) return True except Exception as e: return False
def read_corpus_2(dir_path): """读取最近的一个数据集 唐诗和宋诗 """ sents_src = [] sents_tgt = [] tokenizer = Tokenizer(word2idx) files= os.listdir(dir_path) #得到文件夹下的所有文件名称 for file1 in files: #遍历文件夹 if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开 file_path = dir_path + "/" + file1 print(file_path) # data = json.load(file_path) with open(file_path) as f : poem_list = eval(f.read()) for each_poem in poem_list: string_list = each_poem["paragraphs"] poem = "" for each_s in string_list: poem += each_s cc = opencc.OpenCC('t2s') poem = cc.convert(poem) encode_text = tokenizer.encode(poem)[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue title = cc.convert(each_poem["title"]) if len(title) > 10 or len(title) < 1: # 过滤掉题目长度过长和过短的诗句 continue if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"): # 五言绝句 sents_src.append(title+ "##" + "五言绝句") sents_tgt.append(poem) elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"): # 七言绝句 sents_src.append(title + "##" + "七言绝句") sents_tgt.append(poem) elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"): # 五言律诗 sents_src.append(title + "##" + "五言律诗") sents_tgt.append(poem) elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"): # 七言律诗 sents_src.append(title + "##" + "七言律诗") sents_tgt.append(poem) print("第二个诗句数据集共:" + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def convert(infile: str, outfile: str, cfg: str): """read >> convert >> write file Args: infile (str): input file outfile (str): output file cfg (str): config """ converter = opencc.OpenCC(cfg) with open(infile, "r") as inf, open(outfile, "w+") as outf: outf.write("\n".join(converter.convert(line) for line in inf)) print(f"Convert to {outfile}")
def __init__(self, use_cuda, pre_process=False): self.simplified_to_traditional = opencc.OpenCC('s2t') self.use_cuda = use_cuda self.train_x = None self.train_y = None self.test_x = None self.test_y = None self.tag2id = None if pre_process: self.pre_process() else: self.load_data()
def convert(src_path, dst_path, cfg='s2twp.json'): converter = opencc.OpenCC(cfg) with open(src_path, "r", encoding='utf-8') as src, open(dst_path, "w+", encoding='utf-8') as dst: dst.write("\n".join( converter.convert(line.rstrip()).replace( '(img/', '(../img/').replace('髮送', '傳送').replace( '髮布', '釋出').replace('髮生', '發生').replace('髮出', '發出') for line in src)) print("convert %s to %s" % (src_path, dst_path))
def setOutputSimplifiedChinese(self, outputSimpChinese): self.outputSimpChinese = outputSimpChinese # 建立 OpenCC instance 用來做繁簡體中文轉換 if outputSimpChinese: if not self.opencc: self.opencc = opencc.OpenCC( opencc.OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP) else: self.opencc = None self.updateSwitchLangIcon = True self.updateLangButtons()
def load_dev(path='/home/dy/flat-chinese-ner/data/test.txt', simplify=True): test_data = [] with open(path, 'r', encoding='utf8') as f: file_text = f.read().encode('utf-8').decode('utf-8-sig') converter = opencc.OpenCC('t2s.json') if simplify: file_text = converter.convert(file_text) datas = file_text.split('\n\n--------------------\n\n')[:-1] for doc in datas: _, doc = doc.split('\n') test_data.append(doc) return test_data
def convert(infile: str, outfile: str, cfg: str): """read >> convert >> write file Args: infile (str): input file outfile (str): output file cfg (str): config """ converter = opencc.OpenCC(cfg) with open(infile, "r") as inf, open(outfile, "w+") as outf: data = inf.readlines() data = list(map(converter.convert, data)) outf.writelines(data) print(f"Convert to {outfile}")
def __init__(self, options): self.critical = False super().__init__(options) self.drop_zh = False self.opencc_version = "N/A" try: import opencc self.s2tw = opencc.OpenCC('s2twp.json') self.tw2s = opencc.OpenCC('tw2sp.json') self.opencc_version = opencc.__version__ except Exception as e: if self.critical: raise e else: print("[warning] zhconv: opencc load failed, zhconv disabled") print("[warning] zhconv: disabling zh-cn, zh-tw build") self.drop_zh = True
def wiki_to_txt(file_name, output_name): logging.info("開始 wiki_to_txt") wiki_corpus = WikiCorpus(file_name, dictionary={}) texts_num = 0 converter = opencc.OpenCC('s2t.json') with open(output_name, 'w', encoding='utf-8') as output: for texts in wiki_corpus.get_texts(): r = converter.convert(' '.join(texts)) output.write(r + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已處理 %d 篇文章" % texts_num) logging.info("結束 wiki_to_txt")
def convert2simple(): cc = opencc.OpenCC('t2s') for i in range(1, 5): src_file = dir_path + "wiki_texts" + str(i) + ".txt" des_file = dir_path + "wiki_simple" + str(i) + ".txt" des_f = open(des_file, 'w') with open(src_file, 'r') as f: for line in f: # print line.decode('utf-8') content = cc.convert(line.decode('utf-8')) print content des_f.write(content.encode('utf-8') + '\n') des_f.close() print str(i) + " finished."
def merge_files(root, target, convert): converter = opencc.OpenCC('s2t.json') file_names = get_all_files(root) output = open(target, "w+", encoding='utf-8', errors='ignore') for file_name in file_names: f = open(root + file_name, "r", encoding='utf-8',errors='ignore') for line in tqdm(f.readlines(), desc="merge lines in a file"): if convert: output.write(converter.convert(line)) else: output.write(line) f.close() output.close()
def load_texts_labels(self, filename_label_dict, pickle_name): ''' 加载文本格式数据集分为训练集和测试集; 将数据集保存成pickle文件 :param filename_label_dict: :param pickle_name: 保存的pickle文件名 :return: ''' texts, labels = [], [] # 加载数据集 for filename, label in filename_label_dict.items(): #label = filename_label_dict[filename] file_path = os.path.join(self.DATA_DIR, filename) data = open(file_path, 'r', encoding='utf-8') cnt = 0 for line in data: try: tokens = [ t for t in jieba.lcut(line.strip()) if t not in self.stopwords ] text = ' ' + ' '.join(tokens) # 数据预处理,繁体转简体 cc = opencc.OpenCC('mix2s') texts.append(cc.convert(text)) labels.append(label) except Exception as e: print('{}:\n{}'.format(e, data)) cnt += 1 if cnt % 1000 == 0: print('Processed {} records.'.format(cnt)) print('Done processing {} records.'.format(cnt)) self.dataset['texts'] = texts self.dataset['labels'] = labels # 保存数据集为pickle文件 pickle_texts = '{}_texts.pk'.format(pickle_name) pickle_labels = '{}_labels.pk'.format(pickle_name) with open(os.path.join(self.DATA_DIR, pickle_texts), 'wb') as f_texts: pickle.dump(self.dataset['texts'], f_texts) with open(os.path.join(self.DATA_DIR, pickle_labels), 'wb') as f_labels: pickle.dump(self.dataset['labels'], f_labels) # 分隔数据集为测试集和训练集 self.train_x, self.valid_x, self.train_y, self.valid_y = \ model_selection.train_test_split(self.dataset['texts'], self.dataset['labels'])
def main(): converter = opencc.OpenCC('t2s.json') book = sys.argv[1] with open(book, "rb") as f: data = f.read() text = data.decode('utf-8',errors='ignore') # RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎|,|。]', re.UNICODE) RE = re.compile('[\u4e00-\u9fff|\u3001-\u303F|\uff01-\uff5d|\u2160-\u217F|\u203B|\u30FB|\u2027|\u25a0|\u2500|\uff5e|\u2026\u25c6|\u2460-\u2487]', re.UNICODE) chinese = RE.findall(text) tcscript = ''.join(chinese) sc = converter.convert(tcscript) sc = strQ2B(sc) with open(book+".txt", "a+", encoding="utf-8") as f1: f1.write(sc) f1.write('\n\n')
def t2s(self, traditional_file, clean_file): """ 繁简转换 :param traditional_file: :param clean_file: :return: """ converter = opencc.OpenCC('t2s.json') with open(traditional_file, 'r', encoding='utf-8') as rf: batch_data = [] for line in rf: line = line.strip().split('\t') line[1] = converter.convert(line[1]) batch_data.append(line[0] + '\t' + line[1]) _write_data2file(batch_data, clean_file)
def _create_examples(lines, set_type=None): """Creates examples for the training and dev sets.""" # re_ENUM = re.compile(r"([-.a-zA-Z0-9]+)") re_ENUM = re.compile(r'(([-–+])?\d+(([.·])\d+)?%?|([0-9_.·]*[A-Za-z]+[0-9_.·]*)+)') converter = opencc.OpenCC('t2s') def _labels_words(p_text_segment): inside_tokens = [] inside_labels = [] for segment in p_text_segment: hyper_tokens = segment.split() segment_tokens = [] for hyper_token in hyper_tokens: hyper_token = hyper_token.strip() if len(hyper_token) > 0: is_chinese = False for c in hyper_token: if process.process_utils.is_cjk_char(ord(c)): is_chinese = True break if is_chinese: segment_tokens.extend(list(hyper_token)) else: segment_tokens.append(hyper_token) inside_tokens.extend(segment_tokens) if len(segment_tokens) == 1: inside_labels.extend(["A"]) elif len(segment_tokens) > 1: inside_labels.extend(["BS"] + ["A"] * (len(segment_tokens) - 2) + ["ES"]) return inside_tokens, inside_labels for (i, line) in enumerate(lines): # Only the test set has a header line = convert_to_unicode(line.strip()) text = str.lower(process.process_utils.strQ2B(line)) text = converter.convert(text) text = re_ENUM.sub(" \\1 ", text) text_segment = text.split("☃") tokens, labels = _labels_words(text_segment) o_text = re.sub(r"\s|☃", "", line) offset = 0 o_tokens = [] for token in tokens: o_tokens.append(o_text[offset: offset + len(token)]) offset += len(token) yield InputExample(guid=o_tokens, text=tokens, labels=labels)
def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s cc = opencc.OpenCC('t2s') s = cc.convert(s) # print('start.............') # print(s) return s
def getLSTMPredict(ans): data = {} module_lstm_dir = os.path.join(module_dir, 'lstmB') jieba.set_dictionary(os.path.join(module_lstm_dir, 'dict_v2.txt')) with open(os.path.join(module_lstm_dir, 'stopwords_only_symbol_v2.txt'), 'r', encoding='utf8') as f: stops_symbol = f.read().split('\n') input_str = ans # 輸入新聞標題 # print(f'input_str:{input_str}') converter = opencc.OpenCC('s2twp.json') s2twp_str = converter.convert(input_str) # print(f's2twp_str:{s2twp_str}') jieba_str = ' '.join([ t for t in jieba.cut_for_search(str(s2twp_str)) if t not in stops_symbol ]) input_data_np = np.array([jieba_str]) vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore( os.path.join(module_lstm_dir, 'search_jieba_no_stopwords_train_vocab.pickle')) input_data_pd = np.array(list(vocab_processor.transform(input_data_np))) tf.reset_default_graph() saver = tf.train.import_meta_graph( os.path.join(module_lstm_dir, 'search_jieba_no_stopwords_train_vocab.ckpt.meta')) with tf.Session() as sess: saver.restore( sess, os.path.join(module_lstm_dir, 'search_jieba_no_stopwords_train_vocab.ckpt')) prob_and_ans = {"Placeholder:0": input_data_pd, "Placeholder_2:0": 1} prob = sess.run("probability:0", feed_dict=prob_and_ans) ans = sess.run("ans:0", feed_dict=prob_and_ans) # print(f'probability: {prob}') # 印出較高的機率 # print(f'ans: {ans}') # 印出真或假( 1為真, 0為假) if ans[0].item() == 0: data['result'] = False else: data['result'] = True data['confidence'] = prob[0].item() # print(f'判斷:{ans},信心:{prob}') # print(f'ans:{type(ans[0])},prob:{type(prob[0])}') data['success'] = True return data
def __init__(self, vocab_file, lowercase=True, strip_accents=False, clean_text=True, cc=None): self.vocab_file = vocab_file self.cc = None if cc is not None: # pip install opencc-python-reimplemented import opencc self.cc = opencc.OpenCC(cc) from tokenizers import BertWordPieceTokenizer self._tokenizer = BertWordPieceTokenizer(self.vocab_file, lowercase=lowercase, strip_accents=strip_accents, clean_text=clean_text)
def preproc(): rst_items = [] convertor = opencc.OpenCC('tw2sp.json') test_items = proc_test_set('data', convertor) for item in read_data(get_abs_path('data')): rst_items += proc_item(item, convertor) for item in read_confusion_data(get_abs_path('data')): rst_items += proc_confusion_item(item) # 拆分训练与测试 dev_set_len = len(rst_items) // 10 print(len(rst_items)) random.seed(666) random.shuffle(rst_items) dump_json(rst_items[:dev_set_len], get_abs_path('data', 'dev.json')) dump_json(rst_items[dev_set_len:], get_abs_path('data', 'train.json')) dump_json(test_items, get_abs_path('data', 'test.json')) gc.collect()
def gen_data(file_name): converter = opencc.OpenCC('t2s.json') actions = list() with open(file_name) as f: for line in tqdm(f): doc = json.loads(line.strip()) actions.append({ "_index": 'zhwiki', '_source': { 'title': converter.convert(doc['title']), 'text': converter.convert(doc['text']) } }) if len(actions) >= 100: yield actions actions = list() if len(actions) > 0: yield actions
def read2df(mnt_txt): cc = oc.OpenCC("t2s") with open(mnt_txt,"r", encoding="utf-8") as f: data = f.read() data_list = data.split("\n") eng_list,chn_list = [],[] df = pd.DataFrame() for dl in data_list[:-1]: dls = dl.split("\t") #print(dls) eng_list.append(split_dot(dls[0])) chn_list.append(cc.convert(dls[1])) df["eng"] = eng_list df["chn"] = chn_list print(df.head(5)) df.to_csv("cmn.csv",index=None) print("save csv") return(df)
def test_conversion(): import opencc for inpath in glob(os.path.join(_test_assets_dir, '*.in')): pref = os.path.splitext(inpath)[0] config = os.path.basename(pref) converter = opencc.OpenCC(config) anspath = '{}.{}'.format(pref, 'ans') assert os.path.isfile(anspath) with open(inpath, 'rb') as f: intexts = [l.strip().decode('utf-8') for l in f] with open(anspath, 'rb') as f: anstexts = [l.strip().decode('utf-8') for l in f] assert len(intexts) == len(anstexts) for text, ans in zip(intexts, anstexts): assert converter.convert(text) == ans, \ 'Failed to convert {} for {} -> {}'.format(pref, text, ans)
def export(words): result = "" converter = opencc.OpenCC('t2s.json') HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$') count = 0 last_word = None for line in words: line = line.rstrip("\n") if not HANZI_RE.match(line): continue # Skip single character & too long pages if not 1 < len(line): continue # Skip list pages if line.endswith(('列表', '对照表')): continue if last_word and len(last_word) >= 4 and line.startswith(last_word): continue pinyin = "'".join(lazy_pinyin(line)) if pinyin == line: # print("Failed to convert, ignoring:", pinyin, file=sys.stderr) continue if manual_fix(line): pinyin = manual_fix(line) console.debug(f"Fixing {line} to {pinyin}") last_word = line result += "\t".join((converter.convert(line), pinyin, "0")) result += "\n" count += 1 if count % 1000 == 0: console.debug(str(count) + " converted") if count % 1000 != 0 or count == 0: console.debug(str(count) + " converted") return result
def search_from_qq(song_full_name): cc = opencc.OpenCC('t2s') try: singer, song = song_full_name.split(' - ') except Exception as e: singer = False song = False headers = { 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'Referer': 'https://y.qq.com/portal/search.html' } s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=5)) s.mount('https://', HTTPAdapter(max_retries=5)) r = s.get('https://c.y.qq.com/soso/fcgi-bin/client_search_cp', params={ 'w': song_full_name, 'format': 'json' }, timeout=None, headers=headers) resp = r.json() try: if singer: if resp['data']['song']['list']: for songinfo in resp['data']['song']['list']: for singer_r in songinfo['singer']: if (cc.convert(singer_r['name'].lower()) in cc.convert( singer.lower())) or (cc.convert( singer.lower()) in cc.convert( singer_r['name'].lower())): return songinfo['songmid'] return False return False return resp['data']['song']['list'][0]['songmid'] except Exception as e: return False
def cut_word(): """切词处理文本""" wiki = codecs.open('wiki', 'r', encoding="utf8") train = codecs.open('corpus', 'a', encoding="utf8") i = 0 line = wiki.readline() cc = opencc.OpenCC('t2s') while line: ste = re.findall("[\u4e00-\u9fa5]+", line) if len(ste): line_data = "".join(ste) seg_list = jieba.cut(line_data, cut_all=False) train.write(cc.convert(" ".join(seg_list))) train.write('\n') if i % 100 == 0: print("切词到第" + str(i) + "行") i += 1 line = wiki.readline() wiki.close() train.close()
def conversion(): afterSimplify = open('afterSim.txt', mode='w+') conversion_type = opencc.OpenCC('mix2s') with open('wiki.zh.txt') as preText: for line in preText: # list = [] print type(line) try: line = line.split() # print line for word in line: # print word, type(word) afterWord = conversion_type.convert(word.decode('utf-8')).encode('utf-8') # print afterWord, type(afterWord) afterSimplify.write('{} '.format(afterWord)) # print except UnicodeDecodeError: pass afterSimplify.write('\n')
def search_from_netease(song_full_name): cc = opencc.OpenCC('t2s') try: singer, song = song_full_name.split(' - ') except Exception as e: singer = False song = False headers = { 'Cache-Control': 'no-cache', 'Host': 'musicapi.leanapp.cn', 'Pragma': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=5)) s.mount('https://', HTTPAdapter(max_retries=5)) req = s.get('http://127.0.0.1:3000/search', params={'keywords': song_full_name}, headers=headers, timeout=None) resp = req.json() try: if singer: for x in resp['result']['songs']: for artist in x['artists']: if (cc.convert(artist['name'].lower()) in cc.convert( singer.lower())) or (cc.convert(singer.lower()) in cc.convert( artist['name'].lower())): return x['id'] return False return resp['result']['songs'][0]['id'] except Exception as e: return False
def preprocess(path): r=raw_input("type a directory name:") fw = open(path,'w') ivLIST = file2list('./iv.txt') for root,dirs,files in os.walk(r): for f in files: path = os.path.join(root,f) fo = open(path) for line in fo: line=line.strip() cc = opencc.OpenCC('t2s',opencc_path='/usr/bin/opencc') line = cc.convert(line.decode('utf8')).encode('utf8') if line: #remove the content in () match = par.findall(line) if match: for i in match: if i=='(*^__^*)' or i=='(∩_∩)': line = line.replace(i,' 微笑 ') else: line = line.replace(i,' ') line = applyPAT(quote,line,isCH=None,sub=' ') line = applyPAT(par2,line,1) line = applyPAT(quote2,line,1) line = applyPAT(period,line,1,' 无语 ') ## remove intensional verb and something unsure lineCOPY = line lineCOPY = lineCOPY.replace('。','\n').replace(',','\n').replace(',','\n') clauses = lineCOPY.split('\n') for i in clauses: for j in ivLIST: if i.find(j) !=-1: line = line.replace(i,' ') if line: if line.startswith('宾馆反馈'): continue fw.write(line+'\n') fw.write("----------\n") fw.close()
def chinese_t2s(writefile, readfile): import opencc cc = opencc.OpenCC('t2s') # t2s: Traditional to Simplified, 繁体转简体 # 遇到 UnicodeEncodeError: ‘gbk’ codec can’t encode character,由于GBK和UTF-8编码冲突 # 添加参数设置: encoding='utf-8' file = open(writefile, 'w', encoding='utf-8') prompt = 0 # 用于输出显示工作进度的变量 for line in open(readfile, 'rb').readlines(): # l = cc.convert(l).encode('utf8', 'ignore') # file.write(l + '\n') l = line.decode('utf8', 'ignore').rstrip(u'\n') file.write(cc.convert(l) + u'\n') # 输出显示工作进度 print('traditional to simplified, processing: ' + str(prompt)) prompt += 1 file.close()