def trans(name): file_path = os.path.join(POETRY_DIRECTORY, name) raw = open(file_path, 'r').read() if six.PY2: content = convert_for_mw(unicode(raw), 'zh-cn') else: content = convert_for_mw(raw, 'zh-cn') output_path = os.path.join('./poetry/', name) with open(output_path, 'w') as f: f.write(content)
def get_tagging_words(paragraphs, verbose=False): words = set() # tagging start logger.info('tagging start') tagging = pd.DataFrame({'snippets': paragraphs}) collect_snippet_stat = defaultdict(list) snippet_count = defaultdict(set) snippet_count2 = defaultdict(set) snippet_tags_kwds_cnt = Counter() for i, p in enumerate(paragraphs): snippets = [zhconv.convert_for_mw(p, 'zh-cn')] sentences = [] preprocess_text(snippets, sentences=sentences) collect_snippet_stat[i] = sentences snippet_cnt_vectorizer = CountVectorizer() try: snippet_cnt = snippet_cnt_vectorizer.fit_transform( collect_snippet_stat[i]) # snippet_tags for p snippet_count[i].update( np.array(snippet_cnt_vectorizer.get_feature_names())[ snippet_cnt.toarray().sum(axis=0) > 0]) snippet_tags_kwds_cnt.update(snippet_count[i]) snippet_tags_kwds_cnt.update(snippet_count2[i]) except: snippet_count[i].update([]) snippet_count2[i].update([]) # tagging snippet_tags tagging['snippet_tags'] = tagging.apply(lambda x: snippet_count[x.name], axis=1) # tagging snippet_tags_stat2 valid_wd = list([(k, v) for k, v in snippet_tags_kwds_cnt.items() if (1 < v <= 20)]) tagging['snippet_tags_stat2'] = tagging.apply( lambda x: snippet_count2[x.name].intersection(valid_wd), axis=1) valid_wd.sort(key=takeSecond, reverse=True) pattern = re.compile(r'[a-z0-9A-Z_]') result = list(k for k, v in valid_wd if len(re.findall(pattern, k)) == 0) print(valid_wd) # tagging end logger.info('tagging end') if verbose: tagging.drop(labels=['snippets'], axis=1, inplace=True, errors='ignore') tagging.to_csv(outfile + '.' + strftime('%Y-%m-%d_%H%M%S') + '.tagging', encoding='utf-8') # collect words from tagging result #tagging['snippet_tags'].apply(lambda x: words.update(x)) return result
def trans(file_path): out_path = os.path.join(os.path.dirname(file_path), 'final_' + os.path.basename(file_path)) with open(out_path, 'w', encoding='utf-8') as out_file, open( file_path, 'r', encoding='utf-8') as raw_file: for line in raw_file: content = convert_for_mw(line, 'zh-cn') out_file.write(content)
def trans(name): file_path = os.path.join(POETRY_DIRECTORY, name) raw = open(file_path, 'r', encoding='utf-8').read() content = convert_for_mw(raw, 'zh-cn') output_path = os.path.join('./simplify_poetry/', name) print(output_path) with open(output_path, 'w', encoding='utf-8') as f: f.write(content)
src_path = os.path.join(path, file_name) f = open(src_path, 'r', encoding='utf8') content = ''.join(f.readlines()) dst_dir = path.replace('zh', 'zh_hant') os.makedirs(dst_dir, exist_ok=True) dst_path = os.path.join(dst_dir, file_name) f_ = open(dst_path, 'w', encoding='utf8') content_zh_hant = re.sub(r'\.\. _(.+?):', r'.. _zh_hant_\1:', content) content_zh_hant = re.sub(r':ref:`(.+?) <(.+?)>`', r':ref:`\1 <zh_hant_\2>`', content_zh_hant) content_zh_hant = re.sub(r':label: (.+?)', r':label: zh_hant_\1', content_zh_hant) content_zh_hant = re.sub(r':eq:`(.+?)`', r':eq:`zh_hant_\1`', content_zh_hant) f_.write(zhconv.convert_for_mw(content_zh_hant, 'zh-tw')) print(src_path + ' -> ' + dst_path) dst_dir = path.replace('zh', 'zh_hans') os.makedirs(dst_dir, exist_ok=True) dst_path = os.path.join(dst_dir, file_name) f_ = open(dst_path, 'w', encoding='utf8') content_zh_hans = re.sub(r'\.\. _(.+?):', r'.. _zh_hans_\1:', content) content_zh_hans = re.sub(r':ref:`(.+?) <(.+?)>`', r':ref:`\1 <zh_hans_\2>`', content_zh_hans) content_zh_hans = re.sub(r':label: (.+?)', r':label: zh_hans_\1', content_zh_hans) content_zh_hans = re.sub(r':eq:`(.+?)`', r':eq:`zh_hans_\1`', content_zh_hans) f_.write(zhconv.convert_for_mw(content_zh_hans, 'zh-hans')) print(src_path + ' -> ' + dst_path)
def convert(txt): return convert_for_mw(txt, 'zh-cn')
def _chinese_convert_to_trad(self, text): text = text.replace(u'群', u'-{群}-') return zhconv.convert_for_mw(text, 'zh-hant')