def trans(name):
    file_path = os.path.join(POETRY_DIRECTORY, name)

    raw = open(file_path, 'r').read()

    if six.PY2:
        content = convert_for_mw(unicode(raw), 'zh-cn')
    else:
        content = convert_for_mw(raw, 'zh-cn')

    output_path = os.path.join('./poetry/', name)

    with open(output_path, 'w') as f:
        f.write(content)
def get_tagging_words(paragraphs, verbose=False):
    words = set()
    # tagging start
    logger.info('tagging start')
    tagging = pd.DataFrame({'snippets': paragraphs})
    collect_snippet_stat = defaultdict(list)
    snippet_count = defaultdict(set)
    snippet_count2 = defaultdict(set)
    snippet_tags_kwds_cnt = Counter()

    for i, p in enumerate(paragraphs):
        snippets = [zhconv.convert_for_mw(p, 'zh-cn')]
        sentences = []
        preprocess_text(snippets, sentences=sentences)
        collect_snippet_stat[i] = sentences
        snippet_cnt_vectorizer = CountVectorizer()
        try:
            snippet_cnt = snippet_cnt_vectorizer.fit_transform(
                collect_snippet_stat[i])
            # snippet_tags for p
            snippet_count[i].update(
                np.array(snippet_cnt_vectorizer.get_feature_names())[
                    snippet_cnt.toarray().sum(axis=0) > 0])
            snippet_tags_kwds_cnt.update(snippet_count[i])
            snippet_tags_kwds_cnt.update(snippet_count2[i])
        except:
            snippet_count[i].update([])
            snippet_count2[i].update([])

    # tagging snippet_tags
    tagging['snippet_tags'] = tagging.apply(lambda x: snippet_count[x.name],
                                            axis=1)

    # tagging snippet_tags_stat2
    valid_wd = list([(k, v) for k, v in snippet_tags_kwds_cnt.items()
                     if (1 < v <= 20)])
    tagging['snippet_tags_stat2'] = tagging.apply(
        lambda x: snippet_count2[x.name].intersection(valid_wd), axis=1)

    valid_wd.sort(key=takeSecond, reverse=True)

    pattern = re.compile(r'[a-z0-9A-Z_]')

    result = list(k for k, v in valid_wd if len(re.findall(pattern, k)) == 0)
    print(valid_wd)
    # tagging end
    logger.info('tagging end')
    if verbose:
        tagging.drop(labels=['snippets'],
                     axis=1,
                     inplace=True,
                     errors='ignore')
        tagging.to_csv(outfile + '.' + strftime('%Y-%m-%d_%H%M%S') +
                       '.tagging',
                       encoding='utf-8')

    # collect words from tagging result
    #tagging['snippet_tags'].apply(lambda x: words.update(x))

    return result
Exemple #3
0
def trans(file_path):
    out_path = os.path.join(os.path.dirname(file_path),
                            'final_' + os.path.basename(file_path))
    with open(out_path, 'w', encoding='utf-8') as out_file, open(
            file_path, 'r', encoding='utf-8') as raw_file:
        for line in raw_file:
            content = convert_for_mw(line, 'zh-cn')
            out_file.write(content)
Exemple #4
0
def trans(name):
    file_path = os.path.join(POETRY_DIRECTORY, name)

    raw = open(file_path, 'r', encoding='utf-8').read()

    content = convert_for_mw(raw, 'zh-cn')

    output_path = os.path.join('./simplify_poetry/', name)
    print(output_path)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(content)
        src_path = os.path.join(path, file_name)
        f = open(src_path, 'r', encoding='utf8')
        content = ''.join(f.readlines())

        dst_dir = path.replace('zh', 'zh_hant')
        os.makedirs(dst_dir, exist_ok=True)
        dst_path = os.path.join(dst_dir, file_name)
        f_ = open(dst_path, 'w', encoding='utf8')
        content_zh_hant = re.sub(r'\.\. _(.+?):', r'.. _zh_hant_\1:', content)
        content_zh_hant = re.sub(r':ref:`(.+?) <(.+?)>`',
                                 r':ref:`\1 <zh_hant_\2>`', content_zh_hant)
        content_zh_hant = re.sub(r':label: (.+?)', r':label: zh_hant_\1',
                                 content_zh_hant)
        content_zh_hant = re.sub(r':eq:`(.+?)`', r':eq:`zh_hant_\1`',
                                 content_zh_hant)
        f_.write(zhconv.convert_for_mw(content_zh_hant, 'zh-tw'))
        print(src_path + ' -> ' + dst_path)

        dst_dir = path.replace('zh', 'zh_hans')
        os.makedirs(dst_dir, exist_ok=True)
        dst_path = os.path.join(dst_dir, file_name)
        f_ = open(dst_path, 'w', encoding='utf8')
        content_zh_hans = re.sub(r'\.\. _(.+?):', r'.. _zh_hans_\1:', content)
        content_zh_hans = re.sub(r':ref:`(.+?) <(.+?)>`',
                                 r':ref:`\1 <zh_hans_\2>`', content_zh_hans)
        content_zh_hans = re.sub(r':label: (.+?)', r':label: zh_hans_\1',
                                 content_zh_hans)
        content_zh_hans = re.sub(r':eq:`(.+?)`', r':eq:`zh_hans_\1`',
                                 content_zh_hans)
        f_.write(zhconv.convert_for_mw(content_zh_hans, 'zh-hans'))
        print(src_path + ' -> ' + dst_path)
Exemple #6
0
def convert(txt):
    return convert_for_mw(txt, 'zh-cn')
Exemple #7
0
    def _chinese_convert_to_trad(self, text):
        text = text.replace(u'群', u'-{群}-')

        return zhconv.convert_for_mw(text, 'zh-hant')