Beispiel #1
0
def make_biake_corpora(baike_input_file_path, output_file_path):

    word_id_map = WordIdMap()
    with open(baike_input_file_path,'r') as input:
        for line in input:
            document = Document(line.strip())
            document.split_sentences(WhiteSpaceSegmenter())
            document.segement_words(WhiteSpaceSegmenter())

            word_id_map.add_document(document)

    word_id_map.save_as_text(output_file_path)