Beispiel #1
0
def make_biake_corpora(baike_input_file_path, output_file_path):

    word_id_map = WordIdMap()
    with open(baike_input_file_path,'r') as input:
        for line in input:
            document = Document(line.strip())
            document.split_sentences(WhiteSpaceSegmenter())
            document.segement_words(WhiteSpaceSegmenter())

            word_id_map.add_document(document)

    word_id_map.save_as_text(output_file_path)
Beispiel #2
0
def make_sentence_corpora(baike_input_file_path, output_file_path):

    output = codecs.open(output_file_path,'r','utf-8')

    with codecs.open(baike_input_file_path,'r','utf-8') as input:
        for line in input:
            document = Document(line.strip())
            document.split_sentences(SentenceSegmenter())


            for sentence in document.sentences:
                output.write(str(document.id) + "\t" + str(sentence.id) + "\t" + sentence.content + "\n")