コード例 #1
0
ファイル: em.py プロジェクト: ssquinntran/HDlanguageDetection
def seed():
    filepath = "preprocessed_texts/english/alice-only-spaced.txt"
    # lvu.initialize()
    lv, lang_vectors, n_gram_frequencies = lvu.initialize_from_file()
    vocab_vec, max_word_length = lvu.vocab_vector(lv, lang_vectors)
    vocab = lvu.vocab(max_word_length)

    filepath = "preprocessed_texts/english/alice-only-stream.txt"  # a_christmas_carol.txt"
    text = read_file(filepath)
    processed_indices = dict_explain_away(vocab, max_word_length, text)

    processed = tuples_to_text(processed_indices, text)

    fwrite = open("intermediate/processed_dict_explain_away_results.txt", "w")
    fwrite.write(processed)
    fwrite.close()

    # now for the em
    # not necessary in seeding phase.
    discovered_words = hard_em_discover_words(processed_indices, text, vocab, lv)
    record_results(text, processed_indices, discovered_words, "output/explain_away_results.txt")

    # save data to file
    lvu.write_data_structures(
        [lv, lang_vectors, n_gram_frequencies, vocab_vec, vocab],
        [
            "intermediate/lookup_lv",
            "intermediate/lookup_lang_vectors",
            "intermediate/lookup_n_gram_frequencies",
            "intermediate/lookup_vocab_vec",
            "intermediate/lookup_vocab",
        ],
    )
コード例 #2
0
def seed():
    filepath = "preprocessed_texts/english/alice-only-spaced.txt"
    #lvu.initialize()
    lv, lang_vectors, n_gram_frequencies = lvu.initialize_from_file()
    vocab_vec, max_word_length = lvu.vocab_vector(lv, lang_vectors)
    vocab_dict = lvu.vocab_dict(max_word_length)

    filepath = "preprocessed_texts/english/alice-only-stream.txt"#a_christmas_carol.txt"
    aea = dict_explain_away(vocab_dict,max_word_length,filepath)
    #file = open("intermediate/processing_array_explain_away_results","w")
    #file.write(aea)
    #file.close()

    #now for the em
    #not necessary in seeding phase. 
    #hed = hard_em_discover_words(aea, vocab_array, max_length, filepath)
    #file = open("../output/processed_array_explain_away_results","w")
    #file.write(hed)
    #file.close()

    # save data to file
    lvu.write_data_structures([lv, lang_vectors, n_gram_frequencies, vocab_vec, vocab_dict], \
        ["intermediate/lookup_lv", "intermediate/lookup_lang_vectors", \
        "intermediate/lookup_n_gram_frequencies", "intermediate/lookup_vocab_vec", \
        "intermediate/lookup_vocab_dict"])
コード例 #3
0
def seed():
    filepath = "preprocessed_texts/english/alice-only-spaced.txt"
    #lvu.initialize()
    lv, lang_vectors, n_gram_frequencies = lvu.initialize_from_file()
    vocab_vec, max_word_length = lvu.vocab_vector(lv, lang_vectors)
    vocab = lvu.vocab(max_word_length)

    filepath = "preprocessed_texts/english/alice-only-stream.txt"  #a_christmas_carol.txt"
    text = read_file(filepath)
    processed_indices = dict_explain_away(vocab, max_word_length, text)

    processed = tuples_to_text(processed_indices, text)

    fwrite = open("intermediate/processed_dict_explain_away_results.txt", "w")
    fwrite.write(processed)
    fwrite.close()

    # now for the em
    # not necessary in seeding phase.
    discovered_words = hard_em_discover_words(processed_indices, text, vocab,
                                              lv)
    record_results(text, processed_indices, discovered_words,
                   "output/explain_away_results.txt")

    # save data to file
    lvu.write_data_structures([lv, lang_vectors, n_gram_frequencies, vocab_vec, vocab], \
        ["intermediate/lookup_lv", "intermediate/lookup_lang_vectors", \
        "intermediate/lookup_n_gram_frequencies", "intermediate/lookup_vocab_vec", \
        "intermediate/lookup_vocab"])