Ejemplo n.º 1
0
def word_freq_test():
    penta_freq = gen_vector.gen_word_pentagram_freq(1000,
                                                    './data/corpus/runeberg/')
    tri_freq = gen_vector.gen_trigram_freq(15000)
    values = []
    size = 0
    max_size = 20000

    if (os.path.exists(c.word_freq_path)):
        os.remove(c.word_freq_path)

    word_freq = error_correction.calc_freq(0, max_size)

    while (size <= max_size):
        if (os.path.exists(c.training_data)):
            os.remove(c.training_data)
        sortedOutput = {}
        count = 0
        for key, value in sorted(word_freq.items(),
                                 key=lambda item: item[1],
                                 reverse=True):
            if (count >= size):
                break
            sortedOutput[key] = value
            count += 1
        # print(sortedOutput)
        gen_vector.get_training_data(c.training_data, c.main_db, 13000,
                                     tri_freq, penta_freq, sortedOutput)
        values.append(main())
        print(values)
        size += 500
Ejemplo n.º 2
0
def filter_test():
    values = []
    penta_freq = gen_vector.gen_word_pentagram_freq(1000,
                                                    './data/corpus/runeberg/')
    tri_freq = gen_vector.gen_trigram_freq(10000)
    word_freq = error_correction.calc_freq(0, 10000)
    if (os.path.exists(c.training_data)):
        os.remove(c.training_data)
    gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq,
                                 penta_freq, word_freq)
    values.append(main())
Ejemplo n.º 3
0
def process_file(plain_text,output_file, db_size, training_size, svm_kernal, c_value, gamma,word_freq_size,tri_freq,penta_freq,word_freq):
    gen_vector.get_training_data(c.training_data, c.main_db,db_size,tri_freq,penta_freq,word_freq)
    gen_vector.get_input(plain_text, c.input,tri_freq,penta_freq,word_freq)
    svclassifier = word_classifier.train(c.svm_model, c.training_data,
                    training_size, svm_kernal, c_value,gamma)
    classified_words = word_classifier.predict(c.input, svclassifier)

    output=[]
    for word in classified_words:
        if(word[1]==0):
            corr_word =error_correction.updated_correct_word(word[0],word_freq)
        else:
            corr_word= word[0]
        if isinstance(corr_word, (list,)):
            for word in corr_word:
                output.append(word)
        else:
            output.append(corr_word)

    with open(output_file, 'w') as f:
        for item in output:
            f.write("%s " % item)