Ejemplo n.º 1
0
def word_freq_test():
    penta_freq = gen_vector.gen_word_pentagram_freq(1000,
                                                    './data/corpus/runeberg/')
    tri_freq = gen_vector.gen_trigram_freq(15000)
    values = []
    size = 0
    max_size = 20000

    if (os.path.exists(c.word_freq_path)):
        os.remove(c.word_freq_path)

    word_freq = error_correction.calc_freq(0, max_size)

    while (size <= max_size):
        if (os.path.exists(c.training_data)):
            os.remove(c.training_data)
        sortedOutput = {}
        count = 0
        for key, value in sorted(word_freq.items(),
                                 key=lambda item: item[1],
                                 reverse=True):
            if (count >= size):
                break
            sortedOutput[key] = value
            count += 1
        # print(sortedOutput)
        gen_vector.get_training_data(c.training_data, c.main_db, 13000,
                                     tri_freq, penta_freq, sortedOutput)
        values.append(main())
        print(values)
        size += 500
Ejemplo n.º 2
0
def filter_test():
    values = []
    penta_freq = gen_vector.gen_word_pentagram_freq(1000,
                                                    './data/corpus/runeberg/')
    tri_freq = gen_vector.gen_trigram_freq(10000)
    word_freq = error_correction.calc_freq(0, 10000)
    if (os.path.exists(c.training_data)):
        os.remove(c.training_data)
    gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq,
                                 penta_freq, word_freq)
    values.append(main())
Ejemplo n.º 3
0
def process_dir(input_dir, test, sample_size, db_size, training_size,
                svm_kernal, c_value, gamma,word_freq_size, tri_freq_size):
    count=1
    tri_freq=gen_vector.gen_trigram_freq(tri_freq_size)
    penta_freq=gen_vector.gen_word_pentagram_freq(1000,'./data/corpus/runeberg/')
    word_freq=error_correction.calc_freq(0, word_freq_size)

    for file in os.listdir(input_dir):
        plain = input_dir+file
        output_dir= "./output/%s/%s"%(test,file)
        print(plain)
        if(not os.path.isfile(output_dir)):
            process_file(plain, output_dir, db_size, training_size,
                            svm_kernal, c_value, gamma,word_freq_size,tri_freq,penta_freq,word_freq)
        print("Corrected page %i out of %i)" %(count, len(os.listdir(input_dir))))
        count+=1
        if(sample_size):
            if(sample_size<count):
                break