Ejemplo n.º 1
0
def execute(dataset, hmm_file, tag_file, k):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv_rates = [] # to keep track of the convergence rates varying with k
    for j in range(k):
        conv_rates.append(0.0)

    for sentence in test_sentences:
        k_best = []
        if True: 
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            #TODO remove redundancy
            best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm)
            conv_rates[0] += 1
            k_best.append(best_tags)
            #next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
            if num_iter == -1:
                sys.stderr.write("2nd best does not converge :( \n")
                #print ' '.join(best_tags)
                #print
                #continue
            j = 2 # we have the best, and the second best now
            conv_rates[j-1] += 1
            sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n")
            k_best.append(second_best)
         
            while j < k:
                next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
                k_best.append(next_best)
                if num_iter != -1:
                    conv_rates[j] += 1
                    #k_best.append(next_best)
                    sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n")
                else:
                    sys.stderr.write(str(j+1) + "th best does not converge\n")
                    #break
                j += 1

        for best in k_best:
            print ' '.join(best)                
        print

    for j in range(k):
        sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
Ejemplo n.º 2
0
def execute(dataset, hmm_file, tag_file):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv = 0
    sec_conv = 0
    for sentence in test_sentences:
       
        if True: #len(sentence) < 15:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            
            k_best = []
            best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
            if num_iter != -1:
                sec_conv += 1
                sys.stderr.write("2nd best converges in " + str(num_iter) + "\n")
                k_best.append(best_tags)
                k_best.append(tags1)
                
                third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best)
                if num_iter2 != -1:
                    sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n")
                    conv += 1
                    k_best.append(third_best)
                    fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best)
                    sys.stderr.write("4th best converges in " + str(num_iter3) + "\n")
                    print ' '.join(best_tags)
                    print ' '.join(tags2)
                    print ' '.join(third_best)
                    print ' '.join(fourth_best)
                else:
                    sys.stderr.write( "3rd best does not converge :(\n")
            else:
                sys.stderr.write("2nd best does not converge :(\n")
                continue
            print
    
    sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n")
    sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")
Ejemplo n.º 3
0
def execute(dataset, hmm_file, tag_file):
    sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    sys.stderr.write("reading dev data...\n")
    test_sentences, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(test_sentences, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    for sentence in test_sentences:
       
        if len(sentence) > 0 :#True: #len(tree) < 100:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            tagprint(test_sents_not_rare[test_sentences.index(sentence)])
            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
           
            tagprint(best_tags)
            tagprint(tags2)
            if num_iterations != -1:
                
                sys.stderr.write("fst tagger accuracy = " + str(evaluate.accuracy(truetags, tags2)) + "\n")
                sys.stderr.write("best tags accuracy = " + str(evaluate.accuracy(truetags, best_tags)) + "\n")
                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                print
                sys.stderr.write("does not converge :(\n")
            tagprint(truetags)
            print
        #if i==100:
            #break     
    sys.stderr.write("\n" + str(avg_iterations/converges) + " iterations on average\n")
    sys.stderr.write(str(converges*100/i) +  " % convergence\n")
    sys.stderr.write("time_taken = "+ str(time.time() - start_time) + "\n")
def execute(dataset, hmm_file, tag_file):
    # sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    # sys.stderr.write("reading dev data...\n")
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    fst_acc = 0
    best_acc = 0
    wrong = 0
    for sentence in test_sentences:

        if True:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write("\n" + str(i) + "\n")
            # sys.stderr.write(' '.join(ts[i-1]) + "\n")
            print " ".join(sentence)
            # print ' '.join(ts[i-1])

            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
                wrong += 1

            if num_iterations != -1:
                facc = evaluate.accuracy(truetags, tags2)
                # sys.stderr.write("fst tagger accuracy = " + str(facc) + "\n")
                fst_acc += facc

                bacc = evaluate.accuracy(truetags, best_tags)
                # sys.stderr.write("best tags accuracy = " + str(bacc) + "\n")
                best_acc += bacc

                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                sys.stderr.write("does not converge :(\n")
            print " ".join(best_tags)
            print " ".join(tags2)
            # print "gold  : ", ' '.join(truetags)
            print

            # if i == 100:
            # break
    sys.stderr.write("\nsystem performance\n--------------------\n")
    sys.stderr.write("\n" + str(wrong * 100 / converges) + "% sequences are wrong:\n")
    sys.stderr.write("\naverage accuracy of best: " + str(best_acc / converges) + "\n")
    sys.stderr.write("average accuracy of 2nd best: " + str(fst_acc / converges) + "\n")

    sys.stderr.write("\nsystem efficiency\n---------------------\n")
    sys.stderr.write("\n" + str(avg_iterations / converges) + " iterations on average\n")
    sys.stderr.write(str(converges * 100 / i) + " % convergence\n")
    sys.stderr.write("time_taken = " + str(time.time() - start_time) + "\n")