def main():
    hmm_file = sys.argv[1]
    tag_file = sys.argv[2]
    hmm, tagset = get_param_tagset(hmm_file, tag_file)

    ts, test_tags = data_reader.read_tagging_data(sys.argv[3])
    test_sentences = replace_test(ts, hmm, tagset)

    index = 0
    for sentence in test_sentences:
        index += 1
        print ' '.join(ts[index-1])
	#sentence = "`` We would have to wait until we have collected on those assets before we can move forward , '' he said ."
	#sentence = replace_test([sentence.split(' ')], hmm, tagset)[0]
	#sys.stderr.write(str(index) + " " + ' '.join(sentence) + "\n")

	tagmap = filter_tagset(sentence, tagset, hmm)
	tagseqs = find_all_tagseqs(len(sentence), tagmap, "", [])

	resultmap = {}
	for tagline in tagseqs:
	    tagseq = tagline.strip().split(' ')
	    score = find_log_prob(hmm, tagset, sentence, tagseq)
	    if score != '':
		key = ' '.join(tagseq)
		resultmap[key] = score
	sorted_x = sorted(resultmap.iteritems(), key=operator.itemgetter(1))[-10:]
	for k, v in sorted_x:
	    print k, '\t', "{0:.2f}".format(v)
        print
        if index > 100:
           break 
def execute(dataset, hmm_file, tag_file, k):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv_rates = [] # to keep track of the convergence rates varying with k
    for j in range(k):
        conv_rates.append(0.0)

    for sentence in test_sentences:
        k_best = []
        if True: 
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            #TODO remove redundancy
            best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm)
            conv_rates[0] += 1
            k_best.append(best_tags)
            #next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
            if num_iter == -1:
                sys.stderr.write("2nd best does not converge :( \n")
                #print ' '.join(best_tags)
                #print
                #continue
            j = 2 # we have the best, and the second best now
            conv_rates[j-1] += 1
            sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n")
            k_best.append(second_best)
         
            while j < k:
                next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
                k_best.append(next_best)
                if num_iter != -1:
                    conv_rates[j] += 1
                    #k_best.append(next_best)
                    sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n")
                else:
                    sys.stderr.write(str(j+1) + "th best does not converge\n")
                    #break
                j += 1

        for best in k_best:
            print ' '.join(best)                
        print

    for j in range(k):
        sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
def execute(dataset, hmm_file, tag_file):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv = 0
    sec_conv = 0
    for sentence in test_sentences:
       
        if True: #len(sentence) < 15:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            
            k_best = []
            best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
            if num_iter != -1:
                sec_conv += 1
                sys.stderr.write("2nd best converges in " + str(num_iter) + "\n")
                k_best.append(best_tags)
                k_best.append(tags1)
                
                third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best)
                if num_iter2 != -1:
                    sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n")
                    conv += 1
                    k_best.append(third_best)
                    fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best)
                    sys.stderr.write("4th best converges in " + str(num_iter3) + "\n")
                    print ' '.join(best_tags)
                    print ' '.join(tags2)
                    print ' '.join(third_best)
                    print ' '.join(fourth_best)
                else:
                    sys.stderr.write( "3rd best does not converge :(\n")
            else:
                sys.stderr.write("2nd best does not converge :(\n")
                continue
            print
    
    sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n")
    sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")
Exemple #4
0
def execute(dataset, hmm_file, tag_file):
    sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    sys.stderr.write("reading dev data...\n")
    test_sentences, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(test_sentences, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    for sentence in test_sentences:
       
        if len(sentence) > 0 :#True: #len(tree) < 100:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            tagprint(test_sents_not_rare[test_sentences.index(sentence)])
            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
           
            tagprint(best_tags)
            tagprint(tags2)
            if num_iterations != -1:
                
                sys.stderr.write("fst tagger accuracy = " + str(evaluate.accuracy(truetags, tags2)) + "\n")
                sys.stderr.write("best tags accuracy = " + str(evaluate.accuracy(truetags, best_tags)) + "\n")
                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                print
                sys.stderr.write("does not converge :(\n")
            tagprint(truetags)
            print
        #if i==100:
            #break     
    sys.stderr.write("\n" + str(avg_iterations/converges) + " iterations on average\n")
    sys.stderr.write(str(converges*100/i) +  " % convergence\n")
    sys.stderr.write("time_taken = "+ str(time.time() - start_time) + "\n")
Exemple #5
0
def main():
    hmm_file = sys.argv[1]
    tag_file = sys.argv[2]
    hmm, tagset = get_param_tagset(hmm_file, tag_file)

    sentence = "We 're about to see if advertising works ."
    sentence = replace_test([sentence.split(' ')], hmm, tagset)[0]
    sys.stderr.write(' '.join(sentence) + "\n")

    tagmap = filter_tagset(sentence, tagset, hmm)
    tagseqs = find_all_tagseqs(len(sentence), tagmap, "", [])

    resultmap = {}
    for tagline in tagseqs:
        tagseq = tagline.strip().split(' ')
        score = find_log_prob(hmm, tagset, sentence, tagseq)
        if score != '':
            key = ' '.join(tagseq)
            resultmap[key] = score
    sorted_x = sorted(resultmap.iteritems(), key=operator.itemgetter(1))
    for k, v in sorted_x:
        print k,"{0:.2f}".format(v)
def execute(dataset, hmm_file, tag_file):
    # sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    # sys.stderr.write("reading dev data...\n")
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    fst_acc = 0
    best_acc = 0
    wrong = 0
    for sentence in test_sentences:

        if True:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write("\n" + str(i) + "\n")
            # sys.stderr.write(' '.join(ts[i-1]) + "\n")
            print " ".join(sentence)
            # print ' '.join(ts[i-1])

            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
                wrong += 1

            if num_iterations != -1:
                facc = evaluate.accuracy(truetags, tags2)
                # sys.stderr.write("fst tagger accuracy = " + str(facc) + "\n")
                fst_acc += facc

                bacc = evaluate.accuracy(truetags, best_tags)
                # sys.stderr.write("best tags accuracy = " + str(bacc) + "\n")
                best_acc += bacc

                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                sys.stderr.write("does not converge :(\n")
            print " ".join(best_tags)
            print " ".join(tags2)
            # print "gold  : ", ' '.join(truetags)
            print

            # if i == 100:
            # break
    sys.stderr.write("\nsystem performance\n--------------------\n")
    sys.stderr.write("\n" + str(wrong * 100 / converges) + "% sequences are wrong:\n")
    sys.stderr.write("\naverage accuracy of best: " + str(best_acc / converges) + "\n")
    sys.stderr.write("average accuracy of 2nd best: " + str(fst_acc / converges) + "\n")

    sys.stderr.write("\nsystem efficiency\n---------------------\n")
    sys.stderr.write("\n" + str(avg_iterations / converges) + " iterations on average\n")
    sys.stderr.write(str(converges * 100 / i) + " % convergence\n")
    sys.stderr.write("time_taken = " + str(time.time() - start_time) + "\n")
Replaces all emissions with frequency <= 5 with the word
-RARE-
'''
def smooth_emission(emission_counts):
    e_counts = defaultdict()
    for key, val in emission_counts.iteritems():
        if val <= 5:
            tag, word = key.split('~>')
            new_key = tag + '~>-RARE-'
            if new_key in e_counts:
                e_counts[new_key] += val
            else:
                e_counts[new_key] = val
        else:
            e_counts[key] = val

    return e_counts

if __name__=='__main__':
    #replace(sys.argv[1])
    test_sent_tags = sys.argv[1]
    hmm, tagset = hmm_utils.get_param_tagset(sys.argv[2], sys.argv[3])
    ts, test_tags = data_reader.read_tagging_data(test_sent_tags)
    test_sentences = replace_test(ts, hmm, tagset)

    for ts in test_sentences:
        for word in ts:
            print word
        print