Example #1
0
def randomvalidate(model, lm, emissions, cfd):
    args = Namespace()
    args.targetlang="gn"
    ## args.sourcetext="/space/Europarl_Intersection_preprocessed/intersection.en.txt.ascii.taggedlemmas" 
    ## args.targettext="/space/Europarl_Intersection_preprocessed/intersection.es.txt.lemmas"
    ## args.alignments="/space/output_en_es/training.align"
    args.sourcetext="/space/es_gn_bibles/bible.es.txt" 
    args.targettext="/space/es_gn_bibles/bible.gn.txt"
    args.alignments="/space/output_es_gn/training.align"
    args.fast=False

    triple_sentences = learn.load_bitext(args)
    tl_sentences = learn.get_target_language_sentences(triple_sentences)
    sl_sentences = [source for (source,target,align) in triple_sentences]
    sentence_pairs = list(zip(sl_sentences, tl_sentences))

    hmmparts = HMMParts(lm, emissions, cfd)

    totalcorrect = 0
    totalwords = 0

    for lineid in guarani.testset:
        (ss, ts) = sentence_pairs[lineid]
        print(" ".join(ss))
        if model == "unigram":
            tagged = skinnyhmm.mfs(cfd, ss)
        if model == "bigram":
            tagged = skinnyhmm.viterbi(lm, emissions, cfd, ss)
        elif model == "trigram":
            tagged = searches.beam(lm, emissions, cfd, ss, beamwidth=BEAMWIDTH)
        elif model == "memm":
            tagged = searches.beam_memm(ss, hmmparts, beamwidth=BEAMWIDTH)

        print("ORIGINAL:", list(zip(ss,ts)))
        print("TAGGED:", tagged)

        predicted = [t for (s,t) in tagged]
        correct = 0
        print(list(zip(ts, predicted)))
        for actual, pred in zip(ts, predicted):
            if actual == UNTRANSLATED: continue
            totalwords += 1
            if actual == pred:
                correct += 1
                totalcorrect += 1
        print("sentence accuracy:", correct / len(ss))
        print("considered words:", len(ss))
    accuracy = (totalcorrect / totalwords)
    print("accuracy:", accuracy)
    print("considered words:", totalwords)