def randomvalidate(model, lm, emissions, cfd): args = Namespace() args.targetlang="gn" ## args.sourcetext="/space/Europarl_Intersection_preprocessed/intersection.en.txt.ascii.taggedlemmas" ## args.targettext="/space/Europarl_Intersection_preprocessed/intersection.es.txt.lemmas" ## args.alignments="/space/output_en_es/training.align" args.sourcetext="/space/es_gn_bibles/bible.es.txt" args.targettext="/space/es_gn_bibles/bible.gn.txt" args.alignments="/space/output_es_gn/training.align" args.fast=False triple_sentences = learn.load_bitext(args) tl_sentences = learn.get_target_language_sentences(triple_sentences) sl_sentences = [source for (source,target,align) in triple_sentences] sentence_pairs = list(zip(sl_sentences, tl_sentences)) hmmparts = HMMParts(lm, emissions, cfd) totalcorrect = 0 totalwords = 0 for lineid in guarani.testset: (ss, ts) = sentence_pairs[lineid] print(" ".join(ss)) if model == "unigram": tagged = skinnyhmm.mfs(cfd, ss) if model == "bigram": tagged = skinnyhmm.viterbi(lm, emissions, cfd, ss) elif model == "trigram": tagged = searches.beam(lm, emissions, cfd, ss, beamwidth=BEAMWIDTH) elif model == "memm": tagged = searches.beam_memm(ss, hmmparts, beamwidth=BEAMWIDTH) print("ORIGINAL:", list(zip(ss,ts))) print("TAGGED:", tagged) predicted = [t for (s,t) in tagged] correct = 0 print(list(zip(ts, predicted))) for actual, pred in zip(ts, predicted): if actual == UNTRANSLATED: continue totalwords += 1 if actual == pred: correct += 1 totalcorrect += 1 print("sentence accuracy:", correct / len(ss)) print("considered words:", len(ss)) accuracy = (totalcorrect / totalwords) print("accuracy:", accuracy) print("considered words:", totalwords)