class UnknownWordTagger: def __init__(self): self._ngram_tagger = NgramTagger() def tagset(self, word, at_sentence_start=False): """ Return a list of (probability, tag) tuples for the given word """ toklist = list(parse_tokens(" ".join(word))) token = toklist[0] w = word[0] if token.kind == TOK.WORD and token.val is None: try: with BIN_Db.get_db() as db: w, m = db.lookup_word(token.txt, at_sentence_start) except Exception as e: w, m = token.txt, [] token = TOK.Word(w, m) return self._ngram_tagger.tag_single_token(token) def tag(self, word, at_sentence_start=False): """ Return a list with a single (word, tag) tuple for the given word list, containing a single word """ taglist = self.tagset(word, at_sentence_start) w = word[0] if taglist: # Sort in descending order of probability taglist.sort(key=lambda x: x[1], reverse=True) # Return the most likely tag return [(w, taglist[0][0])] # No taglist: give up and return 'Unk' as the tag return [(w, 'Unk')]
class UnknownWordTagger: def __init__(self): self._ngram_tagger = NgramTagger() def tagset(self, word, at_sentence_start = False): """ Return a list of (probability, tag) tuples for the given word """ toklist = list(parse_tokens(" ".join(word))) token = toklist[0] w = word[0] if token.kind == TOK.WORD and token.val is None: try: with BIN_Db.get_db() as db: w, m = db.lookup_word(token.txt, at_sentence_start) except Exception as e: w, m = token.txt, [] token = TOK.Word(w, m) return self._ngram_tagger.tag_single_token(token) def tag(self, word, at_sentence_start = False): """ Return a list with a single (word, tag) tuple for the given word list, containing a single word """ taglist = self.tagset(word, at_sentence_start) w = word[0] if taglist: # Sort in descending order of probability taglist.sort(key = lambda x: x[1], reverse = True) # Return the most likely tag return [ (w, taglist[0][0]) ] # No taglist: give up and return 'Unk' as the tag return [ (w, 'Unk') ]
def test_tagger(): print("Initializing tagger") # Number of training and test sentences TRAINING_SET = 500 IFD_TRAINING_SET = 21000 # There are only about 20.800 sentences in the IFD corpus TEST_SET = 400 BEAM_SIZE = 250 # A higher number does not seem to yield improved results # noinspection PyUnreachableCode if False: tnt_tagger = TnT(N=BEAM_SIZE, C=True) tagger = NgramTagger(n=3, verbose=False) # Create a new model and store it with timeit("Train NgramTagger"): # Get a sentence stream from parsed articles # Number of sentences, size of training set sentence_stream = Article.sentence_stream(limit=TRAINING_SET, skip=TEST_SET) tagger.train(sentence_stream) with timeit("Train TnT_Tagger on articles"): # Get a sentence stream from parsed articles # Number of sentences, size of training set sentence_stream = Article.sentence_stream(limit=TRAINING_SET, skip=TEST_SET) word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream) tnt_tagger.train(word_tag_stream) with timeit("Train TnT_Tagger on IFD"): # Get a sentence stream from parsed articles # Number of sentences, size of training set word_tag_stream = IFD_Corpus().word_tag_stream( limit=IFD_TRAINING_SET, skip=TEST_SET) tnt_tagger.train(word_tag_stream) with timeit("Store TnT model"): tnt_tagger.store(_TNT_MODEL_FILE) else: tagger = None # Load an existing model with timeit("load_model()"): tnt_tagger = TnT.load(_TNT_MODEL_FILE) if tnt_tagger is None: print( f"Unable to load TnT model from {_TNT_MODEL_FILE}, test aborted" ) return #tagger.show_model() #return total_tags = 0 correct_tag = 0 partial_tag = 0 missing_tag = 0 correct_tag_tnt = 0 partial_tag_tnt = 0 missing_tag_tnt = 0 def simple_test(session): txt = "Þau segja að börn hafi gott af því." toklist = tokenize(txt, enclosing_session=session) dlist = tagger.tag(toklist) print("Sentence: '{0}'".format(txt)) print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist))) def article_test(session): sentence_stream = Article.sentence_stream(limit=TEST_SET) for sent in sentence_stream: txt = " ".join(t["x"] for t in sent if "x" in t) if txt: toklist = tokenize(txt, enclosing_session=session) dlist = tagger.tag(toklist) print("Sentence: '{0}'".format(txt)) print("Tagging result:\n{0}".format("\n".join( str(d) for d in dlist))) def test_ifd_file(session): print("\n\n*** IFD TEST SET ***\n\n") gen = IFD_Corpus().raw_sentence_stream(limit=TEST_SET) dlist = None for sent in gen: orðalisti = [triple[0] for triple in sent] mörk_OTB = [triple[1] for triple in sent] lemmur_OTB = [triple[2] for triple in sent] txt = " ".join(orðalisti) if tagger is not None: toklist = tokenize(txt, enclosing_session=session) dlist = tagger.tag(toklist) tntlist = tnt_tagger.tag(orðalisti) ix = 0 print("\n{0}\n".format(txt)) for tag, lemma, word, tnt_wt in zip(mörk_OTB, lemmur_OTB, orðalisti, tntlist): tnt_tag = tnt_wt[1] j = ix if dlist is None: gtag = "?" else: while j < len(dlist) and dlist[j].get("x", "") != word: j += 1 if j < len(dlist): ix = j gtag = dlist[ix].get("i", "?") if gtag == "?" and dlist[ix].get( "k") == TOK.PUNCTUATION: gtag = word ix += 1 else: gtag = "?" def grade(gtag): if gtag == "?" and tag != "?": return "M" if gtag == tag: return " " if gtag[0] == tag[0]: return "P" return "E" grade_g = grade(gtag) grade_tnt = grade(tnt_tag) print("{0:20} | {1:20} | {2:8} | {3:8} | {4} | {5:8} | {6}". format(word, lemma or word, tag, gtag, grade(gtag), tnt_tag, grade(tnt_tag))) nonlocal total_tags, missing_tag, correct_tag, partial_tag nonlocal missing_tag_tnt, correct_tag_tnt, partial_tag_tnt total_tags += 1 if grade_g == "M": missing_tag += 1 elif grade_g == " ": correct_tag += 1 elif grade_g == "P": partial_tag += 1 if grade_tnt == "M": missing_tag_tnt += 1 elif grade_tnt == " ": correct_tag_tnt += 1 elif grade_tnt == "P": partial_tag_tnt += 1 with SessionContext(read_only=True, commit=True) as session: #simple_test(session) #article_test(session) test_ifd_file(session) if total_tags: print("\n-----------------------------------\n") print("Total tags: {0:8}".format(total_tags)) print("\nNgram tagger:\n") print("Missing tags: {0:8} {1:6.2f}%".format( missing_tag, 100.0 * missing_tag / total_tags)) print("Tagged: {0:8} {1:6.2f}%".format( total_tags - missing_tag, 100.0 * (total_tags - missing_tag) / total_tags)) print("Correct tags: {0:8} {1:6.2f}%".format( correct_tag, 100.0 * correct_tag / total_tags)) print("Partial tags: {0:8} {1:6.2f}%".format( partial_tag + correct_tag, 100.0 * (partial_tag + correct_tag) / total_tags)) print("Partial prec: {0:8} {1:6.2f}%".format( "", 100.0 * (partial_tag + correct_tag) / (total_tags - missing_tag))) print("Precision: {0:8} {1:6.2f}%".format( "", 100.0 * correct_tag / (total_tags - missing_tag))) print("\nTnT tagger:\n") print("Missing tags: {0:8} {1:6.2f}%".format( missing_tag_tnt, 100.0 * missing_tag_tnt / total_tags)) print("Tagged: {0:8} {1:6.2f}%".format( total_tags - missing_tag_tnt, 100.0 * (total_tags - missing_tag_tnt) / total_tags)) print("Correct tags: {0:8} {1:6.2f}%".format( correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags)) print("Partial tags: {0:8} {1:6.2f}%".format( partial_tag_tnt + correct_tag_tnt, 100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags)) print("Partial prec: {0:8} {1:6.2f}%".format( "", 100.0 * (partial_tag_tnt + correct_tag_tnt) / (total_tags - missing_tag_tnt))) print("Precision: {0:8} {1:6.2f}%".format( "", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt))) print("\n-----------------------------------\n")
def __init__(self): self._ngram_tagger = NgramTagger()
def test_tagger(): print("Initializing tagger") # Number of training and test sentences TRAINING_SET = 500 IFD_TRAINING_SET = 21000 # There are only about 20.800 sentences in the IFD corpus TEST_SET = 400 BEAM_SIZE = 250 # A higher number does not seem to yield improved results # noinspection PyUnreachableCode if False: tnt_tagger = TnT(N = BEAM_SIZE, C = True) tagger = NgramTagger(n = 3, verbose = False) # Create a new model and store it with timeit("Train NgramTagger"): # Get a sentence stream from parsed articles # Number of sentences, size of training set sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET) tagger.train(sentence_stream) with timeit("Train TnT_Tagger on articles"): # Get a sentence stream from parsed articles # Number of sentences, size of training set sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET) word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream) tnt_tagger.train(word_tag_stream) with timeit("Train TnT_Tagger on IFD"): # Get a sentence stream from parsed articles # Number of sentences, size of training set word_tag_stream = IFD_Corpus().word_tag_stream(limit = IFD_TRAINING_SET, skip = TEST_SET) tnt_tagger.train(word_tag_stream) with timeit("Store TnT model"): tnt_tagger.store(_TNT_MODEL_FILE) else: tagger = None # Load an existing model with timeit("load_model()"): tnt_tagger = TnT.load(_TNT_MODEL_FILE) if tnt_tagger is None: print(f"Unable to load TnT model from {_TNT_MODEL_FILE}, test aborted") return #tagger.show_model() #return total_tags = 0 correct_tag = 0 partial_tag = 0 missing_tag = 0 correct_tag_tnt = 0 partial_tag_tnt = 0 missing_tag_tnt = 0 def simple_test(session): txt = "Þau segja að börn hafi gott af því." toklist = tokenize(txt, enclosing_session = session) dlist = tagger.tag(toklist) print("Sentence: '{0}'".format(txt)) print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist))) def article_test(session): sentence_stream = Article.sentence_stream(limit = TEST_SET) for sent in sentence_stream: txt = " ".join(t["x"] for t in sent if "x" in t) if txt: toklist = tokenize(txt, enclosing_session = session) dlist = tagger.tag(toklist) print("Sentence: '{0}'".format(txt)) print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist))) def test_ifd_file(session): print("\n\n*** IFD TEST SET ***\n\n") gen = IFD_Corpus().raw_sentence_stream(limit = TEST_SET) dlist = None for sent in gen: orðalisti = [ triple[0] for triple in sent ] mörk_OTB = [ triple[1] for triple in sent ] lemmur_OTB = [ triple[2] for triple in sent ] txt = " ".join(orðalisti) if tagger is not None: toklist = tokenize(txt, enclosing_session = session) dlist = tagger.tag(toklist) tntlist = tnt_tagger.tag(orðalisti) ix = 0 print("\n{0}\n".format(txt)) for tag, lemma, word, tnt_wt in zip(mörk_OTB, lemmur_OTB, orðalisti, tntlist): tnt_tag = tnt_wt[1] j = ix if dlist is None: gtag = "?" else: while j < len(dlist) and dlist[j].get("x", "") != word: j += 1 if j < len(dlist): ix = j gtag = dlist[ix].get("i", "?") if gtag == "?" and dlist[ix].get("k") == TOK.PUNCTUATION: gtag = word ix += 1 else: gtag = "?" def grade(gtag): if gtag == "?" and tag != "?": return "M" if gtag == tag: return " " if gtag[0] == tag[0]: return "P" return "E" grade_g = grade(gtag) grade_tnt = grade(tnt_tag) print("{0:20} | {1:20} | {2:8} | {3:8} | {4} | {5:8} | {6}" .format(word, lemma or word, tag, gtag, grade(gtag), tnt_tag, grade(tnt_tag))) nonlocal total_tags, missing_tag, correct_tag, partial_tag nonlocal missing_tag_tnt, correct_tag_tnt, partial_tag_tnt total_tags += 1 if grade_g == "M": missing_tag += 1 elif grade_g == " ": correct_tag += 1 elif grade_g == "P": partial_tag += 1 if grade_tnt == "M": missing_tag_tnt += 1 elif grade_tnt == " ": correct_tag_tnt += 1 elif grade_tnt == "P": partial_tag_tnt += 1 with SessionContext(read_only = True, commit = True) as session: #simple_test(session) #article_test(session) test_ifd_file(session) if total_tags: print("\n-----------------------------------\n") print("Total tags: {0:8}".format(total_tags)) print("\nNgram tagger:\n") print("Missing tags: {0:8} {1:6.2f}%" .format(missing_tag, 100.0 * missing_tag / total_tags)) print("Tagged: {0:8} {1:6.2f}%" .format(total_tags - missing_tag, 100.0 * (total_tags - missing_tag) / total_tags)) print("Correct tags: {0:8} {1:6.2f}%" .format(correct_tag, 100.0 * correct_tag / total_tags)) print("Partial tags: {0:8} {1:6.2f}%" .format(partial_tag + correct_tag, 100.0 * (partial_tag + correct_tag) / total_tags)) print("Partial prec: {0:8} {1:6.2f}%" .format("", 100.0 * (partial_tag + correct_tag) / (total_tags - missing_tag))) print("Precision: {0:8} {1:6.2f}%" .format("", 100.0 * correct_tag / (total_tags - missing_tag))) print("\nTnT tagger:\n") print("Missing tags: {0:8} {1:6.2f}%" .format(missing_tag_tnt, 100.0 * missing_tag_tnt / total_tags)) print("Tagged: {0:8} {1:6.2f}%" .format(total_tags - missing_tag_tnt, 100.0 * (total_tags - missing_tag_tnt) / total_tags)) print("Correct tags: {0:8} {1:6.2f}%" .format(correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags)) print("Partial tags: {0:8} {1:6.2f}%" .format(partial_tag_tnt + correct_tag_tnt, 100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags)) print("Partial prec: {0:8} {1:6.2f}%" .format("", 100.0 * (partial_tag_tnt + correct_tag_tnt) / (total_tags - missing_tag_tnt))) print("Precision: {0:8} {1:6.2f}%" .format("", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt))) print("\n-----------------------------------\n")