コード例 #1
0
class UnknownWordTagger:
    def __init__(self):
        self._ngram_tagger = NgramTagger()

    def tagset(self, word, at_sentence_start=False):
        """ Return a list of (probability, tag) tuples for the given word """
        toklist = list(parse_tokens(" ".join(word)))
        token = toklist[0]
        w = word[0]
        if token.kind == TOK.WORD and token.val is None:
            try:
                with BIN_Db.get_db() as db:
                    w, m = db.lookup_word(token.txt, at_sentence_start)
            except Exception as e:
                w, m = token.txt, []
            token = TOK.Word(w, m)
        return self._ngram_tagger.tag_single_token(token)

    def tag(self, word, at_sentence_start=False):
        """ Return a list with a single (word, tag) tuple for the given
            word list, containing a single word """
        taglist = self.tagset(word, at_sentence_start)
        w = word[0]
        if taglist:
            # Sort in descending order of probability
            taglist.sort(key=lambda x: x[1], reverse=True)
            # Return the most likely tag
            return [(w, taglist[0][0])]
        # No taglist: give up and return 'Unk' as the tag
        return [(w, 'Unk')]
コード例 #2
0
ファイル: tnttagger.py プロジェクト: vthorsteinsson/Reynir
class UnknownWordTagger:

    def __init__(self):
        self._ngram_tagger = NgramTagger()

    def tagset(self, word, at_sentence_start = False):
        """ Return a list of (probability, tag) tuples for the given word """
        toklist = list(parse_tokens(" ".join(word)))
        token = toklist[0]
        w = word[0]
        if token.kind == TOK.WORD and token.val is None:
            try:
                with BIN_Db.get_db() as db:
                    w, m = db.lookup_word(token.txt, at_sentence_start)
            except Exception as e:
                w, m = token.txt, []
            token = TOK.Word(w, m)
        return self._ngram_tagger.tag_single_token(token)

    def tag(self, word, at_sentence_start = False):
        """ Return a list with a single (word, tag) tuple for the given
            word list, containing a single word """
        taglist = self.tagset(word, at_sentence_start)
        w = word[0]
        if taglist:
            # Sort in descending order of probability
            taglist.sort(key = lambda x: x[1], reverse = True)
            # Return the most likely tag
            return [ (w, taglist[0][0]) ]
        # No taglist: give up and return 'Unk' as the tag
        return [ (w, 'Unk') ]
コード例 #3
0
def test_tagger():

    print("Initializing tagger")

    # Number of training and test sentences
    TRAINING_SET = 500
    IFD_TRAINING_SET = 21000  # There are only about 20.800 sentences in the IFD corpus
    TEST_SET = 400
    BEAM_SIZE = 250  # A higher number does not seem to yield improved results

    # noinspection PyUnreachableCode
    if False:
        tnt_tagger = TnT(N=BEAM_SIZE, C=True)
        tagger = NgramTagger(n=3, verbose=False)
        # Create a new model and store it
        with timeit("Train NgramTagger"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            sentence_stream = Article.sentence_stream(limit=TRAINING_SET,
                                                      skip=TEST_SET)
            tagger.train(sentence_stream)
        with timeit("Train TnT_Tagger on articles"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            sentence_stream = Article.sentence_stream(limit=TRAINING_SET,
                                                      skip=TEST_SET)
            word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream)
            tnt_tagger.train(word_tag_stream)
        with timeit("Train TnT_Tagger on IFD"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            word_tag_stream = IFD_Corpus().word_tag_stream(
                limit=IFD_TRAINING_SET, skip=TEST_SET)
            tnt_tagger.train(word_tag_stream)
        with timeit("Store TnT model"):
            tnt_tagger.store(_TNT_MODEL_FILE)
    else:
        tagger = None
        # Load an existing model
        with timeit("load_model()"):
            tnt_tagger = TnT.load(_TNT_MODEL_FILE)
            if tnt_tagger is None:
                print(
                    f"Unable to load TnT model from {_TNT_MODEL_FILE}, test aborted"
                )
                return
    #tagger.show_model()
    #return

    total_tags = 0
    correct_tag = 0
    partial_tag = 0
    missing_tag = 0
    correct_tag_tnt = 0
    partial_tag_tnt = 0
    missing_tag_tnt = 0

    def simple_test(session):
        txt = "Þau segja að börn hafi gott af því."
        toklist = tokenize(txt, enclosing_session=session)
        dlist = tagger.tag(toklist)
        print("Sentence: '{0}'".format(txt))
        print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))

    def article_test(session):
        sentence_stream = Article.sentence_stream(limit=TEST_SET)
        for sent in sentence_stream:
            txt = " ".join(t["x"] for t in sent if "x" in t)
            if txt:
                toklist = tokenize(txt, enclosing_session=session)
                dlist = tagger.tag(toklist)
                print("Sentence: '{0}'".format(txt))
                print("Tagging result:\n{0}".format("\n".join(
                    str(d) for d in dlist)))

    def test_ifd_file(session):
        print("\n\n*** IFD TEST SET ***\n\n")
        gen = IFD_Corpus().raw_sentence_stream(limit=TEST_SET)
        dlist = None
        for sent in gen:
            orðalisti = [triple[0] for triple in sent]
            mörk_OTB = [triple[1] for triple in sent]
            lemmur_OTB = [triple[2] for triple in sent]
            txt = " ".join(orðalisti)
            if tagger is not None:
                toklist = tokenize(txt, enclosing_session=session)
                dlist = tagger.tag(toklist)
            tntlist = tnt_tagger.tag(orðalisti)
            ix = 0
            print("\n{0}\n".format(txt))
            for tag, lemma, word, tnt_wt in zip(mörk_OTB, lemmur_OTB,
                                                orðalisti, tntlist):
                tnt_tag = tnt_wt[1]
                j = ix
                if dlist is None:
                    gtag = "?"
                else:
                    while j < len(dlist) and dlist[j].get("x", "") != word:
                        j += 1
                    if j < len(dlist):
                        ix = j
                        gtag = dlist[ix].get("i", "?")
                        if gtag == "?" and dlist[ix].get(
                                "k") == TOK.PUNCTUATION:
                            gtag = word
                        ix += 1
                    else:
                        gtag = "?"

                def grade(gtag):
                    if gtag == "?" and tag != "?":
                        return "M"
                    if gtag == tag:
                        return " "
                    if gtag[0] == tag[0]:
                        return "P"
                    return "E"

                grade_g = grade(gtag)
                grade_tnt = grade(tnt_tag)

                print("{0:20} | {1:20} | {2:8} | {3:8} | {4} | {5:8} | {6}".
                      format(word, lemma or word, tag, gtag, grade(gtag),
                             tnt_tag, grade(tnt_tag)))
                nonlocal total_tags, missing_tag, correct_tag, partial_tag
                nonlocal missing_tag_tnt, correct_tag_tnt, partial_tag_tnt
                total_tags += 1
                if grade_g == "M":
                    missing_tag += 1
                elif grade_g == " ":
                    correct_tag += 1
                elif grade_g == "P":
                    partial_tag += 1
                if grade_tnt == "M":
                    missing_tag_tnt += 1
                elif grade_tnt == " ":
                    correct_tag_tnt += 1
                elif grade_tnt == "P":
                    partial_tag_tnt += 1

    with SessionContext(read_only=True, commit=True) as session:

        #simple_test(session)

        #article_test(session)

        test_ifd_file(session)

    if total_tags:
        print("\n-----------------------------------\n")
        print("Total tags:   {0:8}".format(total_tags))
        print("\nNgram tagger:\n")
        print("Missing tags: {0:8} {1:6.2f}%".format(
            missing_tag, 100.0 * missing_tag / total_tags))
        print("Tagged:       {0:8} {1:6.2f}%".format(
            total_tags - missing_tag,
            100.0 * (total_tags - missing_tag) / total_tags))
        print("Correct tags: {0:8} {1:6.2f}%".format(
            correct_tag, 100.0 * correct_tag / total_tags))
        print("Partial tags: {0:8} {1:6.2f}%".format(
            partial_tag + correct_tag,
            100.0 * (partial_tag + correct_tag) / total_tags))
        print("Partial prec: {0:8} {1:6.2f}%".format(
            "",
            100.0 * (partial_tag + correct_tag) / (total_tags - missing_tag)))
        print("Precision:    {0:8} {1:6.2f}%".format(
            "", 100.0 * correct_tag / (total_tags - missing_tag)))
        print("\nTnT tagger:\n")
        print("Missing tags: {0:8} {1:6.2f}%".format(
            missing_tag_tnt, 100.0 * missing_tag_tnt / total_tags))
        print("Tagged:       {0:8} {1:6.2f}%".format(
            total_tags - missing_tag_tnt,
            100.0 * (total_tags - missing_tag_tnt) / total_tags))
        print("Correct tags: {0:8} {1:6.2f}%".format(
            correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags))
        print("Partial tags: {0:8} {1:6.2f}%".format(
            partial_tag_tnt + correct_tag_tnt,
            100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags))
        print("Partial prec: {0:8} {1:6.2f}%".format(
            "", 100.0 * (partial_tag_tnt + correct_tag_tnt) /
            (total_tags - missing_tag_tnt)))
        print("Precision:    {0:8} {1:6.2f}%".format(
            "", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt)))
        print("\n-----------------------------------\n")
コード例 #4
0
 def __init__(self):
     self._ngram_tagger = NgramTagger()
コード例 #5
0
ファイル: tnttagger.py プロジェクト: vthorsteinsson/Reynir
 def __init__(self):
     self._ngram_tagger = NgramTagger()
コード例 #6
0
ファイル: tagtest.py プロジェクト: vthorsteinsson/Reynir
def test_tagger():

    print("Initializing tagger")

    # Number of training and test sentences
    TRAINING_SET = 500
    IFD_TRAINING_SET = 21000 # There are only about 20.800 sentences in the IFD corpus
    TEST_SET = 400
    BEAM_SIZE = 250 # A higher number does not seem to yield improved results

    # noinspection PyUnreachableCode
    if False:
        tnt_tagger = TnT(N = BEAM_SIZE, C = True)
        tagger = NgramTagger(n = 3, verbose = False)
        # Create a new model and store it
        with timeit("Train NgramTagger"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
            tagger.train(sentence_stream)
        with timeit("Train TnT_Tagger on articles"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
            word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream)
            tnt_tagger.train(word_tag_stream)
        with timeit("Train TnT_Tagger on IFD"):
            # Get a sentence stream from parsed articles
            # Number of sentences, size of training set
            word_tag_stream = IFD_Corpus().word_tag_stream(limit = IFD_TRAINING_SET, skip = TEST_SET)
            tnt_tagger.train(word_tag_stream)
        with timeit("Store TnT model"):
            tnt_tagger.store(_TNT_MODEL_FILE)
    else:
        tagger = None
        # Load an existing model
        with timeit("load_model()"):
            tnt_tagger = TnT.load(_TNT_MODEL_FILE)
            if tnt_tagger is None:
                print(f"Unable to load TnT model from {_TNT_MODEL_FILE}, test aborted")
                return
    #tagger.show_model()
    #return

    total_tags = 0
    correct_tag = 0
    partial_tag = 0
    missing_tag = 0
    correct_tag_tnt = 0
    partial_tag_tnt = 0
    missing_tag_tnt = 0


    def simple_test(session):
        txt = "Þau segja að börn hafi gott af því."
        toklist = tokenize(txt, enclosing_session = session)
        dlist = tagger.tag(toklist)
        print("Sentence: '{0}'".format(txt))
        print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))


    def article_test(session):
        sentence_stream = Article.sentence_stream(limit = TEST_SET)
        for sent in sentence_stream:
            txt = " ".join(t["x"] for t in sent if "x" in t)
            if txt:
                toklist = tokenize(txt, enclosing_session = session)
                dlist = tagger.tag(toklist)
                print("Sentence: '{0}'".format(txt))
                print("Tagging result:\n{0}".format("\n".join(str(d) for d in dlist)))


    def test_ifd_file(session):
        print("\n\n*** IFD TEST SET ***\n\n")
        gen = IFD_Corpus().raw_sentence_stream(limit = TEST_SET)
        dlist = None
        for sent in gen:
            orðalisti = [ triple[0] for triple in sent ]
            mörk_OTB = [ triple[1] for triple in sent ]
            lemmur_OTB = [ triple[2] for triple in sent ]
            txt = " ".join(orðalisti)
            if tagger is not None:
                toklist = tokenize(txt, enclosing_session = session)
                dlist = tagger.tag(toklist)
            tntlist = tnt_tagger.tag(orðalisti)
            ix = 0
            print("\n{0}\n".format(txt))
            for tag, lemma, word, tnt_wt in zip(mörk_OTB, lemmur_OTB, orðalisti, tntlist):
                tnt_tag = tnt_wt[1]
                j = ix
                if dlist is None:
                    gtag = "?"
                else:
                    while j < len(dlist) and dlist[j].get("x", "") != word:
                        j += 1
                    if j < len(dlist):
                        ix = j
                        gtag = dlist[ix].get("i", "?")
                        if gtag == "?" and dlist[ix].get("k") == TOK.PUNCTUATION:
                            gtag = word
                        ix += 1
                    else:
                        gtag = "?"

                def grade(gtag):
                    if gtag == "?" and tag != "?":
                        return "M"
                    if gtag == tag:
                        return " "
                    if gtag[0] == tag[0]:
                        return "P"
                    return "E"

                grade_g = grade(gtag)
                grade_tnt = grade(tnt_tag)

                print("{0:20} | {1:20} | {2:8} | {3:8} | {4} | {5:8} | {6}"
                    .format(word, lemma or word, tag, gtag, grade(gtag), tnt_tag, grade(tnt_tag)))
                nonlocal total_tags, missing_tag, correct_tag, partial_tag
                nonlocal missing_tag_tnt, correct_tag_tnt, partial_tag_tnt
                total_tags += 1
                if grade_g == "M":
                    missing_tag += 1
                elif grade_g == " ":
                    correct_tag += 1
                elif grade_g == "P":
                    partial_tag += 1
                if grade_tnt == "M":
                    missing_tag_tnt += 1
                elif grade_tnt == " ":
                    correct_tag_tnt += 1
                elif grade_tnt == "P":
                    partial_tag_tnt += 1

    with SessionContext(read_only = True, commit = True) as session:

        #simple_test(session)

        #article_test(session)

        test_ifd_file(session)

    if total_tags:
        print("\n-----------------------------------\n")
        print("Total tags:   {0:8}".format(total_tags))
        print("\nNgram tagger:\n")
        print("Missing tags: {0:8} {1:6.2f}%"
            .format(missing_tag, 100.0 * missing_tag / total_tags))
        print("Tagged:       {0:8} {1:6.2f}%"
            .format(total_tags - missing_tag, 100.0 * (total_tags - missing_tag) / total_tags))
        print("Correct tags: {0:8} {1:6.2f}%"
            .format(correct_tag, 100.0 * correct_tag / total_tags))
        print("Partial tags: {0:8} {1:6.2f}%"
            .format(partial_tag + correct_tag, 100.0 * (partial_tag + correct_tag) / total_tags))
        print("Partial prec: {0:8} {1:6.2f}%"
            .format("", 100.0 * (partial_tag + correct_tag) / (total_tags - missing_tag)))
        print("Precision:    {0:8} {1:6.2f}%"
            .format("", 100.0 * correct_tag / (total_tags - missing_tag)))
        print("\nTnT tagger:\n")
        print("Missing tags: {0:8} {1:6.2f}%"
            .format(missing_tag_tnt, 100.0 * missing_tag_tnt / total_tags))
        print("Tagged:       {0:8} {1:6.2f}%"
            .format(total_tags - missing_tag_tnt, 100.0 * (total_tags - missing_tag_tnt) / total_tags))
        print("Correct tags: {0:8} {1:6.2f}%"
            .format(correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags))
        print("Partial tags: {0:8} {1:6.2f}%"
            .format(partial_tag_tnt + correct_tag_tnt, 100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags))
        print("Partial prec: {0:8} {1:6.2f}%"
            .format("", 100.0 * (partial_tag_tnt + correct_tag_tnt) / (total_tags - missing_tag_tnt)))
        print("Precision:    {0:8} {1:6.2f}%"
            .format("", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt)))
        print("\n-----------------------------------\n")