Ejemplo n.º 1
0
    def process_test_sentence(sent):
        """ 
        Helper function that processes a test sentence indivivdually
        """

        # create a transducer for the test sentence
        # it transduces from word -> word
        with open("w/test_sent_fsa.txt", "w") as test_file:
            i = 0

            for w, _ in sent:  # we only write the word and keep the test label for testing
                test_file.write(f"{i}\t{i+1}\t{w}\t{w}\n")
                i += 1

            test_file.write(f"{i}")

        # complite the fst for the test sentence and compute the predictions with the model we're testing.
        # finally save the predictions to a file
        call(
            f"fstcompile --isymbols=w/lex.syms --osymbols=w/lex.syms --keep_osymbols --keep_isymbols w/test_sent_fsa.txt | "
            +
            f"fstcompose - w/{kind}_wfst_ngrm.fsa | fstrmepsilon | fstshortestpath | fsttopsort | "
            + f"fstprint - " + f" > w/prediction_on_sent.txt")

        # read the file and extract predictions (x, y)
        pd = pandas.read_csv("w/prediction_on_sent.txt",
                             delimiter="\t",
                             header=None)

        pd = pd[:-1][[2, 3]].get_values(
        )  # obtain only columns for word and tag (discard: id and weight columns)
        pd = [[w, re.sub(r"__.*", "", t)]
              for w, t in pd]  # replace the O__word tags with only O

        return pd  # return word and predicted tag
Ejemplo n.º 2
0
def process_preds_to_score(score_file_name):
    """
    passes the `w/pred_coneval.txt` file through the conlleval script and saves the resulting scores in 
     `scores/<score_file_name>` file
    """
    call(
        f"cat w/pred_coneval.txt | ../P1_data/scripts/conlleval.pl > scores/{score_file_name}"
    )
Ejemplo n.º 3
0
def create_iob_ngram_model(method="kneser_ney", order="3"):
    """
    Creates the transducer + ngram model. Only considering words and normal IOB tags
    """

    call(
        f"farcompilestrings --symbols=w/lex.syms --keep_symbols --unknown_symbol='<unk>' w/iob_ngram_file.txt | "
        + f"ngramcount --order={order} --require_symbols=false - | " +
        f"ngrammake --method={method} - | " +
        # now we compose the tagger with the ngram model
        f"fstcompose w/iob_tagger_trans.fsa - " + f" > w/iob_wfst_ngrm.fsa")
Ejemplo n.º 4
0
def create_baseline_model():
    """
    Baseline model is just the word -> IOB tag transducer + unigram model with no smoothing
    """
    call(
        "farcompilestrings --symbols=w/lex.syms --keep_symbols --unknown_symbol='<unk>' w/iob_ngram_file.txt | "
        +  # use word -> IOB tag transducer
        "ngramcount --order=1 --require_symbols=false - | " +  # unigram 
        "ngrammake --method=unsmoothed - | " +  # with no smoothing

        # now we compose the tagger with the ngram model
        "fstcompose w/iob_tagger_trans.fsa - " + " > w/baseline_wfst_ngrm.fsa")
Ejemplo n.º 5
0
def create_transducer_from_data(all_training_pairs, name):
    """
    `all_training_pairs` - is an array of pairs, where each pair is [word, tag]
    `name` - is the name we want to give to the transducer

    The transducer is written to a file and then compiled with fstcompile
    """

    # first count occurrences of each word given the tag
    cfd = nltk.ConditionalFreqDist(
        reversed(pair)  # tag - word (condition is tag)
        for pair in all_training_pairs)  # pair [word, tag]

    with open(f"w/{name}.txt", "w") as tagger_trans:

        for word, tag in set(all_training_pairs):  # only unique pairs

            # calculate probability of word given the tag
            # probability(word | tag) =
            #          count(word, tag) / count(tag)

            freqs = cfd[tag]

            val = freqs[word]  # count(word, tag)
            total_w = sum(freqs.values())  # count(tag)

            # inverse log to respect the fact that weight is actually a score
            probab = -math.log(val / total_w)

            # write transition rule to file
            tagger_trans.write(f"0\t0\t{word}\t{tag}\t{probab}\n")

        # Now we handle the probabilities for an unknown word
        # <unk> can be tagged with any tag, with equal possibility
        unkprob = 1 / len(cfd.keys())

        for tag in cfd.keys():
            tagger_trans.write(f"0\t0\t<unk>\t{tag}\t{unkprob}\n")

        tagger_trans.write("0")  # funally write a 0 at the end of the file

    # finally we compile the file we just generated into a transducer
    call(
        f"fstcompile --isymbols=w/lex.syms --osymbols=w/lex.syms --keep_osymbols --keep_isymbols w/{name}.txt | "
        + f"fstarcsort > w/{name}.fsa")
Ejemplo n.º 6
0
        continue

    print(
        f"Processing:\t version: {kind} \t method: {method} \t order: {count}")


    if kind == "iob":  # create model with smoothing and ngram order
        model.create_iob_ngram_model(method, count)

    elif kind == "iob_and_w":
        model.create_iob_and_wrds_ngram_model(method, count)

    # process all test sentences with the created model
    scores.process_test_sentences(test_set, kind)

    # calculate how well the model made the predictions for the test set and save 
    # results to a file
    scores.process_preds_to_score(f"{kind}_method-{method}_order-{count}.txt") 


# finally, if baseline scores haven't been calculated yet, we calculate them
if not os.path.exists(f"scores/baseline.txt"):
    print("Creating baseline model and processing test set with it")
    model.create_baseline_model()
    scores.process_test_sentences(test_set, "baseline")
    scores.process_preds_to_score("baseline.txt")


# finally run notebook to generate graphics needed for report
call("jupyter nbconvert --execute Graphics.ipynb")