def process_test_sentence(sent): """ Helper function that processes a test sentence indivivdually """ # create a transducer for the test sentence # it transduces from word -> word with open("w/test_sent_fsa.txt", "w") as test_file: i = 0 for w, _ in sent: # we only write the word and keep the test label for testing test_file.write(f"{i}\t{i+1}\t{w}\t{w}\n") i += 1 test_file.write(f"{i}") # complite the fst for the test sentence and compute the predictions with the model we're testing. # finally save the predictions to a file call( f"fstcompile --isymbols=w/lex.syms --osymbols=w/lex.syms --keep_osymbols --keep_isymbols w/test_sent_fsa.txt | " + f"fstcompose - w/{kind}_wfst_ngrm.fsa | fstrmepsilon | fstshortestpath | fsttopsort | " + f"fstprint - " + f" > w/prediction_on_sent.txt") # read the file and extract predictions (x, y) pd = pandas.read_csv("w/prediction_on_sent.txt", delimiter="\t", header=None) pd = pd[:-1][[2, 3]].get_values( ) # obtain only columns for word and tag (discard: id and weight columns) pd = [[w, re.sub(r"__.*", "", t)] for w, t in pd] # replace the O__word tags with only O return pd # return word and predicted tag
def process_preds_to_score(score_file_name): """ passes the `w/pred_coneval.txt` file through the conlleval script and saves the resulting scores in `scores/<score_file_name>` file """ call( f"cat w/pred_coneval.txt | ../P1_data/scripts/conlleval.pl > scores/{score_file_name}" )
def create_iob_ngram_model(method="kneser_ney", order="3"): """ Creates the transducer + ngram model. Only considering words and normal IOB tags """ call( f"farcompilestrings --symbols=w/lex.syms --keep_symbols --unknown_symbol='<unk>' w/iob_ngram_file.txt | " + f"ngramcount --order={order} --require_symbols=false - | " + f"ngrammake --method={method} - | " + # now we compose the tagger with the ngram model f"fstcompose w/iob_tagger_trans.fsa - " + f" > w/iob_wfst_ngrm.fsa")
def create_baseline_model(): """ Baseline model is just the word -> IOB tag transducer + unigram model with no smoothing """ call( "farcompilestrings --symbols=w/lex.syms --keep_symbols --unknown_symbol='<unk>' w/iob_ngram_file.txt | " + # use word -> IOB tag transducer "ngramcount --order=1 --require_symbols=false - | " + # unigram "ngrammake --method=unsmoothed - | " + # with no smoothing # now we compose the tagger with the ngram model "fstcompose w/iob_tagger_trans.fsa - " + " > w/baseline_wfst_ngrm.fsa")
def create_transducer_from_data(all_training_pairs, name): """ `all_training_pairs` - is an array of pairs, where each pair is [word, tag] `name` - is the name we want to give to the transducer The transducer is written to a file and then compiled with fstcompile """ # first count occurrences of each word given the tag cfd = nltk.ConditionalFreqDist( reversed(pair) # tag - word (condition is tag) for pair in all_training_pairs) # pair [word, tag] with open(f"w/{name}.txt", "w") as tagger_trans: for word, tag in set(all_training_pairs): # only unique pairs # calculate probability of word given the tag # probability(word | tag) = # count(word, tag) / count(tag) freqs = cfd[tag] val = freqs[word] # count(word, tag) total_w = sum(freqs.values()) # count(tag) # inverse log to respect the fact that weight is actually a score probab = -math.log(val / total_w) # write transition rule to file tagger_trans.write(f"0\t0\t{word}\t{tag}\t{probab}\n") # Now we handle the probabilities for an unknown word # <unk> can be tagged with any tag, with equal possibility unkprob = 1 / len(cfd.keys()) for tag in cfd.keys(): tagger_trans.write(f"0\t0\t<unk>\t{tag}\t{unkprob}\n") tagger_trans.write("0") # funally write a 0 at the end of the file # finally we compile the file we just generated into a transducer call( f"fstcompile --isymbols=w/lex.syms --osymbols=w/lex.syms --keep_osymbols --keep_isymbols w/{name}.txt | " + f"fstarcsort > w/{name}.fsa")
continue print( f"Processing:\t version: {kind} \t method: {method} \t order: {count}") if kind == "iob": # create model with smoothing and ngram order model.create_iob_ngram_model(method, count) elif kind == "iob_and_w": model.create_iob_and_wrds_ngram_model(method, count) # process all test sentences with the created model scores.process_test_sentences(test_set, kind) # calculate how well the model made the predictions for the test set and save # results to a file scores.process_preds_to_score(f"{kind}_method-{method}_order-{count}.txt") # finally, if baseline scores haven't been calculated yet, we calculate them if not os.path.exists(f"scores/baseline.txt"): print("Creating baseline model and processing test set with it") model.create_baseline_model() scores.process_test_sentences(test_set, "baseline") scores.process_preds_to_score("baseline.txt") # finally run notebook to generate graphics needed for report call("jupyter nbconvert --execute Graphics.ipynb")