def getE(word, tag): curr_score = word + " " + tag if curr_score in e_cache: return e_cache[curr_score] #word not seen in the train if word not in e: word = General.assignWrodClass(word) if tag in q: tag_amt = q[tag] else: tag_amt = 100000 if tag in e[word]: e_cache[curr_score] = e[word][tag] / tag_amt return e_cache[curr_score] #if tag was not seen for the word in training than return a low score e_cache[curr_score] = 0.5 / tag_amt return e_cache[curr_score]
def prun(word): if word[-3:] == 'ing': return (["VBG"]) elif bool(re.search("-", word)): return (["JJ"]) elif len(word) > 1 and word.isupper(): return (["NNP"]) elif sum(map(lambda c: (1 if c.isdigit() else 0), word)) > float(len(word)) / 2: return (["CD"]) elif len(word) > 0 and word[0].isupper(): return (["NNP", "NN", "NNS"]) elif word[-4:] == 'able': return (["JJ"]) elif word[-2:] == 'ly': return (["RB"]) elif word[-3:] == 'ers': return (["NNS"]) elif word[-4:] == 'tion' or word[-3:] == 'ist' or word[-2:] == 'ty': return (["NN"]) else: word = General.assignWrodClass(word) return (list(e[word].keys()))
e[lst[line_i][i][0]][lst[line_i][i][1]] += 1 #fill q data for j in range(3): triple = ' '.join(map(lambda n: 'SS' if n < 0 else lst[line_i][n][1], range(i - j, i + 1))) #triple = ' '.join(map(lambda n: 'SS' if n < 0 else line[n][1], range(i - j, i + 1))) if triple not in q: q[triple] = 0 q[triple] += 1 #change low frequency data of e for k in e: for t in e[k]: #assign word class from pattern if less than frequency cuttoff if e[k][t] < low_freq_cutoff: word_class = General.assignWrodClass(k) else: word_class = k if word_class not in e_lf: e_lf[word_class] = {} if t not in e_lf[word_class]: e_lf[word_class][t] =e[k][t] else: e_lf[word_class][t] += e[k][t] #prepare format of e for output file for k in e_lf: for t in e_lf[k]: output_e.append(k + " " + t + "\t" + str(e_lf[k][t]))
def getE(word): if word not in e: word = General.assignWrodClass(word) return e[word]
for t in tags: V[0][t] = {} for r in tags: V[0][t][r] = 0 V[0]["SS"]["SS"] = 1 bp = [{} for w in row] + [{}] tags_p2 = ["SS"] tags_p = ["SS"] for i in range(n + 1): word = row[i][0] if word not in e: tags_curr = prun(word) word = General.assignWrodClass(word) else: tags_curr = list(e[word].keys()) V[i + 1] = {} bp[i + 1] = {} for t in tags_p: V[i + 1][t] = {} bp[i + 1][t] = {} for r in tags_curr: l = {} for tT in tags_p2: l[tT] = (V[i][tT][t]) * getScore(word, r, tT, t) V[i + 1][t][r] = max(list(l.values())) bp[i + 1][t][r] = General.argmax(l)