def getE(word, tag):
    curr_score = word + " " + tag
    if curr_score in e_cache:
        return e_cache[curr_score]
    #word not seen in the train
    if word not in e:
        word = General.assignWrodClass(word)

    if tag in q:
        tag_amt = q[tag]
    else:
        tag_amt = 100000

    if tag in e[word]:
        e_cache[curr_score] = e[word][tag] / tag_amt
        return e_cache[curr_score]

    #if tag was not seen for the word in training than return a low score
    e_cache[curr_score] = 0.5 / tag_amt
    return e_cache[curr_score]
def prun(word):
    if word[-3:] == 'ing':
        return (["VBG"])
    elif bool(re.search("-", word)):
        return (["JJ"])
    elif len(word) > 1 and word.isupper():
        return (["NNP"])
    elif sum(map(lambda c:
                 (1 if c.isdigit() else 0), word)) > float(len(word)) / 2:
        return (["CD"])
    elif len(word) > 0 and word[0].isupper():
        return (["NNP", "NN", "NNS"])
    elif word[-4:] == 'able':
        return (["JJ"])
    elif word[-2:] == 'ly':
        return (["RB"])
    elif word[-3:] == 'ers':
        return (["NNS"])
    elif word[-4:] == 'tion' or word[-3:] == 'ist' or word[-2:] == 'ty':
        return (["NN"])
    else:
        word = General.assignWrodClass(word)
        return (list(e[word].keys()))
Example #3
0
        e[lst[line_i][i][0]][lst[line_i][i][1]] += 1
        
        #fill q data
        for j in range(3):
            triple = ' '.join(map(lambda n: 'SS' if n < 0 else lst[line_i][n][1], range(i - j, i + 1)))
                #triple = ' '.join(map(lambda n: 'SS' if n < 0 else line[n][1], range(i - j, i + 1)))
            if triple not in q:
                q[triple] = 0
            q[triple] += 1
            
#change low frequency data of e
for k in e:
    for t in e[k]:
        #assign word class from pattern if less than frequency cuttoff 
        if e[k][t] < low_freq_cutoff:
            word_class = General.assignWrodClass(k)
        else:
            word_class = k
        
        if word_class not in e_lf:
            e_lf[word_class] = {}
            
        if t not in e_lf[word_class]:
            e_lf[word_class][t] =e[k][t]
        else:
            e_lf[word_class][t] += e[k][t]

#prepare format of e for output file        
for k in e_lf:
    for t in e_lf[k]:
        output_e.append(k + " " + t + "\t" + str(e_lf[k][t]))
Example #4
0
def getE(word):
    if word not in e:
        word = General.assignWrodClass(word)
    return e[word]
    for t in tags:
        V[0][t] = {}
        for r in tags:
            V[0][t][r] = 0

    V[0]["SS"]["SS"] = 1
    bp = [{} for w in row] + [{}]
    tags_p2 = ["SS"]
    tags_p = ["SS"]

    for i in range(n + 1):
        word = row[i][0]

        if word not in e:
            tags_curr = prun(word)
            word = General.assignWrodClass(word)
        else:
            tags_curr = list(e[word].keys())

        V[i + 1] = {}
        bp[i + 1] = {}
        for t in tags_p:
            V[i + 1][t] = {}
            bp[i + 1][t] = {}
            for r in tags_curr:
                l = {}
                for tT in tags_p2:
                    l[tT] = (V[i][tT][t]) * getScore(word, r, tT, t)

                V[i + 1][t][r] = max(list(l.values()))
                bp[i + 1][t][r] = General.argmax(l)