Example #1
0
def decode(sents, goldtagseqs, postagseqs, info, weights) : #estfile, weightsfile):
    labelset = ['B', 'I', 'O', '*']

    tp_bi = 0
    tp_o = 0
    acc = 0.0
    tot = 0
    tot_rec_bi = 0
    tot_rec_o = 0
    tot_prec_bi = 0
    tot_prec_o = 0
    
    sys.stderr.write("total test sentences = " + str(len(sents)) + "\n")
    for i in range(len(sents)):
        sys.stderr.write(str(i) + "\r")
	sent = sents[i]
        postags = postagseqs[i]

	tags = viterbi.execute(sent, labelset, postags, weights, info)
        for j in range(len(tags)):
            if tags[j] == goldtagseqs[i][j]:
                acc += 1
            if goldtagseqs[i][j] in ('B','I') and tags[j] in ('B','I'):
                tp_bi += 1
            elif goldtagseqs[i][j] == "O" and tags[j] == "O":
                tp_o += 1

            if goldtagseqs[i][j] in ('B', 'I'):
                tot_rec_bi += 1
            else:
                tot_rec_o += 1
            if tags[j] in ('B', 'I'):
                tot_prec_bi += 1
            else:
                tot_prec_o += 1
            print sent[j]+"\t"+postags[j]+"\t"+goldtagseqs[i][j]+"\t"+tags[j]
        print
        
        tot += len(tags)
    sys.stderr.write("accuracy     = "    + str(acc/tot) + "\n")
    sys.stderr.write("BI recall    = " + str(tp_bi/tot_rec_bi) + "\n")
    if tot_prec_bi > 0:
        sys.stderr.write("BI precision = " + str(tp_bi/tot_prec_bi) + "\n")
    sys.stderr.write("O recall     = "     + str(tp_o/tot_rec_o) + "\n")
    if tot_prec_o > 0:
        sys.stderr.write("O precision  = "  + str(tp_o/tot_prec_o) + "\n\n")
Example #2
0
def run(sentset, labelset, postagseqs, vecs1, vecs2, num_iter, all_feats):
    weights = init(all_feats)
    weights_avg = init(all_feats)

    for i in range(num_iter):
        sys.stderr.write(str(i)+"\r")
        for j in range(len(sentset)):
            sent = sentset[j]
            labelseq = labelset[j]
            postagseq = postagseqs[j]
            vec1 = vecs1[j]
            vec2 = vecs2[j]
            predseq, f = execute(sent, all_labels, postagseq, vec1, vec2, weights)
            if labelseq != predseq:
                update(weights, predseq, labelseq, postagseq, vec1, vec2, sent)
                add_weights(weights_avg, weights)
    for f in weights_avg.iterkeys():
        weights_avg[f] /= num_iter*len(sentset)
        print f, weights_avg[f]
    return weights_avg
Example #3
0
def run(sentset, labelset, postagseqs, vecs1, vecs2, num_iter, all_feats):
    weights = init(all_feats)
    weights_avg = init(all_feats)

    for i in range(num_iter):
        sys.stderr.write(str(i) + "\r")
        for j in range(len(sentset)):
            sent = sentset[j]
            labelseq = labelset[j]
            postagseq = postagseqs[j]
            vec1 = vecs1[j]
            vec2 = vecs2[j]
            predseq, f = execute(sent, all_labels, postagseq, vec1, vec2,
                                 weights)
            if labelseq != predseq:
                update(weights, predseq, labelseq, postagseq, vec1, vec2, sent)
                add_weights(weights_avg, weights)
    for f in weights_avg.iterkeys():
        weights_avg[f] /= num_iter * len(sentset)
        print f, weights_avg[f]
    return weights_avg
Example #4
0
def main(testfile, featsfile):
    labelset = ['0', '1', '*']
    test = codecs.open(testfile, 'r', 'utf-8')
    feats = set([])
    sents = []
    tagseqs = []
    postagseqs = []
    vecs1 = []
    vecs2 = []

    contents = []

    sent = []
    tags = []
    postags = []
    vec1 = []
    vec2 = []

    content = []

    while 1:
        line = test.readline()
        if not line:
            break
        line = line.strip()
        if line == "":
            sents.append(sent)
            tagseqs.append(tags)
            postagseqs.append(postags)
            vecs1.append(vec1)
            vecs2.append(vec2)
            contents.append(content)

            sent = []
            tags = []
            postags = []
            vec1 = []
            vec2 = []
            content = []
            continue
        cline = line.split("\t")

        word = cline[1].strip()
        #tag = cline[13].strip()
        tag = '1'
        pos = cline[3].strip()
        v1 = cline[10].strip()
        v2 = cline[11].strip()
        sent.append(word.strip())
        tags.append(tag.strip())
        postags.append(pos.strip())
        vec1.append(v1.strip())
        vec2.append(v2.strip())
        content.append(cline)
    test.close()

    weights = {}
    feats = open(featsfile, 'r')
    while 1:
        line = feats.readline()
        if not line:
            break
        line = line.strip()
        f, wt = line.split(' ')
        weights[f] = float(wt)
    feats.close()

    acc = 0.0
    tot = 0
    for i in range(len(sents)):
        sent = sents[i]
        postags = postagseqs[i]
        vec1 = vecs1[i]
        vec2 = vecs2[i]
        tags, f = viterbi.execute(sent, labelset, postags, vec1, vec2, weights)
        for j in range(len(tags)):
            print_line_withmodification(contents[i][j], tags[j])
            if tags[j] == tagseqs[i][j]:
                acc += 1
        print
        tot += len(tags)
Example #5
0
def main(testfile, featsfile):
    labelset = ['0', '1', '*']
    test = codecs.open(testfile, 'r', 'utf-8')
    feats = set([])
    sents = []
    tagseqs = []
    postagseqs = []
    vecs1 = []
    vecs2 = []
    
    contents = []

    sent = []
    tags = []
    postags = []
    vec1 = []
    vec2 = []

    content = []



    while 1:
        line = test.readline()
        if not line:
            break
        line = line.strip()
        if line == "":
            sents.append(sent)
            tagseqs.append(tags)
            postagseqs.append(postags)
            vecs1.append(vec1)
            vecs2.append(vec2)
            contents.append(content)

            sent = []
            tags = []
            postags = []
            vec1 = []
            vec2 = []
            content = []
            continue
        cline = line.split("\t")

        word = cline[1].strip()
        #tag = cline[13].strip()
        tag = '1'
        pos = cline[3].strip()
        v1 = cline[10].strip()
        v2 = cline[11].strip()
        sent.append(word.strip())
        tags.append(tag.strip())
        postags.append(pos.strip())
        vec1.append(v1.strip())
        vec2.append(v2.strip())
        content.append(cline)
    test.close()

    weights = {}
    feats = open(featsfile, 'r')
    while 1:
        line = feats.readline()
        if not line:
            break
        line = line.strip()
        f, wt = line.split(' ')
        weights[f] = float(wt)
    feats.close()

    acc = 0.0
    tot = 0
    for i in range(len(sents)):
        sent = sents[i]
        postags = postagseqs[i]
        vec1 = vecs1[i]
        vec2 = vecs2[i] 
        tags, f = viterbi.execute(sent, labelset, postags, vec1, vec2, weights)
        for j in range(len(tags)):
            print_line_withmodification(contents[i][j],tags[j])
            if tags[j] == tagseqs[i][j]:
                acc += 1
        print 
        tot += len(tags)