def decode(sents, goldtagseqs, postagseqs, info, weights) : #estfile, weightsfile): labelset = ['B', 'I', 'O', '*'] tp_bi = 0 tp_o = 0 acc = 0.0 tot = 0 tot_rec_bi = 0 tot_rec_o = 0 tot_prec_bi = 0 tot_prec_o = 0 sys.stderr.write("total test sentences = " + str(len(sents)) + "\n") for i in range(len(sents)): sys.stderr.write(str(i) + "\r") sent = sents[i] postags = postagseqs[i] tags = viterbi.execute(sent, labelset, postags, weights, info) for j in range(len(tags)): if tags[j] == goldtagseqs[i][j]: acc += 1 if goldtagseqs[i][j] in ('B','I') and tags[j] in ('B','I'): tp_bi += 1 elif goldtagseqs[i][j] == "O" and tags[j] == "O": tp_o += 1 if goldtagseqs[i][j] in ('B', 'I'): tot_rec_bi += 1 else: tot_rec_o += 1 if tags[j] in ('B', 'I'): tot_prec_bi += 1 else: tot_prec_o += 1 print sent[j]+"\t"+postags[j]+"\t"+goldtagseqs[i][j]+"\t"+tags[j] print tot += len(tags) sys.stderr.write("accuracy = " + str(acc/tot) + "\n") sys.stderr.write("BI recall = " + str(tp_bi/tot_rec_bi) + "\n") if tot_prec_bi > 0: sys.stderr.write("BI precision = " + str(tp_bi/tot_prec_bi) + "\n") sys.stderr.write("O recall = " + str(tp_o/tot_rec_o) + "\n") if tot_prec_o > 0: sys.stderr.write("O precision = " + str(tp_o/tot_prec_o) + "\n\n")
def run(sentset, labelset, postagseqs, vecs1, vecs2, num_iter, all_feats): weights = init(all_feats) weights_avg = init(all_feats) for i in range(num_iter): sys.stderr.write(str(i)+"\r") for j in range(len(sentset)): sent = sentset[j] labelseq = labelset[j] postagseq = postagseqs[j] vec1 = vecs1[j] vec2 = vecs2[j] predseq, f = execute(sent, all_labels, postagseq, vec1, vec2, weights) if labelseq != predseq: update(weights, predseq, labelseq, postagseq, vec1, vec2, sent) add_weights(weights_avg, weights) for f in weights_avg.iterkeys(): weights_avg[f] /= num_iter*len(sentset) print f, weights_avg[f] return weights_avg
def run(sentset, labelset, postagseqs, vecs1, vecs2, num_iter, all_feats): weights = init(all_feats) weights_avg = init(all_feats) for i in range(num_iter): sys.stderr.write(str(i) + "\r") for j in range(len(sentset)): sent = sentset[j] labelseq = labelset[j] postagseq = postagseqs[j] vec1 = vecs1[j] vec2 = vecs2[j] predseq, f = execute(sent, all_labels, postagseq, vec1, vec2, weights) if labelseq != predseq: update(weights, predseq, labelseq, postagseq, vec1, vec2, sent) add_weights(weights_avg, weights) for f in weights_avg.iterkeys(): weights_avg[f] /= num_iter * len(sentset) print f, weights_avg[f] return weights_avg
def main(testfile, featsfile): labelset = ['0', '1', '*'] test = codecs.open(testfile, 'r', 'utf-8') feats = set([]) sents = [] tagseqs = [] postagseqs = [] vecs1 = [] vecs2 = [] contents = [] sent = [] tags = [] postags = [] vec1 = [] vec2 = [] content = [] while 1: line = test.readline() if not line: break line = line.strip() if line == "": sents.append(sent) tagseqs.append(tags) postagseqs.append(postags) vecs1.append(vec1) vecs2.append(vec2) contents.append(content) sent = [] tags = [] postags = [] vec1 = [] vec2 = [] content = [] continue cline = line.split("\t") word = cline[1].strip() #tag = cline[13].strip() tag = '1' pos = cline[3].strip() v1 = cline[10].strip() v2 = cline[11].strip() sent.append(word.strip()) tags.append(tag.strip()) postags.append(pos.strip()) vec1.append(v1.strip()) vec2.append(v2.strip()) content.append(cline) test.close() weights = {} feats = open(featsfile, 'r') while 1: line = feats.readline() if not line: break line = line.strip() f, wt = line.split(' ') weights[f] = float(wt) feats.close() acc = 0.0 tot = 0 for i in range(len(sents)): sent = sents[i] postags = postagseqs[i] vec1 = vecs1[i] vec2 = vecs2[i] tags, f = viterbi.execute(sent, labelset, postags, vec1, vec2, weights) for j in range(len(tags)): print_line_withmodification(contents[i][j], tags[j]) if tags[j] == tagseqs[i][j]: acc += 1 print tot += len(tags)
def main(testfile, featsfile): labelset = ['0', '1', '*'] test = codecs.open(testfile, 'r', 'utf-8') feats = set([]) sents = [] tagseqs = [] postagseqs = [] vecs1 = [] vecs2 = [] contents = [] sent = [] tags = [] postags = [] vec1 = [] vec2 = [] content = [] while 1: line = test.readline() if not line: break line = line.strip() if line == "": sents.append(sent) tagseqs.append(tags) postagseqs.append(postags) vecs1.append(vec1) vecs2.append(vec2) contents.append(content) sent = [] tags = [] postags = [] vec1 = [] vec2 = [] content = [] continue cline = line.split("\t") word = cline[1].strip() #tag = cline[13].strip() tag = '1' pos = cline[3].strip() v1 = cline[10].strip() v2 = cline[11].strip() sent.append(word.strip()) tags.append(tag.strip()) postags.append(pos.strip()) vec1.append(v1.strip()) vec2.append(v2.strip()) content.append(cline) test.close() weights = {} feats = open(featsfile, 'r') while 1: line = feats.readline() if not line: break line = line.strip() f, wt = line.split(' ') weights[f] = float(wt) feats.close() acc = 0.0 tot = 0 for i in range(len(sents)): sent = sents[i] postags = postagseqs[i] vec1 = vecs1[i] vec2 = vecs2[i] tags, f = viterbi.execute(sent, labelset, postags, vec1, vec2, weights) for j in range(len(tags)): print_line_withmodification(contents[i][j],tags[j]) if tags[j] == tagseqs[i][j]: acc += 1 print tot += len(tags)