def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] assert sentence != [] annotation = [] for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] # NOTE: start and end are *token* offsets original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) annotation.append((start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) this_edits = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset] source_sentences.append(this_sentence) gold_edits.append(this_edits) return (source_sentences, gold_edits)
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() #puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] #print(sentence) assert sentence != [] annotations = {} for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] if etype == 'noop': start_offset = -1 end_offset = -1 corrections = [ c.strip() if c != '-NONE-' else '' for c in fields[2].split('||') ] # NOTE: start and end are *token* offsets original = ' '.join( ' '.join(sentence).split()[start_offset:end_offset]) annotator = int(fields[5]) if annotator not in list(annotations.keys()): annotations[annotator] = [] # print(etype, original,corrections) annotations[annotator].append( (start_offset, end_offset, original, corrections, etype)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) source_sentences.append(this_sentence) this_edits = {} for annotator, annotation in annotations.items(): this_edits[annotator] = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0 ] if len(this_edits) == 0: this_edits[0] = [] #print(this_edits) gold_edits.append(this_edits) return (source_sentences, gold_edits)
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, "r") puffer = fgold.read() fgold.close() puffer = puffer.decode("utf8") for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith("S ")] assert sentence != [] annotations = {} for line in item[1:]: if line.startswith("I ") or line.startswith("S "): continue assert line.startswith("A ") line = line[2:] fields = line.split("|||") start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] if etype == "noop": start_offset = -1 end_offset = -1 corrections = [c.strip() if c != "-NONE-" else "" for c in fields[2].split("||")] # NOTE: start and end are *token* offsets original = " ".join(" ".join(sentence).split()[start_offset:end_offset]) annotator = int(fields[5]) if annotator not in annotations.keys(): annotations[annotator] = [] annotations[annotator].append((start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) source_sentences.append(this_sentence) this_edits = {} for annotator, annotation in annotations.iteritems(): this_edits[annotator] = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0 ] if len(this_edits) == 0: this_edits[0] = [] gold_edits.append(this_edits) return (source_sentences, gold_edits)
def evaluateIt(system_file,gold_file,verbose=False): max_unchanged_words=2 ignore_whitespace_casing= False very_verbose = False # opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"]) # for o, v in opts: # if o in ('-v', '--verbose'): # verbose = True # elif o == '--very_verbose': # very_verbose = True # elif o == '--max_unchanged_words': # max_unchanged_words = int(v) # elif o == '--ignore_whitespace_casing': # ignore_whitespace_casing = True # else: # print >> sys.stderr, "Unknown option :", o # print_usage() # sys.exit(-1) # # # starting point # if len(args) != 2: # print_usage() # sys.exit(-1) #system_file = args[0] #gold_file = args[1] # load source sentences and gold edits source_sentences, gold_edits = load_annotation(gold_file) # load system hypotheses fin = smart_open(system_file, 'r') system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] fin.close() p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose) #print "Precision : %.4f" % p #print "Recall : %.4f" % r #print "F1 : %.4f" % f1 return p,r,f1
max_unchanged_words = int(v) elif o == '--ignore_whitespace_casing': ignore_whitespace_casing = True else: print >> sys.stderr, "Unknown option :", o print_usage() sys.exit(-1) # starting point if len(args) != 2: print_usage() sys.exit(-1) system_file = args[0] gold_file = args[1] # load source sentences and gold edits source_sentences, gold_edits = load_annotation(gold_file) # load system hypotheses fin = smart_open(system_file, 'r') system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] fin.close() p, r, f1 = levenshtein.batch_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose) print "Precision : %.4f" % p print "Recall : %.4f" % r print "F1 : %.4f" % f1
elif o == "--ignore_whitespace_casing": ignore_whitespace_casing = True else: print >> sys.stderr, "Unknown option :", o print_usage() sys.exit(-1) # starting point if len(args) != 2: print_usage() sys.exit(-1) system_file = args[0] gold_file = args[1] # load source sentences and gold edits source_sentences, gold_edits = load_annotation(gold_file) # load system hypotheses fin = smart_open(system_file, "r") system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] fin.close() p, r, f1 = levenshtein.batch_multi_pre_rec_f1( system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose ) print "Precision : %.4f" % p print "Recall : %.4f" % r print "F1 : %.4f" % f1
else: print("Unknown option :", o, file=sys.stderr) print_usage() sys.exit(-1) # starting point if len(args) != 2: print_usage() sys.exit(-1) system_file = args[0] gold_file = args[1] # load source sentences and gold edits source_sentences, gold_edits = load_annotation(gold_file) # load system hypotheses fin = smart_open(system_file, mode='r') system_sentences = [line.strip() for line in fin.readlines()] fin.close() p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, beta, ignore_whitespace_casing, verbose, very_verbose) print(("Precision : %.4f" % p)) print(("Recall : %.4f" % r)) print(("F_%.1f : %.4f" % (beta, f1)))