def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] assert sentence != [] annotation = [] for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] # NOTE: start and end are *token* offsets original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) annotation.append((start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) this_edits = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset] source_sentences.append(this_sentence) gold_edits.append(this_edits) return (source_sentences, gold_edits)
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() #puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] #print(sentence) assert sentence != [] annotations = {} for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] if etype == 'noop': start_offset = -1 end_offset = -1 corrections = [ c.strip() if c != '-NONE-' else '' for c in fields[2].split('||') ] # NOTE: start and end are *token* offsets original = ' '.join( ' '.join(sentence).split()[start_offset:end_offset]) annotator = int(fields[5]) if annotator not in list(annotations.keys()): annotations[annotator] = [] # print(etype, original,corrections) annotations[annotator].append( (start_offset, end_offset, original, corrections, etype)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) source_sentences.append(this_sentence) this_edits = {} for annotator, annotation in annotations.items(): this_edits[annotator] = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0 ] if len(this_edits) == 0: this_edits[0] = [] #print(this_edits) gold_edits.append(this_edits) return (source_sentences, gold_edits)
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, "r") puffer = fgold.read() fgold.close() puffer = puffer.decode("utf8") for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith("S ")] assert sentence != [] annotations = {} for line in item[1:]: if line.startswith("I ") or line.startswith("S "): continue assert line.startswith("A ") line = line[2:] fields = line.split("|||") start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] if etype == "noop": start_offset = -1 end_offset = -1 corrections = [c.strip() if c != "-NONE-" else "" for c in fields[2].split("||")] # NOTE: start and end are *token* offsets original = " ".join(" ".join(sentence).split()[start_offset:end_offset]) annotator = int(fields[5]) if annotator not in annotations.keys(): annotations[annotator] = [] annotations[annotator].append((start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) source_sentences.append(this_sentence) this_edits = {} for annotator, annotation in annotations.iteritems(): this_edits[annotator] = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0 ] if len(this_edits) == 0: this_edits[0] = [] gold_edits.append(this_edits) return (source_sentences, gold_edits)
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] assert sentence != [] annotation = [] for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] corrections = [ c.strip() if c != '-NONE-' else '' for c in fields[2].split('||') ] # NOTE: start and end are *token* offsets original = ' '.join( ' '.join(sentence).split()[start_offset:end_offset]) annotation.append( (start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) this_edits = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset ] source_sentences.append(this_sentence) gold_edits.append(this_edits) return (source_sentences, gold_edits)
opts, args = getopt.getopt(sys.argv[1:], "", ["out_train=", "out_dev=", "out_test="]) train_out = 'train' dev_out = 'dev' test_out = 'test' for o, v in opts: if o == "--out_train": train_out = v elif o == "--out_dev": dev_out = v elif o == "--out_test": test_out = v else: print >> sys.stderr, "Unknown option : ", o assert len(args) == 2 data = list(paragraphs(sys.stdin)) total = len(data) train_split, dev_split = map(lambda i : int(i) * total / 100, args) test_split = total - train_split - dev_split data_iter = iter(data) for no_instances, output in zip([train_split, dev_split, test_split], [train_out, dev_out, test_out]): fout = open(output, 'wb') for i in xrange(no_instances): fout.write(next(data_iter) + '\n')
def load_annotation(gold_file): source_sentences = [] gold_edits = [] fgold = smart_open(gold_file, 'r') puffer = fgold.read() fgold.close() puffer = puffer.decode('utf8') for item in paragraphs(puffer.splitlines(True)): item = item.splitlines(False) sentence = [line[2:].strip() for line in item if line.startswith('S ')] assert sentence != [] annotations = {} for line in item[1:]: if line.startswith('I ') or line.startswith('S '): continue assert line.startswith('A ') line = line[2:] fields = line.split('|||') start_offset = int(fields[0].split()[0]) end_offset = int(fields[0].split()[1]) etype = fields[1] if etype == 'noop': start_offset = -1 end_offset = -1 corrections = [ c.strip() if c != '-NONE-' else '' for c in fields[2].split('||') ] # NOTE: start and end are *token* offsets original = ' '.join( ' '.join(sentence).split()[start_offset:end_offset]) annotator = int(fields[5]) if annotator not in annotations.keys(): annotations[annotator] = [] annotations[annotator].append( (start_offset, end_offset, original, corrections)) tok_offset = 0 for this_sentence in sentence: tok_offset += len(this_sentence.split()) source_sentences.append(this_sentence) this_edits = {} for annotator, annotation in annotations.iteritems(): this_edits[annotator] = [ edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0 ] if len(this_edits) == 0: this_edits[0] = [] gold_edits.append(this_edits) return (source_sentences, gold_edits) print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" print >> sys.stderr, "where" print >> sys.stderr, " proposed_sentences - system output, sentence per line" print >> sys.stderr, " source_gold - source sentences with gold token edits" print >> sys.stderr, "OPTIONS" print >> sys.stderr, " -v --verbose - print verbose output" print >> sys.stderr, " --very_verbose - print lots of verbose output" print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." print >> sys.stderr, " --beta B - Beta value for F-measure. Default 0.5." print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no."
import getopt opts, args = getopt.getopt(sys.argv[1:], "", ["out_train=", "out_dev=", "out_test="]) train_out = 'train' dev_out = 'dev' test_out = 'test' for o, v in opts: if o == "--out_train": train_out = v elif o == "--out_dev": dev_out = v elif o == "--out_test": test_out = v else: print >> sys.stderr, "Unknown option : ", o assert len(args) == 2 data = list(paragraphs(sys.stdin)) total = len(data) train_split, dev_split = map(lambda i: int(i) * total / 100, args) test_split = total - train_split - dev_split data_iter = iter(data) for no_instances, output in zip([train_split, dev_split, test_split], [train_out, dev_out, test_out]): fout = open(output, 'wb') for i in xrange(no_instances): fout.write(next(data_iter) + '\n')