def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotation = []
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
            # NOTE: start and end are *token* offsets
            original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
            annotation.append((start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            this_edits = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset]
            source_sentences.append(this_sentence)
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
Exemple #2
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    #puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        #print(sentence)
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            if etype == 'noop':
                start_offset = -1
                end_offset = -1
            corrections = [
                c.strip() if c != '-NONE-' else ''
                for c in fields[2].split('||')
            ]
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                ' '.join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in list(annotations.keys()):
                annotations[annotator] = []

#           print(etype, original,corrections)
            annotations[annotator].append(
                (start_offset, end_offset, original, corrections, etype))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.items():
                this_edits[annotator] = [
                    edit for edit in annotation if edit[0] <= tok_offset
                    and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0
                ]
            if len(this_edits) == 0:
                this_edits[0] = []
            #print(this_edits)
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
Exemple #3
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, "r")
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode("utf8")
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith("S ")]
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith("I ") or line.startswith("S "):
                continue
            assert line.startswith("A ")
            line = line[2:]
            fields = line.split("|||")
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            if etype == "noop":
                start_offset = -1
                end_offset = -1
            corrections = [c.strip() if c != "-NONE-" else "" for c in fields[2].split("||")]
            # NOTE: start and end are *token* offsets
            original = " ".join(" ".join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in annotations.keys():
                annotations[annotator] = []
            annotations[annotator].append((start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.iteritems():
                this_edits[annotator] = [
                    edit
                    for edit in annotation
                    if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0
                ]
            if len(this_edits) == 0:
                this_edits[0] = []
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
Exemple #4
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotation = []
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            corrections = [
                c.strip() if c != '-NONE-' else ''
                for c in fields[2].split('||')
            ]
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                ' '.join(sentence).split()[start_offset:end_offset])
            annotation.append(
                (start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            this_edits = [
                edit for edit in annotation
                if edit[0] <= tok_offset and edit[1] <= tok_offset
            ]
            source_sentences.append(this_sentence)
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
Exemple #5
0
    opts, args = getopt.getopt(sys.argv[1:], "", ["out_train=", "out_dev=", "out_test="])
    train_out = 'train'
    dev_out = 'dev'
    test_out = 'test'
    for o, v in opts:
        if o == "--out_train":
            train_out = v
        elif o == "--out_dev":
            dev_out = v
        elif o == "--out_test":
            test_out = v
        else:
            print >> sys.stderr, "Unknown option : ", o

    assert len(args) == 2

    data = list(paragraphs(sys.stdin))
    total = len(data)
    train_split, dev_split = map(lambda i : int(i) * total / 100, args)
    test_split = total - train_split - dev_split
    data_iter = iter(data)

    for no_instances, output in zip([train_split, dev_split, test_split], [train_out, dev_out, test_out]):
        fout = open(output, 'wb')
        for i in xrange(no_instances):            
            fout.write(next(data_iter) + '\n')
        

        
Exemple #6
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            if etype == 'noop':
                start_offset = -1
                end_offset = -1
            corrections = [
                c.strip() if c != '-NONE-' else ''
                for c in fields[2].split('||')
            ]
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                ' '.join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in annotations.keys():
                annotations[annotator] = []
            annotations[annotator].append(
                (start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.iteritems():
                this_edits[annotator] = [
                    edit for edit in annotation if edit[0] <= tok_offset
                    and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0
                ]
            if len(this_edits) == 0:
                this_edits[0] = []
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)

    print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
    print >> sys.stderr, "where"
    print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
    print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
    print >> sys.stderr, "OPTIONS"
    print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
    print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
    print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
    print >> sys.stderr, "        --beta B                    -  Beta value for F-measure. Default 0.5."
    print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
Exemple #7
0
    import getopt

    opts, args = getopt.getopt(sys.argv[1:], "",
                               ["out_train=", "out_dev=", "out_test="])
    train_out = 'train'
    dev_out = 'dev'
    test_out = 'test'
    for o, v in opts:
        if o == "--out_train":
            train_out = v
        elif o == "--out_dev":
            dev_out = v
        elif o == "--out_test":
            test_out = v
        else:
            print >> sys.stderr, "Unknown option : ", o

    assert len(args) == 2

    data = list(paragraphs(sys.stdin))
    total = len(data)
    train_split, dev_split = map(lambda i: int(i) * total / 100, args)
    test_split = total - train_split - dev_split
    data_iter = iter(data)

    for no_instances, output in zip([train_split, dev_split, test_split],
                                    [train_out, dev_out, test_out]):
        fout = open(output, 'wb')
        for i in xrange(no_instances):
            fout.write(next(data_iter) + '\n')