Beispiel #1
0
def random_Implicit_subset(sample_size=30, corpus_filename='pdtb2.csv'):
    """
    Creates a CSV file containing randomly selected Implicit examples
    from each of the primary semantic classes. sample_size determines
    the size of the sample from each class (default: 30). The output
    is a file called pdtb-random-Implicit-subset.csv with columns named
    for the attributes/methods that determined the values.
    """
    d = defaultdict(list)
    pdtb = CorpusReader(corpus_filename)
    for datum in pdtb.iter_data(display_progress=True):
        if datum.Relation == 'Implicit' and not datum.Sup1_RawText and not datum.Sup2_RawText:
            d[datum.primary_semclass1()].append(datum)
    with open('pdtb-random-Implicit-subset.csv', 'w') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow(
            ['Arg1_RawText', 'conn_str', 'Arg2_RawText', 'primary_semclass1'])
        for cls, data, in list(d.items()):
            shuffle(data)
            for datum in data[:sample_size]:
                row = [
                    datum.Arg1_RawText,
                    datum.conn_str(), datum.Arg2_RawText, cls
                ]
                csvwriter.writerow(row)
Beispiel #2
0
def print_attribution_texts(corpus_filename='pdtb2.csv'):
    """Inspect the strings characterizing attribution values."""
    pdtb = CorpusReader(corpus_filename)
    for datum in pdtb.iter_data(display_progress=False):
        txt = datum.Attribution_RawText
        if txt:
            print(txt)
Beispiel #3
0
def distribution_of_relative_arg_order(corpus_filename='pdtb2.csv'):
    d = defaultdict(int)
    pdtb = CorpusReader(corpus_filename)
    for datum in pdtb.iter_data(display_progress=True):
        d[datum.relative_arg_order()] += 1
    for order, count in sorted(list(d.items()),
                               key=itemgetter(1),
                               reverse=True):
        print(order, count)
Beispiel #4
0
def attribution_counts(corpus_filename='pdtb2.csv'):
    """Create a count dictionary of non-null attribution values."""
    pdtb = CorpusReader(corpus_filename)
    d = defaultdict(int)
    for datum in pdtb.iter_data():
        src = datum.Attribution_Source
        if src:
            d[src] += 1
    return d
Beispiel #5
0
def count_semantic_classes(corpus_filename='pdtb2.csv'):
    """Count ConnHeadSemClass1 values."""
    pdtb = CorpusReader(corpus_filename)
    d = defaultdict(int)
    for datum in pdtb.iter_data():
        sc = datum.ConnHeadSemClass1
        # Filter None values (should be just EntRel/NonRel data):
        if sc:
            d[sc] += 1
    return d
Beispiel #6
0
def relation_count(corpus_filename='pdtb2.csv'):
    """Calculate and display the distribution of relations."""
    pdtb = CorpusReader(corpus_filename)
    # Create a count dictionary of relations:
    d = defaultdict(int)
    for datum in pdtb.iter_data():
        d[datum.Relation] += 1
    # Print the results to standard output:
    for key, val in d.items():
        print("{} {}".format(key, val))
Beispiel #7
0
def connective_distribution(corpus_filename='pdtb2.csv'):
    """Counts of connectives by relation type."""
    pdtb = CorpusReader(corpus_filename)
    d = defaultdict(lambda: defaultdict(int))
    for datum in pdtb.iter_data():
        cs = datum.conn_str(distinguish_implicit=False)
        # Filter None values (should be just EntRel/NoRel data):
        if cs:
            # Downcase for further collapsing, and add 1:
            d[datum.Relation][cs.lower()] += 1
    return d
Beispiel #8
0
def semantic_classes_in_implicit_relations(corpus_filename='pdtb2.csv'):
    """Count the primary semantic classes for connectives
    limited to Implicit relations."""
    d = defaultdict(int)
    pdtb = CorpusReader(corpus_filename)
    for datum in pdtb.iter_data(display_progress=True):
        if datum.Relation == 'Implicit':
            d[datum.primary_semclass1()] += 1
    # Print, sorted by values, largest first:
    for key, val in sorted(list(d.items()), key=itemgetter(1), reverse=True):
        print(key, val)
Beispiel #9
0
def connective_initial(sem_re, output_filename, corpus_filename='pdtb2.csv'):
    """
    Pull out examples of Explicit or Implicit relations in which

    (i) Arg1 immediately precedes Arg2, with only the connective
        intervening in the case of Explicit.
    (ii) There is no supplementary text on either argument.
    (iii) ConnHeadSemClass1 matches the user-supplied regex sem_re    

    The results go into a CSV file named output_filename.
    """
    keepers = {}  # Stores the items that pass muster.
    pdtb = CorpusReader(corpus_filename)
    for datum in pdtb.iter_data(display_progress=False):
        # Restrict to examples that are either Implicit or
        # Explicit and have no supplementary text:
        rel = datum.Relation
        if rel in ('Implicit', 'Explicit'
                   ) and not datum.Sup1_RawText and not datum.Sup2_RawText:
            # Further restrict to the class of semantic relations captured by sem_re:
            if sem_re.search(datum.ConnHeadSemClass1):
                # Make sure that Arg1, the connective, and Arg2 are all adjacent:
                if adjacency_check(datum):
                    # Stick to simple connectives: for Explicit, the connective
                    # and its head are the same;
                    # for Implicit, there is no secondary connective.
                    if (rel == 'Explicit' and datum.ConnHead == datum.Connective_RawText) or \
                       (rel == 'Implicit' and not datum.Conn2):
                        itemId = "%s/%s" % (datum.Section, datum.FileNumber)
                        print(itemId)
                        # We needn't flag them, since column 2 does that.
                        conn = datum.conn_str(distinguish_implicit=False)
                        # Store in a dict with file number keys to avoid taking two
                        # sentences from the same file:
                        keepers[itemId] = [
                            itemId, rel, datum.ConnHeadSemClass1,
                            datum.Arg1_RawText, conn, datum.Arg2_RawText
                        ]
    # Store the results in a CSV file:
    with open(output_filename, 'wt') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow([
            'ItemId', 'Relation', 'ConnHeadSemClass1', 'Arg1', 'Connective',
            'Arg2'
        ])
        csvwriter.writerows(list(keepers.values()))
    print("CSV created.")
def preprocess(splitting):
    # 1 for Lin and 2 for Ji
    if splitting == 1:
        train_sec = [
            '02',
            '03',
            '04',
            '05',
            '06',
            '07',
            '08',
            '09',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
            '19',
            '20',
            '21',
        ]
        dev_sec = ['22']
        test_sec = ['23']
    elif splitting == 2 or splitting == 3:
        train_sec = [
            '02',
            '03',
            '04',
            '05',
            '06',
            '07',
            '08',
            '09',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
            '19',
            '20',
        ]
        dev_sec = ['00', '01']
        test_sec = ['21', '22']
    else:
        raise Exception('wrong splitting')

    arg1_train = []
    arg2_train = []
    conn_train = []
    sense_train = []

    arg1_dev = []
    arg2_dev = []
    sense1_dev = []
    sense2_dev = []

    arg1_test = []
    arg2_test = []
    sense1_test = []
    sense2_test = []

    for corpus in CorpusReader('./raw/pdtb2.csv').iter_data():
        if corpus.Relation != 'Implicit':
            continue
        sense_split = corpus.ConnHeadSemClass1.split('.')
        sense_l2 = '.'.join(sense_split[0:2])
        if (sense_l2 in selected_sense) or splitting == 3:
            arg1, pos1 = arg_filter(corpus.arg1_pos(wn_format=True))
            arg2, pos2 = arg_filter(corpus.arg2_pos(wn_format=True))
            if corpus.Section in train_sec:
                arg1_train.append(arg1)
                arg2_train.append(arg2)
                conn_train.append(corpus.Conn1)
                sense_train.append([sense_l2, sense_split[0]])
            elif corpus.Section in dev_sec:
                arg1_dev.append(arg1)
                arg2_dev.append(arg2)
                sense1_dev.append([sense_l2, sense_split[0]])
            elif corpus.Section in test_sec:
                arg1_test.append(arg1)
                arg2_test.append(arg2)
                sense1_test.append([sense_l2, sense_split[0]])
            else:
                continue
            if corpus.Conn2 is not None:
                sense_split = corpus.Conn2SemClass1.split('.')
                sense_l2 = '.'.join(sense_split[0:2])
                if (sense_l2 in selected_sense) or splitting == 3:
                    if corpus.Section in train_sec:
                        arg1_train.append(arg1)
                        arg2_train.append(arg2)
                        conn_train.append(corpus.Conn2)
                        sense_train.append([sense_l2, sense_split[0]])
                    elif corpus.Section in dev_sec:
                        sense2_dev.append([sense_l2, sense_split[0]])
                    elif corpus.Section in test_sec:
                        sense2_test.append([sense_l2, sense_split[0]])
            else:
                if corpus.Section in dev_sec:
                    sense2_dev.append([None, None])
                elif corpus.Section in test_sec:
                    sense2_test.append([None, None])

    assert len(arg1_train) == len(arg2_train) == len(conn_train) == len(
        sense_train)
    assert len(arg1_dev) == len(arg2_dev) == len(sense1_dev) == len(sense2_dev)
    assert len(arg1_test) == len(arg2_test) == len(sense1_test) == len(
        sense2_test)
    print('train size:', len(arg1_train))
    print('dev size:', len(arg1_dev))
    print('test size:', len(arg1_test))

    if splitting == 1:
        pre = './interim/lin/'
    elif splitting == 2:
        pre = './interim/ji/'
    elif splitting == 3:
        pre = './interim/l/'
    with open(pre + 'train.pkl', 'wb') as f:
        pickle.dump(arg1_train, f)
        pickle.dump(arg2_train, f)
        pickle.dump(conn_train, f)
        pickle.dump(sense_train, f)
    with open(pre + 'dev.pkl', 'wb') as f:
        pickle.dump(arg1_dev, f)
        pickle.dump(arg2_dev, f)
        pickle.dump(sense1_dev, f)
        pickle.dump(sense2_dev, f)
    with open(pre + 'test.pkl', 'wb') as f:
        pickle.dump(arg1_test, f)
        pickle.dump(arg2_test, f)
        pickle.dump(sense1_test, f)
        pickle.dump(sense2_test, f)
Beispiel #11
0
def pdtb2_make_splits_xval(path, write_path):
    """Make 12 cross-validation splits for PDTB 2.0"""

    sections = [
        '00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
        '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
        '24'
    ]

    dev_sections = []
    test_sections = []
    train_sections = []

    for i in range(0, 25, 2):
        dev_sections.append([sections[i], sections[(i + 1) % 25]])
        test_sections.append(
            [sections[(i + 23) % 25], sections[(i + 24) % 25]])
        train_sections.append([sections[(i + j) % 25] for j in range(2, 23)])

    means_d = {'train': 0, 'dev': 0, 'test': 0}

    pdtb_data = list(CorpusReader(path).iter_data())
    for fold_no, (dev, test, train) in enumerate(
            zip(dev_sections[:-1], test_sections[:-1], train_sections[:-1])):
        all_splits = dev + test + train
        assert len(set(all_splits)) == 25

        split_d = {'train': train, 'dev': dev, 'test': test}
        lines_d = {'train': [], 'dev': [], 'test': []}
        label_d = {}

        for corpus in pdtb_data:
            for split, sections in split_d.items():
                if corpus.Relation == 'Implicit' and corpus.Section in sections:
                    sense1 = (corpus.ConnHeadSemClass1, corpus.Conn1)
                    sense2 = (corpus.ConnHeadSemClass2, corpus.Conn1)
                    sense3 = (corpus.Conn2SemClass1, corpus.Conn2)
                    sense4 = (corpus.Conn2SemClass2, corpus.Conn2)

                    # use list instead of set to preserve order
                    sense_list = [sense1, sense2, sense3, sense4]
                    formatted_sense_list = []
                    for sense_full, conn in sense_list:
                        if sense_full is not None:
                            sense = '.'.join(sense_full.split('.')[0:2])
                            if (sense not in [
                                    s for s, c, sf in formatted_sense_list
                            ] and sense in SELECTED_SENSES_PDTB2):
                                formatted_sense_list.append(
                                    (sense, conn, sense_full))

                    # No useable senses
                    if len(formatted_sense_list) == 0:
                        continue

                    arg1 = corpus.Arg1_RawText
                    arg2 = corpus.Arg2_RawText

                    if split == 'train':
                        for sense, conn, sense_full in formatted_sense_list:
                            lines_d[split].append(
                                tab_delimited([
                                    split, corpus.Section, corpus.FileNumber,
                                    sense, corpus.Relation, arg1, arg2, conn,
                                    sense_full
                                ]))
                            label_d[sense] = label_d.get(sense, 0) + 1

                    else:
                        if len(formatted_sense_list) == 1:
                            formatted_sense_list.append((None, None, None))
                        sense_paired = zip(formatted_sense_list[0],
                                           formatted_sense_list[1])
                        senses, conns, senses_full = sense_paired
                        lines_d[split].append(
                            tab_delimited([
                                split, corpus.Section, corpus.FileNumber,
                                senses[0], senses[1], corpus.Relation, arg1,
                                arg2, conns[0], senses_full[0], conns[1],
                                senses_full[1]
                            ]))
                        label_d[senses[0]] = label_d.get(senses[0], 0) + 1
                        if senses[1] is not None:
                            label_d[senses[1]] = label_d.get(senses[1], 0) + 1

                    assert len(formatted_sense_list) <= 2
                    if len(formatted_sense_list) == 2:
                        if formatted_sense_list[0][0] == formatted_sense_list[
                                1][0]:
                            print('redundant!')

        for split, lines in lines_d.items():
            means_d[split] += len(lines) - 1

        # Write to file
        write_path_fold = os.path.join(write_path,
                                       'fold_{}'.format(fold_no + 1))
        write_to_file(lines_d, write_path_fold)

        print('Cross-validation fold {}'.format(fold_no + 1))
        print('Label counts: ', label_d)

        total = 0
        for _, count in label_d.items():
            total += count

        print('Total: ', total)

    for split, total in means_d.items():
        print('Mean {}: {}'.format(split, total / len(dev_sections[:-1])))
Beispiel #12
0
def pdtb2_make_splits_single_l1(path, write_path, split_name):
    """
    Make single standard split for PDTB 2.0, using Level-1 labels (4-way classification).
    split_name should be one of 'ji', 'lin', 'patterson'.
    'ji': Split from Ji & Eistenstein (2015), 2-20 train, 0-1 dev, 21-22 test
    'lin': Split from Lin et al. (2009) and dev set as indicated by Qin et al. (2017),
           2-21 train, 22 dev, 23 test
    'patterson': Split from Patterson & Kehler (2013), 2-22 train, 0-1 dev, 23-24 test
    """
    if split_name == 'ji':
        train_sections = [
            '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20'
        ]
        dev_sections = ['00', '01']
        test_sections = ['21', '22']

    elif split_name == 'lin':
        train_sections = [
            '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21'
        ]
        dev_sections = ['22']
        test_sections = ['23']

    elif split_name == 'patterson':
        train_sections = [
            '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22'
        ]
        dev_sections = ['00', '01']
        test_sections = ['23', '24']

    sections = train_sections + dev_sections + test_sections

    lines_d = {'train': [], 'dev': [], 'test': []}
    label_d = {}

    for corpus in CorpusReader(path).iter_data():
        if corpus.Relation == 'Implicit' and corpus.Section in sections:
            sense1 = (corpus.ConnHeadSemClass1, corpus.Conn1)
            sense2 = (corpus.ConnHeadSemClass2, corpus.Conn1)
            sense3 = (corpus.Conn2SemClass1, corpus.Conn2)
            sense4 = (corpus.Conn2SemClass2, corpus.Conn2)

            # use list instead of set to preserve order
            sense_list = [sense1, sense2, sense3, sense4]
            formatted_sense_list = []
            for sense_full, conn in sense_list:
                if sense_full is not None:
                    sense = sense_full.split('.')[0]
                    if sense not in [s for s, c, sf in formatted_sense_list]:
                        formatted_sense_list.append((sense, conn, sense_full))

            # Should be at least one sense
            assert len(formatted_sense_list) > 0

            arg1 = corpus.Arg1_RawText
            arg2 = corpus.Arg2_RawText

            if corpus.Section in train_sections:
                split = 'train'
            elif corpus.Section in dev_sections:
                split = 'dev'
            else:
                split = 'test'

            if split == 'train':
                for sense, conn, sense_full in formatted_sense_list:
                    lines_d[split].append(
                        tab_delimited([
                            split, corpus.Section, corpus.FileNumber, sense,
                            corpus.Relation, arg1, arg2, conn, sense_full
                        ]))
                    label_d[sense] = label_d.get(sense, 0) + 1

            else:
                if len(formatted_sense_list) == 1:
                    formatted_sense_list.append((None, None, None))
                sense_paired = zip(formatted_sense_list[0],
                                   formatted_sense_list[1])
                senses, conns, senses_full = sense_paired
                lines_d[split].append(
                    tab_delimited([
                        split, corpus.Section, corpus.FileNumber, senses[0],
                        senses[1], corpus.Relation, arg1, arg2, conns[0],
                        senses_full[0], conns[1], senses_full[1]
                    ]))

                label_d[senses[0]] = label_d.get(senses[0], 0) + 1
                if senses[1] is not None:
                    label_d[senses[1]] = label_d.get(senses[1], 0) + 1

            assert len(formatted_sense_list) <= 2
            if len(formatted_sense_list) == 2:
                if formatted_sense_list[0][0] == formatted_sense_list[1][0]:
                    raise ValueError('Redundant labels!')

    # Write to file
    write_to_file(lines_d, write_path)
    print('Label count: ', label_d)