Esempio n. 1
0
def load_training_as_dataframe():
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame of all instances of relations in the training section.
        Interesting columns are 'rel', 'nuc_sig', 'arity'
    """
    rst_phrases = []  # list of rows, each represented as a dict

    rst_reader = RstReader(CD_TRAIN)
    rst_corpus = rst_reader.slurp()
    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        # convert labels to coarse
        coarse_rtree_ref = REL_CONV(rtree_ref)
        # store "same-unit" subtrees
        heterogeneous_nodes = []
        internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1
        for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes):
            # get each kid's relation
            kid_rels = tuple(treenode(kid).rel for kid in su_subtree)
            # filter out nodes whose kids have different relations
            rels = [r for r in set(kid_rels) if r != 'span']
            if len(rels) > 1:
                heterogeneous_nodes.append(kid_rels)
                continue

            # process homogeneous nodes
            res = dict()
            rel = rels[0]
            res['rel'] = rel
            # arity
            res['arity'] = len(su_subtree)  # number of kids
            # nuclearity signature
            kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree)
            nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N'
                              for kid_nuc in kid_nucs)
            res['nuc_sig'] = (nuc_sig
                              if nuc_sig in frozenset(['SN', 'NS']) else 'NN')
            # TODO len(kid_rels) - 1 is the nb of bin rels

            # height
            rel_hgt = su_subtree.height()
            res['height'] = rel_hgt

            # TODO disc relations of the grandchildren
            #

            rst_phrases.append(res)

    # turn into a DataFrame
    df = pd.DataFrame(rst_phrases)
    # add calculated columns
    # * "undirected" nuclearity, e.g. NS == SN
    df['unuc_sig'] = map(
        lambda nuc_sig: ('NS'
                         if nuc_sig in ['NS', 'SN'] else 'NN'), df.nuc_sig)
    return df
Esempio n. 2
0
def load_training_as_dataframe():
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame of all instances of relations in the training section.
        Interesting columns are 'rel', 'nuc_sig', 'arity'
    """
    rst_phrases = []  # list of rows, each represented as a dict

    rst_reader = RstReader(CD_TRAIN)
    rst_corpus = rst_reader.slurp()
    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        # convert labels to coarse
        coarse_rtree_ref = REL_CONV(rtree_ref)
        # store "same-unit" subtrees
        heterogeneous_nodes = []
        internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1
        for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes):
            # get each kid's relation
            kid_rels = tuple(treenode(kid).rel for kid in su_subtree)
            # filter out nodes whose kids have different relations
            rels = [r for r in set(kid_rels) if r != 'span']
            if len(rels) > 1:
                heterogeneous_nodes.append(kid_rels)
                continue

            # process homogeneous nodes
            res = dict()
            rel = rels[0]
            res['rel'] = rel
            # arity
            res['arity'] = len(su_subtree)  # number of kids
            # nuclearity signature
            kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree)
            nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N'
                              for kid_nuc in kid_nucs)
            res['nuc_sig'] = (nuc_sig if nuc_sig in frozenset(['SN', 'NS'])
                              else 'NN')
            # TODO len(kid_rels) - 1 is the nb of bin rels

            # height
            rel_hgt = su_subtree.height()
            res['height'] = rel_hgt

            # TODO disc relations of the grandchildren
            #

            rst_phrases.append(res)

    # turn into a DataFrame
    df = pd.DataFrame(rst_phrases)
    # add calculated columns
    # * "undirected" nuclearity, e.g. NS == SN
    df['unuc_sig'] = map(lambda nuc_sig: ('NS' if nuc_sig in ['NS', 'SN']
                                          else 'NN'),
                         df.nuc_sig)
    return df
Esempio n. 3
0
def dump_dep_rstdt(corpus_dir, out_dir, nary_enc):
    """Convert and dump the RST-DT corpus as dependency trees."""
    # convert and dump RST trees from train
    dir_train = os.path.join(corpus_dir, TRAIN_FOLDER)
    if not os.path.isdir(dir_train):
        raise ValueError('No such folder: {}'.format(dir_train))
    reader_train = Reader(dir_train)
    trees_train = reader_train.slurp()
    dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                       nary_enc=nary_enc)
                    for doc_name, rst_tree in trees_train.items()}
    dump_disdep_files(dtrees_train.values(),
                      os.path.join(out_dir, os.path.basename(dir_train)))

    # convert and dump RST trees from test
    dir_test = os.path.join(corpus_dir, TEST_FOLDER)
    if not os.path.isdir(dir_test):
        raise ValueError('No such folder: {}'.format(dir_test))
    reader_test = Reader(dir_test)
    trees_test = reader_test.slurp()
    dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                      nary_enc=nary_enc)
                   for doc_name, rst_tree in trees_test.items()}
    dump_disdep_files(dtrees_test.values(),
                      os.path.join(out_dir, os.path.basename(dir_test)))
Esempio n. 4
0
    if not os.path.exists(PTB_DIR):
        raise ValueError("Unable to find PTB dir {}".format(PTB_DIR))
    if not os.path.exists(RST_DIR):
        raise ValueError("Unable to find RST dir {}".format(RST_DIR))
    if not os.path.exists(CORENLP_OUT_DIR):
        raise ValueError("Unable to find parsed dir {}".format(
            CORENLP_OUT_DIR))

    corpus = 'RSTtrees-WSJ-main-1.0/TRAINING'
    corpus_dir = os.path.join(RST_DIR, corpus)
    # syntactic parsers to compare
    ptb_reader = BracketParseCorpusReader(PTB_DIR,
                                          r'../wsj_.*\.mrg',
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)
Esempio n. 5
0
def load_corpus_as_dataframe_new(selection='train', binarize=False,
                                 verbose=0):
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Parameters
    ----------
    selection : one of {'train', 'test'}  TODO: add 'both'
        Select the part of the corpus to load.

    binarize : boolean, default: False
        If True, apply right-heavy binarization on RST trees.

    Returns
    -------
    node_df: pandas.DataFrame
        DataFrame of all nodes from the constituency trees.
    rel_df: pandas.DataFrame
        DataFrame of all relations.
    edu_df: pandas.DataFrame
        DataFrame of all EDUs.

    TODO
    ----
    [ ] intra-sentential-first right-heavy binarization
    [ ] left-heavy binarization (?)
    [ ] add selection='both'

    Notes
    -----
    `selection='both'` can currently be done as:
    ```
    train_df = load_corpus_as_dataframe_new(selection='train')
    test_df = load_corpus_as_dataframe_new(selection='test')
    both_df = train_df.append(test_df)
    ```
    """
    node_rows = []  # list of dicts, one dict per node
    rel_rows = []  # list of dicts, one dict per relation
    # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe
    # conflating both does no harm
    edu_rows = []  # list of dicts, one dict per EDU
    sent_rows = []  # ibid
    para_rows = []  # ibid

    if selection == 'train':
        rst_reader = RstReader(CD_TRAIN)
    elif selection == 'test':
        rst_reader = RstReader(CD_TEST)
    else:
        raise ValueError('Unknown selection {}'.format(selection))

    rst_corpus = rst_reader.slurp()

    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        doc_ctx = rtree_ref.label().context
        doc_text = doc_ctx.text()
        doc_edus = rtree_ref.leaves()

        # 0. collect EDUs
        doc_edu_rows = load_edus(doc_edus)

        # 1. collect spans (constituency nodes) from the gold RST trees
        #
        # transform the original RST tree: convert labels to their
        # coarse equivalent, binarize if required
        coarse_rtree_ref = REL_CONV(rtree_ref)
        if binarize:
            coarse_rtree_ref = _binarize(coarse_rtree_ref)
        # collect spans
        doc_span_rows = load_spans(coarse_rtree_ref)

        # prepare this info to find "leaky" substructures:
        # sentences and paragraphs
        # dict of EDU spans to constituent node from the RST tree
        rst_tree_node_spans = {
            (row['edu_start'], row['edu_end']): row['treepos']
            for row in doc_span_rows
        }
        # list of EDU spans of constituent nodes, sorted by length of span
        # then start
        rst_tree_node_spans_by_len = list(sorted(
            rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0])))

        # 2. Collect sentences
        doc_sent_rows = []
        # use dirty PTB tokenizer + parser
        # NB: the two following lines eat up 67% of total time
        doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text)
        doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks)

        # sentence <-> EDU mapping and the information that depends on this
        # mapping might be more appropriate as a separate DataFrame
        # align EDUs with sentences
        edu2sent = align_edus_with_sentences(doc_edus, doc_tkd_trees,
                                             strict=False)
        # get the codomain of edu2sent
        # if we want to be strict, we can assert that the codomain is
        # a gapless interval
        # assert sent_idc == list(range(len(doc_tkd_trees)))
        # this assertion is currently known to fail on:
        # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB
        #     (1 sentence is split in 2)
        edu2sent_codom = set([sent_idx for sent_idx in edu2sent
                              if sent_idx is not None])

        # find the index of the first and last EDU of each sentence
        # indices in both lists are offset by 1 to map to real EDU
        # numbering (which is 1-based)
        sent_edu_starts = [(edu2sent.index(i) + 1 if i in edu2sent_codom
                            else None)
                           for i in range(len(doc_tkd_trees))]
        sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) + 1
                          if i in edu2sent_codom
                          else None)
                         for i in range(len(doc_tkd_trees))]
        # sentences that don't have their own RST subtree are 'leaky'

        # WIP propagate sentence-EDU mapping to RST tree spans
        for row in doc_span_rows:
            # offset by -1 because edu2sent is 0-based
            row['sent_start'] = edu2sent[row['edu_start'] - 1]
            row['sent_end'] = edu2sent[row['edu_end'] - 1]
            # inferred columns
            # special cases because edu2sent is None for EDUs whose text
            # is missing from PTB
            if ((row['sent_start'] is not None and
                 row['sent_end'] is not None)):
                row['sent_len'] = row['sent_end'] - row['sent_start'] + 1
                row['intra_sent'] = (row['sent_start'] == row['sent_end'])
                row['strad_sent'] = (not row['intra_sent'] and
                                     (not row['edu_start'] in sent_edu_starts or
                                      not row['edu_end'] in sent_edu_ends))
            else:
                row['sent_len'] = None
                row['intra_sent'] = None
                row['strad_sent'] = None
        # end WIP propagate

        # end of sentence <-> EDU mapping et al.

        # iterate over syntactic trees as proxy for sentences
        for sent_idx, tkd_tree in enumerate(doc_tkd_trees):
            row = {
                # data directly from the sentence segmenter
                'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx),
                'char_start': tkd_tree.span.char_start,
                'char_end': tkd_tree.span.char_end,
            }
            # sentence <-> EDU mapping dependent data
            # should probably have its own dataframe
            # to better handle disagreement between sentence and EDU
            # segmentation, that translates in the following entries
            # as None for missing data
            if sent_idx in edu2sent_codom:
                row.update({
                    'edu_start': sent_edu_starts[sent_idx],
                    'edu_end': sent_edu_ends[sent_idx],
                    # computed column
                    'edu_len': (sent_edu_ends[sent_idx] -
                                sent_edu_starts[sent_idx]) + 1,
                })

                # use alignment: sentence <-> RST tree spans (via EDUs)
                # leaky sentences: complex (2+ EDUs) sentences that don't
                # have a corresponding span in the RST tree
                leaky = (row['edu_len'] > 1 and
                         ((sent_edu_starts[sent_idx],
                           sent_edu_ends[sent_idx])
                          not in rst_tree_node_spans))
                row.update({
                    'leaky': leaky,
                })
                # find for each leaky sentence the smallest RST subtree
                # that covers it
                if row['leaky']:
                    sent_edu_first = sent_edu_starts[sent_idx]
                    sent_edu_last = sent_edu_ends[sent_idx]
                    # find parent span and straddling spans ;
                    # straddling spans exist for type 3 and 4
                    strad_spans = []
                    for edu_span in rst_tree_node_spans_by_len:
                        # parent span
                        if ((edu_span[0] <= sent_edu_first and
                             sent_edu_last <= edu_span[1])):
                            parent_span = edu_span
                            break
                        # straddling spans
                        if ((edu_span[0] < sent_edu_first and
                             sent_edu_first <= edu_span[1]) or
                            (edu_span[0] <= sent_edu_last and
                             sent_edu_last < edu_span[1])):
                            strad_spans.append(edu_span)
                    else:
                        raise ValueError(
                            'No minimal spanning node for {}'.format(row))
                    # leaky types {1, 2} vs {3, 4}:
                    # for types 1 and 2, members of the parent
                    # constituent are "pure" wrt sentence span:
                    # each member is either fully inside or fully
                    # outside the sentence ;
                    # no member straddles either of the sentence
                    # boundaries
                    leaky_type_12 = not strad_spans
                    # DEBUG
                    if verbose:
                        print(doc_id.doc)
                        print(parent_span, strad_spans if strad_spans else '')
                    # end DEBUG

                    # leaky types {1, 3} vs {2, 4}
                    # {1, 3} have at least one coordinative (aka multinuclear)
                    # relation in the chain of spans between the parent span
                    # and the EDU(s) of the leaky sentence ;
                    # {2, 4} have only subordinative (aka mononuclear)
                    # relations
                    leaky_coord = False
                    # first, check the kids of the parent span
                    parent_tpos = rst_tree_node_spans[parent_span]
                    parent_subtree = coarse_rtree_ref[parent_tpos]
                    if all(kid.label().nuclearity == 'Nucleus'
                           for kid in parent_subtree):
                        leaky_coord = True

                    # then, check the kids of all straddling spans
                    strad_rels = []  # TEMPORARY
                    for strad_span in strad_spans:
                        strad_tpos = rst_tree_node_spans[strad_span]
                        strad_subtree = coarse_rtree_ref[strad_tpos]
                        if all(kid.label().nuclearity == 'Nucleus'
                               for kid in strad_subtree):
                            leaky_coord = True
                        # TEMPORARY: store straddling relations (from kids)
                        kid_rels = [kid.label().rel
                                    for kid in strad_subtree
                                    if kid.label().rel != 'span']
                        # if all kids bear the same relation label, store
                        # only this value
                        if len(set(kid_rels)) == 1:
                            kid_rels = kid_rels[0]
                        else:
                            kid_rels = '+'.join(kid_rels)
                        strad_rels.append(kid_rels)
                        # WIP list of straddling relations
                        strad_rels_rows.append({
                            'node_id': '{}_const{}'.format(
                                strad_subtree.origin.doc,
                                '-'.join(str(x) for x in strad_tpos)),
                            'sent_id': '{}_sent{}'.format(
                                doc_id.doc, sent_idx),
                            'kid_rels': kid_rels,
                        })
                        # end WIP running counter
                    # determine type of leaky (ugly)
                    if leaky_type_12:
                        if leaky_coord:
                            leaky_type = 1
                        else:
                            leaky_type = 2
                    else:
                        if leaky_coord:
                            leaky_type = 3
                        else:
                            leaky_type = 4
                    # display type of leaky
                    if verbose:
                        print('Type {} ({}-level {} structure)\t{}'.format(
                            leaky_type,
                            'Same' if leaky_type_12 else 'Multi',
                            'coordination' if leaky_coord else 'subordination',
                            '; '.join(strad_rels)))
                        print()
                    # end WIP nuclearity of straddling spans

                    # add info to row
                    row.update({
                        # parent span, in EDUs
                        'parent_edu_start': parent_span[0],
                        'parent_edu_end': parent_span[1],
                        # length of parent span, in sentences
                        'parent_sent_len': (
                            edu2sent[parent_span[1] - 1] -
                            edu2sent[parent_span[0] - 1] + 1),
                        # distance between the current sentence and the most
                        # remote sentence covered by the parent span,
                        # in sentences
                        'parent_sent_dist': (
                            max([(edu2sent[parent_span[1] - 1] - sent_idx),
                                 (sent_idx - edu2sent[parent_span[0] - 1])])),
                        # types of leaky, in the taxonomy of
                        # (van der Vliet et al. 2011)
                        'leaky_type': leaky_type,
                    })
                else:
                    row.update({
                        'parent_span_start': row['edu_start'],
                        'parent_span_end': row['edu_end'],
                        # default value for leaky_type, provides a mean for
                        # easy comparison on complex sentences, between
                        # non-leaky and the various types of leaky
                        'leaky_type': 0,
                    })
                # end WIP
            doc_sent_rows.append(row)

        # 3. collect paragraphs
        doc_para_rows = []
        doc_paras = doc_ctx.paragraphs
        doc_text = doc_ctx.text()
        # doc_paras is None when the original text has no explicit marking
        # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ
        # corpus
        if doc_paras is not None:
            # EDU to paragraph mapping
            edu2para = align_edus_with_paragraphs(doc_edus, doc_paras,
                                                  doc_text, strict=False)
            edu2para_codom = set([para_idx for para_idx in edu2para
                                  if para_idx is not None])
            # index of the first and last EDU of each paragraph
            para_edu_starts = [(edu2para.index(i) + 1 if i in edu2para_codom
                                else None)
                               for i in range(len(doc_paras))]
            para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) + 1
                              if i in edu2para_codom
                              else None)
                             for i in range(len(doc_paras))]
            # paragraphs that don't have their own RST subtree are "leaky" ;
            # end of paragraph <-> EDU mapping et al.

            # iterate over paragraphs
            for para_idx, para in enumerate(doc_paras):
                # dirty, educe.rst_dt.text.Paragraph should have a span
                para_span = Span(para.sentences[0].span.char_start,
                                 para.sentences[-1].span.char_end)
                # end dirty
                row = {
                    # data directly from the paragraph segmenter
                    'para_id': '{}_para{}'.format(doc_id.doc, para_idx),
                    'char_start': para_span.char_start,
                    'char_end': para_span.char_end,
                }
                # paragraph <-> EDU mapping dependent data
                # should probably have its own dataframe etc.
                if para_idx in edu2para_codom:
                    row.update({
                        'edu_start': para_edu_starts[para_idx],
                        'edu_end': para_edu_ends[para_idx],
                        # computed column
                        'edu_len': (para_edu_ends[para_idx] -
                                    para_edu_starts[para_idx]) + 1,
                    })
                    # use paragraph <-> RST tree alignment
                    if row['edu_len'] > 1:  # complex paragraphs only
                        row.update({
                            'leaky': ((para_edu_starts[para_idx],
                                       para_edu_ends[para_idx])
                                      not in rst_tree_node_spans),
                        })
                    else:
                        row.update({'leaky': False})
                    # WIP find for each leaky paragraph the smallest RST
                    # subtree that covers it
                    if row['leaky']:
                        for edu_span in rst_tree_node_spans_by_len:
                            if ((edu_span[0] <= para_edu_starts[para_idx] and
                                 para_edu_ends[para_idx] <= edu_span[1])):
                                parent_span = edu_span
                                break
                        else:
                            raise ValueError(
                                'No minimal spanning node for {}'.format(row))
                        # add info to row
                        row.update({
                            # parent span, on EDUs
                            'parent_edu_start': parent_span[0],
                            'parent_edu_end': parent_span[1]
                        })
                        # length of parent span, in paragraphs
                        if ((edu2para[parent_span[1] - 1] is not None and
                             edu2para[parent_span[0] - 1] is not None)):
                            row.update({
                                'parent_para_len': (
                                    edu2para[parent_span[1] - 1] -
                                    edu2para[parent_span[0] - 1] + 1),
                                # distance between the current paragraph and the
                                # most remote paragraph covered by the parent
                                # span, in paragraphs
                                'parent_para_dist': (
                                    max([(edu2para[parent_span[1] - 1] -
                                          para_idx),
                                         (para_idx -
                                          edu2para[parent_span[0] - 1])])),
                            })
                    else:
                        row.update({
                            'parent_edu_start': row['edu_start'],
                            'parent_edu_end': row['edu_end'],
                        })
                    # end WIP
                doc_para_rows.append(row)

        # NB: these are leaky sentences wrt the original constituency
        # trees ; leaky sentences wrt the binarized constituency trees
        # might be different (TODO), similarly for the dependency trees
        # (TODO too) ;
        # I should count them, see if the ~5% Joty mentions are on the
        # original or binarized ctrees, and compare with the number of
        # leaky for deptrees ; I suspect the latter will be much lower...
        # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond
        # to cases where nodes that are not the head of their sentence
        # have dependents in other sentences
        # this would capture the set (or a subset) of edges that fall
        # outside of the search space for the "iheads" intra/inter
        # strategy

        # add doc entries to corpus entries
        para_rows.extend(doc_para_rows)
        sent_rows.extend(doc_sent_rows)
        rel_rows.extend(doc_span_rows)
        edu_rows.extend(doc_edu_rows)

    # turn list into a DataFrame
    node_df = pd.DataFrame(node_rows)
    rel_df = pd.DataFrame(rel_rows)
    edu_df = pd.DataFrame(edu_rows)
    sent_df = pd.DataFrame(sent_rows)
    para_df = pd.DataFrame(para_rows)
    # add calculated columns here? (leaky and complex sentences)

    return node_df, rel_df, edu_df, sent_df, para_df
Esempio n. 6
0
    # properly recast strip_accents if None
    strip_accents = (args.strip_accents if args.strip_accents != 'None'
                     else None)
    lowercase = args.lowercase
    stop_words = (args.stop_words if args.stop_words != 'None'
                  else None)
    outfile = args.outfile
    n_jobs = args.n_jobs
    verbose = args.verbose
    sel_pairs = args.pairs
    distance_range = (args.scale if args.scale != 'None'
                      else None)

    # * read the corpus
    rst_corpus_dir = RST_CORPUS['double']
    rst_reader = Reader(rst_corpus_dir)
    rst_corpus = rst_reader.slurp(verbose=True)
    corpus_texts = [v.text() for k, v in sorted(rst_corpus.items())]

    # MOVE ~ WMD.__init__()
    # load word embeddings
    vocab_dict, W = load_embedding("embed")
    # end MOVE

    # MOVE ~ WMD.fit(corpus_texts?)
    # fit CountVectorizer to the vocabulary of the corpus
    vect = CountVectorizer(
        strip_accents=strip_accents, lowercase=lowercase,
        stop_words=stop_words
    ).fit(corpus_texts)
    # compute the vocabulary common to the embeddings and corpus, restrict
Esempio n. 7
0
def load_corpus_as_dataframe_new(selection='train', binarize=False, verbose=0):
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Parameters
    ----------
    selection : one of {'train', 'test'}  TODO: add 'both'
        Select the part of the corpus to load.

    binarize : boolean, default: False
        If True, apply right-heavy binarization on RST trees.

    Returns
    -------
    node_df: pandas.DataFrame
        DataFrame of all nodes from the constituency trees.
    rel_df: pandas.DataFrame
        DataFrame of all relations.
    edu_df: pandas.DataFrame
        DataFrame of all EDUs.

    TODO
    ----
    [ ] intra-sentential-first right-heavy binarization
    [ ] left-heavy binarization (?)
    [ ] add selection='both'

    Notes
    -----
    `selection='both'` can currently be done as:
    ```
    train_df = load_corpus_as_dataframe_new(selection='train')
    test_df = load_corpus_as_dataframe_new(selection='test')
    both_df = train_df.append(test_df)
    ```
    """
    node_rows = []  # list of dicts, one dict per node
    rel_rows = []  # list of dicts, one dict per relation
    # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe
    # conflating both does no harm
    edu_rows = []  # list of dicts, one dict per EDU
    sent_rows = []  # ibid
    para_rows = []  # ibid

    if selection == 'train':
        rst_reader = RstReader(CD_TRAIN)
    elif selection == 'test':
        rst_reader = RstReader(CD_TEST)
    else:
        raise ValueError('Unknown selection {}'.format(selection))

    rst_corpus = rst_reader.slurp()

    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        doc_ctx = rtree_ref.label().context
        doc_text = doc_ctx.text()
        doc_edus = rtree_ref.leaves()

        # 0. collect EDUs
        doc_edu_rows = load_edus(doc_edus)

        # 1. collect spans (constituency nodes) from the gold RST trees
        #
        # transform the original RST tree: convert labels to their
        # coarse equivalent, binarize if required
        coarse_rtree_ref = REL_CONV(rtree_ref)
        if binarize:
            coarse_rtree_ref = _binarize(coarse_rtree_ref)
        # collect spans
        doc_span_rows = load_spans(coarse_rtree_ref)

        # prepare this info to find "leaky" substructures:
        # sentences and paragraphs
        # dict of EDU spans to constituent node from the RST tree
        rst_tree_node_spans = {(row['edu_start'], row['edu_end']):
                               row['treepos']
                               for row in doc_span_rows}
        # list of EDU spans of constituent nodes, sorted by length of span
        # then start
        rst_tree_node_spans_by_len = list(
            sorted(rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0])))

        # 2. Collect sentences
        doc_sent_rows = []
        # use dirty PTB tokenizer + parser
        # NB: the two following lines eat up 67% of total time
        doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text)
        doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks)

        # sentence <-> EDU mapping and the information that depends on this
        # mapping might be more appropriate as a separate DataFrame
        # align EDUs with sentences
        edu2sent = align_edus_with_sentences(doc_edus,
                                             doc_tkd_trees,
                                             strict=False)
        # get the codomain of edu2sent
        # if we want to be strict, we can assert that the codomain is
        # a gapless interval
        # assert sent_idc == list(range(len(doc_tkd_trees)))
        # this assertion is currently known to fail on:
        # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB
        #     (1 sentence is split in 2)
        edu2sent_codom = set(
            [sent_idx for sent_idx in edu2sent if sent_idx is not None])

        # find the index of the first and last EDU of each sentence
        # indices in both lists are offset by 1 to map to real EDU
        # numbering (which is 1-based)
        sent_edu_starts = [
            (edu2sent.index(i) + 1 if i in edu2sent_codom else None)
            for i in range(len(doc_tkd_trees))
        ]
        sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) +
                          1 if i in edu2sent_codom else None)
                         for i in range(len(doc_tkd_trees))]
        # sentences that don't have their own RST subtree are 'leaky'

        # WIP propagate sentence-EDU mapping to RST tree spans
        for row in doc_span_rows:
            # offset by -1 because edu2sent is 0-based
            row['sent_start'] = edu2sent[row['edu_start'] - 1]
            row['sent_end'] = edu2sent[row['edu_end'] - 1]
            # inferred columns
            # special cases because edu2sent is None for EDUs whose text
            # is missing from PTB
            if ((row['sent_start'] is not None
                 and row['sent_end'] is not None)):
                row['sent_len'] = row['sent_end'] - row['sent_start'] + 1
                row['intra_sent'] = (row['sent_start'] == row['sent_end'])
                row['strad_sent'] = (not row['intra_sent'] and
                                     (not row['edu_start'] in sent_edu_starts
                                      or not row['edu_end'] in sent_edu_ends))
            else:
                row['sent_len'] = None
                row['intra_sent'] = None
                row['strad_sent'] = None
        # end WIP propagate

        # end of sentence <-> EDU mapping et al.

        # iterate over syntactic trees as proxy for sentences
        for sent_idx, tkd_tree in enumerate(doc_tkd_trees):
            row = {
                # data directly from the sentence segmenter
                'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx),
                'char_start': tkd_tree.span.char_start,
                'char_end': tkd_tree.span.char_end,
            }
            # sentence <-> EDU mapping dependent data
            # should probably have its own dataframe
            # to better handle disagreement between sentence and EDU
            # segmentation, that translates in the following entries
            # as None for missing data
            if sent_idx in edu2sent_codom:
                row.update({
                    'edu_start':
                    sent_edu_starts[sent_idx],
                    'edu_end':
                    sent_edu_ends[sent_idx],
                    # computed column
                    'edu_len':
                    (sent_edu_ends[sent_idx] - sent_edu_starts[sent_idx]) + 1,
                })

                # use alignment: sentence <-> RST tree spans (via EDUs)
                # leaky sentences: complex (2+ EDUs) sentences that don't
                # have a corresponding span in the RST tree
                leaky = (row['edu_len'] > 1 and
                         ((sent_edu_starts[sent_idx], sent_edu_ends[sent_idx])
                          not in rst_tree_node_spans))
                row.update({
                    'leaky': leaky,
                })
                # find for each leaky sentence the smallest RST subtree
                # that covers it
                if row['leaky']:
                    sent_edu_first = sent_edu_starts[sent_idx]
                    sent_edu_last = sent_edu_ends[sent_idx]
                    # find parent span and straddling spans ;
                    # straddling spans exist for type 3 and 4
                    strad_spans = []
                    for edu_span in rst_tree_node_spans_by_len:
                        # parent span
                        if ((edu_span[0] <= sent_edu_first
                             and sent_edu_last <= edu_span[1])):
                            parent_span = edu_span
                            break
                        # straddling spans
                        if ((edu_span[0] < sent_edu_first
                             and sent_edu_first <= edu_span[1])
                                or (edu_span[0] <= sent_edu_last
                                    and sent_edu_last < edu_span[1])):
                            strad_spans.append(edu_span)
                    else:
                        raise ValueError(
                            'No minimal spanning node for {}'.format(row))
                    # leaky types {1, 2} vs {3, 4}:
                    # for types 1 and 2, members of the parent
                    # constituent are "pure" wrt sentence span:
                    # each member is either fully inside or fully
                    # outside the sentence ;
                    # no member straddles either of the sentence
                    # boundaries
                    leaky_type_12 = not strad_spans
                    # DEBUG
                    if verbose:
                        print(doc_id.doc)
                        print(parent_span, strad_spans if strad_spans else '')
                    # end DEBUG

                    # leaky types {1, 3} vs {2, 4}
                    # {1, 3} have at least one coordinative (aka multinuclear)
                    # relation in the chain of spans between the parent span
                    # and the EDU(s) of the leaky sentence ;
                    # {2, 4} have only subordinative (aka mononuclear)
                    # relations
                    leaky_coord = False
                    # first, check the kids of the parent span
                    parent_tpos = rst_tree_node_spans[parent_span]
                    parent_subtree = coarse_rtree_ref[parent_tpos]
                    if all(kid.label().nuclearity == 'Nucleus'
                           for kid in parent_subtree):
                        leaky_coord = True

                    # then, check the kids of all straddling spans
                    strad_rels = []  # TEMPORARY
                    for strad_span in strad_spans:
                        strad_tpos = rst_tree_node_spans[strad_span]
                        strad_subtree = coarse_rtree_ref[strad_tpos]
                        if all(kid.label().nuclearity == 'Nucleus'
                               for kid in strad_subtree):
                            leaky_coord = True
                        # TEMPORARY: store straddling relations (from kids)
                        kid_rels = [
                            kid.label().rel for kid in strad_subtree
                            if kid.label().rel != 'span'
                        ]
                        # if all kids bear the same relation label, store
                        # only this value
                        if len(set(kid_rels)) == 1:
                            kid_rels = kid_rels[0]
                        else:
                            kid_rels = '+'.join(kid_rels)
                        strad_rels.append(kid_rels)
                        # WIP list of straddling relations
                        strad_rels_rows.append({
                            'node_id':
                            '{}_const{}'.format(
                                strad_subtree.origin.doc,
                                '-'.join(str(x) for x in strad_tpos)),
                            'sent_id':
                            '{}_sent{}'.format(doc_id.doc, sent_idx),
                            'kid_rels':
                            kid_rels,
                        })
                        # end WIP running counter
                    # determine type of leaky (ugly)
                    if leaky_type_12:
                        if leaky_coord:
                            leaky_type = 1
                        else:
                            leaky_type = 2
                    else:
                        if leaky_coord:
                            leaky_type = 3
                        else:
                            leaky_type = 4
                    # display type of leaky
                    if verbose:
                        print('Type {} ({}-level {} structure)\t{}'.format(
                            leaky_type, 'Same' if leaky_type_12 else 'Multi',
                            'coordination' if leaky_coord else 'subordination',
                            '; '.join(strad_rels)))
                        print()
                    # end WIP nuclearity of straddling spans

                    # add info to row
                    row.update({
                        # parent span, in EDUs
                        'parent_edu_start':
                        parent_span[0],
                        'parent_edu_end':
                        parent_span[1],
                        # length of parent span, in sentences
                        'parent_sent_len': (edu2sent[parent_span[1] - 1] -
                                            edu2sent[parent_span[0] - 1] + 1),
                        # distance between the current sentence and the most
                        # remote sentence covered by the parent span,
                        # in sentences
                        'parent_sent_dist':
                        (max([(edu2sent[parent_span[1] - 1] - sent_idx),
                              (sent_idx - edu2sent[parent_span[0] - 1])])),
                        # types of leaky, in the taxonomy of
                        # (van der Vliet et al. 2011)
                        'leaky_type':
                        leaky_type,
                    })
                else:
                    row.update({
                        'parent_span_start': row['edu_start'],
                        'parent_span_end': row['edu_end'],
                        # default value for leaky_type, provides a mean for
                        # easy comparison on complex sentences, between
                        # non-leaky and the various types of leaky
                        'leaky_type': 0,
                    })
                # end WIP
            doc_sent_rows.append(row)

        # 3. collect paragraphs
        doc_para_rows = []
        doc_paras = doc_ctx.paragraphs
        doc_text = doc_ctx.text()
        # doc_paras is None when the original text has no explicit marking
        # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ
        # corpus
        if doc_paras is not None:
            # EDU to paragraph mapping
            edu2para = align_edus_with_paragraphs(doc_edus,
                                                  doc_paras,
                                                  doc_text,
                                                  strict=False)
            edu2para_codom = set(
                [para_idx for para_idx in edu2para if para_idx is not None])
            # index of the first and last EDU of each paragraph
            para_edu_starts = [
                (edu2para.index(i) + 1 if i in edu2para_codom else None)
                for i in range(len(doc_paras))
            ]
            para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) +
                              1 if i in edu2para_codom else None)
                             for i in range(len(doc_paras))]
            # paragraphs that don't have their own RST subtree are "leaky" ;
            # end of paragraph <-> EDU mapping et al.

            # iterate over paragraphs
            for para_idx, para in enumerate(doc_paras):
                # dirty, educe.rst_dt.text.Paragraph should have a span
                para_span = Span(para.sentences[0].span.char_start,
                                 para.sentences[-1].span.char_end)
                # end dirty
                row = {
                    # data directly from the paragraph segmenter
                    'para_id': '{}_para{}'.format(doc_id.doc, para_idx),
                    'char_start': para_span.char_start,
                    'char_end': para_span.char_end,
                }
                # paragraph <-> EDU mapping dependent data
                # should probably have its own dataframe etc.
                if para_idx in edu2para_codom:
                    row.update({
                        'edu_start':
                        para_edu_starts[para_idx],
                        'edu_end':
                        para_edu_ends[para_idx],
                        # computed column
                        'edu_len':
                        (para_edu_ends[para_idx] - para_edu_starts[para_idx]) +
                        1,
                    })
                    # use paragraph <-> RST tree alignment
                    if row['edu_len'] > 1:  # complex paragraphs only
                        row.update({
                            'leaky': ((para_edu_starts[para_idx],
                                       para_edu_ends[para_idx])
                                      not in rst_tree_node_spans),
                        })
                    else:
                        row.update({'leaky': False})
                    # WIP find for each leaky paragraph the smallest RST
                    # subtree that covers it
                    if row['leaky']:
                        for edu_span in rst_tree_node_spans_by_len:
                            if ((edu_span[0] <= para_edu_starts[para_idx]
                                 and para_edu_ends[para_idx] <= edu_span[1])):
                                parent_span = edu_span
                                break
                        else:
                            raise ValueError(
                                'No minimal spanning node for {}'.format(row))
                        # add info to row
                        row.update({
                            # parent span, on EDUs
                            'parent_edu_start': parent_span[0],
                            'parent_edu_end': parent_span[1]
                        })
                        # length of parent span, in paragraphs
                        if ((edu2para[parent_span[1] - 1] is not None
                             and edu2para[parent_span[0] - 1] is not None)):
                            row.update({
                                'parent_para_len':
                                (edu2para[parent_span[1] - 1] -
                                 edu2para[parent_span[0] - 1] + 1),
                                # distance between the current paragraph and
                                # the most remote paragraph covered by the
                                # parent span, in paragraphs
                                'parent_para_dist': (max([
                                    (edu2para[parent_span[1] - 1] - para_idx),
                                    (para_idx - edu2para[parent_span[0] - 1])
                                ])),
                            })
                    else:
                        row.update({
                            'parent_edu_start': row['edu_start'],
                            'parent_edu_end': row['edu_end'],
                        })
                    # end WIP
                doc_para_rows.append(row)

        # NB: these are leaky sentences wrt the original constituency
        # trees ; leaky sentences wrt the binarized constituency trees
        # might be different (TODO), similarly for the dependency trees
        # (TODO too) ;
        # I should count them, see if the ~5% Joty mentions are on the
        # original or binarized ctrees, and compare with the number of
        # leaky for deptrees ; I suspect the latter will be much lower...
        # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond
        # to cases where nodes that are not the head of their sentence
        # have dependents in other sentences
        # this would capture the set (or a subset) of edges that fall
        # outside of the search space for the "iheads" intra/inter
        # strategy

        # add doc entries to corpus entries
        para_rows.extend(doc_para_rows)
        sent_rows.extend(doc_sent_rows)
        rel_rows.extend(doc_span_rows)
        edu_rows.extend(doc_edu_rows)

    # turn list into a DataFrame
    node_df = pd.DataFrame(node_rows)
    rel_df = pd.DataFrame(rel_rows)
    edu_df = pd.DataFrame(edu_rows)
    sent_df = pd.DataFrame(sent_rows)
    para_df = pd.DataFrame(para_rows)
    # add calculated columns here? (leaky and complex sentences)

    return node_df, rel_df, edu_df, sent_df, para_df
Esempio n. 8
0
    # properly recast strip_accents if None
    strip_accents = (args.strip_accents if args.strip_accents != 'None'
                     else None)
    lowercase = args.lowercase
    stop_words = (args.stop_words if args.stop_words != 'None'
                  else None)
    outfile = args.outfile
    n_jobs = args.n_jobs
    verbose = args.verbose
    sel_pairs = args.pairs
    distance_range = (args.scale if args.scale != 'None'
                      else None)

    # * read the corpus
    rst_corpus_dir = RST_CORPUS['double']
    rst_reader = Reader(rst_corpus_dir)
    rst_corpus = rst_reader.slurp(verbose=True)
    corpus_texts = [v.text() for k, v in sorted(rst_corpus.items())]

    # MOVE ~ WMD.__init__()
    # load word embeddings
    vocab_dict, W = load_embedding("embed")
    # end MOVE

    # MOVE ~ WMD.fit(corpus_texts?)
    # fit CountVectorizer to the vocabulary of the corpus
    vect = CountVectorizer(
        strip_accents=strip_accents, lowercase=lowercase,
        stop_words=stop_words
    ).fit(corpus_texts)
    # compute the vocabulary common to the embeddings and corpus, restrict
Esempio n. 9
0
    if not os.path.exists(PTB_DIR):
        raise ValueError("Unable to find PTB dir {}".format(PTB_DIR))
    if not os.path.exists(RST_DIR):
        raise ValueError("Unable to find RST dir {}".format(RST_DIR))
    if not os.path.exists(CORENLP_OUT_DIR):
        raise ValueError(
            "Unable to find parsed dir {}".format(CORENLP_OUT_DIR))

    corpus = 'RSTtrees-WSJ-main-1.0/TRAINING'
    corpus_dir = os.path.join(RST_DIR, corpus)
    # syntactic parsers to compare
    ptb_reader = BracketParseCorpusReader(PTB_DIR,
                                          r'../wsj_.*\.mrg',
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)