Beispiel #1
0
def convert(corpus, multinuclear, odir):
    """
    Convert every RST tree in the corpus to a dependency tree
    (and back, but simplified using a set of relation types
    that will be systematically treated as multinuclear)
    """
    bin_dir = os.path.join(odir, "rst-binarised")
    dt_dir = os.path.join(odir, "rst-to-dt")
    rst2_dir = os.path.join(odir, "dt-to-rst")
    for subdir in [bin_dir, dt_dir, rst2_dir]:
        if not os.path.exists(subdir):
            os.makedirs(subdir)

    for k in corpus:
        suffix = os.path.splitext(k.doc)[0]

        stree = SimpleRSTTree.from_rst_tree(corpus[k])
        with open(os.path.join(bin_dir, suffix), 'w') as fout:
            fout.write(str(stree))

        dtree = RstDepTree.from_simple_rst_tree(stree)
        with open(os.path.join(dt_dir, suffix), 'w') as fout:
            fout.write(str(dtree))

        stree2 = deptree_to_simple_rst_tree(dtree)
        with open(os.path.join(rst2_dir, suffix), 'w') as fout:
            fout.write(str(stree2))
Beispiel #2
0
def convert(corpus, multinuclear, odir):
    """
    Convert every RST tree in the corpus to a dependency tree
    (and back, but simplified using a set of relation types
    that will be systematically treated as multinuclear)
    """
    bin_dir = os.path.join(odir, "rst-binarised")
    dt_dir = os.path.join(odir, "rst-to-dt")
    rst2_dir = os.path.join(odir, "dt-to-rst")
    for subdir in [bin_dir, dt_dir, rst2_dir]:
        if not os.path.exists(subdir):
            os.makedirs(subdir)

    for k in corpus:
        suffix = os.path.splitext(k.doc)[0]

        stree = SimpleRSTTree.from_rst_tree(corpus[k])
        with open(os.path.join(bin_dir, suffix), 'w') as fout:
            fout.write(str(stree))

        dtree = RstDepTree.from_simple_rst_tree(stree)
        with open(os.path.join(dt_dir, suffix), 'w') as fout:
            fout.write(str(dtree))

        stree2 = deptree_to_simple_rst_tree(dtree)
        with open(os.path.join(rst2_dir, suffix), 'w') as fout:
            fout.write(str(stree2))
Beispiel #3
0
    def walk(ancestor, subtree):
        """
        The basic descent/ascent driver of our conversion algorithm.
        Note that we are looking at three layers of the dependency
        tree at the same time.


                     r0       r1
            ancestor --> src +--> tgt1
                             |
                             |r2
                             +--> tgt2
                             |
                             ..
                             |
                             |rN
                             +--> tgtN

        The base case is if src is a leaf node (no children),
        whereupon we return a tiny tree connecting the two.

        If we do have children, we have to first obtain the
        full RST tree for src (through the folding process
        described in the docstring for the main function)
        before connecting it to its ancestor.

        Parameters
        ----------
        ancestor : SimpleRSTTree
            SimpleRSTTree of the ancestor

        subtree : int
            Index of the head of the subtree

        Returns
        -------
        res : SimpleRSTTree
            SimpleRSTTree covering ancestor and subtree.
        """
        # create tree leaf for src
        edu_src = dtree.edus[subtree]
        src = SimpleRSTTree(
            Node("leaf", (edu_src.num, edu_src.num), edu_src.text_span(),
                 "leaf"),
            [edu_src])

        # descend into each child, but note that we are folding
        # rather than mapping, ie. we threading along a nested
        # RST tree as go from sibling to sibling
        ranked_targets = dtree.deps(subtree)
        for tgt in ranked_targets:
            src = walk(src, tgt)
        if not ancestor:
            # first call: ancestor is None, subtree is the index of the
            # (presumably unique) real root
            return src

        # connect ancestor with src
        n_anc = treenode(ancestor)
        n_src = treenode(src)
        rel = dtree.labels[subtree]
        nuc = dtree.nucs[subtree]
        #
        if n_anc.span.overlaps(n_src.span):
            raise RstDtException("Span %s overlaps with %s " %
                                 (n_anc.span, n_src.span))
        else:
            if n_anc.span <= n_src.span:
                left = ancestor
                right = src
                nuc_kids = [NUC_N, nuc]
            else:
                left = src
                right = ancestor
                nuc_kids = [nuc, NUC_N]
            # nuc in SimpleRSTTree is the concatenation of the initial
            # letter of each kid's nuclearity for the relation,
            # eg. {NS, SN, NN}
            nuc = ''.join(x[0] for x in nuc_kids)
        # compute EDU span of the parent node from the kids'
        l_edu_span = treenode(left).edu_span
        r_edu_span = treenode(right).edu_span
        edu_span = (min(l_edu_span[0], r_edu_span[0]),
                    max(l_edu_span[1], r_edu_span[1]))
        txt_span = n_anc.span.merge(n_src.span)
        res = SimpleRSTTree(
            Node(nuc, edu_span, txt_span, rel),
            [left, right])
        return res
Beispiel #4
0
def get_oracle_ctrees(dep_edges,
                      att_edus,
                      nuc_strategy="unamb_else_most_frequent",
                      rank_strategy="closest-intra-rl-inter-rl",
                      prioritize_same_unit=True,
                      strict=False):
    """Build the oracle constituency tree(s) for a dependency tree.

    Parameters
    ----------
    dep_edges: dict(string, [(string, string, string)])
        Edges for each document, indexed by doc name
        Cf. type of return value from
        irit-rst-dt/ctree.py:load_attelo_output_file()
    att_edus: cf return type of attelo.io.load_edus
        EDUs as they are known to attelo
    strict: boolean, True by default
        If True, any link from ROOT to an EDU that is neither 'ROOT' nor
        UNRELATED raises an exception, otherwise a warning is issued.

    Returns
    -------
    ctrees: list of RstTree
        There can be several e.g. for leaky sentences.
    """
    # rebuild educe EDUs from their attelo description
    # and group them by doc_name
    educe_edus = defaultdict(list)
    edu2sent_idx = defaultdict(dict)
    gid2num = dict()
    for att_edu in att_edus:
        # doc name
        doc_name = att_edu.grouping
        # EDU info
        # skip ROOT (automatically added by RstDepTree.__init__)
        if att_edu.id == 'ROOT':
            continue
        edu_num = int(att_edu.id.rsplit('_', 1)[1])
        edu_span = EduceSpan(att_edu.start, att_edu.end)
        edu_text = att_edu.text
        educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
        # map global id of EDU to num of EDU inside doc
        gid2num[att_edu.id] = edu_num
        # map EDU to sentence
        try:
            sent_idx = int(att_edu.subgrouping.split('_sent')[1])
        except IndexError:
            # this EDU could not be attached to any sentence (ex: missing
            # text in the PTB), so a default subgrouping identifier was used ;
            # we aim for consistency with educe and map these to "None"
            sent_idx = None
        edu2sent_idx[doc_name][edu_num] = sent_idx
    # check that our info covers only one document
    assert len(educe_edus) == 1
    # then restrict to this document
    doc_name = educe_edus.keys()[0]
    educe_edus = educe_edus[doc_name]
    edu2sent_idx = edu2sent_idx[doc_name]
    # sort EDUs by num
    educe_edus = list(sorted(educe_edus, key=lambda e: e.num))
    # rebuild educe-style edu2sent ; prepend 0 for the fake root
    edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus]
    # classifiers for nuclearity and ranking
    # FIXME declare, fit and predict upstream...
    # nuclearity
    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
    nuc_classifier.fit([], [])  # empty X and y for dummy fit
    # ranking classifier
    rank_classifier = InsideOutAttachmentRanker(
        strategy=rank_strategy, prioritize_same_unit=prioritize_same_unit)

    # rebuild RstDepTrees
    dtree = RstDepTree(educe_edus)
    for src_id, tgt_id, lbl in dep_edges:
        if src_id == 'ROOT':
            if lbl not in ['ROOT', UNKNOWN]:
                err_msg = 'weird root label: {} {} {}'.format(
                    src_id, tgt_id, lbl)
                if strict:
                    raise ValueError(err_msg)
                else:
                    print('W: {}, using ROOT instead'.format(err_msg))
            dtree.set_root(gid2num[tgt_id])
        else:
            dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
    # add nuclearity: heuristic baseline
    dtree.nucs = nuc_classifier.predict([dtree])[0]
    # add rank: some strategies require a mapping from EDU to sentence
    # EXPERIMENTAL attach array of sentence index for each EDU in tree
    dtree.sent_idx = edu2sent
    # end EXPERIMENTAL
    dtree.ranks = rank_classifier.predict([dtree])[0]
    # end NEW

    # create pred ctree
    try:
        bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True)
        if False:  # EXPERIMENTAL
            # currently False to run on output that already has
            # labels embedding nuclearity
            bin_srtrees = [
                SimpleRSTTree.incorporate_nuclearity_into_label(bin_srtree)
                for bin_srtree in bin_srtrees
            ]
        bin_rtrees = [
            SimpleRSTTree.to_binary_rst_tree(bin_srtree)
            for bin_srtree in bin_srtrees
        ]
    except RstDtException as rst_e:
        print(rst_e)
        if False:
            print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                            for edu in educe_edus[doc_name]))
        raise
    ctrees = bin_rtrees

    return ctrees
Beispiel #5
0
def get_oracle_ctrees(dep_edges, att_edus,
                      nuc_strategy="unamb_else_most_frequent",
                      rank_strategy="closest-intra-rl-inter-rl",
                      prioritize_same_unit=True,
                      strict=False):
    """Build the oracle constituency tree(s) for a dependency tree.

    Parameters
    ----------
    dep_edges: dict(string, [(string, string, string)])
        Edges for each document, indexed by doc name
        Cf. type of return value from
        irit-rst-dt/ctree.py:load_attelo_output_file()
    att_edus: cf return type of attelo.io.load_edus
        EDUs as they are known to attelo
    strict: boolean, True by default
        If True, any link from ROOT to an EDU that is neither 'ROOT' nor
        UNRELATED raises an exception, otherwise a warning is issued.

    Returns
    -------
    ctrees: list of RstTree
        There can be several e.g. for leaky sentences.
    """
    # rebuild educe EDUs from their attelo description
    # and group them by doc_name
    educe_edus = defaultdict(list)
    edu2sent_idx = defaultdict(dict)
    gid2num = dict()
    for att_edu in att_edus:
        # doc name
        doc_name = att_edu.grouping
        # EDU info
        # skip ROOT (automatically added by RstDepTree.__init__)
        if att_edu.id == 'ROOT':
            continue
        edu_num = int(att_edu.id.rsplit('_', 1)[1])
        edu_span = EduceSpan(att_edu.start, att_edu.end)
        edu_text = att_edu.text
        educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
        # map global id of EDU to num of EDU inside doc
        gid2num[att_edu.id] = edu_num
        # map EDU to sentence
        try:
            sent_idx = int(att_edu.subgrouping.split('_sent')[1])
        except IndexError:
            # this EDU could not be attached to any sentence (ex: missing
            # text in the PTB), so a default subgrouping identifier was used ;
            # we aim for consistency with educe and map these to "None"
            sent_idx = None
        edu2sent_idx[doc_name][edu_num] = sent_idx
    # check that our info covers only one document
    assert len(educe_edus) == 1
    # then restrict to this document
    doc_name = educe_edus.keys()[0]
    educe_edus = educe_edus[doc_name]
    edu2sent_idx = edu2sent_idx[doc_name]
    # sort EDUs by num
    educe_edus = list(sorted(educe_edus, key=lambda e: e.num))
    # rebuild educe-style edu2sent ; prepend 0 for the fake root
    edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus]
    # classifiers for nuclearity and ranking
    # FIXME declare, fit and predict upstream...
    # nuclearity
    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
    nuc_classifier.fit([], [])  # empty X and y for dummy fit
    # ranking classifier
    rank_classifier = InsideOutAttachmentRanker(
        strategy=rank_strategy,
        prioritize_same_unit=prioritize_same_unit)

    # rebuild RstDepTrees
    dtree = RstDepTree(educe_edus)
    for src_id, tgt_id, lbl in dep_edges:
        if src_id == 'ROOT':
            if lbl not in ['ROOT', UNKNOWN]:
                err_msg = 'weird root label: {} {} {}'.format(
                    src_id, tgt_id, lbl)
                if strict:
                    raise ValueError(err_msg)
                else:
                    print('W: {}, using ROOT instead'.format(err_msg))
            dtree.set_root(gid2num[tgt_id])
        else:
            dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
    # add nuclearity: heuristic baseline
    dtree.nucs = nuc_classifier.predict([dtree])[0]
    # add rank: some strategies require a mapping from EDU to sentence
    # EXPERIMENTAL attach array of sentence index for each EDU in tree
    dtree.sent_idx = edu2sent
    # end EXPERIMENTAL
    dtree.ranks = rank_classifier.predict([dtree])[0]
    # end NEW

    # create pred ctree
    try:
        bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True)
        if False:  # EXPERIMENTAL
            # currently False to run on output that already has
            # labels embedding nuclearity
            bin_srtrees = [SimpleRSTTree.incorporate_nuclearity_into_label(
                bin_srtree) for bin_srtree in bin_srtrees]
        bin_rtrees = [SimpleRSTTree.to_binary_rst_tree(bin_srtree)
                      for bin_srtree in bin_srtrees]
    except RstDtException as rst_e:
        print(rst_e)
        if False:
            print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                            for edu in educe_edus[doc_name]))
        raise
    ctrees = bin_rtrees

    return ctrees
Beispiel #6
0
        'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR),
        'strip_accents': strip_accents,
        'lowercase': lowercase,
        'stop_words': stop_words,
        'n_jobs': n_jobs,
        'verbose': verbose,
    }
    print('# parameters: ({})'.format(params),
          file=outfile)

    # do the real job
    corpus_items = sorted(rst_corpus.items())
    doc_keys = [key.doc for key, doc in corpus_items]
    doc_key_dtrees = [
        (doc_key.doc,
         RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc)))
        for doc_key, doc in corpus_items
    ]
    edu_txts = list(e.text().replace('\n', ' ')
                    for doc_key, dtree in doc_key_dtrees
                    for e in dtree.edus)
    # vectorize each EDU using its text
    edu_vecs = vect.transform(edu_txts)
    # normalize each row of the count matrix using the l1 norm
    # (copy=False to perform in place)
    edu_vecs = normalize(edu_vecs, norm='l1', copy=False)
    # get all pairs of EDUs of interest, here as triples
    # (gov_idx, dep_idx, lbl)
    # TODO maybe sort edu pairs so that dependents with
    # the same governor are grouped (potential speed up?)
    edu_pairs = [
Beispiel #7
0
        'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR),
        'strip_accents': strip_accents,
        'lowercase': lowercase,
        'stop_words': stop_words,
        'n_jobs': n_jobs,
        'verbose': verbose,
    }
    print('# parameters: ({})'.format(params),
          file=outfile)

    # do the real job
    corpus_items = sorted(rst_corpus.items())
    doc_keys = [key.doc for key, doc in corpus_items]
    doc_key_dtrees = [
        (doc_key.doc,
         RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc)))
        for doc_key, doc in corpus_items
    ]
    edu_txts = list(e.text().replace('\n', ' ')
                    for doc_key, dtree in doc_key_dtrees
                    for e in dtree.edus)
    # vectorize each EDU using its text
    edu_vecs = vect.transform(edu_txts)
    # normalize each row of the count matrix using the l1 norm
    # (copy=False to perform in place)
    edu_vecs = normalize(edu_vecs, norm='l1', copy=False)
    # get all pairs of EDUs of interest, here as triples
    # (gov_idx, dep_idx, lbl)
    # TODO maybe sort edu pairs so that dependents with
    # the same governor are grouped (potential speed up?)
    edu_pairs = [