Ejemplo n.º 1
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Ejemplo n.º 2
0
    def tgt_html(grandparent, anno, naughty=False):
        """
        Describe the given annotation in HTML and append that
        description to the given HTML grandparent node.
        """
        parent = h.span(grandparent)
        h.span(parent, anno_code(anno))
        type_span = h.span(parent, '[%s] ' % anno.type)
        if naughty:
            type_span.attrib['class'] = 'naughty'

        if anno in contexts:
            turn = contexts[anno].turn
            turn_info = stac.split_turn_text(doc.text(turn.span))[0]
            turn_splits = turn_info.split(":")
            if len(turn_splits) > 1:
                tid = ET.SubElement(parent, 'b')
                tid.text = turn_splits[0] + ":"
                h.span(parent, ":".join(turn_splits[1:]))
            else:
                h.span(parent, turn_info)

        if not stac.is_relation_instance(anno):
            t_text = text(anno)
            if stac.is_cdu(anno):
                trange = turn_range(anno)
                if trange:
                    h.elem(parent, 'b', trange)
            h.span(parent,
                   text=snippet(t_text, 100),
                   attrib={'class': 'snippet'})
            h.span(parent, ' %s' % anno.text_span())
        return parent
Ejemplo n.º 3
0
    def tgt_html(grandparent, anno, naughty=False):
        """
        Describe the given annotation in HTML and append that
        description to the given HTML grandparent node.
        """
        parent = h.span(grandparent)
        h.span(parent, anno_code(anno))
        type_span = h.span(parent, '[%s] ' % anno.type)
        if naughty:
            type_span.attrib['class'] = 'naughty'

        if anno in contexts:
            turn = contexts[anno].turn
            turn_info = stac.split_turn_text(doc.text(turn.span))[0]
            turn_splits = turn_info.split(":")
            if len(turn_splits) > 1:
                tid = ET.SubElement(parent, 'b')
                tid.text = turn_splits[0] + ":"
                h.span(parent, ":".join(turn_splits[1:]))
            else:
                h.span(parent, turn_info)

        if not stac.is_relation_instance(anno):
            t_text = text(anno)
            if stac.is_cdu(anno):
                trange = turn_range(anno)
                if trange:
                    h.elem(parent, 'b', trange)
            h.span(parent,
                   text=snippet(t_text, 100),
                   attrib={'class': 'snippet'})
            h.span(parent, ' %s' % anno.text_span())
        return parent
Ejemplo n.º 4
0
def read_tags(corpus, dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.
    """
    pos_tags = {}
    for k in corpus:
        doc   = corpus[k]
        turns = sorted_by_span(filter(stac.is_turn, doc.units))

        tagged_file = tagger_file_name(k, dir)
        raw_toks    = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text_for(turn))
            start        = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text_for(t)
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Ejemplo n.º 5
0
def is_disconnected(gra, contexts, node):
    """
    An EDU is considered disconnected unless:

    * it has an incoming link or
    * it has an outgoing Conditional link
    * it's at the beginning of a dialogue

    In principle we don't need to look at EDUs that are disconnected
    on the outgoing end because (1) it's can be legitimate for
    non-dialogue-ending EDUs to not have outgoing links and (2) such
    information would be redundant with the incoming anyway
    """
    def rel_type(rel):
        "relation type for a given link (string)"
        return gra.annotation(gra.mirror(rel)).type

    edu = gra.annotation(node)
    if edu not in contexts:
        return True
    else:
        ctx = contexts[edu]
        first_turn_span = ctx.dialogue_turns[0].text_span()
        first_turn_text = gra.doc.text(first_turn_span)
        first_turn_pref = stac.split_turn_text(first_turn_text)[0]
        first_turn_start = first_turn_span.char_start + len(first_turn_pref)
        rel_links = [x for x in gra.links(node) if gra.is_relation(x)]
        has_incoming = any(node == gra.rel_links(x)[1] for x in rel_links)
        has_outgoing_whitelist = any(node == gra.rel_links(r)[0] and
                                     rel_type(r) in BACKWARDS_WHITELIST
                                     for r in rel_links)
        is_at_start = edu.text_span().char_start == first_turn_start
        return not (has_incoming or has_outgoing_whitelist or is_at_start)
Ejemplo n.º 6
0
    def html(self):
        doc = self.doc
        contexts = self.contexts
        t = self.unit

        parent = ET.Element('span')
        html_anno_id(parent, self.unit)
        html_span(parent, " " + anno_code(t))
        type_span = html_span(parent, '[%s] ' % t.type)

        if t in contexts:
            turn = contexts[t].turn
            turn_info = stac.split_turn_text(doc.text(turn.span))[0]
            turn_splits = turn_info.split(":")
            if len(turn_splits) > 1:
                tid = ET.SubElement(parent, 'b')
                tid.text = turn_splits[0] + ":"
                trest = html_span(parent, ":".join(turn_splits[1:]))
            else:
                html_span(parent, turn_info)

        t_span = t.text_span()
        t_text = doc.text(t_span)
        if t_span.char_start > 0:
            before_idx = t_span.char_start - 1
            before_sp = html_span(parent, doc.text()[before_idx])
            before_sp.attrib['class'] = 'spillover'
        text_sp = html_span(parent, t_text)
        text_sp.attrib['class'] = 'snippet'
        if t_span.char_end < len(doc.text()):
            after_idx = t_span.char_end
            after_sp = html_span(parent, doc.text()[after_idx])
            after_sp.attrib['class'] = 'spillover'
        html_span(parent, ' %s' % t_span)
        return parent
Ejemplo n.º 7
0
def is_disconnected(gra, contexts, node):
    """Return True if an EDU is disconnected from a discourse structure.

    An EDU is considered disconnected unless:

    * it has an incoming link or
    * it has an outgoing Conditional link or
    * it's at the beginning of a dialogue

    In principle we don't need to look at EDUs that are disconnected
    on the outgoing end because (1) it can be legitimate for
    non-dialogue-ending EDUs to not have outgoing links and (2) such
    information would be redundant with the incoming anyway.
    """
    def rel_type(rel):
        "relation type for a given link (string)"
        return gra.annotation(gra.mirror(rel)).type

    edu = gra.annotation(node)
    if edu not in contexts:
        return True
    else:
        ctx = contexts[edu]
        first_turn_span = ctx.dialogue_turns[0].text_span()
        first_turn_text = gra.doc.text(first_turn_span)
        first_turn_pref = stac.split_turn_text(first_turn_text)[0]
        first_turn_start = first_turn_span.char_start + len(first_turn_pref)
        rel_links = [x for x in gra.links(node) if gra.is_relation(x)]
        has_incoming = any(node == gra.rel_links(x)[1] for x in rel_links)
        has_outgoing_whitelist = any(
            node == gra.rel_links(r)[0] and rel_type(r) in BACKWARDS_WHITELIST
            for r in rel_links)
        is_at_start = edu.text_span().char_start == first_turn_start
        return not (has_incoming or has_outgoing_whitelist or is_at_start)
Ejemplo n.º 8
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Ejemplo n.º 9
0
 def _get_turn_info(self, u):
     enclosing_turns = [ t for t in self.turns if t.span.encloses(u.span) ]
     if len(enclosing_turns) > 0:
         turn      = enclosing_turns[0]
         speaker   = turn.features['Emitter']
         turn_text = stac.split_turn_text(self.doc.text(turn.span))[0]
         turn_id   = turn_text.split(':')[0].strip()
         return speaker, turn_id
     else:
         return None, None
Ejemplo n.º 10
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Ejemplo n.º 11
0
Archivo: glozz.py Proyecto: kowey/educe
 def html_turn_info(self, parent, turn):
     """
     Given a turn annotation, append a prettified HTML
     representation of the turn text (highlighting parts
     of it, such as the turn number)
     """
     turn_text = self.doc.text(turn.text_span())
     turn_info = stac.split_turn_text(turn_text)[0]
     turn_splits = turn_info.split(":")
     if len(turn_splits) > 1:
         tid = turn_splits[0]
         trest = turn_splits[1:]
         h.elem(parent, 'b', text=tid + ":")
         h.span(parent, text=":".join(trest))
     else:
         h.span(parent, turn_info)
Ejemplo n.º 12
0
 def html_turn_info(self, parent, turn):
     """
     Given a turn annotation, append a prettified HTML
     representation of the turn text (highlighting parts
     of it, such as the turn number)
     """
     turn_text = self.doc.text(turn.text_span())
     turn_info = stac.split_turn_text(turn_text)[0]
     turn_splits = turn_info.split(":")
     if len(turn_splits) > 1:
         tid = turn_splits[0]
         trest = turn_splits[1:]
         h.elem(parent, 'b', text=tid + ":")
         h.span(parent, text=":".join(trest))
     else:
         h.span(parent, turn_info)
Ejemplo n.º 13
0
    def tgt_html(grandparent, t, naughty=False):
        def tid(x):
            if x in contexts:
                tid_str = contexts[x].turn.features['Identifier']
                return int(tid_str) if tid_str else None
            else:
                return None

        parent = html_span(grandparent)
        html_span(parent, anno_code(t))
        type_span = html_span(parent, '[%s] ' % t.type)
        if naughty:
            type_span.attrib['class'] = 'naughty'

        if t in contexts:
            turn = contexts[t].turn
            turn_info = stac.split_turn_text(doc.text(turn.span))[0]
            turn_splits = turn_info.split(":")
            if len(turn_splits) > 1:
                tid = ET.SubElement(parent, 'b')
                tid.text = turn_splits[0] + ":"
                trest = html_span(parent, ":".join(turn_splits[1:]))
            else:
                html_span(parent, turn_info)

        if not stac.is_relation_instance(t):
            t_span = t.text_span()
            t_text = doc.text(t_span)
            if stac.is_cdu(t):
                tids = [x for x in map(tid, t.terminals()) if x]
                if tids:
                    tspan = ET.SubElement(parent, 'b')
                    min_tid = min(tids)
                    max_tid = max(tids)
                    if min_tid == max_tid:
                        tspan.text = "%d: " % min_tid
                    else:
                        tspan.text = "%d-%d: " % (min_tid, max_tid)
            text_sp = html_span(parent, snippet(t_text, 100))
            text_sp.attrib['class'] = 'snippet'
            html_span(parent, ' %s' % t_span)
        return parent
Ejemplo n.º 14
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.

    Parameters
    ----------
    corpus : dict(FileId, GlozzDocument)
        Dictionary of documents keyed by their FileId.
    root_dir : str
        Path to the directory containing the output of the POS tagger,
        one file per document.

    Returns
    -------
    pos_tags : dict(FileId, list(Token))
        Map from each document id to the list of tokens predicted by a
        POS tagger.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Ejemplo n.º 15
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.

    Parameters
    ----------
    corpus : dict(FileId, GlozzDocument)
        Dictionary of documents keyed by their FileId.
    root_dir : str
        Path to the directory containing the output of the POS tagger,
        one file per document.

    Returns
    -------
    pos_tags : dict(FileId, list(Token))
        Map from each document id to the list of tokens predicted by a
        POS tagger.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Ejemplo n.º 16
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start
                  + len(stac.split_turn_text(ttext)[0])
                  - sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Ejemplo n.º 17
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents in
    the corpus and save the results in the specified directory

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may somewhat
    mitigate the lost of coreference information.
    """

    # for each document, how many digits do we need to represent the turns
    # in that document; for essentially cosmetic purposes (padding)
    digits = {}
    for d in frozenset([ k.doc for k in corpus ]):
        turns = []
        for k in corpus:
            if k.doc == d:
                turns.extend(filter(stac.is_turn, corpus[k].units))
        turn_ids  = [ int(t.features['Identifier']) for t in turns ]
        digits[d] = max(2,int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc   = corpus[k]
        turns = sorted(filter(stac.is_turn, doc.units),
                       key=lambda k:k.span)

        k_txt           = copy.copy(k)
        k_txt.stage     = 'turns'
        k_txt.annotator = None

        if split:
            for turn in turns:
                ttext = stac.split_turn_text(doc.text_for(turn))[1]
                tid   = turn.features['Identifier']
                root  = stac.id_to_path(k_txt) + '_' + tid.zfill(digits[k.doc])

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir  = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print >> f, ttext

                txt_files.append(txt_file)
        else:
            root     = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir  = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for turn in turns:
                    ttext = stac.split_turn_text(doc.text_for(turn))[1]
                    print >> f, ttext
            txt_files.append(txt_file)

    # manifest tells corenlp what to files to read as input
    manifest_dir  = os.path.join(outdir, 'tmp')
    manifest_file = os.path.join(manifest_dir, 'manifest')
    with codecs.open(manifest_file, 'w', 'utf-8') as f:
        print >> f, '\n'.join(txt_files)

    # java properties to control behaviour of corenlp
    properties = [] if split else ['ssplit.eolonly=true']
    props_file = os.path.join(manifest_dir, 'corenlp.properties')
    with codecs.open(props_file, 'w', 'utf-8') as f:
        print >> f, '\n'.join(properties)

    # run corenlp (will take a while for it to load its various models)
    jars   = [ x for x in os.listdir(corenlp_dir) if os.path.splitext(x)[1] == '.jar' ]
    cp_sep = ':' if os.name != 'nt' else ';'

    corenlp_outdir = os.path.join(outdir, 'corenlp')
    if not os.path.exists(corenlp_outdir):
        os.makedirs(corenlp_outdir)

    cmd = [ 'java'
          , '-cp', cp_sep.join(jars)
          , '-Xmx3g'
          , 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
          , '-filelist',  manifest_file
          , '-props',     props_file
          , '-outputDirectory', corenlp_outdir
          ]
    subprocess.call(cmd, cwd=corenlp_dir)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml': continue
        k, tid = from_corenlp_output_filename(sfile)
        from_path = os.path.join(corenlp_outdir, sfile)
        to_path   = parsed_file_name(k, outdir)
        to_dir    = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Ejemplo n.º 18
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    def is_matching_turn(x):
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = x.features['Identifier']
            return stac.is_turn(x) & tid == x_tid

    turns     = sorted(filter(is_matching_turn, doc.units), key=lambda k:k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
                % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = collections.defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid    = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = collections.defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext  = doc.text_for(turn)
        offset = turn.span.char_start + len(stac.split_turn_text(ttext)[0]) - sentence_begin

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees  = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid         = sent['id']
        tokens      = educe_tokens[sid]
        tree        = nltk.tree.Tree(sent['parse'])
        educe_tree  = ConstituencyTree.build(tree, tokens.values())

        deps   = collections.defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty,dep_id))

        educe_dtree = DependencyTree.build(deps, tokens, sid + '-0')

        all_tokens.extend(tokens.values())
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr,chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid         = m['sentence']

            local_id    = lambda x : int(x[len(sid) + 1:])
            global_id   = lambda x : sid + '-' + str(x)

            start       = local_id(m['start'])
            end         = local_id(m['end'])
            token_range = map(global_id, range(start, end))
            tokens      = [ educe_tokens[sid][t] for t in token_range ]
            head        = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
Ejemplo n.º 19
0
 def ttext(turn):
     return stac.split_turn_text(doc.text(turn.text_span()))[1]
Ejemplo n.º 20
0
 def ttext(turn):
     return stac.split_turn_text(doc.text(turn.text_span()))[1]
Ejemplo n.º 21
0
 def ttext(turn):
     return stac.split_turn_text(doc.text_for(turn))[1]
Ejemplo n.º 22
0
 def ttext(turn):
     """Get the turn text"""
     return stac.split_turn_text(doc.text(turn.text_span()))[1]
Ejemplo n.º 23
0
 def ttext(turn):
     """Get the turn text"""
     return stac.split_turn_text(doc.text(turn.text_span()))[1]
Ejemplo n.º 24
0
def read_corenlp_result(doc, corenlp_doc, tid=None):
    """Read CoreNLP's output for a document.

    Parameters
    ----------
    doc: educe Document (?)
        The original document (?)

    corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource
        Object that contains all annotations for the document

    tid: turn id
        Turn id (?)

    Returns
    -------
    corenlp_doc: CoreNlpDocument
        A CoreNlpDocument containing all information.
    """
    def is_matching_turn(x):
        """Check whether x corresponds to the current turn"""
        if tid is None:
            return stac.is_turn(x)
        else:
            x_tid = stac.turn_id(x)
            return stac.is_turn(x) & tid == x_tid

    turns = sorted((x for x in doc.units if is_matching_turn(x)),
                   key=lambda k: k.span)
    sentences = corenlp_doc.get_ordered_sentence_list()

    if len(turns) != len(sentences):
        msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\
              'and parsed sentences (%d) %s'\
              % (len(turns), len(sentences), doc.origin)
        raise Exception(msg)

    sentence_toks = defaultdict(list)
    for t in corenlp_doc.get_ordered_token_list():
        sid = t['s_id']
        sentence_toks[sid].append(t)

    # build dict from sid to (dict from tid to fancy token)
    educe_tokens = defaultdict(dict)
    for turn, sent in zip(turns, sentences):
        sid = sent['id']

        # the token offsets are global, ie. for all sentences/turns
        # in the file; so we have to shift them to left to zero them
        # and then shift them back to the right
        sentence_begin = min(t['extent'][0] for t in sentence_toks[sid])

        ttext = doc.text(turn.text_span())
        offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) -
                  sentence_begin)

        for t in sentence_toks[sid]:
            tid = t['id']
            educe_tokens[sid][tid] = CoreNlpToken(t, offset)

    all_tokens = []
    all_trees = []
    all_dtrees = []
    for turn, sent in zip(turns, sentences):
        sid = sent['id']
        tokens_dict = educe_tokens[sid]
        # FIXME tokens are probably not properly ordered because token ids
        # are global ids, i.e. strings like "1-18" (sentence 1, token 18)
        # which means basic sorting ranks "1-10" before "1-2"
        # cf. educe.rst_dt.corenlp
        sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())]
        # end FIXME
        tree = nltk.tree.Tree.fromstring(sent['parse'])
        educe_tree = ConstituencyTree.build(tree, sorted_tokens)

        deps = defaultdict(list)
        for ty, gov_id, dep_id in sent['dependencies']:
            deps[gov_id].append((ty, dep_id))

        educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0')

        all_tokens.extend(sorted_tokens)
        all_trees.append(educe_tree)
        all_dtrees.append(educe_dtree)

    all_chains = []
    for ctr, chain in enumerate(corenlp_doc.get_coref_chains()):
        mentions = []
        for m in chain:
            sid = m['sentence']

            local_id = lambda x: int(x[len(sid) + 1:])
            global_id = lambda x: sid + '-' + str(x)

            start = local_id(m['start'])
            end = local_id(m['end'])
            token_range = [global_id(x) for x in range(start, end)]
            tokens = [educe_tokens[sid][t] for t in token_range]
            head = educe_tokens[sid][m['head']]
            mentions.append(Mention(tokens, head, m['most_representative']))
        all_chains.append(Chain(mentions))

    return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)