Example #1
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = stac.turn_id(x)
         return stac.is_turn(x) & tid == x_tid
Example #2
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = x.features['Identifier']
         return stac.is_turn(x) & tid == x_tid
Example #3
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = stac.turn_id(x)
         return stac.is_turn(x) & tid == x_tid
Example #4
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = x.features['Identifier']
         return stac.is_turn(x) & tid == x_tid
Example #5
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Example #6
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Example #7
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Example #8
0
def extract_turns(doc):
    """
    Return a string representation of the document's turn text
    for use by a tagger
    """
    turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

    def ttext(turn):
        """Get the turn text"""
        return stac.split_turn_text(doc.text(turn.text_span()))[1]

    return "\n".join(ttext(x) for x in turns)
Example #9
0
def extract_turns(doc):
    """
    Return a string representation of the document's turn text
    for use by a tagger
    """
    turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

    def ttext(turn):
        """Get the turn text"""
        return stac.split_turn_text(doc.text(turn.text_span()))[1]

    return "\n".join(ttext(x) for x in turns)
Example #10
0
def _split_dialogue(tcache, doc, tid):
    """Split a dialogue at a turn

    Turns at or after the given tid are pushed into a new empty
    dialogue.

    Returns
    -------
    Span for the dialogue that was split
    """

    wanted_t = "turn %d" % tid
    wanted_d = "dialogue for " + wanted_t
    turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and st.turn_id(x) == tid])
    dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and x.encloses(turn)])
    dspan = dialogue.text_span()
    _actually_split(tcache, doc, dialogue, turn)
    return dspan
Example #11
0
def _split_dialogue(tcache, doc, tid):
    """Split a dialogue at a turn

    Turns at or after the given tid are pushed into a new empty
    dialogue.

    Returns
    -------
    Span for the dialogue that was split
    """

    wanted_t = 'turn {}'.format(tid)
    wanted_d = 'dialogue for ' + wanted_t
    turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and
                           st.turn_id(x) == tid])
    dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and
                               x.encloses(turn)])
    dspan = dialogue.text_span()
    _actually_split(tcache, doc, dialogue, turn)
    return dspan
Example #12
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.

    Parameters
    ----------
    corpus : dict(FileId, GlozzDocument)
        Dictionary of documents keyed by their FileId.
    root_dir : str
        Path to the directory containing the output of the POS tagger,
        one file per document.

    Returns
    -------
    pos_tags : dict(FileId, list(Token))
        Map from each document id to the list of tokens predicted by a
        POS tagger.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Example #13
0
def read_tags(corpus, root_dir):
    """
    Read stored POS tagger output from a directory, and convert them to
    educe.annotation.Standoff objects.

    Return a dictionary mapping 'FileId's to sets of tokens.

    Parameters
    ----------
    corpus : dict(FileId, GlozzDocument)
        Dictionary of documents keyed by their FileId.
    root_dir : str
        Path to the directory containing the output of the POS tagger,
        one file per document.

    Returns
    -------
    pos_tags : dict(FileId, list(Token))
        Map from each document id to the list of tokens predicted by a
        POS tagger.
    """
    pos_tags = {}
    for k in corpus:
        doc = corpus[k]
        turns = sorted_by_span(x for x in doc.units if stac.is_turn(x))

        tagged_file = tagger_file_name(k, root_dir)
        raw_toks = ext.read_token_file(tagged_file)
        pos_tags[k] = []
        for turn, seg in zip(turns, raw_toks):
            prefix, body = stac.split_turn_text(doc.text(turn.text_span()))
            start = turn.span.char_start + len(prefix)
            toks = ext.token_spans(body, seg, start)
            for t in toks:
                t.origin = doc
                dtxt = doc.text(t.text_span())
                assert dtxt == t.word
            pos_tags[k].extend(toks)
    return pos_tags
Example #14
0
def _nudge_dialogue(doc, tid, direction):
    """
    Move a turn either up or down.
    For feedback purposes, return the span of the affected region
    """
    prev_turn, turn, next_turn = \
        _window1(lambda x: st.turn_id(x) == tid,
                 [x for x in doc.units if st.is_turn(x)])
    if not turn:
        sys.exit("Could not find turn %d" % tid)

    tspan = turn.text_span()
    prev_dialogue, dialogue, next_dialogue = \
        _window1(lambda x: x.text_span().encloses(tspan),
                 [x for x in doc.units if st.is_dialogue(x)])

    if direction == "up":
        return _nudge_up(turn, dialogue, next_turn, prev_dialogue)
    elif direction == "down":
        return _nudge_down(turn, dialogue, prev_turn, next_dialogue)
    else:
        raise Exception("Unknown direction " + direction)
Example #15
0
def _nudge_dialogue(doc, tid, direction):
    """
    Move a turn either up or down.
    For feedback purposes, return the span of the affected region
    """
    prev_turn, turn, next_turn = _window1(
        lambda x: st.turn_id(x) == tid,
        [x for x in doc.units if st.is_turn(x)]
    )
    if not turn:
        sys.exit("Could not find turn %d" % tid)

    tspan = turn.text_span()
    prev_dialogue, dialogue, next_dialogue = _window1(
        lambda x: x.text_span().encloses(tspan),
        [x for x in doc.units if st.is_dialogue(x)]
    )

    if direction == "up":
        return _nudge_up(turn, dialogue, next_turn, prev_dialogue)
    elif direction == "down":
        return _nudge_down(turn, dialogue, prev_turn, next_dialogue)
    else:
        raise Exception("Unknown direction " + direction)
Example #16
0
 def is_matching_turn(x):
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = x.features['Identifier']
         return stac.is_turn(x) & tid == x_tid
Example #17
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend([x for x in corpus[k].units
                                  if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files, outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Example #18
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend(
                        [x for x in corpus[k].units if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files,
                                             outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)