Ejemplo n.º 1
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = stac.turn_id(x)
         return stac.is_turn(x) & tid == x_tid
Ejemplo n.º 2
0
 def is_matching_turn(x):
     """Check whether x corresponds to the current turn"""
     if tid is None:
         return stac.is_turn(x)
     else:
         x_tid = stac.turn_id(x)
         return stac.is_turn(x) & tid == x_tid
Ejemplo n.º 3
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Ejemplo n.º 4
0
def turn_id_text(doc):
    """
    Return a list of (turn ids, text) tuples
    in span order (no speaker)
    """
    turns = sorted((x for x in doc.units if stac.is_turn(x)),
                   key=lambda k: k.text_span())
    return [(stac.turn_id(turn),
             stac.split_turn_text(doc.text(turn.text_span()))[1])
            for turn in turns]
Ejemplo n.º 5
0
def _split_dialogue(tcache, doc, tid):
    """Split a dialogue at a turn

    Turns at or after the given tid are pushed into a new empty
    dialogue.

    Returns
    -------
    Span for the dialogue that was split
    """

    wanted_t = "turn %d" % tid
    wanted_d = "dialogue for " + wanted_t
    turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and st.turn_id(x) == tid])
    dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and x.encloses(turn)])
    dspan = dialogue.text_span()
    _actually_split(tcache, doc, dialogue, turn)
    return dspan
Ejemplo n.º 6
0
def _split_dialogue(tcache, doc, tid):
    """Split a dialogue at a turn

    Turns at or after the given tid are pushed into a new empty
    dialogue.

    Returns
    -------
    Span for the dialogue that was split
    """

    wanted_t = 'turn {}'.format(tid)
    wanted_d = 'dialogue for ' + wanted_t
    turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and
                           st.turn_id(x) == tid])
    dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and
                               x.encloses(turn)])
    dspan = dialogue.text_span()
    _actually_split(tcache, doc, dialogue, turn)
    return dspan
Ejemplo n.º 7
0
def _nudge_dialogue(doc, tid, direction):
    """
    Move a turn either up or down.
    For feedback purposes, return the span of the affected region
    """
    prev_turn, turn, next_turn = \
        _window1(lambda x: st.turn_id(x) == tid,
                 [x for x in doc.units if st.is_turn(x)])
    if not turn:
        sys.exit("Could not find turn %d" % tid)

    tspan = turn.text_span()
    prev_dialogue, dialogue, next_dialogue = \
        _window1(lambda x: x.text_span().encloses(tspan),
                 [x for x in doc.units if st.is_dialogue(x)])

    if direction == "up":
        return _nudge_up(turn, dialogue, next_turn, prev_dialogue)
    elif direction == "down":
        return _nudge_down(turn, dialogue, prev_turn, next_dialogue)
    else:
        raise Exception("Unknown direction " + direction)
Ejemplo n.º 8
0
def _nudge_down(turn, dialogue, prev_turn, next_dialogue):
    """
    Move last turn to next dialogue. (ie. shorten the right
    boundary of this dialogue and extend the left boundary of
    this dialogue)

    Return encompassing span to show what we've changed
    """
    if not prev_turn:
        sys.exit("Can't move very first turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not next_dialogue:
        sys.exit("Can't move from last dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_end != dialogue.span.char_end:
        sys.exit("Turn %d %s is not at the end of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = prev_turn.span.char_end - turn.span.char_end
    # take both dialogue boundaries down a bit (to next turn end)
    next_dialogue.span.char_start += offset
    dialogue.span.char_end += offset
    return Span.merge_all([dialogue.span, next_dialogue.span])
Ejemplo n.º 9
0
def _nudge_up(turn, dialogue, next_turn, prev_dialogue):
    """
    Move first turn to previous dialogue (ie. extend the
    previous dialogue to incorporate this turn, and push
    this dialogue to exclude it)

    Return encompassing span to show what we've changed
    """
    if not next_turn:
        sys.exit("Can't move very last turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not prev_dialogue:
        sys.exit("Can't move from first dialogue. "
                 "Try `stac-util move` instead")
    elif turn.span.char_start - 1 != dialogue.span.char_start:
        sys.exit("Turn %d %s is not at the start of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = next_turn.span.char_start - turn.span.char_start
    # take both dialogue boundaries up a bit (to prev turn end)
    prev_dialogue.span.char_end += offset
    dialogue.span.char_start += offset
    return Span.merge_all([prev_dialogue.span, dialogue.span])
Ejemplo n.º 10
0
def _nudge_down(turn, dialogue, prev_turn, next_dialogue):
    """
    Move last turn to next dialogue. (ie. shorten the right
    boundary of this dialogue and extend the left boundary of
    this dialogue)

    Return encompassing span to show what we've changed
    """
    if not prev_turn:
        sys.exit("Can't move very first turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not next_dialogue:
        sys.exit("Can't move from last dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_end != dialogue.span.char_end:
        sys.exit("Turn %d %s is not at the end of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = prev_turn.span.char_end - turn.span.char_end
    # take both dialogue boundaries down a bit (to next turn end)
    next_dialogue.span.char_start += offset
    dialogue.span.char_end += offset
    return Span.merge_all([dialogue.span, next_dialogue.span])
Ejemplo n.º 11
0
def _nudge_up(turn, dialogue, next_turn, prev_dialogue):
    """
    Move first turn to previous dialogue (ie. extend the
    previous dialogue to incorporate this turn, and push
    this dialogue to exclude it)

    Return encompassing span to show what we've changed
    """
    if not next_turn:
        sys.exit("Can't move very last turn. "
                 "Try `stac-util merge-dialogue` instead")
    elif not prev_dialogue:
        sys.exit("Can't move from first dialogue."
                 "Try `stac-util move` instead")
    elif turn.span.char_start - 1 != dialogue.span.char_start:
        sys.exit("Turn %d %s is not at the start of its dialogue %s" %
                 (st.turn_id(turn), turn.span, dialogue.span))

    offset = next_turn.span.char_start - turn.span.char_start
    # take both dialogue boundaries up a bit (to prev turn end)
    prev_dialogue.span.char_end += offset
    dialogue.span.char_start += offset
    return Span.merge_all([prev_dialogue.span, dialogue.span])
Ejemplo n.º 12
0
def _nudge_dialogue(doc, tid, direction):
    """
    Move a turn either up or down.
    For feedback purposes, return the span of the affected region
    """
    prev_turn, turn, next_turn = _window1(
        lambda x: st.turn_id(x) == tid,
        [x for x in doc.units if st.is_turn(x)]
    )
    if not turn:
        sys.exit("Could not find turn %d" % tid)

    tspan = turn.text_span()
    prev_dialogue, dialogue, next_dialogue = _window1(
        lambda x: x.text_span().encloses(tspan),
        [x for x in doc.units if st.is_dialogue(x)]
    )

    if direction == "up":
        return _nudge_up(turn, dialogue, next_turn, prev_dialogue)
    elif direction == "down":
        return _nudge_down(turn, dialogue, prev_turn, next_dialogue)
    else:
        raise Exception("Unknown direction " + direction)
Ejemplo n.º 13
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend([x for x in corpus[k].units
                                  if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files, outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Ejemplo n.º 14
0
 def turn_id(anno):
     "turn id if we know about this annotation"
     return stac.turn_id(anno) if anno in contexts else None
Ejemplo n.º 15
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend(
                        [x for x in corpus[k].units if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files,
                                             outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Ejemplo n.º 16
0
 def turn_id(anno):
     "turn id if we know about this annotation"
     return stac.turn_id(anno) if anno in contexts else None