def is_matching_turn(x): """Check whether x corresponds to the current turn""" if tid is None: return stac.is_turn(x) else: x_tid = stac.turn_id(x) return stac.is_turn(x) & tid == x_tid
def is_matching_turn(x): """Check whether x corresponds to the current turn""" if tid is None: return stac.is_turn(x) else: x_tid = x.features['Identifier'] return stac.is_turn(x) & tid == x_tid
def read_tags(corpus, root_dir): """ Read stored POS tagger output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. """ pos_tags = {} for k in corpus: doc = corpus[k] turns = sorted_by_span(x for x in doc.units if stac.is_turn(x)) tagged_file = tagger_file_name(k, root_dir) raw_toks = ext.read_token_file(tagged_file) pos_tags[k] = [] for turn, seg in zip(turns, raw_toks): prefix, body = stac.split_turn_text(doc.text(turn.text_span())) start = turn.span.char_start + len(prefix) toks = ext.token_spans(body, seg, start) for t in toks: t.origin = doc dtxt = doc.text(t.text_span()) assert dtxt == t.word pos_tags[k].extend(toks) return pos_tags
def turn_id_text(doc): """ Return a list of (turn ids, text) tuples in span order (no speaker) """ turns = sorted((x for x in doc.units if stac.is_turn(x)), key=lambda k: k.text_span()) return [(stac.turn_id(turn), stac.split_turn_text(doc.text(turn.text_span()))[1]) for turn in turns]
def extract_turns(doc): """ Return a string representation of the document's turn text for use by a tagger """ turns = sorted_by_span(x for x in doc.units if stac.is_turn(x)) def ttext(turn): """Get the turn text""" return stac.split_turn_text(doc.text(turn.text_span()))[1] return "\n".join(ttext(x) for x in turns)
def _split_dialogue(tcache, doc, tid): """Split a dialogue at a turn Turns at or after the given tid are pushed into a new empty dialogue. Returns ------- Span for the dialogue that was split """ wanted_t = "turn %d" % tid wanted_d = "dialogue for " + wanted_t turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and st.turn_id(x) == tid]) dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and x.encloses(turn)]) dspan = dialogue.text_span() _actually_split(tcache, doc, dialogue, turn) return dspan
def _split_dialogue(tcache, doc, tid): """Split a dialogue at a turn Turns at or after the given tid are pushed into a new empty dialogue. Returns ------- Span for the dialogue that was split """ wanted_t = 'turn {}'.format(tid) wanted_d = 'dialogue for ' + wanted_t turn = _the(wanted_t, [x for x in doc.units if st.is_turn(x) and st.turn_id(x) == tid]) dialogue = _the(wanted_d, [x for x in doc.units if st.is_dialogue(x) and x.encloses(turn)]) dspan = dialogue.text_span() _actually_split(tcache, doc, dialogue, turn) return dspan
def read_tags(corpus, root_dir): """ Read stored POS tagger output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. Parameters ---------- corpus : dict(FileId, GlozzDocument) Dictionary of documents keyed by their FileId. root_dir : str Path to the directory containing the output of the POS tagger, one file per document. Returns ------- pos_tags : dict(FileId, list(Token)) Map from each document id to the list of tokens predicted by a POS tagger. """ pos_tags = {} for k in corpus: doc = corpus[k] turns = sorted_by_span(x for x in doc.units if stac.is_turn(x)) tagged_file = tagger_file_name(k, root_dir) raw_toks = ext.read_token_file(tagged_file) pos_tags[k] = [] for turn, seg in zip(turns, raw_toks): prefix, body = stac.split_turn_text(doc.text(turn.text_span())) start = turn.span.char_start + len(prefix) toks = ext.token_spans(body, seg, start) for t in toks: t.origin = doc dtxt = doc.text(t.text_span()) assert dtxt == t.word pos_tags[k].extend(toks) return pos_tags
def _nudge_dialogue(doc, tid, direction): """ Move a turn either up or down. For feedback purposes, return the span of the affected region """ prev_turn, turn, next_turn = \ _window1(lambda x: st.turn_id(x) == tid, [x for x in doc.units if st.is_turn(x)]) if not turn: sys.exit("Could not find turn %d" % tid) tspan = turn.text_span() prev_dialogue, dialogue, next_dialogue = \ _window1(lambda x: x.text_span().encloses(tspan), [x for x in doc.units if st.is_dialogue(x)]) if direction == "up": return _nudge_up(turn, dialogue, next_turn, prev_dialogue) elif direction == "down": return _nudge_down(turn, dialogue, prev_turn, next_dialogue) else: raise Exception("Unknown direction " + direction)
def _nudge_dialogue(doc, tid, direction): """ Move a turn either up or down. For feedback purposes, return the span of the affected region """ prev_turn, turn, next_turn = _window1( lambda x: st.turn_id(x) == tid, [x for x in doc.units if st.is_turn(x)] ) if not turn: sys.exit("Could not find turn %d" % tid) tspan = turn.text_span() prev_dialogue, dialogue, next_dialogue = _window1( lambda x: x.text_span().encloses(tspan), [x for x in doc.units if st.is_dialogue(x)] ) if direction == "up": return _nudge_up(turn, dialogue, next_turn, prev_dialogue) elif direction == "down": return _nudge_down(turn, dialogue, prev_turn, next_dialogue) else: raise Exception("Unknown direction " + direction)
def is_matching_turn(x): if tid is None: return stac.is_turn(x) else: x_tid = x.features['Identifier'] return stac.is_turn(x) & tid == x_tid
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory. If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the loss of coreference information. """ if split: # for each document, how many digits do we need to represent the # turns in that document; for essentially cosmetic purposes # (padding) digits = {} for d in frozenset([k.doc for k in corpus]): turns = [] for k in corpus: if k.doc == d: turns.extend([x for x in corpus[k].units if stac.is_turn(x)]) turn_ids = [stac.turn_id(t)[0] for t in turns] digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: nb_digits = digits[k.doc] for tid, ttext in turn_id_text(doc): root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print(ttext, file=f) txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for _, ttext in turn_id_text(doc): print(ttext, file=f) txt_files.append(txt_file) # run CoreNLP corenlp_wrapper = CoreNlpWrapper(corenlp_dir) corenlp_props = [] if split else ['ssplit.eolonly=true'] corenlp_outdir = corenlp_wrapper.process(txt_files, outdir, properties=corenlp_props) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue from_path = os.path.join(corenlp_outdir, sfile) # targeted (STAC) filename k, tid = from_corenlp_output_filename(sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory. If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the loss of coreference information. """ if split: # for each document, how many digits do we need to represent the # turns in that document; for essentially cosmetic purposes # (padding) digits = {} for d in frozenset([k.doc for k in corpus]): turns = [] for k in corpus: if k.doc == d: turns.extend( [x for x in corpus[k].units if stac.is_turn(x)]) turn_ids = [stac.turn_id(t)[0] for t in turns] digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: nb_digits = digits[k.doc] for tid, ttext in turn_id_text(doc): root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print(ttext, file=f) txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for _, ttext in turn_id_text(doc): print(ttext, file=f) txt_files.append(txt_file) # run CoreNLP corenlp_wrapper = CoreNlpWrapper(corenlp_dir) corenlp_props = [] if split else ['ssplit.eolonly=true'] corenlp_outdir = corenlp_wrapper.process(txt_files, outdir, properties=corenlp_props) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue from_path = os.path.join(corenlp_outdir, sfile) # targeted (STAC) filename k, tid = from_corenlp_output_filename(sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)