def read_corenlp_result(doc, corenlp_doc, tid=None): """Read CoreNLP's output for a document. Parameters ---------- doc: educe Document (?) The original document (?) corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document tid: turn id Turn id (?) Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information. """ def is_matching_turn(x): """Check whether x corresponds to the current turn""" if tid is None: return stac.is_turn(x) else: x_tid = stac.turn_id(x) return stac.is_turn(x) & tid == x_tid turns = sorted((x for x in doc.units if is_matching_turn(x)), key=lambda k: k.span) sentences = corenlp_doc.get_ordered_sentence_list() if len(turns) != len(sentences): msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\ 'and parsed sentences (%d) %s'\ % (len(turns), len(sentences), doc.origin) raise Exception(msg) sentence_toks = defaultdict(list) for t in corenlp_doc.get_ordered_token_list(): sid = t['s_id'] sentence_toks[sid].append(t) # build dict from sid to (dict from tid to fancy token) educe_tokens = defaultdict(dict) for turn, sent in zip(turns, sentences): sid = sent['id'] # the token offsets are global, ie. for all sentences/turns # in the file; so we have to shift them to left to zero them # and then shift them back to the right sentence_begin = min(t['extent'][0] for t in sentence_toks[sid]) ttext = doc.text(turn.text_span()) offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) - sentence_begin) for t in sentence_toks[sid]: tid = t['id'] educe_tokens[sid][tid] = CoreNlpToken(t, offset) all_tokens = [] all_trees = [] all_dtrees = [] for turn, sent in zip(turns, sentences): sid = sent['id'] tokens_dict = educe_tokens[sid] # FIXME tokens are probably not properly ordered because token ids # are global ids, i.e. strings like "1-18" (sentence 1, token 18) # which means basic sorting ranks "1-10" before "1-2" # cf. educe.rst_dt.corenlp sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())] # end FIXME tree = nltk.tree.Tree.fromstring(sent['parse']) educe_tree = ConstituencyTree.build(tree, sorted_tokens) deps = defaultdict(list) for ty, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((ty, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') all_tokens.extend(sorted_tokens) all_trees.append(educe_tree) all_dtrees.append(educe_dtree) all_chains = [] for ctr, chain in enumerate(corenlp_doc.get_coref_chains()): mentions = [] for m in chain: sid = m['sentence'] local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) start = local_id(m['start']) end = local_id(m['end']) token_range = [global_id(x) for x in range(start, end)] tokens = [educe_tokens[sid][t] for t in token_range] head = educe_tokens[sid][m['head']] mentions.append(Mention(tokens, head, m['most_representative'])) all_chains.append(Chain(mentions)) return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # NEW extract local id to properly sort tokens tok_local_id = lambda x: int(x[len(sid) + 1:]) sorted_tokens = [ tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id) ] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to extract local ids and generate global ids local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) # retrieve tokens for this mention start = local_id(mntn['start']) end = local_id(mntn['end']) tokens = [ educe_tokens[sid][global_id(tok_idx)] for tok_idx in range(start, end) ] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # NEW extract local id to properly sort tokens tok_local_id = lambda x: int(x[len(sid) + 1:]) sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to extract local ids and generate global ids local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) # retrieve tokens for this mention start = local_id(mntn['start']) end = local_id(mntn['end']) tokens = [educe_tokens[sid][global_id(tok_idx)] for tok_idx in range(start, end)] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # sort tokens by their (integer) local id tok_local_id = tok_lid(sid) sorted_tokens = [ tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id) ] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to map from/to local and global ids tok_local_id = tok_lid(sid) tok_global_id = tok_gid(sid) # retrieve tokens for this mention start = tok_local_id(mntn['start']) end = tok_local_id(mntn['end']) tokens = [ educe_tokens[sid][tok_global_id(tok_idx)] for tok_idx in range(start, end) ] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc