def corenlp_to_xmltree(obj: Union[Dict, Sentence], prune_root: bool = True) -> XMLTree: """ Transforms an object with CoreNLP dep_path and dep_parent attributes into an XMLTree. Will include elements of any array having the same dimensiion as dep_* as node attributes. Also adds special word_idx attribute corresponding to original sequence order in sentence. """ # Convert input object to dictionary s: Dict = get_as_dict(obj) # Use the dep_parents array as a guide: ensure it is present and a list of # ints if not ("dep_parents" in s and isinstance(s["dep_parents"], list)): raise ValueError( "Input CoreNLP object must have a 'dep_parents' attribute which is a list" ) try: dep_parents = list(map(int, s["dep_parents"])) except Exception: raise ValueError("'dep_parents' attribute must be a list of ints") # Also ensure that we are using CoreNLP-native indexing # (root=0, 1-base word indexes)! b = min(dep_parents) if b != 0: dep_parents = list(map(lambda j: j - b, dep_parents)) # Parse recursively root = corenlp_to_xmltree_sub(s, dep_parents, 0) # Often the return tree will have several roots, where one is the actual # root and the rest are just singletons not included in the dep tree # parse... # We optionally remove these singletons and then collapse the root if only # one child left. if prune_root: for c in root: if len(c) == 0: root.remove(c) if len(root) == 1: root = root.findall("./*")[0] return XMLTree(root, words=s["words"])
def extract_textual_features( candidates: Union[Candidate, List[Candidate]], ) -> Iterator[Tuple[int, str, int]]: """Extract textual features. :param candidates: A list of candidates to extract features from """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))): raise ValueError( f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found." ) # Unary candidates if len(args) == 1: span: Union[SpanMention, ImplicitSpanMention] = args[0] if span.sentence.is_lingual(): get_tdl_feats = _compile_entity_feature_generator() xmltree = corenlp_to_xmltree(span.sentence) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) if len(sidxs) > 0: # Add DDLIB entity features for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Multinary candidates else: spans = args if all([span.sentence.is_lingual() for span in spans]): get_tdl_feats = compile_relation_feature_generator( is_multary=True) sents = [get_as_dict(span.sentence) for span in spans] xmltree = corenlp_to_xmltree(spans[0].sentence) s_idxs = [ list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) for span in spans ] if all([len(s_idx) > 0 for s_idx in s_idxs]): # Add DDLIB entity features for relation for span, sent, s_idx, i in zip(spans, sents, s_idxs, range(len(spans))): for f in _get_ddlib_feats(span, sent, s_idx): yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in multinary_tdl_feats: multinary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s_idxs): multinary_tdl_feats[candidate.id].add(f) for f in multinary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for i, span in enumerate(spans): for f in _get_word_feats(span): yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE
def get_content_feats(candidates): candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], TemporarySpanMention)): raise ValueError( f"Accepts Span-type arguments, {type(candidate)}-type found.") # Unary candidates if len(args) == 1: span = args[0] if span.sentence.is_lingual(): get_tdl_feats = compile_entity_feature_generator() sent = get_as_dict(span.sentence) xmltree = corenlp_to_xmltree(sent) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) if len(sidxs) > 0: # Add DDLIB entity features for f in get_ddlib_feats(span, sent, sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE else: for f in get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Binary candidates elif len(args) == 2: span1, span2 = args if span1.sentence.is_lingual() and span2.sentence.is_lingual(): get_tdl_feats = compile_relation_feature_generator() sent1 = get_as_dict(span1.sentence) sent2 = get_as_dict(span2.sentence) xmltree = corenlp_to_xmltree(get_as_dict(span1.sentence)) s1_idxs = list( range(span1.get_word_start_index(), span1.get_word_end_index() + 1)) s2_idxs = list( range(span2.get_word_start_index(), span2.get_word_end_index() + 1)) if len(s1_idxs) > 0 and len(s2_idxs) > 0: # Add DDLIB entity features for relation for f in get_ddlib_feats(span1, sent1, s1_idxs): yield candidate.id, f"DDL_e1_{f}", DEF_VALUE for f in get_ddlib_feats(span2, sent2, s2_idxs): yield candidate.id, f"DDL_e2_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in binary_tdl_feats: binary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs): binary_tdl_feats[candidate.id].add(f) for f in binary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE else: for f in get_word_feats(span1): yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE for f in get_word_feats(span2): yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE else: raise NotImplementedError( "Only handles unary and binary candidates currently")
def extract_textual_features( candidates: Union[Candidate, List[Candidate]], ) -> Iterator[Tuple[int, str, int]]: """Extract textual features. :param candidates: A list of candidates to extract features from :type candidates: list """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))): raise ValueError( f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found." ) # Unary candidates if len(args) == 1: span: Union[SpanMention, ImplicitSpanMention] = args[0] if span.sentence.is_lingual(): get_tdl_feats = _compile_entity_feature_generator() xmltree = corenlp_to_xmltree(span.sentence) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1) ) if len(sidxs) > 0: # Add DDLIB entity features for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Binary candidates elif len(args) == 2: span1, span2 = args if span1.sentence.is_lingual() and span2.sentence.is_lingual(): get_tdl_feats = compile_relation_feature_generator() sent1 = get_as_dict(span1.sentence) sent2 = get_as_dict(span2.sentence) xmltree = corenlp_to_xmltree(span1.sentence) s1_idxs = list( range(span1.get_word_start_index(), span1.get_word_end_index() + 1) ) s2_idxs = list( range(span2.get_word_start_index(), span2.get_word_end_index() + 1) ) if len(s1_idxs) > 0 and len(s2_idxs) > 0: # Add DDLIB entity features for relation for f in _get_ddlib_feats(span1, sent1, s1_idxs): yield candidate.id, f"DDL_e1_{f}", DEF_VALUE for f in _get_ddlib_feats(span2, sent2, s2_idxs): yield candidate.id, f"DDL_e2_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in binary_tdl_feats: binary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs): binary_tdl_feats[candidate.id].add(f) for f in binary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span1): yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE for f in _get_word_feats(span2): yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE else: raise NotImplementedError( "Only handles unary and binary candidates currently" )