def get_definition_annotations(text: str, decode_unicode=True) \ -> Generator[DefinitionAnnotation, None, None]: for d in get_definition_objects_list(text, decode_unicode=decode_unicode): ant = DefinitionAnnotation(coords=d.coords, text=d.text, name=d.name) yield ant
def get_annotations(self, sentence: str) -> List[DefinitionAnnotation]: annotations = [] # type: List[DefinitionAnnotation] # we go from term to definition because term is a simplier object to locate terms = list(self.model_term.predict_text( sentence, join_settings=self.term_join_sets)) if not terms: return annotations # find definitions around the terms # "mask" suggests the underlying model where the term is located feature_mask = [0] * len(sentence) for term in terms: for i in range(term[0], term[1]): feature_mask[i] = 1 definitions = list(self.model_definition.predict_text( sentence, feature_mask=feature_mask, join_settings=self.definition_join_sets)) # combine terms with surrounding definitions # measure distance between each tearm and each definition definition_distances = {} # { term0: [0:d0, 1:d1, ...N:dN], term1: ... } for t_s, t_e in terms: distances = [] for i in range(len(definitions)): df_s, df_e = definitions[i] if df_e <= t_s: distance = t_s - df_e elif df_s >= t_e: distance = df_s - t_e else: if df_s > t_s: distance = -(t_e - df_s + 1) elif df_e < t_e: distance = -(df_e - t_s + 1) else: distance = - (t_e - t_s + 1) distances.append((i, distance)) distances.sort(key=lambda d: d[1]) # closest definitions go first definition_distances[(t_s, t_e)] = distances # get closest definitions for each term for term in terms: defs = definition_distances[term] df = defs[0] if defs else (term[0], term[1]) def_start = min(df[0], term[0]) def_end = max(df[1], term[1]) term_phrase = sentence[term[0]: term[1]] ant = DefinitionAnnotation( coords=(def_start, def_end), name=term_phrase, text=sentence[def_start: def_end], locale='en') annotations.append(ant) # TODO: check if annotations overlap and cut overlapping parts return annotations
def annotate_document(self, text: str, definitions: List[dict], output_path: str) -> None: annotations = [] index = 0 for df in definitions: index += 1 ant_text = df["tags"]["Extracted Entity Text"] ant = DefinitionAnnotation( name=df["tags"]["Extracted Entity Definition Name"], coords=(df["attrs"]["start"], df["attrs"]["end"]), text=ant_text, locale="en") annotations.append(ant) html = annotate_text(text, annotations) save_test_document(output_path, html)
def get_definition_annotations(text: str, decode_unicode=True, locator_type: AnnotationLocatorType = AnnotationLocatorType.RegexpBased) \ -> Generator[DefinitionAnnotation, None, None]: if locator_type == AnnotationLocatorType.MlWordVectorBased: if not parser_ml_classifier.initialized: raise Exception( f'"parser_ml_classifier" object should be initialized (call load_compressed method)' ) ants = parser_ml_classifier.get_annotations(text) for ant in ants: yield ant return # use Regexp-based locator for d in get_definition_objects_list(text, decode_unicode=decode_unicode): ant = DefinitionAnnotation(coords=d.coords, text=d.text, name=d.name) yield ant
def make_annotation_from_pattrn(self, locale: str, ptrn: PatternFound, phrase: LineOrPhrase) -> TextAnnotation: return DefinitionAnnotation(name=ptrn.name, coords=(ptrn.start, ptrn.end), text=phrase.text[ptrn.start:ptrn.end], locale=locale)