コード例 #1
0
def get_definition_annotations(text: str, decode_unicode=True) \
        -> Generator[DefinitionAnnotation, None, None]:
    for d in get_definition_objects_list(text,
                                         decode_unicode=decode_unicode):
        ant = DefinitionAnnotation(coords=d.coords,
                                   text=d.text,
                                   name=d.name)
        yield ant
コード例 #2
0
    def get_annotations(self, sentence: str) -> List[DefinitionAnnotation]:
        annotations = []  # type: List[DefinitionAnnotation]
        # we go from term to definition because term is a simplier object to locate
        terms = list(self.model_term.predict_text(
                sentence, join_settings=self.term_join_sets))
        if not terms:
            return annotations

        # find definitions around the terms
        # "mask" suggests the underlying model where the term is located
        feature_mask = [0] * len(sentence)
        for term in terms:
            for i in range(term[0], term[1]):
                feature_mask[i] = 1

        definitions = list(self.model_definition.predict_text(
                    sentence, feature_mask=feature_mask,
                    join_settings=self.definition_join_sets))

        # combine terms with surrounding definitions
        # measure distance between each tearm and each definition
        definition_distances = {}  # { term0: [0:d0, 1:d1, ...N:dN], term1: ... }
        for t_s, t_e in terms:
            distances = []
            for i in range(len(definitions)):
                df_s, df_e = definitions[i]
                if df_e <= t_s:
                    distance = t_s - df_e
                elif df_s >= t_e:
                    distance = df_s - t_e
                else:
                    if df_s > t_s:
                        distance = -(t_e - df_s + 1)
                    elif df_e < t_e:
                        distance = -(df_e - t_s + 1)
                    else:
                        distance = - (t_e - t_s + 1)
                distances.append((i, distance))
            distances.sort(key=lambda d: d[1])  # closest definitions go first
            definition_distances[(t_s, t_e)] = distances

        # get closest definitions for each term
        for term in terms:
            defs = definition_distances[term]
            df = defs[0] if defs else (term[0], term[1])
            def_start = min(df[0], term[0])
            def_end = max(df[1], term[1])

            term_phrase = sentence[term[0]: term[1]]
            ant = DefinitionAnnotation(
                coords=(def_start, def_end),
                name=term_phrase,
                text=sentence[def_start: def_end],
                locale='en')
            annotations.append(ant)

        # TODO: check if annotations overlap and cut overlapping parts
        return annotations
コード例 #3
0
    def annotate_document(self, text: str, definitions: List[dict],
                          output_path: str) -> None:
        annotations = []
        index = 0
        for df in definitions:
            index += 1
            ant_text = df["tags"]["Extracted Entity Text"]
            ant = DefinitionAnnotation(
                name=df["tags"]["Extracted Entity Definition Name"],
                coords=(df["attrs"]["start"], df["attrs"]["end"]),
                text=ant_text,
                locale="en")
            annotations.append(ant)

        html = annotate_text(text, annotations)
        save_test_document(output_path, html)
コード例 #4
0
def get_definition_annotations(text: str,
                               decode_unicode=True,
                               locator_type: AnnotationLocatorType = AnnotationLocatorType.RegexpBased) \
        -> Generator[DefinitionAnnotation, None, None]:

    if locator_type == AnnotationLocatorType.MlWordVectorBased:
        if not parser_ml_classifier.initialized:
            raise Exception(
                f'"parser_ml_classifier" object should be initialized (call load_compressed method)'
            )
        ants = parser_ml_classifier.get_annotations(text)
        for ant in ants:
            yield ant
        return

    # use Regexp-based locator
    for d in get_definition_objects_list(text, decode_unicode=decode_unicode):
        ant = DefinitionAnnotation(coords=d.coords, text=d.text, name=d.name)
        yield ant
コード例 #5
0
 def make_annotation_from_pattrn(self, locale: str, ptrn: PatternFound,
                                 phrase: LineOrPhrase) -> TextAnnotation:
     return DefinitionAnnotation(name=ptrn.name,
                                 coords=(ptrn.start, ptrn.end),
                                 text=phrase.text[ptrn.start:ptrn.end],
                                 locale=locale)