Beispiel #1
0
class _NERLikeClassifier:
    _PREDICTION_BATCH_SIZE = get_batch_size()

    def __init__(self, net_classifier):
        self.net_classifier = net_classifier

    def predict_docs(
            self,
            docs: List[Document],
            batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]:
        predictions = self.net_classifier.predict_docs(docs, batch_size)
        return [
            self._type_entities(doc.extras["ne"], ent_typing)
            for doc, ent_typing in zip(docs, predictions)
        ]

    def predict_doc(self, doc: Document) -> List[Entity]:
        return self.predict_docs([doc])[0]

    @staticmethod
    def _type_entities(entities: List[Entity],
                       entities_typing: Dict[Entity, Optional[str]]):
        ret = []
        for ent in entities:
            prediction = entities_typing[ent]
            if prediction is not None:
                ret.append(ent.with_type(prediction))

        return ret
Beispiel #2
0
class _Classifier:
    _PREDICTION_BATCH_SIZE = get_batch_size()

    def __init__(self, graph, feature_extractor: NERFeatureExtractor,
                 feature_computer, session, saver, post_processor):
        self.graph = graph
        self.extractor = feature_extractor
        self.feature_computer = feature_computer
        self.session = session
        self.saver = saver
        self.post_processor = post_processor

    def predict_docs_with_scores(self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) \
            -> Tuple[List[List[Entity]], List[List[float]]]:

        docs = self.feature_computer.create_features_for_docs(docs)
        samples = chain.from_iterable(
            map(self.extractor.extract_features_from_doc, docs))
        batcher = get_standard_batcher_factory(
            samples, batch_size, self.extractor.get_padding_value_and_rank)

        sent_labels, scores = predict_for_samples(self.graph, self.session,
                                                  ["predictions", "scores"],
                                                  batcher)
        sent_labels, scores = iter(sent_labels), iter(scores)

        docs_predicted_entities, docs_confidences = [], []

        for doc in docs:
            predicted_entities, sentences_confidences = [], []

            for sent, labels, score in zip(doc.sentences, sent_labels, scores):
                # remove padding
                predicted_entities.extend(
                    self.extractor.encoded_labels_to_entities(
                        sent, labels[:len(sent)]))
                sentences_confidences.append(score)

            if self.post_processor is not None:
                predicted_entities = self.post_processor(
                    doc, predicted_entities)

            docs_predicted_entities.append(predicted_entities)
            docs_confidences.append(sentences_confidences)

        return docs_predicted_entities, docs_confidences

    def predict_docs(
            self,
            docs: List[Document],
            batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]:
        return self.predict_docs_with_scores(docs, batch_size)[0]

    def predict_doc_with_scores(
            self, doc: Document) -> Tuple[List[Entity], List[float]]:
        """
        :return: (List[Entity] found in doc, List[float] containing sequence labelling score for each Sentence in doc)
        """
        docs_entities, docs_confidences = self.predict_docs_with_scores([doc])
        return docs_entities[0], docs_confidences[0]

    def predict_doc(self, doc: Document) -> List[Entity]:
        predicted, _ = self.predict_doc_with_scores(doc)
        return predicted

    def save(self, out_path):
        save_classifier(out_path, self.extractor, self.feature_computer,
                        self.graph, self.session, self.saver)
        save_with_pickle(self.post_processor, out_path, "post_processor")

    @classmethod
    def load(cls, path, session):
        extractor, feature_computer, graph, saver = load_classifier(
            path, NERFeatureExtractor, SyntacticFeatureComputer, session)

        return cls(graph, extractor, feature_computer, session, saver,
                   load_with_pickle(path, "post_processor"))
Beispiel #3
0
class _Classifier:
    _PREDICTION_BATCH_SIZE = get_batch_size()

    def __init__(self, graph: dict, extractor: RelExtFeatureExtractor,
                 feature_computer: AbstractFeatureComputer, session, saver,
                 collapser: EntitiesCollapser):
        self.session = session
        self.graph = graph
        self.extractor = extractor
        self.feature_computer = feature_computer
        self.saver = saver
        self.collapser = collapser

    def predict_docs(self,
                     docs: list,
                     *,
                     print_progress=False,
                     include_probs=False) -> dict:
        rels = {}

        for i, doc in enumerate(docs):
            rels[doc.name] = self.predict_doc(doc, include_probs)
            if print_progress:
                update_progress((i + 1) / len(docs))

        return rels

    def predict_doc(self, doc, include_probs=False):
        doc, direct_mapping = self.collapser.transform_with_mapping(doc)
        reversed_mapping = {v: k for k, v in direct_mapping.items()}
        doc = self.feature_computer.create_features_for_doc(doc)

        samples, entity_pairs = self.extractor.extract_features_from_doc(doc)
        entity_pairs = [(reversed_mapping[e1], reversed_mapping[e2])
                        for e1, e2 in entity_pairs]
        outputs = ["predictions"]

        if include_probs:
            outputs.append("scores")

        batcher = get_standard_batcher_factory(
            samples, self._PREDICTION_BATCH_SIZE,
            self.extractor.get_padding_value_and_rank)

        out = predict_for_samples(self.graph, self.session, outputs,
                                  batcher)  # labels, [scores]

        relations = self._collect_pair_results(out[0], entity_pairs)
        relations = self._get_relations(relations)
        ret = relations

        if include_probs:
            scores = self._collect_pair_results(out[1], entity_pairs)
            scores = self._get_scores(scores)
            ret = (relations, scores)

        return ret

    def _collect_pair_results(self, out, entity_pairs):
        ret = {}
        for result, entity_pair in zip(out, entity_pairs):
            ret[entity_pair] = result
        return ret

    def _get_scores(self, scores: dict) -> dict:
        ret = {}
        for pair, pair_scores in scores.items():
            scores_dict = {}
            for i, score in enumerate(pair_scores):
                scores_dict[self.extractor.get_type(i)] = score
            ret[pair] = scores_dict
        return ret

    def _get_relations(self, predictions: dict) -> set:
        rels = set()
        for (e1, e2), label in predictions.items():
            rel_type = self.extractor.get_type(label)
            if rel_type is not None:
                rels.add(Relation(e1, e2, rel_type))
        return rels

    def save(self, out_path):
        save_classifier(out_path, self.extractor, self.feature_computer,
                        self.graph, self.session, self.saver)
        save_with_pickle(self.collapser, out_path, "collapser")

    @classmethod
    def load(cls, path, session):
        extractor, feature_computer, graph, saver = load_classifier(
            path, RelExtFeatureExtractor, CompositeFeatureComputer, session)

        return cls(graph, extractor, feature_computer, session, saver,
                   load_with_pickle(path, "collapser"))
Beispiel #4
0
class _Classifier:
    _PREDICTION_BATCH_SIZE = get_batch_size()

    def __init__(self, graph, feature_extractor: GroupingFeatureExtractor,
                 feature_computer, session, saver,
                 grouper_collapser: '_GrouperCollapser'):
        self.graph = graph
        self.extractor = feature_extractor
        self.feature_computer = feature_computer
        self.session = session
        self.saver = saver
        self.grouper_collapser = grouper_collapser

    def predict_docs(self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) \
            -> List[Dict[Entity, Optional[str]]]:

        docs_answers, docs_reversed_mappings = [], []
        docs_chains_to_predict, samples_to_predict = [], []

        for doc in docs:
            reversed_mapping, chains_to_predict, doc_samples, ready_answers = self._doc_input(
                doc)
            docs_answers.append(ready_answers)
            docs_reversed_mappings.append(reversed_mapping)
            docs_chains_to_predict.append(chains_to_predict)
            samples_to_predict.extend(doc_samples)

        batcher = get_standard_batcher_factory(
            samples_to_predict, batch_size,
            self.extractor.get_padding_value_and_rank)

        # we have only predictions as output
        predicted_labels, = predict_for_samples(self.graph, self.session,
                                                ["predictions"], batcher)
        predicted_types = map(self.extractor.get_type, predicted_labels)

        for t in zip(docs_chains_to_predict, docs_reversed_mappings,
                     docs_answers):
            chains_to_predict, reversed_mapping, ready_answers = t
            for chain, predicted_type in zip(chains_to_predict,
                                             predicted_types):
                ready_answers.update(
                    dict.fromkeys(map(reversed_mapping.get, chain),
                                  predicted_type))

        return docs_answers

    def _doc_input(self, doc: Document):
        collapsed_doc, groups, reversed_mapping = self.grouper_collapser.prepare_doc_with_collapsing(
            doc)
        doc = self.feature_computer.create_features_for_doc(collapsed_doc)
        chain_samples = self.extractor.extract_features_from_doc(doc, groups)
        chains_to_predict, samples_to_predict = [], []
        ready_answers = {}

        for chain, sample in chain_samples:
            if isinstance(sample, dict):
                chains_to_predict.append(chain)
                samples_to_predict.append(sample)
            else:
                ready_answers.update(
                    dict.fromkeys(map(reversed_mapping.get, chain), sample))

        return reversed_mapping, chains_to_predict, samples_to_predict, ready_answers

    def predict_doc(self, doc: Document) -> Dict[Entity, Optional[str]]:
        return self.predict_docs([doc])[0]

    def save(self, out_path):
        save_classifier(out_path, self.extractor, self.feature_computer,
                        self.graph, self.session, self.saver)
        save_with_pickle(self.grouper_collapser, out_path, "grouper_collapser")

    @classmethod
    def load(cls, path, session):
        extractor, feature_computer, graph, saver = load_classifier(
            path, NETFeatureExtractor, SyntacticFeatureComputer, session)

        return cls(graph, extractor, feature_computer, session, saver,
                   load_with_pickle(path, "grouper_collapser"))
class ChainedNERClassifier:
    _PREDICTION_BATCH_SIZE = get_batch_size()

    def __init__(self, model_path: str):
        self.path = model_path

    def __enter__(self):
        if not self.__is_chained_model():
            self.__ner_manager, self.__net_manager = NERClassifier(
                self.path), None
            return self.__ner_manager.__enter__()

        self.__ner_manager = ChainedNERClassifier(join(self.path, "ner"))
        self.__ner = self.__ner_manager.__enter__()
        self.__net_manager = NETClassifier(join(self.path, "net"))
        self.__net = self.__net_manager.__enter__()

        return self

    def predict_docs(
            self,
            docs: List[Document],
            batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]:
        docs_entities = self.__ner.predict_docs(docs, batch_size)
        docs = [
            doc.with_additional_extras({"ne": ents})
            for doc, ents in zip(docs, docs_entities)
        ]
        entities_typing = self.__net.predict_docs(docs, batch_size)
        return [
            self._type_entities(ents, typing)
            for ents, typing in zip(docs_entities, entities_typing)
        ]

    def predict_doc(self, doc: Document) -> List[Entity]:
        entities = self.__ner.predict_doc(doc)
        entities_typing = self.__net.predict_doc(
            doc.with_additional_extras({"ne": entities}))
        return self._type_entities(entities, entities_typing)

    @staticmethod
    def _type_entities(
            entities: List[Entity],
            entities_typing: Dict[Entity, Optional[str]]) -> List[Entity]:
        def type_entity(ent: Entity) -> Entity:
            new_type = entities_typing.get(ent, ent.type)
            return ent.with_type(new_type)

        return list(map(type_entity, entities))

    def __exit__(self, *exc):
        if self.__net_manager is not None:
            self.__net_manager.__exit__(*exc)

        self.__ner_manager.__exit__(*exc)

        self.__net_manager, self.__ner_manager, self.__ner, self.__net = None, None, None, None

    def __is_chained_model(self):
        path_files = listdir(self.path)
        return set(path_files) == {"ner", "net"} and all(
            isdir(join(self.path, p)) for p in ["ner", "net"])