class _NERLikeClassifier: _PREDICTION_BATCH_SIZE = get_batch_size() def __init__(self, net_classifier): self.net_classifier = net_classifier def predict_docs( self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]: predictions = self.net_classifier.predict_docs(docs, batch_size) return [ self._type_entities(doc.extras["ne"], ent_typing) for doc, ent_typing in zip(docs, predictions) ] def predict_doc(self, doc: Document) -> List[Entity]: return self.predict_docs([doc])[0] @staticmethod def _type_entities(entities: List[Entity], entities_typing: Dict[Entity, Optional[str]]): ret = [] for ent in entities: prediction = entities_typing[ent] if prediction is not None: ret.append(ent.with_type(prediction)) return ret
class _Classifier: _PREDICTION_BATCH_SIZE = get_batch_size() def __init__(self, graph, feature_extractor: NERFeatureExtractor, feature_computer, session, saver, post_processor): self.graph = graph self.extractor = feature_extractor self.feature_computer = feature_computer self.session = session self.saver = saver self.post_processor = post_processor def predict_docs_with_scores(self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) \ -> Tuple[List[List[Entity]], List[List[float]]]: docs = self.feature_computer.create_features_for_docs(docs) samples = chain.from_iterable( map(self.extractor.extract_features_from_doc, docs)) batcher = get_standard_batcher_factory( samples, batch_size, self.extractor.get_padding_value_and_rank) sent_labels, scores = predict_for_samples(self.graph, self.session, ["predictions", "scores"], batcher) sent_labels, scores = iter(sent_labels), iter(scores) docs_predicted_entities, docs_confidences = [], [] for doc in docs: predicted_entities, sentences_confidences = [], [] for sent, labels, score in zip(doc.sentences, sent_labels, scores): # remove padding predicted_entities.extend( self.extractor.encoded_labels_to_entities( sent, labels[:len(sent)])) sentences_confidences.append(score) if self.post_processor is not None: predicted_entities = self.post_processor( doc, predicted_entities) docs_predicted_entities.append(predicted_entities) docs_confidences.append(sentences_confidences) return docs_predicted_entities, docs_confidences def predict_docs( self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]: return self.predict_docs_with_scores(docs, batch_size)[0] def predict_doc_with_scores( self, doc: Document) -> Tuple[List[Entity], List[float]]: """ :return: (List[Entity] found in doc, List[float] containing sequence labelling score for each Sentence in doc) """ docs_entities, docs_confidences = self.predict_docs_with_scores([doc]) return docs_entities[0], docs_confidences[0] def predict_doc(self, doc: Document) -> List[Entity]: predicted, _ = self.predict_doc_with_scores(doc) return predicted def save(self, out_path): save_classifier(out_path, self.extractor, self.feature_computer, self.graph, self.session, self.saver) save_with_pickle(self.post_processor, out_path, "post_processor") @classmethod def load(cls, path, session): extractor, feature_computer, graph, saver = load_classifier( path, NERFeatureExtractor, SyntacticFeatureComputer, session) return cls(graph, extractor, feature_computer, session, saver, load_with_pickle(path, "post_processor"))
class _Classifier: _PREDICTION_BATCH_SIZE = get_batch_size() def __init__(self, graph: dict, extractor: RelExtFeatureExtractor, feature_computer: AbstractFeatureComputer, session, saver, collapser: EntitiesCollapser): self.session = session self.graph = graph self.extractor = extractor self.feature_computer = feature_computer self.saver = saver self.collapser = collapser def predict_docs(self, docs: list, *, print_progress=False, include_probs=False) -> dict: rels = {} for i, doc in enumerate(docs): rels[doc.name] = self.predict_doc(doc, include_probs) if print_progress: update_progress((i + 1) / len(docs)) return rels def predict_doc(self, doc, include_probs=False): doc, direct_mapping = self.collapser.transform_with_mapping(doc) reversed_mapping = {v: k for k, v in direct_mapping.items()} doc = self.feature_computer.create_features_for_doc(doc) samples, entity_pairs = self.extractor.extract_features_from_doc(doc) entity_pairs = [(reversed_mapping[e1], reversed_mapping[e2]) for e1, e2 in entity_pairs] outputs = ["predictions"] if include_probs: outputs.append("scores") batcher = get_standard_batcher_factory( samples, self._PREDICTION_BATCH_SIZE, self.extractor.get_padding_value_and_rank) out = predict_for_samples(self.graph, self.session, outputs, batcher) # labels, [scores] relations = self._collect_pair_results(out[0], entity_pairs) relations = self._get_relations(relations) ret = relations if include_probs: scores = self._collect_pair_results(out[1], entity_pairs) scores = self._get_scores(scores) ret = (relations, scores) return ret def _collect_pair_results(self, out, entity_pairs): ret = {} for result, entity_pair in zip(out, entity_pairs): ret[entity_pair] = result return ret def _get_scores(self, scores: dict) -> dict: ret = {} for pair, pair_scores in scores.items(): scores_dict = {} for i, score in enumerate(pair_scores): scores_dict[self.extractor.get_type(i)] = score ret[pair] = scores_dict return ret def _get_relations(self, predictions: dict) -> set: rels = set() for (e1, e2), label in predictions.items(): rel_type = self.extractor.get_type(label) if rel_type is not None: rels.add(Relation(e1, e2, rel_type)) return rels def save(self, out_path): save_classifier(out_path, self.extractor, self.feature_computer, self.graph, self.session, self.saver) save_with_pickle(self.collapser, out_path, "collapser") @classmethod def load(cls, path, session): extractor, feature_computer, graph, saver = load_classifier( path, RelExtFeatureExtractor, CompositeFeatureComputer, session) return cls(graph, extractor, feature_computer, session, saver, load_with_pickle(path, "collapser"))
class _Classifier: _PREDICTION_BATCH_SIZE = get_batch_size() def __init__(self, graph, feature_extractor: GroupingFeatureExtractor, feature_computer, session, saver, grouper_collapser: '_GrouperCollapser'): self.graph = graph self.extractor = feature_extractor self.feature_computer = feature_computer self.session = session self.saver = saver self.grouper_collapser = grouper_collapser def predict_docs(self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) \ -> List[Dict[Entity, Optional[str]]]: docs_answers, docs_reversed_mappings = [], [] docs_chains_to_predict, samples_to_predict = [], [] for doc in docs: reversed_mapping, chains_to_predict, doc_samples, ready_answers = self._doc_input( doc) docs_answers.append(ready_answers) docs_reversed_mappings.append(reversed_mapping) docs_chains_to_predict.append(chains_to_predict) samples_to_predict.extend(doc_samples) batcher = get_standard_batcher_factory( samples_to_predict, batch_size, self.extractor.get_padding_value_and_rank) # we have only predictions as output predicted_labels, = predict_for_samples(self.graph, self.session, ["predictions"], batcher) predicted_types = map(self.extractor.get_type, predicted_labels) for t in zip(docs_chains_to_predict, docs_reversed_mappings, docs_answers): chains_to_predict, reversed_mapping, ready_answers = t for chain, predicted_type in zip(chains_to_predict, predicted_types): ready_answers.update( dict.fromkeys(map(reversed_mapping.get, chain), predicted_type)) return docs_answers def _doc_input(self, doc: Document): collapsed_doc, groups, reversed_mapping = self.grouper_collapser.prepare_doc_with_collapsing( doc) doc = self.feature_computer.create_features_for_doc(collapsed_doc) chain_samples = self.extractor.extract_features_from_doc(doc, groups) chains_to_predict, samples_to_predict = [], [] ready_answers = {} for chain, sample in chain_samples: if isinstance(sample, dict): chains_to_predict.append(chain) samples_to_predict.append(sample) else: ready_answers.update( dict.fromkeys(map(reversed_mapping.get, chain), sample)) return reversed_mapping, chains_to_predict, samples_to_predict, ready_answers def predict_doc(self, doc: Document) -> Dict[Entity, Optional[str]]: return self.predict_docs([doc])[0] def save(self, out_path): save_classifier(out_path, self.extractor, self.feature_computer, self.graph, self.session, self.saver) save_with_pickle(self.grouper_collapser, out_path, "grouper_collapser") @classmethod def load(cls, path, session): extractor, feature_computer, graph, saver = load_classifier( path, NETFeatureExtractor, SyntacticFeatureComputer, session) return cls(graph, extractor, feature_computer, session, saver, load_with_pickle(path, "grouper_collapser"))
class ChainedNERClassifier: _PREDICTION_BATCH_SIZE = get_batch_size() def __init__(self, model_path: str): self.path = model_path def __enter__(self): if not self.__is_chained_model(): self.__ner_manager, self.__net_manager = NERClassifier( self.path), None return self.__ner_manager.__enter__() self.__ner_manager = ChainedNERClassifier(join(self.path, "ner")) self.__ner = self.__ner_manager.__enter__() self.__net_manager = NETClassifier(join(self.path, "net")) self.__net = self.__net_manager.__enter__() return self def predict_docs( self, docs: List[Document], batch_size: int = _PREDICTION_BATCH_SIZE) -> List[List[Entity]]: docs_entities = self.__ner.predict_docs(docs, batch_size) docs = [ doc.with_additional_extras({"ne": ents}) for doc, ents in zip(docs, docs_entities) ] entities_typing = self.__net.predict_docs(docs, batch_size) return [ self._type_entities(ents, typing) for ents, typing in zip(docs_entities, entities_typing) ] def predict_doc(self, doc: Document) -> List[Entity]: entities = self.__ner.predict_doc(doc) entities_typing = self.__net.predict_doc( doc.with_additional_extras({"ne": entities})) return self._type_entities(entities, entities_typing) @staticmethod def _type_entities( entities: List[Entity], entities_typing: Dict[Entity, Optional[str]]) -> List[Entity]: def type_entity(ent: Entity) -> Entity: new_type = entities_typing.get(ent, ent.type) return ent.with_type(new_type) return list(map(type_entity, entities)) def __exit__(self, *exc): if self.__net_manager is not None: self.__net_manager.__exit__(*exc) self.__ner_manager.__exit__(*exc) self.__net_manager, self.__ner_manager, self.__ner, self.__net = None, None, None, None def __is_chained_model(self): path_files = listdir(self.path) return set(path_files) == {"ner", "net"} and all( isdir(join(self.path, p)) for p in ["ner", "net"])