def _get_classifier_features(props, docs): classifier_features = {} dual_features = {} classifier_agreement_size = props.get('classifier_agreement_size', -1) if classifier_agreement_size >= 0: agreement_types = props.get('agreement_types', []) for agreement_type in agreement_types: converter = create_categorical_converter( {"agreement", "disagreement", "unknown"}) classifier_features.update( create_feature(agreement_type + "_agreement", props, converter, 'classifier_agreement')) classifier_features.update( create_feature( 'mention_distance', props, create_unsigned_integers_converter(props["max_mention_distance"]))) classifier_features.update( create_feature( 'mention_interrelation', props, create_categorical_converter( {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"}))) classifier_features.update( create_feature( 'classifier_entity_distance', props, create_unsigned_integers_converter(props["max_entity_distance"]))) classifier_features.update( create_feature( 'entities_token_distance_in_classifier', props, create_unsigned_integers_converter( props["max_token_entities_distance"]))) classifier_features.update( create_feature( 'entities_sent_distance_in_classifier', props, create_unsigned_integers_converter( props["max_sent_entities_distance"]))) dual_features.update( create_feature( 'entities_types_in_classifier', props, create_categorical_converter(collect_entities_types(docs)))) dual_features.update( create_feature( 'head_ne_types', props, create_categorical_converter(collect_entities_types( docs, extras=True).union('O'), has_oov=True))) classifier_features.update(_get_binary_features(props)) return duplicate_features_config(dual_features, classifier_features)
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): types_to_unquote = props.get("types_to_unquote", []) unquote_prob = props.get("prob_to_unquote", 0.0) if types_to_unquote and unquote_prob: # concat augmented docs with original ones to be sure all possible features are processed by FE factories augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote) prev_docs = docs docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs))) token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size) ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props) token_features_meta.basic_meta += ne_meta ent_types = collect_entities_types(docs) labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO")) labels_converter = create_categorical_converter( labelling_strategy.get_possible_categories(ent_types), zero_padding=False ) prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote) feature_extractor = NERFeatureExtractor( token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor) return feature_extractor, token_features_meta
def generate_feature_extractor( docs: Iterable[Document], props: dict, shared_feature_extractor: SpansCommonFeatureExtractor): entities_types = collect_entities_types(docs) pair_filters = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), MaxTokenDistanceCandidateFilter(props['max_candidate_distance']) ] if props.get("filter_intersecting", False): pair_filters.append(IntersectingCandidateFilter()) rels = [_filter_doc_rels(doc, AndFilter(pair_filters)) for doc in docs] rel_arg_types = create_rel_arg_types(rels) pair_filters.append(RelArgTypesCandidateFilter(rel_arg_types)) candidate_extractor = DefaultCandidateExtractionStrategy( DefaultPairExtractionStrategy(AndFilter(pair_filters))) rel_dict = create_rel_dict(rels) valid_ent_rel_types = collect_valid_rel_types(rels) entities_encoder_meta, entities_encoder_converters = get_categorical_meta_converters( _get_entities_encoder_features(props, entities_types)) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_token_meta = tp_meta.namespaced('e1') + tp_meta.namespaced('e2') attention_rel_meta, attention_converters = get_categorical_meta_converters( _get_relation_level_features(props, rel_arg_types, entities_types, "attention")) attention_meta = AttentionFeaturesMeta(attention_token_meta, attention_rel_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_relation_level_features(props, rel_arg_types, entities_types)) feature_extractor = NegativeSamplesFilteringFeatureExtractor( shared_feature_extractor, rel_dict, entities_encoder_converters, token_position_fe, attention_converters, classifier_converters, candidate_extractor, valid_ent_rel_types, negative_ratio=props.get("negative_samples_ratio", float("inf"))) return feature_extractor, Metas(entities_encoder_meta, attention_meta, classifier_meta)
def _get_encoder_features(props, docs): encoder_features = {} encoder_features.update( create_feature( 'encoder_entity_types', props, create_categorical_converter(collect_entities_types(docs), zero_padding=True))) encoder_features.update( create_feature( 'encoder_entity_ne', props, create_categorical_converter(collect_entities_types( docs, extras=True).union('O'), zero_padding=True, has_oov=True, oov_object='O'))) speech_types = props.get('speech_types', []) speech_size = props.get('speech_size', -1) if speech_size >= 0: for speech_type in speech_types: encoder_features.update( create_feature( 'encoder_' + speech_type, props, create_categorical_converter({True, False}, zero_padding=True), 'speech')) feats_types = props.get('morph_feats_list', []) feats_size = props.get('morph_feats_size', -1) if feats_size >= 0: for feat_name in feats_types: encoder_features.update( create_feature( 'encoder_' + feat_name, props, create_categorical_converter(collect_feature_labels( docs, feat_name), zero_padding=True), 'morph_feats')) return encoder_features
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict): features = {} labelling_strategy = None if props.get("ne_emb_size", -1) >= 0: types = collect_entities_types(docs, extras=True) labelling_strategy = get_labelling_strategy( props.get("ne_labelling_strategy", "IO")) features['ne'] = { 'converter': create_categorical_converter( labelling_strategy.get_possible_categories(types), has_oov=True) } if props["ne_emb_size"] != 0: features["ne"]['embedding_size'] = props["ne_emb_size"] meta, converters = get_categorical_meta_converters(features) return NEFeatureExtractor(converters, labelling_strategy), meta