コード例 #1
0
ファイル: factory.py プロジェクト: wayne9qiu/derek
def _get_classifier_features(props, docs):
    classifier_features = {}
    dual_features = {}

    classifier_agreement_size = props.get('classifier_agreement_size', -1)
    if classifier_agreement_size >= 0:
        agreement_types = props.get('agreement_types', [])
        for agreement_type in agreement_types:
            converter = create_categorical_converter(
                {"agreement", "disagreement", "unknown"})
            classifier_features.update(
                create_feature(agreement_type + "_agreement", props, converter,
                               'classifier_agreement'))

    classifier_features.update(
        create_feature(
            'mention_distance', props,
            create_unsigned_integers_converter(props["max_mention_distance"])))

    classifier_features.update(
        create_feature(
            'mention_interrelation', props,
            create_categorical_converter(
                {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"})))

    classifier_features.update(
        create_feature(
            'classifier_entity_distance', props,
            create_unsigned_integers_converter(props["max_entity_distance"])))

    classifier_features.update(
        create_feature(
            'entities_token_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_token_entities_distance"])))

    classifier_features.update(
        create_feature(
            'entities_sent_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_sent_entities_distance"])))

    dual_features.update(
        create_feature(
            'entities_types_in_classifier', props,
            create_categorical_converter(collect_entities_types(docs))))

    dual_features.update(
        create_feature(
            'head_ne_types', props,
            create_categorical_converter(collect_entities_types(
                docs, extras=True).union('O'),
                                         has_oov=True)))

    classifier_features.update(_get_binary_features(props))

    return duplicate_features_config(dual_features, classifier_features)
コード例 #2
0
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0):
    types_to_unquote = props.get("types_to_unquote", [])
    unquote_prob = props.get("prob_to_unquote", 0.0)

    if types_to_unquote and unquote_prob:
        # concat augmented docs with original ones to be sure all possible features are processed by FE factories
        augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote)
        prev_docs = docs
        docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs)))

    token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size)

    ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props)
    token_features_meta.basic_meta += ne_meta

    ent_types = collect_entities_types(docs)

    labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO"))
    labels_converter = create_categorical_converter(
        labelling_strategy.get_possible_categories(ent_types),
        zero_padding=False
    )
    prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote)
    feature_extractor = NERFeatureExtractor(
        token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor)

    return feature_extractor, token_features_meta
コード例 #3
0
ファイル: factory.py プロジェクト: ispras-texterra/derek
def generate_feature_extractor(
        docs: Iterable[Document], props: dict,
        shared_feature_extractor: SpansCommonFeatureExtractor):

    entities_types = collect_entities_types(docs)

    pair_filters = [
        DifferentEntitiesCandidateFilter(),
        InSameSentenceCandidateFilter(),
        MaxTokenDistanceCandidateFilter(props['max_candidate_distance'])
    ]

    if props.get("filter_intersecting", False):
        pair_filters.append(IntersectingCandidateFilter())

    rels = [_filter_doc_rels(doc, AndFilter(pair_filters)) for doc in docs]
    rel_arg_types = create_rel_arg_types(rels)
    pair_filters.append(RelArgTypesCandidateFilter(rel_arg_types))
    candidate_extractor = DefaultCandidateExtractionStrategy(
        DefaultPairExtractionStrategy(AndFilter(pair_filters)))

    rel_dict = create_rel_dict(rels)
    valid_ent_rel_types = collect_valid_rel_types(rels)

    entities_encoder_meta, entities_encoder_converters = get_categorical_meta_converters(
        _get_entities_encoder_features(props, entities_types))

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)
    attention_token_meta = tp_meta.namespaced('e1') + tp_meta.namespaced('e2')

    attention_rel_meta, attention_converters = get_categorical_meta_converters(
        _get_relation_level_features(props, rel_arg_types, entities_types,
                                     "attention"))

    attention_meta = AttentionFeaturesMeta(attention_token_meta,
                                           attention_rel_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_relation_level_features(props, rel_arg_types, entities_types))

    feature_extractor = NegativeSamplesFilteringFeatureExtractor(
        shared_feature_extractor,
        rel_dict,
        entities_encoder_converters,
        token_position_fe,
        attention_converters,
        classifier_converters,
        candidate_extractor,
        valid_ent_rel_types,
        negative_ratio=props.get("negative_samples_ratio", float("inf")))

    return feature_extractor, Metas(entities_encoder_meta, attention_meta,
                                    classifier_meta)
コード例 #4
0
def _get_encoder_features(props, docs):
    encoder_features = {}

    encoder_features.update(
        create_feature(
            'encoder_entity_types', props,
            create_categorical_converter(collect_entities_types(docs),
                                         zero_padding=True)))

    encoder_features.update(
        create_feature(
            'encoder_entity_ne', props,
            create_categorical_converter(collect_entities_types(
                docs, extras=True).union('O'),
                                         zero_padding=True,
                                         has_oov=True,
                                         oov_object='O')))

    speech_types = props.get('speech_types', [])
    speech_size = props.get('speech_size', -1)
    if speech_size >= 0:
        for speech_type in speech_types:
            encoder_features.update(
                create_feature(
                    'encoder_' + speech_type, props,
                    create_categorical_converter({True, False},
                                                 zero_padding=True), 'speech'))

    feats_types = props.get('morph_feats_list', [])
    feats_size = props.get('morph_feats_size', -1)
    if feats_size >= 0:
        for feat_name in feats_types:
            encoder_features.update(
                create_feature(
                    'encoder_' + feat_name, props,
                    create_categorical_converter(collect_feature_labels(
                        docs, feat_name),
                                                 zero_padding=True),
                    'morph_feats'))
    return encoder_features
コード例 #5
0
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict):
    features = {}
    labelling_strategy = None

    if props.get("ne_emb_size", -1) >= 0:
        types = collect_entities_types(docs, extras=True)
        labelling_strategy = get_labelling_strategy(
            props.get("ne_labelling_strategy", "IO"))
        features['ne'] = {
            'converter':
            create_categorical_converter(
                labelling_strategy.get_possible_categories(types),
                has_oov=True)
        }
        if props["ne_emb_size"] != 0:
            features["ne"]['embedding_size'] = props["ne_emb_size"]

    meta, converters = get_categorical_meta_converters(features)
    return NEFeatureExtractor(converters, labelling_strategy), meta