Esempio n. 1
0
def _get_classifier_features(props, docs):
    classifier_features = {}
    dual_features = {}

    classifier_agreement_size = props.get('classifier_agreement_size', -1)
    if classifier_agreement_size >= 0:
        agreement_types = props.get('agreement_types', [])
        for agreement_type in agreement_types:
            converter = create_categorical_converter(
                {"agreement", "disagreement", "unknown"})
            classifier_features.update(
                create_feature(agreement_type + "_agreement", props, converter,
                               'classifier_agreement'))

    classifier_features.update(
        create_feature(
            'mention_distance', props,
            create_unsigned_integers_converter(props["max_mention_distance"])))

    classifier_features.update(
        create_feature(
            'mention_interrelation', props,
            create_categorical_converter(
                {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"})))

    classifier_features.update(
        create_feature(
            'classifier_entity_distance', props,
            create_unsigned_integers_converter(props["max_entity_distance"])))

    classifier_features.update(
        create_feature(
            'entities_token_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_token_entities_distance"])))

    classifier_features.update(
        create_feature(
            'entities_sent_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_sent_entities_distance"])))

    dual_features.update(
        create_feature(
            'entities_types_in_classifier', props,
            create_categorical_converter(collect_entities_types(docs))))

    dual_features.update(
        create_feature(
            'head_ne_types', props,
            create_categorical_converter(collect_entities_types(
                docs, extras=True).union('O'),
                                         has_oov=True)))

    classifier_features.update(_get_binary_features(props))

    return duplicate_features_config(dual_features, classifier_features)
Esempio n. 2
0
def _init_we_features(docs: Iterable[Document], props: dict):
    we_converters_preprocessors = {}
    precomputed_features = []

    i = 0
    for model_config in props.get('models', []):
        reader_type = model_config.get("type", "w2v")
        ignore_errors = model_config.get("ignore_utf_errors", False)
        reader = embedding_readers[reader_type](
            errors='ignore' if ignore_errors else 'strict')

        logger.info(f"Loading {reader_type} model...")
        name = f'words_{i}'

        we_model = reader.read(model_config["path"])
        trainable = model_config.get("trainable", False)
        preprocessor = StandardTokenProcessor.from_props(model_config)

        tokens_set = extract_tokens(docs, preprocessor, we_model, trainable)
        converter = create_categorical_converter(tokens_set, has_oov=True)
        vectors = init_vectors(converter, we_model.vector_size, we_model,
                               trainable)

        we_converters_preprocessors[name] = (converter, preprocessor)
        precomputed_features.append({
            'name': name,
            'vectors': vectors,
            'trainable': trainable
        })

        logger.info("Initialised embeddings ({}, {})".format(
            vectors.shape[0], vectors.shape[1]))
        i += 1

    if props.get("internal_emb_size", 0) != 0:
        name = f'words_{i}'

        doc_tokens = extract_tokens(docs, trainable=True)
        converter = create_categorical_converter(doc_tokens, has_oov=True)
        vectors = init_vectors(converter,
                               props["internal_emb_size"],
                               trainable=True)

        we_converters_preprocessors[name] = (converter, None)
        precomputed_features.append({
            'name': name,
            'vectors': vectors,
            'trainable': True
        })

    return we_converters_preprocessors, WordEmbeddingsMeta(
        precomputed_features)
Esempio n. 3
0
def _init_we_features(docs: Iterable[Document], props: dict):
    we_converters_preprocessors = {}
    precomputed_features = []

    i = 0
    for model in props.get('models', []):
        logger.info("Loading w2v model...")
        name = f'words_{i}'

        we_model = KeyedVectors.load_word2vec_format(model["path"],
                                                     binary=model.get(
                                                         "binary", True),
                                                     datatype=float)
        trainable = model.get("trainable", False)
        preprocessor = StandardTokenProcessor.from_props(model)

        tokens_set = extract_tokens(docs, preprocessor, we_model, trainable)
        converter = create_categorical_converter(tokens_set, has_oov=True)
        vectors = init_vectors(converter, we_model.vector_size, we_model,
                               trainable)

        we_converters_preprocessors[name] = (converter, preprocessor)
        precomputed_features.append({
            'name': name,
            'vectors': vectors,
            'trainable': trainable
        })

        logger.info("Initialised embeddings ({}, {})".format(
            vectors.shape[0], vectors.shape[1]))
        i += 1

    if props.get("internal_emb_size", 0) != 0:
        name = f'words_{i}'

        doc_tokens = extract_tokens(docs, trainable=True)
        converter = create_categorical_converter(doc_tokens, has_oov=True)
        vectors = init_vectors(converter,
                               props["internal_emb_size"],
                               trainable=True)

        we_converters_preprocessors[name] = (converter, None)
        precomputed_features.append({
            'name': name,
            'vectors': vectors,
            'trainable': True
        })

    return we_converters_preprocessors, WordEmbeddingsMeta(
        precomputed_features)
Esempio n. 4
0
def _get_entities_encoder_features(props, entities_types):
    oov_type = None
    entities_types = entities_types.union({oov_type})
    features = {}

    if props.get("entities_types_emb_size", -1) >= 0:
        features['entities_types'] = {
            'converter':
            create_categorical_converter(entities_types,
                                         has_oov=True,
                                         oov_object=oov_type)
        }
        if props["entities_types_emb_size"] != 0:
            features["entities_types"]['embedding_size'] = props[
                "entities_types_emb_size"]

    if props.get("entities_depth_emb_size", -1) >= 0:
        features['entities_depths'] = {
            'converter':
            create_unsigned_integers_converter(props["max_entities_depth"])
        }
        if props["entities_depth_emb_size"] != 0:
            features["entities_depths"]['embedding_size'] = props[
                "entities_depth_emb_size"]

    return features
Esempio n. 5
0
def create_rel_dict(rels: list):
    rel_types = {None}

    for doc_rels in rels:
        rel_types.update([x.type for x in doc_rels])

    return create_categorical_converter(rel_types, zero_padding=False)
Esempio n. 6
0
def generate_feature_extractor(docs: Iterable[Document],
                               props: dict,
                               char_padding_size: int = 0):
    token_feature_extractor, token_features_meta = generate_token_feature_extractor(
        docs, props, char_padding_size)
    types_mapping = _get_ne_type_to_label_mapping(
        docs, set(props.get("restricted_ne_types", set())))
    valid_ne_types = types_mapping.keys()
    valid_ent_types = set(chain.from_iterable(types_mapping.values()))
    types_converter = create_categorical_converter(valid_ent_types,
                                                   zero_padding=False)

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)

    attention_ent_meta, attention_converters = get_categorical_meta_converters(
        _get_features(props, valid_ne_types, "attention"))

    attention_meta = AttentionFeaturesMeta(tp_meta, attention_ent_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_features(props, valid_ne_types))

    feature_extractor = NETFeatureExtractor(token_feature_extractor,
                                            token_position_fe,
                                            attention_converters,
                                            classifier_converters,
                                            types_converter, types_mapping)
    # no encoder meta in this task
    metas = Metas(get_empty_basic_meta(), attention_meta, classifier_meta)

    return feature_extractor, metas, token_features_meta
Esempio n. 7
0
def generate_feature_extractor(
        docs: Iterable[Document], props: dict,
        shared_feature_extractor: SpansCommonFeatureExtractor):

    strategy = _get_sampling_strategy(props)
    arc_converter = create_categorical_converter(
        strategy.get_possible_arc_types(docs), zero_padding=False)

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)
    attention_token_meta = tp_meta.namespaced('head') + tp_meta.namespaced(
        'dep')

    attention_arc_meta, attention_converters = get_categorical_meta_converters(
        _get_arc_level_features(props, "attention"))

    attention_meta = AttentionFeaturesMeta(attention_token_meta,
                                           attention_arc_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_arc_level_features(props))

    feature_extractor = ParserFeatureExtractor(shared_feature_extractor,
                                               arc_converter,
                                               token_position_fe,
                                               attention_converters,
                                               classifier_converters, strategy)

    return feature_extractor, Metas(get_empty_basic_meta(), attention_meta,
                                    classifier_meta)
Esempio n. 8
0
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0):
    types_to_unquote = props.get("types_to_unquote", [])
    unquote_prob = props.get("prob_to_unquote", 0.0)

    if types_to_unquote and unquote_prob:
        # concat augmented docs with original ones to be sure all possible features are processed by FE factories
        augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote)
        prev_docs = docs
        docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs)))

    token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size)

    ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props)
    token_features_meta.basic_meta += ne_meta

    ent_types = collect_entities_types(docs)

    labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO"))
    labels_converter = create_categorical_converter(
        labelling_strategy.get_possible_categories(ent_types),
        zero_padding=False
    )
    prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote)
    feature_extractor = NERFeatureExtractor(
        token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor)

    return feature_extractor, token_features_meta
Esempio n. 9
0
def _init_char_level_features(docs: Iterable[Document], props: dict,
                              char_padding_size: int):
    char_level_features = {}
    if props.get("char_embedding_size", -1) > 0:
        chars = collect_chars_set(docs)
        char_level_features['chars'] = {
            'converter': create_categorical_converter(chars, has_oov=True),
            'embedding_size': props["char_embedding_size"],
            'padding_size': char_padding_size
        }
    return char_level_features
def _get_encoder_features(props, docs):
    encoder_features = {}

    encoder_features.update(
        create_feature(
            'encoder_entity_types', props,
            create_categorical_converter(collect_entities_types(docs),
                                         zero_padding=True)))

    encoder_features.update(
        create_feature(
            'encoder_entity_ne', props,
            create_categorical_converter(collect_entities_types(
                docs, extras=True).union('O'),
                                         zero_padding=True,
                                         has_oov=True,
                                         oov_object='O')))

    speech_types = props.get('speech_types', [])
    speech_size = props.get('speech_size', -1)
    if speech_size >= 0:
        for speech_type in speech_types:
            encoder_features.update(
                create_feature(
                    'encoder_' + speech_type, props,
                    create_categorical_converter({True, False},
                                                 zero_padding=True), 'speech'))

    feats_types = props.get('morph_feats_list', [])
    feats_size = props.get('morph_feats_size', -1)
    if feats_size >= 0:
        for feat_name in feats_types:
            encoder_features.update(
                create_feature(
                    'encoder_' + feat_name, props,
                    create_categorical_converter(collect_feature_labels(
                        docs, feat_name),
                                                 zero_padding=True),
                    'morph_feats'))
    return encoder_features
def _get_features(props):
    features = {}

    size = props.get("token_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_position"] = feature

    size = props.get("token_log_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_log_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_log_position"] = feature

    size = props.get("sent_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_sent_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["sent_position"] = feature

    size = props.get("at_root_dt_path_size", -1)
    if size >= 0:
        feature = {"converter": create_categorical_converter({False, True})}
        if size > 0:
            feature["embedding_size"] = size
        features["at_root_dt_path"] = feature

    size = props.get("root_dt_path_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_unsigned_integers_converter(props["max_dt_depth"],
                                               additional_labels={False})
        }
        if size > 0:
            feature["embedding_size"] = size
        features["root_dt_path_position"] = feature

    return features
Esempio n. 12
0
def _get_features(props, ne_types, name_postfix="classifier"):
    features = {}

    ne_type_feature_name = "ne_type_in_{}".format(name_postfix)
    ne_type_size = props.get(ne_type_feature_name + "_size", -1)

    if ne_type_size >= 0:
        features[ne_type_feature_name] = {
            "converter": create_categorical_converter(ne_types)
        }

        if ne_type_size > 0:
            features[ne_type_feature_name]["embedding_size"] = ne_type_size

    return features
Esempio n. 13
0
    def test_custom_oov_in_set(self):
        categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"}
        converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=True, oov_object="hello")
        indexed_categories = {
            "$BACTERIA$": 0,
            "$HABITAT$": 1,
            "hello": 2,
            "laboratory": 3,
            '1': 2,
            1: 2,
            'privet': 2,
        }

        converter_indexed_categories = {key: converter[key] for key in indexed_categories}
        self.assertEqual(indexed_categories, converter_indexed_categories)
Esempio n. 14
0
    def test_categorical_converter_1(self):
        categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"}
        converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=False)
        indexed_categories = {
            "$BACTERIA$": 0,
            "$HABITAT$": 1,
            "hello": 2,
            "laboratory": 3
        }

        converter_indexed_categories = {key: converter[key] for key in indexed_categories}
        self.assertEqual(indexed_categories, converter_indexed_categories)

        error_keys = ["$OOV$", "$PADDING$", 0, 1, "privet"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Esempio n. 15
0
    def test_categorical_converter_4(self):
        categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"}
        converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=True)
        indexed_categories = {
            "$BACTERIA$": 0,
            "$HABITAT$": 1,
            "hello": 2,
            "laboratory": 3,
            "$OOV$": 4,
            '1': 4,
            1: 4,
            'privet': 4,
        }

        converter_indexed_categories = {key: converter[key] for key in indexed_categories}
        self.assertEqual(indexed_categories, converter_indexed_categories)
        self.assertRaises(KeyError, getitem, converter, "$PADDING$")
Esempio n. 16
0
    def test_categorical_converter_3(self):
        categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"}
        converter = create_categorical_converter(categories_set, zero_padding=True, has_oov=True)
        indexed_categories = {
            "$PADDING$": 0,
            "$BACTERIA$": 1,
            "$HABITAT$": 2,
            "hello": 3,
            "laboratory": 4,
            "$OOV$": 5,
            '1': 5,
            1: 5,
            'privet': 5,
        }

        converter_indexed_categories = {key: converter[key] for key in indexed_categories}
        self.assertEqual(indexed_categories, converter_indexed_categories)
Esempio n. 17
0
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict):
    features = {}
    labelling_strategy = None

    if props.get("ne_emb_size", -1) >= 0:
        types = collect_entities_types(docs, extras=True)
        labelling_strategy = get_labelling_strategy(
            props.get("ne_labelling_strategy", "IO"))
        features['ne'] = {
            'converter':
            create_categorical_converter(
                labelling_strategy.get_possible_categories(types),
                has_oov=True)
        }
        if props["ne_emb_size"] != 0:
            features["ne"]['embedding_size'] = props["ne_emb_size"]

    meta, converters = get_categorical_meta_converters(features)
    return NEFeatureExtractor(converters, labelling_strategy), meta
def generate_gazetteers_feature_extractors(props: dict):
    features = {}
    gazetteer_feature_extractors = {}
    converter = create_categorical_converter({True, False},
                                             zero_padding=True,
                                             has_oov=False)

    for index, config in enumerate(props.get('gazetteers', [])):
        gazetteer_name = f"gazetteer_{index}"
        features[gazetteer_name] = {'converter': converter}
        if config.get('emb_size', -1) > 0:
            features[gazetteer_name]['embedding_size'] = config['emb_size']

        gazetteer = _read_gazetteer(config["path"])
        processor = StandardTokenProcessor.from_props(config)
        gazetteer = set(map(processor, gazetteer))

        gazetteer_feature_extractors[
            gazetteer_name] = GazetteerFeatureExtractor(
                gazetteer, processor, converter,
                config.get("lemmatize", False))

    meta, _ = get_categorical_meta_converters(features)
    return meta, gazetteer_feature_extractors
Esempio n. 19
0
def _get_relation_level_features(props,
                                 rel_arg_types,
                                 entities_types,
                                 name_postfix="classifier"):
    dual_features = {}
    single_features = {}

    rel_args_feature_name = "rel_args_in_{}".format(name_postfix)
    rel_args_size = props.get("rel_args_in_{}_size".format(name_postfix), -1)

    if rel_args_size >= 0:
        single_features[rel_args_feature_name] = {
            "converter": create_categorical_converter(rel_arg_types)
        }

        if rel_args_size != 0:
            single_features[rel_args_feature_name][
                "embedding_size"] = rel_args_size

    entities_types_feature_name = "entities_types_in_{}".format(name_postfix)
    entities_types_size = props.get(entities_types_feature_name + "_size", -1)

    if entities_types_size >= 0:
        dual_features[entities_types_feature_name] = {
            "converter": create_categorical_converter(entities_types)
        }

        if entities_types_size > 0:
            dual_features[entities_types_feature_name][
                "embedding_size"] = entities_types_size

    token_distance_feature_name = "entities_token_distance_in_{}".format(
        name_postfix)
    token_distance_size = props.get(token_distance_feature_name + "_size", -1)

    if token_distance_size >= 0:
        single_features[token_distance_feature_name] = {
            "converter":
            create_unsigned_integers_converter(
                props["max_token_entities_distance"])
        }

        if token_distance_size != 0:
            single_features[token_distance_feature_name][
                "embedding_size"] = token_distance_size

    token_log_distance_feature_name = "entities_token_log_distance_in_{}".format(
        name_postfix)
    token_log_distance_size = props.get(
        token_log_distance_feature_name + "_size", -1)

    if token_log_distance_size >= 0:
        single_features[token_log_distance_feature_name] = {
            "converter":
            create_unsigned_log_integers_converter(
                props["max_token_entities_distance"])
        }

        if token_log_distance_size != 0:
            single_features[token_log_distance_feature_name][
                "embedding_size"] = token_log_distance_size

    sent_distance_feature_name = "entities_sent_distance_in_{}".format(
        name_postfix)
    sent_distance_size = props.get(sent_distance_feature_name + "_size", -1)

    if sent_distance_size >= 0:
        single_features[sent_distance_feature_name] = {
            "converter":
            create_unsigned_integers_converter(
                props["max_sent_entities_distance"])
        }

        if sent_distance_size != 0:
            single_features[sent_distance_feature_name][
                "embedding_size"] = sent_distance_size

    rel_direction_feature_name = "rel_dir_in_{}".format(name_postfix)
    rel_direction_size = props.get(rel_direction_feature_name + "_size", -1)

    if rel_direction_size >= 0:
        categories = {"e1_e2", "e2_e1", "e1_in_e2", "e2_in_e1"}
        single_features[rel_direction_feature_name] = {
            "converter": create_categorical_converter(categories)
        }

        if rel_direction_size != 0:
            single_features[rel_direction_feature_name][
                "embedding_size"] = rel_direction_size

    return duplicate_features_config(dual_features, single_features)
Esempio n. 20
0
def _init_word_level_features(docs: Iterable[Document], props: dict,
                              morph_features: List[str]):
    features = {}

    if props.get("pos_emb_size", -1) >= 0:
        pos_types = collect_feature_labels(docs, "pos")
        features['pos'] = {
            'converter': create_categorical_converter(pos_types, has_oov=True)
        }
        if props["pos_emb_size"] != 0:
            features['pos']['embedding_size'] = props["pos_emb_size"]

    if props.get("borders_size", -1) >= 0:
        features['borders'] = {
            'converter': create_categorical_converter({'start', 'in', 'end'})
        }
        if props["borders_size"] != 0:
            features["borders"]['embedding_size'] = props["borders_size"]

    if props.get("dt_label_emb_size", -1) >= 0:
        dt_label_types = collect_feature_labels(docs, "dt_labels")
        features['dt_labels'] = {
            'converter': create_categorical_converter(dt_label_types,
                                                      has_oov=True)
        }
        if props["dt_label_emb_size"] != 0:
            features["dt_labels"]['embedding_size'] = props[
                "dt_label_emb_size"]

    if props.get("dt_distance_emb_size", -1) >= 0:
        features['dt_head_distances'] = {
            'converter':
            create_signed_integers_converter(props["max_dt_distance"])
        }
        if props["dt_distance_emb_size"] != 0:
            features["dt_head_distances"]['embedding_size'] = props[
                "dt_distance_emb_size"]

    if props.get("dt_depth_emb_size", -1) >= 0:
        features['dt_depths'] = {
            'converter':
            create_unsigned_integers_converter(props["max_dt_depth"])
        }
        if props["dt_depth_emb_size"] != 0:
            features["dt_depths"]['embedding_size'] = props[
                "dt_depth_emb_size"]

    max_dt_delta = props.get("max_dt_delta", 0)
    if max_dt_delta:
        for direction in [Direction.FORWARD, Direction.BACKWARD]:
            key = "dt_deltas_" + direction.value
            emb_size = props.get(key + "_emb_size", -1)
            if emb_size >= 0:
                features[key] = {
                    'converter':
                    create_signed_integers_converter(
                        max_dt_delta, additional_labels={"$START$"})
                }

                if emb_size != 0:
                    features[key]['embedding_size'] = emb_size

    for direction in [Direction.FORWARD, Direction.BACKWARD]:
        key = "dt_breakups_" + direction.value
        emb_size = props.get(key + "_emb_size", -1)
        if emb_size >= 0:
            features[key] = {
                'converter':
                create_categorical_converter(collect_feature_labels(docs, key))
            }

            if emb_size != 0:
                features[key]['embedding_size'] = emb_size

    if props.get('morph_feats_emb_size', -1) >= 0:
        for feat in morph_features:
            feat_types = collect_feature_labels(docs, feat)
            if not feat_types:
                continue
            features[feat] = {
                'converter': create_categorical_converter(feat_types,
                                                          has_oov=True)
            }
            if props["morph_feats_emb_size"] != 0:
                features[feat]['embedding_size'] = props[
                    "morph_feats_emb_size"]

    return features
Esempio n. 21
0
def _create_binary_feature(size):
    feature = {"converter": create_categorical_converter({True, False})}
    if size != 0:
        feature["embedding_size"] = size
    return feature