Exemple #1
0
def _get_classifier_features(props, docs):
    classifier_features = {}
    dual_features = {}

    classifier_agreement_size = props.get('classifier_agreement_size', -1)
    if classifier_agreement_size >= 0:
        agreement_types = props.get('agreement_types', [])
        for agreement_type in agreement_types:
            converter = create_categorical_converter(
                {"agreement", "disagreement", "unknown"})
            classifier_features.update(
                create_feature(agreement_type + "_agreement", props, converter,
                               'classifier_agreement'))

    classifier_features.update(
        create_feature(
            'mention_distance', props,
            create_unsigned_integers_converter(props["max_mention_distance"])))

    classifier_features.update(
        create_feature(
            'mention_interrelation', props,
            create_categorical_converter(
                {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"})))

    classifier_features.update(
        create_feature(
            'classifier_entity_distance', props,
            create_unsigned_integers_converter(props["max_entity_distance"])))

    classifier_features.update(
        create_feature(
            'entities_token_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_token_entities_distance"])))

    classifier_features.update(
        create_feature(
            'entities_sent_distance_in_classifier', props,
            create_unsigned_integers_converter(
                props["max_sent_entities_distance"])))

    dual_features.update(
        create_feature(
            'entities_types_in_classifier', props,
            create_categorical_converter(collect_entities_types(docs))))

    dual_features.update(
        create_feature(
            'head_ne_types', props,
            create_categorical_converter(collect_entities_types(
                docs, extras=True).union('O'),
                                         has_oov=True)))

    classifier_features.update(_get_binary_features(props))

    return duplicate_features_config(dual_features, classifier_features)
Exemple #2
0
    def test_unsigned_integers_converter_4(self):
        right_border = 4
        additional_labels = {"$SENTEND$", "$HABITAT$"}
        converter = create_unsigned_integers_converter(
            right_border, additional_labels=additional_labels, zero_padding=True)
        indexed_integers = {
            "$PADDING$": 0,
            "$HABITAT$": 1,
            "$SENTEND$": 2,
            0: 3,
            1: 4,
            2: 5,
            3: 6,
            4: 7,
            5: 8,
            6: 8,
            7: 8
        }

        converter_indexed_integers = {key: converter[key] for key in indexed_integers}
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", -1, -2, -3, -1000, "habitat"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Exemple #3
0
def _get_entities_encoder_features(props, entities_types):
    oov_type = None
    entities_types = entities_types.union({oov_type})
    features = {}

    if props.get("entities_types_emb_size", -1) >= 0:
        features['entities_types'] = {
            'converter':
            create_categorical_converter(entities_types,
                                         has_oov=True,
                                         oov_object=oov_type)
        }
        if props["entities_types_emb_size"] != 0:
            features["entities_types"]['embedding_size'] = props[
                "entities_types_emb_size"]

    if props.get("entities_depth_emb_size", -1) >= 0:
        features['entities_depths'] = {
            'converter':
            create_unsigned_integers_converter(props["max_entities_depth"])
        }
        if props["entities_depth_emb_size"] != 0:
            features["entities_depths"]['embedding_size'] = props[
                "entities_depth_emb_size"]

    return features
Exemple #4
0
    def test_unsigned_integers_converter_3(self):
        right_border = 4
        additional_labels = set()
        converter = create_unsigned_integers_converter(
            right_border,
            additional_labels=additional_labels,
            zero_padding=True)
        indexed_integers = {
            "$PADDING$": 0,
            0: 1,
            1: 2,
            2: 3,
            3: 4,
            4: 5,
            5: 6,
            6: 6,
            7: 6
        }

        converter_indexed_integers = {
            key: converter[key]
            for key in indexed_integers
        }
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", -1, -2, -3, -1000, "habitat"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
def _get_features(props):
    features = {}

    size = props.get("token_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_position"] = feature

    size = props.get("token_log_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_log_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_log_position"] = feature

    size = props.get("sent_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_sent_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["sent_position"] = feature

    size = props.get("at_root_dt_path_size", -1)
    if size >= 0:
        feature = {"converter": create_categorical_converter({False, True})}
        if size > 0:
            feature["embedding_size"] = size
        features["at_root_dt_path"] = feature

    size = props.get("root_dt_path_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_unsigned_integers_converter(props["max_dt_depth"],
                                               additional_labels={False})
        }
        if size > 0:
            feature["embedding_size"] = size
        features["root_dt_path_position"] = feature

    return features
Exemple #6
0
    def test_unsigned_integers_converter_2(self):
        right_border = 0
        additional_labels = set()
        converter = create_unsigned_integers_converter(
            right_border,
            additional_labels=additional_labels,
            zero_padding=False)
        indexed_integers = {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}

        converter_indexed_integers = {
            key: converter[key]
            for key in indexed_integers
        }
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", "$PADDING$", -1, -2, -3, -1000, "habitat"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Exemple #7
0
def _get_arc_level_features(props, name_postfix="classifier"):
    dual_features = {}
    single_features = {}

    token_distance_feature_name = "arc_token_distance_in_{}".format(
        name_postfix)
    token_distance_size = props.get(token_distance_feature_name + "_size", -1)

    if token_distance_size >= 0:
        single_features[token_distance_feature_name] = {
            "converter":
            create_unsigned_integers_converter(props["max_arc_token_distance"])
        }

        if token_distance_size != 0:
            single_features[token_distance_feature_name][
                "embedding_size"] = token_distance_size

    return duplicate_features_config(dual_features, single_features)
Exemple #8
0
def _get_relation_level_features(props,
                                 rel_arg_types,
                                 entities_types,
                                 name_postfix="classifier"):
    dual_features = {}
    single_features = {}

    rel_args_feature_name = "rel_args_in_{}".format(name_postfix)
    rel_args_size = props.get("rel_args_in_{}_size".format(name_postfix), -1)

    if rel_args_size >= 0:
        single_features[rel_args_feature_name] = {
            "converter": create_categorical_converter(rel_arg_types)
        }

        if rel_args_size != 0:
            single_features[rel_args_feature_name][
                "embedding_size"] = rel_args_size

    entities_types_feature_name = "entities_types_in_{}".format(name_postfix)
    entities_types_size = props.get(entities_types_feature_name + "_size", -1)

    if entities_types_size >= 0:
        dual_features[entities_types_feature_name] = {
            "converter": create_categorical_converter(entities_types)
        }

        if entities_types_size > 0:
            dual_features[entities_types_feature_name][
                "embedding_size"] = entities_types_size

    token_distance_feature_name = "entities_token_distance_in_{}".format(
        name_postfix)
    token_distance_size = props.get(token_distance_feature_name + "_size", -1)

    if token_distance_size >= 0:
        single_features[token_distance_feature_name] = {
            "converter":
            create_unsigned_integers_converter(
                props["max_token_entities_distance"])
        }

        if token_distance_size != 0:
            single_features[token_distance_feature_name][
                "embedding_size"] = token_distance_size

    token_log_distance_feature_name = "entities_token_log_distance_in_{}".format(
        name_postfix)
    token_log_distance_size = props.get(
        token_log_distance_feature_name + "_size", -1)

    if token_log_distance_size >= 0:
        single_features[token_log_distance_feature_name] = {
            "converter":
            create_unsigned_log_integers_converter(
                props["max_token_entities_distance"])
        }

        if token_log_distance_size != 0:
            single_features[token_log_distance_feature_name][
                "embedding_size"] = token_log_distance_size

    sent_distance_feature_name = "entities_sent_distance_in_{}".format(
        name_postfix)
    sent_distance_size = props.get(sent_distance_feature_name + "_size", -1)

    if sent_distance_size >= 0:
        single_features[sent_distance_feature_name] = {
            "converter":
            create_unsigned_integers_converter(
                props["max_sent_entities_distance"])
        }

        if sent_distance_size != 0:
            single_features[sent_distance_feature_name][
                "embedding_size"] = sent_distance_size

    rel_direction_feature_name = "rel_dir_in_{}".format(name_postfix)
    rel_direction_size = props.get(rel_direction_feature_name + "_size", -1)

    if rel_direction_size >= 0:
        categories = {"e1_e2", "e2_e1", "e1_in_e2", "e2_in_e1"}
        single_features[rel_direction_feature_name] = {
            "converter": create_categorical_converter(categories)
        }

        if rel_direction_size != 0:
            single_features[rel_direction_feature_name][
                "embedding_size"] = rel_direction_size

    return duplicate_features_config(dual_features, single_features)
Exemple #9
0
def _init_word_level_features(docs: Iterable[Document], props: dict,
                              morph_features: List[str]):
    features = {}

    if props.get("pos_emb_size", -1) >= 0:
        pos_types = collect_feature_labels(docs, "pos")
        features['pos'] = {
            'converter': create_categorical_converter(pos_types, has_oov=True)
        }
        if props["pos_emb_size"] != 0:
            features['pos']['embedding_size'] = props["pos_emb_size"]

    if props.get("borders_size", -1) >= 0:
        features['borders'] = {
            'converter': create_categorical_converter({'start', 'in', 'end'})
        }
        if props["borders_size"] != 0:
            features["borders"]['embedding_size'] = props["borders_size"]

    if props.get("dt_label_emb_size", -1) >= 0:
        dt_label_types = collect_feature_labels(docs, "dt_labels")
        features['dt_labels'] = {
            'converter': create_categorical_converter(dt_label_types,
                                                      has_oov=True)
        }
        if props["dt_label_emb_size"] != 0:
            features["dt_labels"]['embedding_size'] = props[
                "dt_label_emb_size"]

    if props.get("dt_distance_emb_size", -1) >= 0:
        features['dt_head_distances'] = {
            'converter':
            create_signed_integers_converter(props["max_dt_distance"])
        }
        if props["dt_distance_emb_size"] != 0:
            features["dt_head_distances"]['embedding_size'] = props[
                "dt_distance_emb_size"]

    if props.get("dt_depth_emb_size", -1) >= 0:
        features['dt_depths'] = {
            'converter':
            create_unsigned_integers_converter(props["max_dt_depth"])
        }
        if props["dt_depth_emb_size"] != 0:
            features["dt_depths"]['embedding_size'] = props[
                "dt_depth_emb_size"]

    max_dt_delta = props.get("max_dt_delta", 0)
    if max_dt_delta:
        for direction in [Direction.FORWARD, Direction.BACKWARD]:
            key = "dt_deltas_" + direction.value
            emb_size = props.get(key + "_emb_size", -1)
            if emb_size >= 0:
                features[key] = {
                    'converter':
                    create_signed_integers_converter(
                        max_dt_delta, additional_labels={"$START$"})
                }

                if emb_size != 0:
                    features[key]['embedding_size'] = emb_size

    for direction in [Direction.FORWARD, Direction.BACKWARD]:
        key = "dt_breakups_" + direction.value
        emb_size = props.get(key + "_emb_size", -1)
        if emb_size >= 0:
            features[key] = {
                'converter':
                create_categorical_converter(collect_feature_labels(docs, key))
            }

            if emb_size != 0:
                features[key]['embedding_size'] = emb_size

    if props.get('morph_feats_emb_size', -1) >= 0:
        for feat in morph_features:
            feat_types = collect_feature_labels(docs, feat)
            if not feat_types:
                continue
            features[feat] = {
                'converter': create_categorical_converter(feat_types,
                                                          has_oov=True)
            }
            if props["morph_feats_emb_size"] != 0:
                features[feat]['embedding_size'] = props[
                    "morph_feats_emb_size"]

    return features