def _get_features(props):
    features = {}

    size = props.get("token_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_position"] = feature

    size = props.get("token_log_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_log_integers_converter(props["max_word_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["token_log_position"] = feature

    size = props.get("sent_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_signed_integers_converter(props["max_sent_distance"])
        }
        if size > 0:
            feature["embedding_size"] = size
        features["sent_position"] = feature

    size = props.get("at_root_dt_path_size", -1)
    if size >= 0:
        feature = {"converter": create_categorical_converter({False, True})}
        if size > 0:
            feature["embedding_size"] = size
        features["at_root_dt_path"] = feature

    size = props.get("root_dt_path_position_size", -1)
    if size >= 0:
        feature = {
            "converter":
            create_unsigned_integers_converter(props["max_dt_depth"],
                                               additional_labels={False})
        }
        if size > 0:
            feature["embedding_size"] = size
        features["root_dt_path_position"] = feature

    return features
Beispiel #2
0
    def test_signed_integers_converter_5(self):
        absolute_offset = 4
        additional_labels = {"privet"}
        converter = create_signed_integers_converter(
            absolute_offset, additional_labels=additional_labels, zero_padding=False)
        indexed_integers = {
            -1: 0,
            -2: 1,
            -3: 2,
            -4: 3,
            -5: 4,
            -6: 4,
            -7: 4,
            -8: 4,
            0: 5,
            1: 6,
            2: 7,
            3: 8,
            4: 9,
            5: 10,
            6: 10,
            7: 10,
            "privet": 11
        }

        converter_indexed_integers = {key: converter[key] for key in indexed_integers}
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", "$habitat$", "$PADDING$"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Beispiel #3
0
    def test_signed_integers_converter_4(self):
        absolute_offset = 4
        additional_labels = {"privet"}
        converter = create_signed_integers_converter(
            absolute_offset, additional_labels=additional_labels, zero_padding=True)
        indexed_integers = {
            "$PADDING$": 0,
            -1: 1,
            -2: 2,
            -3: 3,
            -4: 4,
            -5: 5,
            -6: 5,
            -7: 5,
            -8: 5,
            0: 6,
            1: 7,
            2: 8,
            3: 9,
            4: 10,
            5: 11,
            6: 11,
            7: 11,
            "privet": 12
        }

        converter_indexed_integers = {key: converter[key] for key in indexed_integers}
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", "$habitat$"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Beispiel #4
0
    def test_signed_integers_converter_2(self):
        absolute_offset = 0
        additional_labels = set()
        converter = create_signed_integers_converter(
            absolute_offset, additional_labels=additional_labels, zero_padding=False)
        indexed_integers = {
            -1: 0,
            -2: 0,
            -3: 0,
            -4: 0,
            -5: 0,
            -6: 0,
            -7: 0,
            -8: 0,
            0: 1,
            1: 2,
            2: 2,
            3: 2,
            4: 2,
            5: 2,
            6: 2,
            7: 2
        }

        converter_indexed_integers = {key: converter[key] for key in indexed_integers}
        self.assertEqual(indexed_integers, converter_indexed_integers)

        error_keys = ["$OOV$", "$PADDING$", "habitat"]
        for key in error_keys:
            self.assertRaises(KeyError, getitem, converter, key)
Beispiel #5
0
def _init_word_level_features(docs: Iterable[Document], props: dict,
                              morph_features: List[str]):
    features = {}

    if props.get("pos_emb_size", -1) >= 0:
        pos_types = collect_feature_labels(docs, "pos")
        features['pos'] = {
            'converter': create_categorical_converter(pos_types, has_oov=True)
        }
        if props["pos_emb_size"] != 0:
            features['pos']['embedding_size'] = props["pos_emb_size"]

    if props.get("borders_size", -1) >= 0:
        features['borders'] = {
            'converter': create_categorical_converter({'start', 'in', 'end'})
        }
        if props["borders_size"] != 0:
            features["borders"]['embedding_size'] = props["borders_size"]

    if props.get("dt_label_emb_size", -1) >= 0:
        dt_label_types = collect_feature_labels(docs, "dt_labels")
        features['dt_labels'] = {
            'converter': create_categorical_converter(dt_label_types,
                                                      has_oov=True)
        }
        if props["dt_label_emb_size"] != 0:
            features["dt_labels"]['embedding_size'] = props[
                "dt_label_emb_size"]

    if props.get("dt_distance_emb_size", -1) >= 0:
        features['dt_head_distances'] = {
            'converter':
            create_signed_integers_converter(props["max_dt_distance"])
        }
        if props["dt_distance_emb_size"] != 0:
            features["dt_head_distances"]['embedding_size'] = props[
                "dt_distance_emb_size"]

    if props.get("dt_depth_emb_size", -1) >= 0:
        features['dt_depths'] = {
            'converter':
            create_unsigned_integers_converter(props["max_dt_depth"])
        }
        if props["dt_depth_emb_size"] != 0:
            features["dt_depths"]['embedding_size'] = props[
                "dt_depth_emb_size"]

    max_dt_delta = props.get("max_dt_delta", 0)
    if max_dt_delta:
        for direction in [Direction.FORWARD, Direction.BACKWARD]:
            key = "dt_deltas_" + direction.value
            emb_size = props.get(key + "_emb_size", -1)
            if emb_size >= 0:
                features[key] = {
                    'converter':
                    create_signed_integers_converter(
                        max_dt_delta, additional_labels={"$START$"})
                }

                if emb_size != 0:
                    features[key]['embedding_size'] = emb_size

    for direction in [Direction.FORWARD, Direction.BACKWARD]:
        key = "dt_breakups_" + direction.value
        emb_size = props.get(key + "_emb_size", -1)
        if emb_size >= 0:
            features[key] = {
                'converter':
                create_categorical_converter(collect_feature_labels(docs, key))
            }

            if emb_size != 0:
                features[key]['embedding_size'] = emb_size

    if props.get('morph_feats_emb_size', -1) >= 0:
        for feat in morph_features:
            feat_types = collect_feature_labels(docs, feat)
            if not feat_types:
                continue
            features[feat] = {
                'converter': create_categorical_converter(feat_types,
                                                          has_oov=True)
            }
            if props["morph_feats_emb_size"] != 0:
                features[feat]['embedding_size'] = props[
                    "morph_feats_emb_size"]

    return features