def _get_classifier_features(props, docs): classifier_features = {} dual_features = {} classifier_agreement_size = props.get('classifier_agreement_size', -1) if classifier_agreement_size >= 0: agreement_types = props.get('agreement_types', []) for agreement_type in agreement_types: converter = create_categorical_converter( {"agreement", "disagreement", "unknown"}) classifier_features.update( create_feature(agreement_type + "_agreement", props, converter, 'classifier_agreement')) classifier_features.update( create_feature( 'mention_distance', props, create_unsigned_integers_converter(props["max_mention_distance"]))) classifier_features.update( create_feature( 'mention_interrelation', props, create_categorical_converter( {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"}))) classifier_features.update( create_feature( 'classifier_entity_distance', props, create_unsigned_integers_converter(props["max_entity_distance"]))) classifier_features.update( create_feature( 'entities_token_distance_in_classifier', props, create_unsigned_integers_converter( props["max_token_entities_distance"]))) classifier_features.update( create_feature( 'entities_sent_distance_in_classifier', props, create_unsigned_integers_converter( props["max_sent_entities_distance"]))) dual_features.update( create_feature( 'entities_types_in_classifier', props, create_categorical_converter(collect_entities_types(docs)))) dual_features.update( create_feature( 'head_ne_types', props, create_categorical_converter(collect_entities_types( docs, extras=True).union('O'), has_oov=True))) classifier_features.update(_get_binary_features(props)) return duplicate_features_config(dual_features, classifier_features)
def test_unsigned_integers_converter_4(self): right_border = 4 additional_labels = {"$SENTEND$", "$HABITAT$"} converter = create_unsigned_integers_converter( right_border, additional_labels=additional_labels, zero_padding=True) indexed_integers = { "$PADDING$": 0, "$HABITAT$": 1, "$SENTEND$": 2, 0: 3, 1: 4, 2: 5, 3: 6, 4: 7, 5: 8, 6: 8, 7: 8 } converter_indexed_integers = {key: converter[key] for key in indexed_integers} self.assertEqual(indexed_integers, converter_indexed_integers) error_keys = ["$OOV$", -1, -2, -3, -1000, "habitat"] for key in error_keys: self.assertRaises(KeyError, getitem, converter, key)
def _get_entities_encoder_features(props, entities_types): oov_type = None entities_types = entities_types.union({oov_type}) features = {} if props.get("entities_types_emb_size", -1) >= 0: features['entities_types'] = { 'converter': create_categorical_converter(entities_types, has_oov=True, oov_object=oov_type) } if props["entities_types_emb_size"] != 0: features["entities_types"]['embedding_size'] = props[ "entities_types_emb_size"] if props.get("entities_depth_emb_size", -1) >= 0: features['entities_depths'] = { 'converter': create_unsigned_integers_converter(props["max_entities_depth"]) } if props["entities_depth_emb_size"] != 0: features["entities_depths"]['embedding_size'] = props[ "entities_depth_emb_size"] return features
def test_unsigned_integers_converter_3(self): right_border = 4 additional_labels = set() converter = create_unsigned_integers_converter( right_border, additional_labels=additional_labels, zero_padding=True) indexed_integers = { "$PADDING$": 0, 0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 6, 7: 6 } converter_indexed_integers = { key: converter[key] for key in indexed_integers } self.assertEqual(indexed_integers, converter_indexed_integers) error_keys = ["$OOV$", -1, -2, -3, -1000, "habitat"] for key in error_keys: self.assertRaises(KeyError, getitem, converter, key)
def _get_features(props): features = {} size = props.get("token_position_size", -1) if size >= 0: feature = { "converter": create_signed_integers_converter(props["max_word_distance"]) } if size > 0: feature["embedding_size"] = size features["token_position"] = feature size = props.get("token_log_position_size", -1) if size >= 0: feature = { "converter": create_signed_log_integers_converter(props["max_word_distance"]) } if size > 0: feature["embedding_size"] = size features["token_log_position"] = feature size = props.get("sent_position_size", -1) if size >= 0: feature = { "converter": create_signed_integers_converter(props["max_sent_distance"]) } if size > 0: feature["embedding_size"] = size features["sent_position"] = feature size = props.get("at_root_dt_path_size", -1) if size >= 0: feature = {"converter": create_categorical_converter({False, True})} if size > 0: feature["embedding_size"] = size features["at_root_dt_path"] = feature size = props.get("root_dt_path_position_size", -1) if size >= 0: feature = { "converter": create_unsigned_integers_converter(props["max_dt_depth"], additional_labels={False}) } if size > 0: feature["embedding_size"] = size features["root_dt_path_position"] = feature return features
def test_unsigned_integers_converter_2(self): right_border = 0 additional_labels = set() converter = create_unsigned_integers_converter( right_border, additional_labels=additional_labels, zero_padding=False) indexed_integers = {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1} converter_indexed_integers = { key: converter[key] for key in indexed_integers } self.assertEqual(indexed_integers, converter_indexed_integers) error_keys = ["$OOV$", "$PADDING$", -1, -2, -3, -1000, "habitat"] for key in error_keys: self.assertRaises(KeyError, getitem, converter, key)
def _get_arc_level_features(props, name_postfix="classifier"): dual_features = {} single_features = {} token_distance_feature_name = "arc_token_distance_in_{}".format( name_postfix) token_distance_size = props.get(token_distance_feature_name + "_size", -1) if token_distance_size >= 0: single_features[token_distance_feature_name] = { "converter": create_unsigned_integers_converter(props["max_arc_token_distance"]) } if token_distance_size != 0: single_features[token_distance_feature_name][ "embedding_size"] = token_distance_size return duplicate_features_config(dual_features, single_features)
def _get_relation_level_features(props, rel_arg_types, entities_types, name_postfix="classifier"): dual_features = {} single_features = {} rel_args_feature_name = "rel_args_in_{}".format(name_postfix) rel_args_size = props.get("rel_args_in_{}_size".format(name_postfix), -1) if rel_args_size >= 0: single_features[rel_args_feature_name] = { "converter": create_categorical_converter(rel_arg_types) } if rel_args_size != 0: single_features[rel_args_feature_name][ "embedding_size"] = rel_args_size entities_types_feature_name = "entities_types_in_{}".format(name_postfix) entities_types_size = props.get(entities_types_feature_name + "_size", -1) if entities_types_size >= 0: dual_features[entities_types_feature_name] = { "converter": create_categorical_converter(entities_types) } if entities_types_size > 0: dual_features[entities_types_feature_name][ "embedding_size"] = entities_types_size token_distance_feature_name = "entities_token_distance_in_{}".format( name_postfix) token_distance_size = props.get(token_distance_feature_name + "_size", -1) if token_distance_size >= 0: single_features[token_distance_feature_name] = { "converter": create_unsigned_integers_converter( props["max_token_entities_distance"]) } if token_distance_size != 0: single_features[token_distance_feature_name][ "embedding_size"] = token_distance_size token_log_distance_feature_name = "entities_token_log_distance_in_{}".format( name_postfix) token_log_distance_size = props.get( token_log_distance_feature_name + "_size", -1) if token_log_distance_size >= 0: single_features[token_log_distance_feature_name] = { "converter": create_unsigned_log_integers_converter( props["max_token_entities_distance"]) } if token_log_distance_size != 0: single_features[token_log_distance_feature_name][ "embedding_size"] = token_log_distance_size sent_distance_feature_name = "entities_sent_distance_in_{}".format( name_postfix) sent_distance_size = props.get(sent_distance_feature_name + "_size", -1) if sent_distance_size >= 0: single_features[sent_distance_feature_name] = { "converter": create_unsigned_integers_converter( props["max_sent_entities_distance"]) } if sent_distance_size != 0: single_features[sent_distance_feature_name][ "embedding_size"] = sent_distance_size rel_direction_feature_name = "rel_dir_in_{}".format(name_postfix) rel_direction_size = props.get(rel_direction_feature_name + "_size", -1) if rel_direction_size >= 0: categories = {"e1_e2", "e2_e1", "e1_in_e2", "e2_in_e1"} single_features[rel_direction_feature_name] = { "converter": create_categorical_converter(categories) } if rel_direction_size != 0: single_features[rel_direction_feature_name][ "embedding_size"] = rel_direction_size return duplicate_features_config(dual_features, single_features)
def _init_word_level_features(docs: Iterable[Document], props: dict, morph_features: List[str]): features = {} if props.get("pos_emb_size", -1) >= 0: pos_types = collect_feature_labels(docs, "pos") features['pos'] = { 'converter': create_categorical_converter(pos_types, has_oov=True) } if props["pos_emb_size"] != 0: features['pos']['embedding_size'] = props["pos_emb_size"] if props.get("borders_size", -1) >= 0: features['borders'] = { 'converter': create_categorical_converter({'start', 'in', 'end'}) } if props["borders_size"] != 0: features["borders"]['embedding_size'] = props["borders_size"] if props.get("dt_label_emb_size", -1) >= 0: dt_label_types = collect_feature_labels(docs, "dt_labels") features['dt_labels'] = { 'converter': create_categorical_converter(dt_label_types, has_oov=True) } if props["dt_label_emb_size"] != 0: features["dt_labels"]['embedding_size'] = props[ "dt_label_emb_size"] if props.get("dt_distance_emb_size", -1) >= 0: features['dt_head_distances'] = { 'converter': create_signed_integers_converter(props["max_dt_distance"]) } if props["dt_distance_emb_size"] != 0: features["dt_head_distances"]['embedding_size'] = props[ "dt_distance_emb_size"] if props.get("dt_depth_emb_size", -1) >= 0: features['dt_depths'] = { 'converter': create_unsigned_integers_converter(props["max_dt_depth"]) } if props["dt_depth_emb_size"] != 0: features["dt_depths"]['embedding_size'] = props[ "dt_depth_emb_size"] max_dt_delta = props.get("max_dt_delta", 0) if max_dt_delta: for direction in [Direction.FORWARD, Direction.BACKWARD]: key = "dt_deltas_" + direction.value emb_size = props.get(key + "_emb_size", -1) if emb_size >= 0: features[key] = { 'converter': create_signed_integers_converter( max_dt_delta, additional_labels={"$START$"}) } if emb_size != 0: features[key]['embedding_size'] = emb_size for direction in [Direction.FORWARD, Direction.BACKWARD]: key = "dt_breakups_" + direction.value emb_size = props.get(key + "_emb_size", -1) if emb_size >= 0: features[key] = { 'converter': create_categorical_converter(collect_feature_labels(docs, key)) } if emb_size != 0: features[key]['embedding_size'] = emb_size if props.get('morph_feats_emb_size', -1) >= 0: for feat in morph_features: feat_types = collect_feature_labels(docs, feat) if not feat_types: continue features[feat] = { 'converter': create_categorical_converter(feat_types, has_oov=True) } if props["morph_feats_emb_size"] != 0: features[feat]['embedding_size'] = props[ "morph_feats_emb_size"] return features