def default_features(cls) -> feature.Features: return feature.Features( { "text": feature.Value("string"), "edits": feature.Dict( feature={ "start_idx": feature.Sequence(feature=feature.Value("int32")), "end_idx": feature.Sequence(feature=feature.Value("int32")), "corrections": feature.Sequence( feature=feature.Sequence(feature=feature.Value("string")) ), } ), "text_length": feature.Value( dtype="float", description="length of the text", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), } )
def _customize_features(self, metadata: dict) -> Optional[Features]: """ declare the customized features for this processor. Args: metadata: the metadata information of system output Returns: """ features = copy.deepcopy(self._default_features) # add user-defined features into features list if metadata is not None: for ( feature_name, feature_config, ) in metadata.items(): if feature_config["dtype"] == "string": features[feature_name] = feature.Value( dtype="string", description=feature_config["description"], is_bucket=True, is_custom=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=feature_config["num_buckets"], setting=1, ), ) elif feature_config["dtype"] == 'float': features[feature_name] = feature.Value( dtype="float", description=feature_config["description"], is_bucket=True, is_custom=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=feature_config["num_buckets"], setting=(), ), ) else: raise NotImplementedError return features
def default_features(cls) -> feature.Features: f = super().default_features() f.update( feature.Features( { # declaim task-specific features "attr_compression": feature.Value( dtype="float", description="the ratio between source and reference length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), } ) ) return f
def default_features(cls) -> feature.Features: return feature.Features({ "true_head": feature.Value("string"), "true_head_decipher": feature.Value("string"), "true_link": feature.Value("string"), "true_tail": feature.Value("string"), "true_tail_decipher": feature.Value("string"), "predict": feature.Value("string"), "true_label": feature.Value("string"), "predictions": feature.Sequence(feature=feature.Value("string")), "tail_entity_length": feature.Value( dtype="float", description="number of words in the tail entity", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "head_entity_length": feature.Value( dtype="float", description="number of words in the head entity", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tail_fre": feature.Value( dtype="float", description="the frequency of tail entity in the training set", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "link_fre": feature.Value( dtype="float", description= "the frequency of link relation in the training set", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "head_fre": feature.Value( dtype="float", description= "the frequency of head relation in the training set", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "symmetry": feature.Value( dtype="string", description=( "boolean feature: 'symmetric' or 'asymmetric'; more " "granularity to be added"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=2, setting=1), ), "entity_type_level": feature.Value( dtype="string", description= ("most specific (highest) entity type level of true tail entity" ), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=8, setting=1), ), })
def test_get_bucket_features(self): ner_task_features = feature.Features({ "tokens": feature.Sequence(feature=feature.Value("string")), "true_tags": feature.Sequence(feature=feature.Value("string")), "pred_tags": feature.Sequence(feature=feature.Value("string")), # --- the following are features of the sentences --- "sentence_length": feature.Value( dtype="float", description="sentence length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "entity_density": feature.Value( dtype="float", description="the ration between all entity " "tokens and sentence tokens ", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "num_oov": feature.Value( dtype="float", description="the number of out-of-vocabulary words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "fre_rank": feature.Value( dtype="float", description=( "the average rank of each word based on its frequency in " "training set"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), # --- the following are features of each entity --- "true_entity_info": feature.Sequence(feature=feature.Dict( feature={ "span_text": feature.Value("string"), "span_tokens": feature.Value( dtype="float", description="entity length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "span_pos": feature.Position(positions=[0, 0]), "span_tag": feature.Value( dtype="string", description="entity tag", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1, ), ), "span_capitalness": feature.Value( dtype="string", description=( "The capitalness of an entity. For example, " "first_caps represents only the first character of " "the entity is capital. full_caps denotes all " "characters of the entity are capital"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1, ), ), "span_rel_pos": feature.Value( dtype="float", description=( "The relative position of an entity in a sentence" ), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "span_chars": feature.Value( dtype="float", description="The number of characters of an entity", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "span_econ": feature.Value( dtype="float", description="entity label consistency", is_bucket=True, require_training_set=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "span_efre": feature.Value( dtype="float", description="entity frequency", is_bucket=True, require_training_set=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), })), }) bucket_features = ner_task_features.get_bucket_features() self.assertEqual( set(bucket_features), set([ 'sentence_length', 'entity_density', 'num_oov', 'fre_rank', 'span_tokens', 'span_tag', 'span_capitalness', 'span_rel_pos', 'span_chars', 'span_econ', 'span_efre', ]), )
def default_features(cls) -> feature.Features: f = super().default_features() f.update( feature.Features({ "attr_compression": feature.Value( dtype="float", description="compression", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "attr_copy_len": feature.Value( dtype="float", description="copy length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "attr_coverage": feature.Value( dtype="float", description="coverage", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "attr_novelty": feature.Value( dtype="float", description="novelty", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "oracle_score": feature.Value( dtype="float", description="the sample-level oracle score", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "oracle_position": feature.Value( dtype="float", description="the sample-level oracle position", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), })) return f
def default_features(cls) -> feature.Features: return feature.Features( { "context": feature.Value("string"), "question": feature.Value("string"), "options": feature.Sequence(feature=feature.Value("string")), "answers": feature.Sequence( feature=feature.Dict( feature={ "text": feature.Value("string"), "option_index": feature.Value("int32"), } ) ), "context_length": feature.Value( dtype="float", description="the length of context", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "question_length": feature.Value( dtype="float", description="the length of question", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "answer_length": feature.Value( dtype="float", description="the length of answer", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "num_oov": feature.Value( dtype="float", description="the number of out-of-vocabulary words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "fre_rank": feature.Value( dtype="float", description=( "the average rank of each word based on its frequency in " "training set" ), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), } )
def default_features(cls) -> feature.Features: return feature.Features({ "aspect": feature.Value("string"), "text": feature.Value("string"), "true_label": feature.Value("string"), "predicted_label": feature.Value("string"), "label": feature.Value( dtype="string", description="category", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1), ), "sentence_length": feature.Value( dtype="float", description="sentence length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "token_number": feature.Value( dtype="float", description="the number of chars", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "entity_number": feature.Value( dtype="float", description="entity numbers", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "aspect_length": feature.Value( dtype="float", description="aspect length", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "aspect_index": feature.Value( dtype="float", description="aspect position", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), })
def default_features(cls) -> feature.Features: return feature.Features({ "source": feature.Value("string"), "reference": feature.Value("string"), "hypothesis": feature.Value("string"), "source_length": feature.Value( dtype="float", description="length of the source", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "reference_length": feature.Value( dtype="float", description="length of the reference", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "hypothesis_length": feature.Value( dtype="float", description="length of the hypothesis", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "src_num_oov": feature.Value( dtype="float", description="OOV words in the source", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "src_fre_rank": feature.Value( dtype="float", description=( "average training-set frequency rank of words in sentence" ), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "ref_num_oov": feature.Value( dtype="float", description="number of OOV words in reference", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "ref_fre_rank": feature.Value( dtype="float", description=( "average training-set frequency rank of words in sentence" ), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), # --- the following are features of each token --- "ref_tok_info": feature.Sequence(feature=feature.Dict( feature={ "tok_text": feature.Value("string"), "tok_pos": feature.Position(positions=[0, 0]), "tok_matched": feature.Value( # this is actually "int" but int is not supported dtype="float", description=( "which token the ref/hyp token matches in the " "hyp/ref sentence, or -1 if none"), is_bucket=False, ), "tok_capitalness": feature.Value( dtype="string", description=("capitalness of token"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1, ), ), "tok_position": feature.Value( dtype="float", description=("relative position of token in sentence"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_chars": feature.Value( dtype="float", description="number of characters in the token", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_test_freq": feature.Value( dtype="float", description="tok frequency in the test set", is_bucket=True, require_training_set=False, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_train_freq": feature.Value( dtype="float", description="tok frequency in the training set", is_bucket=True, require_training_set=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), })), })
def default_features(cls) -> feature.Features: return feature.Features({ "context": feature.Value("string"), "question_mark": feature.Value("string"), "hint": feature.Value("string"), "answers": feature.Value("string"), "context_length": feature.Value( dtype="float", description="the length of context", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "relative_blank_position": feature.Value( dtype="float", description="the relative position of blank (question mark)" " in the whole context", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "absolute_blank_position": feature.Value( dtype="float", description="the absolute position of blank (question mark)" " in the whole context", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "answer_length": feature.Value( dtype="float", description="the length of answer", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "num_oov": feature.Value( dtype="float", description="the number of out-of-vocabulary words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "fre_rank": feature.Value( dtype="float", description=( "the average rank of each word based on its frequency in " "training set"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), })
def default_features(cls) -> feature.Features: return feature.Features({ "text": feature.Value("string"), "log_probs": feature.Value("string"), "text_length": feature.Value( dtype="float", description="text length in tokens", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "text_chars": feature.Value( dtype="float", description="text length in characters", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "num_oov": feature.Value( dtype="float", description="the number of out-of-vocabulary words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "fre_rank": feature.Value( dtype="float", description=( "the average rank of each work based on its frequency in " "training set"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "length_fre": feature.Value( dtype="float", description="the frequency of text length in training set", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), # --- the following are features of each token --- "tok_info": feature.Sequence(feature=feature.Dict( feature={ "tok_text": feature.Value("string"), "tok_pos": feature.Position(positions=[0, 0]), "tok_log_prob": feature.Value( dtype="float", description=( "log probability of the token according to the LM" ), is_bucket=False, ), "tok_capitalness": feature.Value( dtype="string", description=( "The capitalness of an token. For example, " "first_caps represents only the first character of " "the token is capital. full_caps denotes all " "characters of the token are capital"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1, ), ), "tok_position": feature.Value( dtype="float", description=( "The relative position of a token in a sentence"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_chars": feature.Value( dtype="float", description="The number of characters in a token", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_test_freq": feature.Value( dtype="float", description="tok frequency in the test set", is_bucket=True, require_training_set=False, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "tok_train_freq": feature.Value( dtype="float", description="tok frequency in the training set", is_bucket=True, require_training_set=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), })), })
def default_features(cls) -> feature.Features: return feature.Features({ "text": feature.Value("string"), "true_label": feature.Value("string"), "predicted_label": feature.Value("string"), "label": feature.Value( dtype="string", description="category", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_discrete_value", number=4, setting=1), ), "text_length": feature.Value( dtype="float", description="text length in tokens", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "text_chars": feature.Value( dtype="float", description="text length in characters", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "basic_words": feature.Value( dtype="float", description="the ratio of basic words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "lexical_richness": feature.Value( dtype="float", description="lexical diversity", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), ), "num_oov": feature.Value( dtype="float", description="the number of out-of-vocabulary words", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "fre_rank": feature.Value( dtype="float", description=( "the average rank of each word based on its frequency in " "training set"), is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), "length_fre": feature.Value( dtype="float", description="the frequency of text length in training set", is_bucket=True, bucket_info=feature.BucketInfo( method="bucket_attribute_specified_bucket_value", number=4, setting=(), ), require_training_set=True, ), })