Exemple #1
0
 def default_features(cls) -> feature.Features:
     return feature.Features(
         {
             "text": feature.Value("string"),
             "edits": feature.Dict(
                 feature={
                     "start_idx": feature.Sequence(feature=feature.Value("int32")),
                     "end_idx": feature.Sequence(feature=feature.Value("int32")),
                     "corrections": feature.Sequence(
                         feature=feature.Sequence(feature=feature.Value("string"))
                     ),
                 }
             ),
             "text_length": feature.Value(
                 dtype="float",
                 description="length of the text",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
         }
     )
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "true_head":
         feature.Value("string"),
         "true_head_decipher":
         feature.Value("string"),
         "true_link":
         feature.Value("string"),
         "true_tail":
         feature.Value("string"),
         "true_tail_decipher":
         feature.Value("string"),
         "predict":
         feature.Value("string"),
         "true_label":
         feature.Value("string"),
         "predictions":
         feature.Sequence(feature=feature.Value("string")),
         "tail_entity_length":
         feature.Value(
             dtype="float",
             description="number of words in the tail entity",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "head_entity_length":
         feature.Value(
             dtype="float",
             description="number of words in the head entity",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "tail_fre":
         feature.Value(
             dtype="float",
             description="the frequency of tail entity in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "link_fre":
         feature.Value(
             dtype="float",
             description=
             "the frequency of link relation in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "head_fre":
         feature.Value(
             dtype="float",
             description=
             "the frequency of head relation in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "symmetry":
         feature.Value(
             dtype="string",
             description=(
                 "boolean feature: 'symmetric' or 'asymmetric'; more "
                 "granularity to be added"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=2,
                 setting=1),
         ),
         "entity_type_level":
         feature.Value(
             dtype="string",
             description=
             ("most specific (highest) entity type level of true tail entity"
              ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=8,
                 setting=1),
         ),
     })
 def test_get_bucket_features(self):
     ner_task_features = feature.Features({
         "tokens":
         feature.Sequence(feature=feature.Value("string")),
         "true_tags":
         feature.Sequence(feature=feature.Value("string")),
         "pred_tags":
         feature.Sequence(feature=feature.Value("string")),
         # --- the following are features of the sentences ---
         "sentence_length":
         feature.Value(
             dtype="float",
             description="sentence length",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "entity_density":
         feature.Value(
             dtype="float",
             description="the ration between all entity "
             "tokens and sentence tokens ",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each word based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each entity ---
         "true_entity_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "span_text":
                 feature.Value("string"),
                 "span_tokens":
                 feature.Value(
                     dtype="float",
                     description="entity length",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_pos":
                 feature.Position(positions=[0, 0]),
                 "span_tag":
                 feature.Value(
                     dtype="string",
                     description="entity tag",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "span_capitalness":
                 feature.Value(
                     dtype="string",
                     description=(
                         "The capitalness of an entity. For example, "
                         "first_caps represents only the first character of "
                         "the entity is capital. full_caps denotes all "
                         "characters of the entity are capital"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "span_rel_pos":
                 feature.Value(
                     dtype="float",
                     description=(
                         "The relative position of an entity in a sentence"
                     ),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_chars":
                 feature.Value(
                     dtype="float",
                     description="The number of characters of an entity",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_econ":
                 feature.Value(
                     dtype="float",
                     description="entity label consistency",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_efre":
                 feature.Value(
                     dtype="float",
                     description="entity frequency",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })
     bucket_features = ner_task_features.get_bucket_features()
     self.assertEqual(
         set(bucket_features),
         set([
             'sentence_length',
             'entity_density',
             'num_oov',
             'fre_rank',
             'span_tokens',
             'span_tag',
             'span_capitalness',
             'span_rel_pos',
             'span_chars',
             'span_econ',
             'span_efre',
         ]),
     )
 def default_features(cls) -> feature.Features:
     return feature.Features(
         {
             "context": feature.Value("string"),
             "question": feature.Value("string"),
             "options": feature.Sequence(feature=feature.Value("string")),
             "answers": feature.Sequence(
                 feature=feature.Dict(
                     feature={
                         "text": feature.Value("string"),
                         "option_index": feature.Value("int32"),
                     }
                 )
             ),
             "context_length": feature.Value(
                 dtype="float",
                 description="the length of context",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "question_length": feature.Value(
                 dtype="float",
                 description="the length of question",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "answer_length": feature.Value(
                 dtype="float",
                 description="the length of answer",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "num_oov": feature.Value(
                 dtype="float",
                 description="the number of out-of-vocabulary words",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
                 require_training_set=True,
             ),
             "fre_rank": feature.Value(
                 dtype="float",
                 description=(
                     "the average rank of each word based on its frequency in "
                     "training set"
                 ),
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
                 require_training_set=True,
             ),
         }
     )
Exemple #5
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "source":
         feature.Value("string"),
         "reference":
         feature.Value("string"),
         "hypothesis":
         feature.Value("string"),
         "source_length":
         feature.Value(
             dtype="float",
             description="length of the source",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "reference_length":
         feature.Value(
             dtype="float",
             description="length of the reference",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "hypothesis_length":
         feature.Value(
             dtype="float",
             description="length of the hypothesis",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "src_num_oov":
         feature.Value(
             dtype="float",
             description="OOV words in the source",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "src_fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "average training-set frequency rank of words in sentence"
             ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "ref_num_oov":
         feature.Value(
             dtype="float",
             description="number of OOV words in reference",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "ref_fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "average training-set frequency rank of words in sentence"
             ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each token ---
         "ref_tok_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "tok_text":
                 feature.Value("string"),
                 "tok_pos":
                 feature.Position(positions=[0, 0]),
                 "tok_matched":
                 feature.Value(
                     # this is actually "int" but int is not supported
                     dtype="float",
                     description=(
                         "which token the ref/hyp token matches in the "
                         "hyp/ref sentence, or -1 if none"),
                     is_bucket=False,
                 ),
                 "tok_capitalness":
                 feature.Value(
                     dtype="string",
                     description=("capitalness of token"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "tok_position":
                 feature.Value(
                     dtype="float",
                     description=("relative position of token in sentence"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_chars":
                 feature.Value(
                     dtype="float",
                     description="number of characters in the token",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_test_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the test set",
                     is_bucket=True,
                     require_training_set=False,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_train_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the training set",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "text":
         feature.Value("string"),
         "log_probs":
         feature.Value("string"),
         "text_length":
         feature.Value(
             dtype="float",
             description="text length in tokens",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "text_chars":
         feature.Value(
             dtype="float",
             description="text length in characters",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each work based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "length_fre":
         feature.Value(
             dtype="float",
             description="the frequency of text length in training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each token ---
         "tok_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "tok_text":
                 feature.Value("string"),
                 "tok_pos":
                 feature.Position(positions=[0, 0]),
                 "tok_log_prob":
                 feature.Value(
                     dtype="float",
                     description=(
                         "log probability of the token according to the LM"
                     ),
                     is_bucket=False,
                 ),
                 "tok_capitalness":
                 feature.Value(
                     dtype="string",
                     description=(
                         "The capitalness of an token. For example, "
                         "first_caps represents only the first character of "
                         "the token is capital. full_caps denotes all "
                         "characters of the token are capital"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "tok_position":
                 feature.Value(
                     dtype="float",
                     description=(
                         "The relative position of a token in a sentence"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_chars":
                 feature.Value(
                     dtype="float",
                     description="The number of characters in a token",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_test_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the test set",
                     is_bucket=True,
                     require_training_set=False,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_train_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the training set",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })