Ejemplo n.º 1
0
def _ConvertContextAndExamplesToElwc(
    context_feature_and_examples: Tuple[bytes, List[tf.train.Example]]
) -> input_pb2.ExampleListWithContext:
  """Convert context feature and examples to ELWC."""
  context_feature, examples = context_feature_and_examples
  context_feature_proto = tf.train.Example()
  context_feature_proto.ParseFromString(context_feature)
  return input_pb2.ExampleListWithContext(
      context=context_feature_proto, examples=examples)
Ejemplo n.º 2
0
 def CombineContextAndExamples(
         self,
         context_feature_and_examples) -> input_pb2.ExampleListWithContext:
     (context_feature, examples) = context_feature_and_examples
     context_feature_dict = context_feature._asdict()
     context_feature = self.DataToFeatures(context_feature_dict)
     return input_pb2.ExampleListWithContext(
         context=tf.train.Example(features=context_feature),
         examples=examples)
Ejemplo n.º 3
0
def generate_tf_record(input_path, output_path, feature_names):
    """generate tfrecord"""
    def _parse_line(line):
        """Parses a single line in LibSVM format."""
        tokens = line.split()
        assert len(tokens) >= 2, "Ill-formatted line: {}".format(line)
        label = float(tokens[0])
        qid = tokens[1]
        features = {k: v for k, v in zip(feature_names, tokens[2:])}
        return qid, features, label

    def _generate_per_example(features, label):
        example_feature_dict = {
            k:
            tf.train.Feature(bytes_list=tf.train.BytesList(value=[v.encode()]))
            for k, v in features.items()
        }
        example_feature_dict['label'] = tf.train.Feature(
            float_list=tf.train.FloatList(value=[label]))
        return tf.train.Example(features=tf.train.Features(
            feature=example_feature_dict))

    tf.compat.v1.logging.info("Start to convert {} to {}".format(
        input_path, output_path))
    writer = tf.io.TFRecordWriter(output_path)
    with open(input_path, "rt") as f:
        qid_mark = ''
        elwc = input_pb2.ExampleListWithContext()
        for line in f:
            qid, features, label = _parse_line(line)
            if qid_mark == '':
                elwc.examples.add().CopyFrom(
                    _generate_per_example(features, label))
                qid_mark = qid
            elif qid == qid_mark:
                elwc.examples.add().CopyFrom(
                    _generate_per_example(features, label))
            else:
                writer.write(elwc.SerializeToString())
                elwc = input_pb2.ExampleListWithContext()
                elwc.examples.add().CopyFrom(
                    _generate_per_example(features, label))
                qid_mark = qid
Ejemplo n.º 4
0
    def _write_train_eval_tfrecord_examples(self):
        elwc_example = text_format.Parse(
            """
        context: {}
        examples: {
          features: {
            feature: {
              key: "relevance"
              value: { int64_list: { value: [ 1 ] } }
            }
            feature: {
              key: "input_ids"
              value: { int64_list: { value: [ 1, 4, 3, 0 ] } }
            }
            feature: {
              key: "input_mask"
              value: { int64_list: { value: [ 1, 1, 1, 0 ] } }
            }
            feature: {
              key: "segment_ids"
              value: { int64_list: { value: [ 0, 0, 0, 1 ] } }
            }
          }
        }
        examples: {
          features: {
            feature: {
              key: "relevance"
              value: { int64_list: { value: [ 0 ] } }
            }
            feature: {
              key: "input_ids"
              value: { int64_list: { value: [ 2, 5, 8, 9 ] } }
            }
            feature: {
              key: "input_mask"
              value: { int64_list: { value: [ 1, 1, 1, 1 ] } }
            }
            feature: {
              key: "segment_ids"
              value: { int64_list: { value: [ 0, 0, 0, 0 ] } }
            }
          }
        }
      """, input_pb2.ExampleListWithContext())

        # Writes TFRecord examples for training.
        with tf.io.TFRecordWriter(self._train_file) as writer:
            for example in [elwc_example] * 10:
                writer.write(example.SerializeToString())

        # Writes TFRecord examples for evaluation.
        with tf.io.TFRecordWriter(self._eval_file) as writer:
            for example in [elwc_example] * 5:
                writer.write(example.SerializeToString())
def read_and_print_tf_record(target_filename, num_of_examples_to_read):
    filenames = [target_filename]
    tf_record_dataset = tf.data.TFRecordDataset(filenames)
    all_examples = []
    
    for raw_record in tf_record_dataset.take(num_of_examples_to_read):
        example_list_with_context = input_pb2.ExampleListWithContext()
        example_list_with_context.ParseFromString(raw_record.numpy())
        all_examples.append(example_list_with_context)

    return all_examples
Ejemplo n.º 6
0
    def encode(self, instance):

        context = self._encode_context(instance)
        examples = self._encode_examples(instance)

        proto = input_pb2.ExampleListWithContext(context=context,
                                                 examples=examples)

        if self._serialized:
            return proto.SerializeToString()
        else:
            return proto
Ejemplo n.º 7
0
 def _write_train_eval_data(self, data_file):
     elwc_example = text_format.Parse(
         """
       context: {}
       examples: {
         features: {
           feature: {
             key: "relevance"
             value: { int64_list: { value: [ 1 ] } }
           }
           feature: {
             key: "input_ids"
             value: { int64_list: { value: [ 1, 4, 3, 0 ] } }
           }
           feature: {
             key: "input_mask"
             value: { int64_list: { value: [ 1, 1, 1, 0 ] } }
           }
           feature: {
             key: "segment_ids"
             value: { int64_list: { value: [ 0, 0, 0, 1 ] } }
           }
         }
       }
       examples: {
         features: {
           feature: {
             key: "relevance"
             value: { int64_list: { value: [ 0 ] } }
           }
           feature: {
             key: "input_ids"
             value: { int64_list: { value: [ 2, 5, 8, 9 ] } }
           }
           feature: {
             key: "input_mask"
             value: { int64_list: { value: [ 1, 1, 1, 1 ] } }
           }
           feature: {
             key: "segment_ids"
             value: { int64_list: { value: [ 0, 0, 0, 0 ] } }
           }
         }
       }
     """, input_pb2.ExampleListWithContext())
     with tf.io.TFRecordWriter(data_file) as writer:
         for example in [elwc_example] * 8:
             writer.write(example.SerializeToString())
def create_records(df, output_dir, num_of_records=5, prefix="movielens_"):
    """
    Takes a pandas dataframe and number of records to create and creates TFRecords.
    Saves records in output_dir
    """
    all_users = list(set(df.user_id.values.tolist()))

    record_prefix = os.path.join(output_dir, prefix)
    files_per_record = int(len(all_users) / num_of_records)  #approximate number of examples per record
    chunk_number = 0

    for i in range(0, len(all_users), files_per_record):
        print("Writing chunk ", str(chunk_number))
        user_chunk = all_users[i:i+files_per_record]

        if num_of_records == 1:
            record_file = record_prefix + ".tfrecords"
        else:
            record_file = record_prefix + str(chunk_number).zfill(3) + ".tfrecords"

        with tf.io.TFRecordWriter(record_file) as writer:
            for user in user_chunk:
                user_df = df.loc[df["user_id"] == user]
                agegroup = user_df["agegroup"].values.tolist()[0]
                occupation = user_df["occupation"].values.tolist()[0]
                zipcode = user_df["zipcode"].values.tolist()[0]
                sex = user_df["sex"].values.tolist()[0]
                CONTEXT = context_example(user, agegroup, occupation, zipcode, sex)

                EXAMPLES = []
                movie_ids, movie_titles, title_descriptions, ratings = process_df(user_df)
                for i in range(len(movie_ids)):
                    EXAMPLES.append(movie_example(movie_ids[i], movie_titles[i], title_descriptions[i], ratings[i]))

                ELWC = input_pb2.ExampleListWithContext()
                ELWC.context.CopyFrom(CONTEXT)
                for example in EXAMPLES:
                    example_features = ELWC.examples.add()
                    example_features.CopyFrom(example)

                writer.write(ELWC.SerializeToString())
            chunk_number += 1
Ejemplo n.º 9
0
def _create_fake_preprocessed_dataset(output_path, seq_length, label_type):
    """Creates a fake dataset."""
    writer = tf.io.TFRecordWriter(output_path)

    def create_int_feature(values):
        f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
        return f

    def create_float_feature(values):
        f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
        return f

    elwc_num = 32
    list_size = 12
    for query_id in range(elwc_num):
        elwc = input_pb2.ExampleListWithContext()
        for doc_id in range(list_size):
            features = {}
            input_ids = np.random.randint(100, size=(seq_length))
            features['input_ids'] = create_int_feature(input_ids)
            features['input_mask'] = create_int_feature(
                np.ones_like(input_ids))
            features['segment_ids'] = create_int_feature(
                np.ones_like(input_ids))

            if label_type == tf.int64:
                features['relevance'] = create_int_feature([1])
            elif label_type == tf.float32:
                features['relevance'] = create_float_feature([0.5])
            else:
                raise ValueError('Unsupported label_type: %s' % label_type)

            features['query_id'] = create_int_feature([query_id])
            features['document_id'] = create_int_feature([doc_id])

            example = tf.train.Example(features=tf.train.Features(
                feature=features))
            elwc.examples.append(example)

        writer.write(elwc.SerializeToString())
    writer.close()
Ejemplo n.º 10
0
    def convert_to_elwc(self, context, examples, labels, label_name):
        """Converts a <context, example list> pair to an ELWC example.

    Args:
      context: (str) raw text for a context (aka. query).
      examples: (list) raw texts for a list of examples (aka. documents).
      labels: (list) a list of labels (int) for the `examples`.
      label_name: (str) name of the label in the ELWC example.

    Returns:
      A tensorflow.serving.ExampleListWithContext example containing the
      `input_ids`, `input_masks`, `segment_ids` and `label_id` fields.
    """
        if len(examples) != len(labels):
            raise ValueError(
                "`examples` and `labels` should have the same size!")

        elwc = input_pb2.ExampleListWithContext()
        for example, label in zip(examples, labels):
            (input_ids, input_mask,
             segment_ids) = self._to_bert_ids(context, example)

            feature = {
                "input_ids":
                tf.train.Feature(int64_list=tf.train.Int64List(
                    value=input_ids)),
                "input_mask":
                tf.train.Feature(int64_list=tf.train.Int64List(
                    value=input_mask)),
                "segment_ids":
                tf.train.Feature(int64_list=tf.train.Int64List(
                    value=segment_ids)),
                label_name:
                tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            }
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            elwc.examples.append(tf_example)

        return elwc
Ejemplo n.º 11
0
def create_records(df):
    """
    Takes a pandas dataframe and number of records to create and creates TFRecords.
    """
    all_users = list(set(df.device_id.values.tolist()))
    EXAMPLES = movie_example()
    all_records = []

    for user in all_users:
        user_df = df.loc[df["user_id"] == user]
        agegroup = user_df["agegroup"].values.tolist()[0]
        occupation = user_df["occupation"].values.tolist()[0]
        zipcode = user_df["zipcode"].values.tolist()[0]
        sex = user_df["sex"].values.tolist()[0]
        CONTEXT = context_example(user, agegroup, occupation, zipcode, sex)

        ELWC = input_pb2.ExampleListWithContext()
        ELWC.context.CopyFrom(CONTEXT)
        for example in EXAMPLES:
            example_features = ELWC.examples.add()
            example_features.CopyFrom(example)
            
        all_records.append(ELWC)
    return all_records
Ejemplo n.º 12
0
def create_elwc(features: Dict[str, tf.train.Feature]) -> Any:
    elwc: Any = input_pb2.ExampleListWithContext()
    context_feature = tf.train.Example(features=tf.train.Features(
        feature=features))
    elwc.context.CopyFrom(context_feature)
    return elwc
          value: [22]
        }
      }
    }
    # example_float is not present.
    feature {
      key: "example_bytes"
      value {
        bytes_list {
          value: ["w"]
        }
      }
    }
  }
}
""", input_pb2.ExampleListWithContext()).SerializeToString(),
    text_format.Parse(
        """
context {
  features {
    feature {
      key: "ctx.int"
      value {
        int64_list {
          value: [3]
        }
      }
    }
    feature {
      key: "ctx.float"
      value {
Ejemplo n.º 14
0
    def testBigQueryToElwc(self, mock_client):
        # Mock query result schema for _BigQueryConverter.
        mock_client.return_value.query.return_value.result.return_value.schema = self._schema

        with beam.Pipeline() as pipeline:
            elwc_examples = (
                pipeline | 'ToElwc' >> executor._BigQueryToElwcExample(
                    elwc_config=example_gen_pb2.ElwcConfig(
                        context_feature_fields=['qid']),
                    input_dict={},
                    exec_properties={},
                    split_pattern=
                    'SELECT qid, feature_id_1, feature_id_2, feature_id_3 FROM `fake`'
                ))

            elwc_1 = input_pb2.ExampleListWithContext()
            elwc_1.context.features.feature['qid'].int64_list.value.append(1)
            example1 = elwc_1.examples.add()
            example1.features.feature['feature_id_1'].int64_list.value.append(
                1)
            example1.features.feature['feature_id_2'].float_list.value.append(
                1.0)
            example1.features.feature['feature_id_3'].bytes_list.value.append(
                tf.compat.as_bytes('1'))
            example2 = elwc_1.examples.add()
            example2.features.feature['feature_id_1'].int64_list.value.append(
                2)
            example2.features.feature['feature_id_2'].float_list.value.append(
                2.0)
            example2.features.feature['feature_id_3'].bytes_list.value.append(
                tf.compat.as_bytes('2'))

            elwc_2 = input_pb2.ExampleListWithContext()
            elwc_2.context.features.feature['qid'].int64_list.value.append(2)
            example3 = elwc_2.examples.add()
            example3.features.feature['feature_id_1'].int64_list.value.append(
                3)
            example3.features.feature['feature_id_2'].float_list.value.append(
                3.0)
            example3.features.feature['feature_id_3'].bytes_list.value.append(
                tf.compat.as_bytes('3'))
            example4 = elwc_2.examples.add()
            example4.features.feature['feature_id_1'].int64_list.value.append(
                4)
            example4.features.feature['feature_id_2'].float_list.value.append(
                4.0)
            example4.features.feature['feature_id_3'].bytes_list.value.append(
                tf.compat.as_bytes('4'))

            elwc_3 = input_pb2.ExampleListWithContext()
            elwc_3.context.features.feature['qid'].int64_list.value.append(5)
            example5 = elwc_3.examples.add()
            example5.features.feature['feature_id_1'].int64_list.value.append(
                5)
            example5.features.feature['feature_id_2'].float_list.value.append(
                5.0)
            example5.features.feature['feature_id_3'].bytes_list.value.append(
                tf.compat.as_bytes('5'))

            expected_elwc_examples = [elwc_1, elwc_2, elwc_3]

            util.assert_that(elwc_examples,
                             util.equal_to(expected_elwc_examples))
Ejemplo n.º 15
0
    def test_convert_to_elwc(self):
        query = "test"
        documents = ["This", "This is simple test", "test"]
        label_name = "label"
        labels = [1, 0, 1]

        self._bert_max_seq_length = 8
        bert_helper = self._create_tfrbert_util_with_vocab()
        elwc = bert_helper.convert_to_elwc(context=query,
                                           examples=documents,
                                           labels=labels,
                                           label_name=label_name)

        expected_elwc = text_format.Parse(
            """
        examples: {
          features: {
            feature: {
              key: "label"
              value: { int64_list: { value: [ 1 ] } }
            }
            feature: {
              key: "input_ids"
              value: { int64_list: { value: [ 7, 5, 8, 1, 8, 0, 0, 0 ] } }
            }
            feature: {
              key: "input_mask"
              value: { int64_list: { value: [ 1, 1, 1, 1, 1, 0, 0, 0 ] } }
            }
            feature: {
              key: "segment_ids"
              value: { int64_list: { value: [ 0, 0, 0, 1, 1, 0, 0, 0 ] } }
            }
          }
        }
        examples: {
          features: {
            feature: {
              key: "label"
              value: { int64_list: { value: [ 0 ] } }
            }
            feature: {
              key: "input_ids"
              value: { int64_list: { value: [ 7, 5, 8, 1, 2, 4, 5, 8 ] } }
            }
            feature: {
              key: "input_mask"
              value: { int64_list: { value: [ 1, 1, 1, 1, 1, 1, 1, 1 ] } }
            }
            feature: {
              key: "segment_ids"
              value: { int64_list: { value: [ 0, 0, 0, 1, 1, 1, 1, 1 ] } }
            }
          }
        }
        examples: {
          features: {
            feature: {
              key: "label"
              value: { int64_list: { value: [ 1 ] } }
            }
            feature: {
              key: "input_ids"
              value: { int64_list: { value: [ 7, 5, 8, 5, 8, 0, 0, 0 ] } }
            }
            feature: {
              key: "input_mask"
              value: { int64_list: { value: [ 1, 1, 1, 1, 1, 0, 0, 0 ] } }
            }
            feature: {
              key: "segment_ids"
              value: { int64_list: { value: [ 0, 0, 0, 1, 1, 0, 0, 0 ] } }
            }
          }
        }""", input_pb2.ExampleListWithContext())

        self.assertEqual(text_format.MessageToString(expected_elwc),
                         text_format.MessageToString(elwc))
Ejemplo n.º 16
0
        value { bytes_list { value: ["irrelevant", "data"] } }
      }
      feature {
        key: "relevance"
        value { int64_list { value: 1 } }
      }
    }""", tf.train.Example()),
]

try:
  from tensorflow_serving.apis import input_pb2
except ImportError:
  !pip install - q tensorflow-serving-api
  from tensorflow_serving.apis import input_pb2

ELWC = input_pb2.ExampleListWithContext()
ELWC.context.CopyFrom(CONTEXT)
for example in EXAMPLES:
  example_features = ELWC.examples.add()
  example_features.CopyFrom(example)

print(ELWC)

# Store the paths to files containing training and test instances.
_TRAIN_DATA_PATH = "/tmp/train.tfrecords"
_TEST_DATA_PATH = "/tmp/test.tfrecords"

# Store the vocabulary path for query and document tokens.
_VOCAB_PATH = "/tmp/vocab.txt"

# The maximum number of documents per query in the dataset.
Ejemplo n.º 17
0
      features {
        feature {
          key: "custom_features_1"
          value { float_list { value: 1.0 } }
        }
        feature {
          key: "custom_features_3"
          value { float_list { value: 1.0 } }
        }
        feature {
          key: "utility"
          value { float_list { value: 1.0 } }
        }
      }
    }
    """, input_pb2.ExampleListWithContext())

EXAMPLE_PROTO_1 = text_format.Parse(
    """
    features {
      feature {
        key: "cf_1"
        value { float_list { value: 1.0 } }
      }
      feature {
        key: "custom_features_1"
        value { float_list { value: 1.0 } }
      }
      feature {
        key: "custom_features_2"
        value { float_list { value: 1.0 } }
Ejemplo n.º 18
0
def write_context_examples(path, samples):
    def serialize_example_fake(relevance, rvfeatures):
        """
        fake -> same number in context and relevant example
        """
        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {
            'rv_tokens': _int64_list_feature(rvfeatures),  # _RV_FEATURE
            'relevance': _int64_feature(relevance),  # _LABEL_FEATURE
        }
        # Create a Features message using tf.train.Example.

        example = tf.train.Example(features=tf.train.Features(feature=feature))

        return example  # .SerializeToString()

    def serialize_context_fake(contfeatures):
        """
        Creates a tf.Example message ready to be written to a file.
        """

        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {
            'event_tokens': _int64_list_feature(contfeatures),
        }
        # Create a Features message using tf.train.Example.

        context = tf.train.Example(features=tf.train.Features(feature=feature))
        return context  # .SerializeToString()

    def serialize_example():
        """
        Creates a tf.Example message ready to be written to a file.
        concententate: 'rv.feat + rv.tline'
        """
        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {
            'relevance': _int64_feature(rv.relevance),  # _LABEL_FEATURE
        }
        for k, val in enumerate(rv.features()):
            fname = rv_features[k]
            feature[fname] = dispatch_fn[fname](val)

        # Create a Features message using tf.train.Example.

        example = tf.train.Example(features=tf.train.Features(feature=feature))

        return example  # .SerializeToString()

    def serialize_context(contfeatures):
        """
        Creates a tf.Example message ready to be written to a file.
        """

        # Create a dictionary mapping the feature name to the tf.Example-compatible
        # data type.
        feature = {}
        for k, val in enumerate(contfeatures):
            fname = ev_features[k]
            feature[fname] = dispatch_fn[fname](val)
        # Create a Features message using tf.train.Example.

        context = tf.train.Example(features=tf.train.Features(feature=feature))
        return context  # .SerializeToString()

    elwc_list = []
    ev_features = _EVENT_FEATURES
    rv_features = _RV_FEATURES
    for s in samples:

        rvli = s.rvli
        example_list = []
        if _FAKE_ELWC:
            #1 is relevant rest is not
            #example = serialize_example_fake(1, [1])
            #example_list.append(example)
            for i in range(1, 2):
                #relev = random.randint(0, 1)
                if i == 1:
                    relev = 1
                else:
                    relev = 0
                example = serialize_example_fake(relev, [i])
                example_list.append(example)
            cont_feature = random.randint(1, 2)
            #cont_feature = 1
            context = serialize_context_fake([cont_feature])

        else:
            for rv in rvli:
                example = serialize_example()
                example_list.append(example)
            # context = serialize_context(s.features)
            context = serialize_context(s.features())

        ELWC = input_pb2.ExampleListWithContext()
        ELWC.context.CopyFrom(context)
        for example in example_list:
            example_features = ELWC.examples.add()
            example_features.CopyFrom(example)
        elwc_list.append(ELWC)

    file_path = path
    with tf.io.TFRecordWriter(file_path) as writer:

        for elwc in elwc_list:  # [:2]:
            # print(elwc)
            writer.write(elwc.SerializeToString())