def _ConvertContextAndExamplesToElwc( context_feature_and_examples: Tuple[bytes, List[tf.train.Example]] ) -> input_pb2.ExampleListWithContext: """Convert context feature and examples to ELWC.""" context_feature, examples = context_feature_and_examples context_feature_proto = tf.train.Example() context_feature_proto.ParseFromString(context_feature) return input_pb2.ExampleListWithContext( context=context_feature_proto, examples=examples)
def CombineContextAndExamples( self, context_feature_and_examples) -> input_pb2.ExampleListWithContext: (context_feature, examples) = context_feature_and_examples context_feature_dict = context_feature._asdict() context_feature = self.DataToFeatures(context_feature_dict) return input_pb2.ExampleListWithContext( context=tf.train.Example(features=context_feature), examples=examples)
def generate_tf_record(input_path, output_path, feature_names): """generate tfrecord""" def _parse_line(line): """Parses a single line in LibSVM format.""" tokens = line.split() assert len(tokens) >= 2, "Ill-formatted line: {}".format(line) label = float(tokens[0]) qid = tokens[1] features = {k: v for k, v in zip(feature_names, tokens[2:])} return qid, features, label def _generate_per_example(features, label): example_feature_dict = { k: tf.train.Feature(bytes_list=tf.train.BytesList(value=[v.encode()])) for k, v in features.items() } example_feature_dict['label'] = tf.train.Feature( float_list=tf.train.FloatList(value=[label])) return tf.train.Example(features=tf.train.Features( feature=example_feature_dict)) tf.compat.v1.logging.info("Start to convert {} to {}".format( input_path, output_path)) writer = tf.io.TFRecordWriter(output_path) with open(input_path, "rt") as f: qid_mark = '' elwc = input_pb2.ExampleListWithContext() for line in f: qid, features, label = _parse_line(line) if qid_mark == '': elwc.examples.add().CopyFrom( _generate_per_example(features, label)) qid_mark = qid elif qid == qid_mark: elwc.examples.add().CopyFrom( _generate_per_example(features, label)) else: writer.write(elwc.SerializeToString()) elwc = input_pb2.ExampleListWithContext() elwc.examples.add().CopyFrom( _generate_per_example(features, label)) qid_mark = qid
def _write_train_eval_tfrecord_examples(self): elwc_example = text_format.Parse( """ context: {} examples: { features: { feature: { key: "relevance" value: { int64_list: { value: [ 1 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 1, 4, 3, 0 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 0 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 1 ] } } } } } examples: { features: { feature: { key: "relevance" value: { int64_list: { value: [ 0 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 2, 5, 8, 9 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 1 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 0 ] } } } } } """, input_pb2.ExampleListWithContext()) # Writes TFRecord examples for training. with tf.io.TFRecordWriter(self._train_file) as writer: for example in [elwc_example] * 10: writer.write(example.SerializeToString()) # Writes TFRecord examples for evaluation. with tf.io.TFRecordWriter(self._eval_file) as writer: for example in [elwc_example] * 5: writer.write(example.SerializeToString())
def read_and_print_tf_record(target_filename, num_of_examples_to_read): filenames = [target_filename] tf_record_dataset = tf.data.TFRecordDataset(filenames) all_examples = [] for raw_record in tf_record_dataset.take(num_of_examples_to_read): example_list_with_context = input_pb2.ExampleListWithContext() example_list_with_context.ParseFromString(raw_record.numpy()) all_examples.append(example_list_with_context) return all_examples
def encode(self, instance): context = self._encode_context(instance) examples = self._encode_examples(instance) proto = input_pb2.ExampleListWithContext(context=context, examples=examples) if self._serialized: return proto.SerializeToString() else: return proto
def _write_train_eval_data(self, data_file): elwc_example = text_format.Parse( """ context: {} examples: { features: { feature: { key: "relevance" value: { int64_list: { value: [ 1 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 1, 4, 3, 0 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 0 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 1 ] } } } } } examples: { features: { feature: { key: "relevance" value: { int64_list: { value: [ 0 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 2, 5, 8, 9 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 1 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 0 ] } } } } } """, input_pb2.ExampleListWithContext()) with tf.io.TFRecordWriter(data_file) as writer: for example in [elwc_example] * 8: writer.write(example.SerializeToString())
def create_records(df, output_dir, num_of_records=5, prefix="movielens_"): """ Takes a pandas dataframe and number of records to create and creates TFRecords. Saves records in output_dir """ all_users = list(set(df.user_id.values.tolist())) record_prefix = os.path.join(output_dir, prefix) files_per_record = int(len(all_users) / num_of_records) #approximate number of examples per record chunk_number = 0 for i in range(0, len(all_users), files_per_record): print("Writing chunk ", str(chunk_number)) user_chunk = all_users[i:i+files_per_record] if num_of_records == 1: record_file = record_prefix + ".tfrecords" else: record_file = record_prefix + str(chunk_number).zfill(3) + ".tfrecords" with tf.io.TFRecordWriter(record_file) as writer: for user in user_chunk: user_df = df.loc[df["user_id"] == user] agegroup = user_df["agegroup"].values.tolist()[0] occupation = user_df["occupation"].values.tolist()[0] zipcode = user_df["zipcode"].values.tolist()[0] sex = user_df["sex"].values.tolist()[0] CONTEXT = context_example(user, agegroup, occupation, zipcode, sex) EXAMPLES = [] movie_ids, movie_titles, title_descriptions, ratings = process_df(user_df) for i in range(len(movie_ids)): EXAMPLES.append(movie_example(movie_ids[i], movie_titles[i], title_descriptions[i], ratings[i])) ELWC = input_pb2.ExampleListWithContext() ELWC.context.CopyFrom(CONTEXT) for example in EXAMPLES: example_features = ELWC.examples.add() example_features.CopyFrom(example) writer.write(ELWC.SerializeToString()) chunk_number += 1
def _create_fake_preprocessed_dataset(output_path, seq_length, label_type): """Creates a fake dataset.""" writer = tf.io.TFRecordWriter(output_path) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return f def create_float_feature(values): f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) return f elwc_num = 32 list_size = 12 for query_id in range(elwc_num): elwc = input_pb2.ExampleListWithContext() for doc_id in range(list_size): features = {} input_ids = np.random.randint(100, size=(seq_length)) features['input_ids'] = create_int_feature(input_ids) features['input_mask'] = create_int_feature( np.ones_like(input_ids)) features['segment_ids'] = create_int_feature( np.ones_like(input_ids)) if label_type == tf.int64: features['relevance'] = create_int_feature([1]) elif label_type == tf.float32: features['relevance'] = create_float_feature([0.5]) else: raise ValueError('Unsupported label_type: %s' % label_type) features['query_id'] = create_int_feature([query_id]) features['document_id'] = create_int_feature([doc_id]) example = tf.train.Example(features=tf.train.Features( feature=features)) elwc.examples.append(example) writer.write(elwc.SerializeToString()) writer.close()
def convert_to_elwc(self, context, examples, labels, label_name): """Converts a <context, example list> pair to an ELWC example. Args: context: (str) raw text for a context (aka. query). examples: (list) raw texts for a list of examples (aka. documents). labels: (list) a list of labels (int) for the `examples`. label_name: (str) name of the label in the ELWC example. Returns: A tensorflow.serving.ExampleListWithContext example containing the `input_ids`, `input_masks`, `segment_ids` and `label_id` fields. """ if len(examples) != len(labels): raise ValueError( "`examples` and `labels` should have the same size!") elwc = input_pb2.ExampleListWithContext() for example, label in zip(examples, labels): (input_ids, input_mask, segment_ids) = self._to_bert_ids(context, example) feature = { "input_ids": tf.train.Feature(int64_list=tf.train.Int64List( value=input_ids)), "input_mask": tf.train.Feature(int64_list=tf.train.Int64List( value=input_mask)), "segment_ids": tf.train.Feature(int64_list=tf.train.Int64List( value=segment_ids)), label_name: tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) } tf_example = tf.train.Example(features=tf.train.Features( feature=feature)) elwc.examples.append(tf_example) return elwc
def create_records(df): """ Takes a pandas dataframe and number of records to create and creates TFRecords. """ all_users = list(set(df.device_id.values.tolist())) EXAMPLES = movie_example() all_records = [] for user in all_users: user_df = df.loc[df["user_id"] == user] agegroup = user_df["agegroup"].values.tolist()[0] occupation = user_df["occupation"].values.tolist()[0] zipcode = user_df["zipcode"].values.tolist()[0] sex = user_df["sex"].values.tolist()[0] CONTEXT = context_example(user, agegroup, occupation, zipcode, sex) ELWC = input_pb2.ExampleListWithContext() ELWC.context.CopyFrom(CONTEXT) for example in EXAMPLES: example_features = ELWC.examples.add() example_features.CopyFrom(example) all_records.append(ELWC) return all_records
def create_elwc(features: Dict[str, tf.train.Feature]) -> Any: elwc: Any = input_pb2.ExampleListWithContext() context_feature = tf.train.Example(features=tf.train.Features( feature=features)) elwc.context.CopyFrom(context_feature) return elwc
value: [22] } } } # example_float is not present. feature { key: "example_bytes" value { bytes_list { value: ["w"] } } } } } """, input_pb2.ExampleListWithContext()).SerializeToString(), text_format.Parse( """ context { features { feature { key: "ctx.int" value { int64_list { value: [3] } } } feature { key: "ctx.float" value {
def testBigQueryToElwc(self, mock_client): # Mock query result schema for _BigQueryConverter. mock_client.return_value.query.return_value.result.return_value.schema = self._schema with beam.Pipeline() as pipeline: elwc_examples = ( pipeline | 'ToElwc' >> executor._BigQueryToElwcExample( elwc_config=example_gen_pb2.ElwcConfig( context_feature_fields=['qid']), input_dict={}, exec_properties={}, split_pattern= 'SELECT qid, feature_id_1, feature_id_2, feature_id_3 FROM `fake`' )) elwc_1 = input_pb2.ExampleListWithContext() elwc_1.context.features.feature['qid'].int64_list.value.append(1) example1 = elwc_1.examples.add() example1.features.feature['feature_id_1'].int64_list.value.append( 1) example1.features.feature['feature_id_2'].float_list.value.append( 1.0) example1.features.feature['feature_id_3'].bytes_list.value.append( tf.compat.as_bytes('1')) example2 = elwc_1.examples.add() example2.features.feature['feature_id_1'].int64_list.value.append( 2) example2.features.feature['feature_id_2'].float_list.value.append( 2.0) example2.features.feature['feature_id_3'].bytes_list.value.append( tf.compat.as_bytes('2')) elwc_2 = input_pb2.ExampleListWithContext() elwc_2.context.features.feature['qid'].int64_list.value.append(2) example3 = elwc_2.examples.add() example3.features.feature['feature_id_1'].int64_list.value.append( 3) example3.features.feature['feature_id_2'].float_list.value.append( 3.0) example3.features.feature['feature_id_3'].bytes_list.value.append( tf.compat.as_bytes('3')) example4 = elwc_2.examples.add() example4.features.feature['feature_id_1'].int64_list.value.append( 4) example4.features.feature['feature_id_2'].float_list.value.append( 4.0) example4.features.feature['feature_id_3'].bytes_list.value.append( tf.compat.as_bytes('4')) elwc_3 = input_pb2.ExampleListWithContext() elwc_3.context.features.feature['qid'].int64_list.value.append(5) example5 = elwc_3.examples.add() example5.features.feature['feature_id_1'].int64_list.value.append( 5) example5.features.feature['feature_id_2'].float_list.value.append( 5.0) example5.features.feature['feature_id_3'].bytes_list.value.append( tf.compat.as_bytes('5')) expected_elwc_examples = [elwc_1, elwc_2, elwc_3] util.assert_that(elwc_examples, util.equal_to(expected_elwc_examples))
def test_convert_to_elwc(self): query = "test" documents = ["This", "This is simple test", "test"] label_name = "label" labels = [1, 0, 1] self._bert_max_seq_length = 8 bert_helper = self._create_tfrbert_util_with_vocab() elwc = bert_helper.convert_to_elwc(context=query, examples=documents, labels=labels, label_name=label_name) expected_elwc = text_format.Parse( """ examples: { features: { feature: { key: "label" value: { int64_list: { value: [ 1 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 7, 5, 8, 1, 8, 0, 0, 0 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 1, 1, 0, 0, 0 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 1, 1, 0, 0, 0 ] } } } } } examples: { features: { feature: { key: "label" value: { int64_list: { value: [ 0 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 7, 5, 8, 1, 2, 4, 5, 8 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 1, 1, 1, 1, 1 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 1, 1, 1, 1, 1 ] } } } } } examples: { features: { feature: { key: "label" value: { int64_list: { value: [ 1 ] } } } feature: { key: "input_ids" value: { int64_list: { value: [ 7, 5, 8, 5, 8, 0, 0, 0 ] } } } feature: { key: "input_mask" value: { int64_list: { value: [ 1, 1, 1, 1, 1, 0, 0, 0 ] } } } feature: { key: "segment_ids" value: { int64_list: { value: [ 0, 0, 0, 1, 1, 0, 0, 0 ] } } } } }""", input_pb2.ExampleListWithContext()) self.assertEqual(text_format.MessageToString(expected_elwc), text_format.MessageToString(elwc))
value { bytes_list { value: ["irrelevant", "data"] } } } feature { key: "relevance" value { int64_list { value: 1 } } } }""", tf.train.Example()), ] try: from tensorflow_serving.apis import input_pb2 except ImportError: !pip install - q tensorflow-serving-api from tensorflow_serving.apis import input_pb2 ELWC = input_pb2.ExampleListWithContext() ELWC.context.CopyFrom(CONTEXT) for example in EXAMPLES: example_features = ELWC.examples.add() example_features.CopyFrom(example) print(ELWC) # Store the paths to files containing training and test instances. _TRAIN_DATA_PATH = "/tmp/train.tfrecords" _TEST_DATA_PATH = "/tmp/test.tfrecords" # Store the vocabulary path for query and document tokens. _VOCAB_PATH = "/tmp/vocab.txt" # The maximum number of documents per query in the dataset.
features { feature { key: "custom_features_1" value { float_list { value: 1.0 } } } feature { key: "custom_features_3" value { float_list { value: 1.0 } } } feature { key: "utility" value { float_list { value: 1.0 } } } } } """, input_pb2.ExampleListWithContext()) EXAMPLE_PROTO_1 = text_format.Parse( """ features { feature { key: "cf_1" value { float_list { value: 1.0 } } } feature { key: "custom_features_1" value { float_list { value: 1.0 } } } feature { key: "custom_features_2" value { float_list { value: 1.0 } }
def write_context_examples(path, samples): def serialize_example_fake(relevance, rvfeatures): """ fake -> same number in context and relevant example """ # Create a dictionary mapping the feature name to the tf.Example-compatible # data type. feature = { 'rv_tokens': _int64_list_feature(rvfeatures), # _RV_FEATURE 'relevance': _int64_feature(relevance), # _LABEL_FEATURE } # Create a Features message using tf.train.Example. example = tf.train.Example(features=tf.train.Features(feature=feature)) return example # .SerializeToString() def serialize_context_fake(contfeatures): """ Creates a tf.Example message ready to be written to a file. """ # Create a dictionary mapping the feature name to the tf.Example-compatible # data type. feature = { 'event_tokens': _int64_list_feature(contfeatures), } # Create a Features message using tf.train.Example. context = tf.train.Example(features=tf.train.Features(feature=feature)) return context # .SerializeToString() def serialize_example(): """ Creates a tf.Example message ready to be written to a file. concententate: 'rv.feat + rv.tline' """ # Create a dictionary mapping the feature name to the tf.Example-compatible # data type. feature = { 'relevance': _int64_feature(rv.relevance), # _LABEL_FEATURE } for k, val in enumerate(rv.features()): fname = rv_features[k] feature[fname] = dispatch_fn[fname](val) # Create a Features message using tf.train.Example. example = tf.train.Example(features=tf.train.Features(feature=feature)) return example # .SerializeToString() def serialize_context(contfeatures): """ Creates a tf.Example message ready to be written to a file. """ # Create a dictionary mapping the feature name to the tf.Example-compatible # data type. feature = {} for k, val in enumerate(contfeatures): fname = ev_features[k] feature[fname] = dispatch_fn[fname](val) # Create a Features message using tf.train.Example. context = tf.train.Example(features=tf.train.Features(feature=feature)) return context # .SerializeToString() elwc_list = [] ev_features = _EVENT_FEATURES rv_features = _RV_FEATURES for s in samples: rvli = s.rvli example_list = [] if _FAKE_ELWC: #1 is relevant rest is not #example = serialize_example_fake(1, [1]) #example_list.append(example) for i in range(1, 2): #relev = random.randint(0, 1) if i == 1: relev = 1 else: relev = 0 example = serialize_example_fake(relev, [i]) example_list.append(example) cont_feature = random.randint(1, 2) #cont_feature = 1 context = serialize_context_fake([cont_feature]) else: for rv in rvli: example = serialize_example() example_list.append(example) # context = serialize_context(s.features) context = serialize_context(s.features()) ELWC = input_pb2.ExampleListWithContext() ELWC.context.CopyFrom(context) for example in example_list: example_features = ELWC.examples.add() example_features.CopyFrom(example) elwc_list.append(ELWC) file_path = path with tf.io.TFRecordWriter(file_path) as writer: for elwc in elwc_list: # [:2]: # print(elwc) writer.write(elwc.SerializeToString())