def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle): """Serialize a dataframe and write it to a buffer file.""" buffer_path = _buffer_path(data_dir, dataset, name) expected_size = _BUFFER_SIZE[dataset].get(name) file_io.write_to_buffer( dataframe=df, buffer_path=buffer_path, columns=list(_FEATURE_MAP.keys()), expected_size=expected_size) def input_fn(): dataset = tf.data.TFRecordDataset(buffer_path) # batch comes before map because map can deserialize multiple examples. dataset = dataset.batch(batch_size) dataset = dataset.map(_deserialize, num_parallel_calls=16) if shuffle: dataset = dataset.shuffle(shuffle) dataset = dataset.repeat(repeat) return dataset.prefetch(1) return input_fn
def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle): """Serialize a dataframe and write it to a buffer file.""" buffer_path = _buffer_path(data_dir, dataset, name) expected_size = _BUFFER_SIZE[dataset].get(name) # Args: for file_io.write_to_buffer method # dataframe: The pandas dataframe to be serialized. # buffer_path: The path where the serialized results will be written. # columns: The dataframe columns to be serialized. # expected_size: The size in bytes of the serialized results. This is used to # lazily construct the buffer.(M) file_io.write_to_buffer( dataframe=df, buffer_path=buffer_path, columns=list(_FEATURE_MAP.keys()), expected_size=expected_size) def input_fn(): dataset = tf.data.TFRecordDataset(buffer_path) # batch comes before map because map can deserialize multiple examples. dataset = dataset.batch(batch_size) # Args: for Map method # map_func: A function mapping a nested structure of tensors (having # shapes and types defined by `self.output_shapes` and # `self.output_types`) to another nested structure of tensors. # num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`, # representing the number elements to process in parallel. If not # specified, elements will be processed sequentially. # # Returns: # Dataset: A `Dataset`.(M) dataset = dataset.map(_deserialize, num_parallel_calls=16) if shuffle: dataset = dataset.shuffle(shuffle) dataset = dataset.repeat(repeat) return dataset.prefetch(1) return input_fn
def get_input_fn(training, batch_size, ncf_dataset, data_dir, dataset, repeat=1): """Input function for model training and evaluation. The train input consists of 1 positive instance (user and item have interactions) followed by some number of negative instances in which the items are randomly chosen. The number of negative instances is "num_negatives" which is 4 by default. Note that for each epoch, we need to re-generate the negative instances. Together with positive instances, they form a new train dataset. Args: training: A boolean flag for training mode. batch_size: An integer, batch size for training and evaluation. ncf_dataset: An NCFDataSet object, which contains the information about training and test data. repeat: An integer, how many times to repeat the dataset. Returns: dataset: A tf.data.Dataset object containing examples loaded from the files. """ # Generate random negative instances for training in each epoch if training: tf.logging.info("Generating training data.") train_data = generate_train_dataset( ncf_dataset.train_data, ncf_dataset.num_items, ncf_dataset.num_negatives) df = pd.DataFrame(data=train_data, columns=_COLUMNS) if data_dir.startswith("gs://"): buffer_dir = os.path.join(data_dir, _BUFFER_SUBDIR) else: buffer_dir = None buffer_path = file_io.write_to_temp_buffer(df, buffer_dir, _COLUMNS) map_fn = _deserialize_train else: df = pd.DataFrame(ncf_dataset.all_eval_data, columns=_EVAL_COLUMNS) buffer_path = os.path.join( data_dir, _BUFFER_SUBDIR, dataset + "_eval_buffer") file_io.write_to_buffer( dataframe=df, buffer_path=buffer_path, columns=_EVAL_COLUMNS, expected_size=_EVAL_BUFFER_SIZE[dataset]) map_fn = _deserialize_eval def input_fn(): # pylint: disable=missing-docstring dataset = tf.data.TFRecordDataset(buffer_path) if training: dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER_SIZE) dataset = dataset.batch(batch_size) dataset = dataset.map(map_fn, num_parallel_calls=16) dataset = dataset.repeat(repeat) # Prefetch to improve speed of input pipeline. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset return input_fn
def get_input_fn(training, batch_size, ncf_dataset, data_dir, dataset, repeat=1): """Input function for model training and evaluation. The train input consists of 1 positive instance (user and item have interactions) followed by some number of negative instances in which the items are randomly chosen. The number of negative instances is "num_negatives" which is 4 by default. Note that for each epoch, we need to re-generate the negative instances. Together with positive instances, they form a new train dataset. Args: training: A boolean flag for training mode. batch_size: An integer, batch size for training and evaluation. ncf_dataset: An NCFDataSet object, which contains the information about training and test data. repeat: An integer, how many times to repeat the dataset. Returns: dataset: A tf.data.Dataset object containing examples loaded from the files. """ # Generate random negative instances for training in each epoch if training: tf.logging.info("Generating training data.") train_data = generate_train_dataset(ncf_dataset.train_data, ncf_dataset.num_items, ncf_dataset.num_negatives) df = pd.DataFrame(data=train_data, columns=_COLUMNS) if data_dir.startswith("gs://"): buffer_dir = os.path.join(data_dir, _BUFFER_SUBDIR) else: buffer_dir = None buffer_path = file_io.write_to_temp_buffer(df, buffer_dir, _COLUMNS) map_fn = _deserialize_train else: df = pd.DataFrame(ncf_dataset.all_eval_data, columns=_EVAL_COLUMNS) buffer_path = os.path.join(data_dir, _BUFFER_SUBDIR, dataset + "_eval_buffer") file_io.write_to_buffer(dataframe=df, buffer_path=buffer_path, columns=_EVAL_COLUMNS, expected_size=_EVAL_BUFFER_SIZE[dataset]) map_fn = _deserialize_eval def input_fn(): # pylint: disable=missing-docstring dataset = tf.data.TFRecordDataset(buffer_path) if training: dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER_SIZE) dataset = dataset.batch(batch_size) dataset = dataset.map(map_fn, num_parallel_calls=16) dataset = dataset.repeat(repeat) # Prefetch to improve speed of input pipeline. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset return input_fn