Esempio n. 1
0
def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle):
  """Serialize a dataframe and write it to a buffer file."""
  buffer_path = _buffer_path(data_dir, dataset, name)
  expected_size = _BUFFER_SIZE[dataset].get(name)

  file_io.write_to_buffer(
      dataframe=df, buffer_path=buffer_path,
      columns=list(_FEATURE_MAP.keys()), expected_size=expected_size)

  def input_fn():
    dataset = tf.data.TFRecordDataset(buffer_path)
    # batch comes before map because map can deserialize multiple examples.
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(_deserialize, num_parallel_calls=16)
    if shuffle:
      dataset = dataset.shuffle(shuffle)

    dataset = dataset.repeat(repeat)
    return dataset.prefetch(1)

  return input_fn
Esempio n. 2
0
def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle):
  """Serialize a dataframe and write it to a buffer file."""
  buffer_path = _buffer_path(data_dir, dataset, name)
  expected_size = _BUFFER_SIZE[dataset].get(name)

  file_io.write_to_buffer(
      dataframe=df, buffer_path=buffer_path,
      columns=list(_FEATURE_MAP.keys()), expected_size=expected_size)

  def input_fn():
    dataset = tf.data.TFRecordDataset(buffer_path)
    # batch comes before map because map can deserialize multiple examples.
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(_deserialize, num_parallel_calls=16)
    if shuffle:
      dataset = dataset.shuffle(shuffle)

    dataset = dataset.repeat(repeat)
    return dataset.prefetch(1)

  return input_fn
def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle):
    """Serialize a dataframe and write it to a buffer file."""
    buffer_path = _buffer_path(data_dir, dataset, name)
    expected_size = _BUFFER_SIZE[dataset].get(name)
    # Args: for file_io.write_to_buffer method
    #     dataframe: The pandas dataframe to be serialized.
    #     buffer_path: The path where the serialized results will be written.
    #     columns: The dataframe columns to be serialized.
    #     expected_size: The size in bytes of the serialized results. This is used to
    #       lazily construct the buffer.(M)
    file_io.write_to_buffer(
        dataframe=df, buffer_path=buffer_path,
        columns=list(_FEATURE_MAP.keys()), expected_size=expected_size)

    def input_fn():
        dataset = tf.data.TFRecordDataset(buffer_path)
        # batch comes before map because map can deserialize multiple examples.
        dataset = dataset.batch(batch_size)

        # Args: for Map method
        #   map_func: A function mapping a nested structure of tensors (having
        #     shapes and types defined by `self.output_shapes` and
        #    `self.output_types`) to another nested structure of tensors.
        #   num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
        #     representing the number elements to process in parallel. If not
        #     specified, elements will be processed sequentially.
        #
        # Returns:
        #   Dataset: A `Dataset`.(M)

        dataset = dataset.map(_deserialize, num_parallel_calls=16)
        if shuffle:
            dataset = dataset.shuffle(shuffle)

        dataset = dataset.repeat(repeat)
        return dataset.prefetch(1)

    return input_fn
def get_input_fn(training, batch_size, ncf_dataset, data_dir, dataset,
                 repeat=1):
  """Input function for model training and evaluation.

  The train input consists of 1 positive instance (user and item have
  interactions) followed by some number of negative instances in which the items
  are randomly chosen. The number of negative instances is "num_negatives" which
  is 4 by default. Note that for each epoch, we need to re-generate the negative
  instances. Together with positive instances, they form a new train dataset.

  Args:
    training: A boolean flag for training mode.
    batch_size: An integer, batch size for training and evaluation.
    ncf_dataset: An NCFDataSet object, which contains the information about
      training and test data.
    repeat: An integer, how many times to repeat the dataset.

  Returns:
    dataset: A tf.data.Dataset object containing examples loaded from the files.
  """
  # Generate random negative instances for training in each epoch
  if training:
    tf.logging.info("Generating training data.")
    train_data = generate_train_dataset(
        ncf_dataset.train_data, ncf_dataset.num_items,
        ncf_dataset.num_negatives)

    df = pd.DataFrame(data=train_data, columns=_COLUMNS)

    if data_dir.startswith("gs://"):
      buffer_dir = os.path.join(data_dir, _BUFFER_SUBDIR)
    else:
      buffer_dir = None

    buffer_path = file_io.write_to_temp_buffer(df, buffer_dir, _COLUMNS)
    map_fn = _deserialize_train

  else:
    df = pd.DataFrame(ncf_dataset.all_eval_data, columns=_EVAL_COLUMNS)
    buffer_path = os.path.join(
        data_dir, _BUFFER_SUBDIR, dataset + "_eval_buffer")

    file_io.write_to_buffer(
        dataframe=df, buffer_path=buffer_path, columns=_EVAL_COLUMNS,
        expected_size=_EVAL_BUFFER_SIZE[dataset])
    map_fn = _deserialize_eval


  def input_fn():  # pylint: disable=missing-docstring
    dataset = tf.data.TFRecordDataset(buffer_path)
    if training:
      dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER_SIZE)

    dataset = dataset.batch(batch_size)
    dataset = dataset.map(map_fn, num_parallel_calls=16)
    dataset = dataset.repeat(repeat)

    # Prefetch to improve speed of input pipeline.
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
    return dataset

  return input_fn
def get_input_fn(training,
                 batch_size,
                 ncf_dataset,
                 data_dir,
                 dataset,
                 repeat=1):
    """Input function for model training and evaluation.

  The train input consists of 1 positive instance (user and item have
  interactions) followed by some number of negative instances in which the items
  are randomly chosen. The number of negative instances is "num_negatives" which
  is 4 by default. Note that for each epoch, we need to re-generate the negative
  instances. Together with positive instances, they form a new train dataset.

  Args:
    training: A boolean flag for training mode.
    batch_size: An integer, batch size for training and evaluation.
    ncf_dataset: An NCFDataSet object, which contains the information about
      training and test data.
    repeat: An integer, how many times to repeat the dataset.

  Returns:
    dataset: A tf.data.Dataset object containing examples loaded from the files.
  """
    # Generate random negative instances for training in each epoch
    if training:
        tf.logging.info("Generating training data.")
        train_data = generate_train_dataset(ncf_dataset.train_data,
                                            ncf_dataset.num_items,
                                            ncf_dataset.num_negatives)

        df = pd.DataFrame(data=train_data, columns=_COLUMNS)

        if data_dir.startswith("gs://"):
            buffer_dir = os.path.join(data_dir, _BUFFER_SUBDIR)
        else:
            buffer_dir = None

        buffer_path = file_io.write_to_temp_buffer(df, buffer_dir, _COLUMNS)
        map_fn = _deserialize_train

    else:
        df = pd.DataFrame(ncf_dataset.all_eval_data, columns=_EVAL_COLUMNS)
        buffer_path = os.path.join(data_dir, _BUFFER_SUBDIR,
                                   dataset + "_eval_buffer")

        file_io.write_to_buffer(dataframe=df,
                                buffer_path=buffer_path,
                                columns=_EVAL_COLUMNS,
                                expected_size=_EVAL_BUFFER_SIZE[dataset])
        map_fn = _deserialize_eval

    def input_fn():  # pylint: disable=missing-docstring
        dataset = tf.data.TFRecordDataset(buffer_path)
        if training:
            dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER_SIZE)

        dataset = dataset.batch(batch_size)
        dataset = dataset.map(map_fn, num_parallel_calls=16)
        dataset = dataset.repeat(repeat)

        # Prefetch to improve speed of input pipeline.
        dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
        return dataset

    return input_fn