Esempio n. 1
0
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
                   log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
    """
  Transforms the data (X, y, w,...) in a single row.

  Writes X-transforme,d y-transformed to disk.
  """
    row = df.iloc[i]
    X = load_from_disk(row['X'])
    if normalize_X or log_X:
        if normalize_X:
            # Turns NaNs to zeros
            X = np.nan_to_num((X - X_means) / X_stds)
            if truncate_X:
                X[X > trunc] = trunc
                X[X < (-1.0 * trunc)] = -1.0 * trunc
        if log_X:
            X = np.log(X)
    save_to_disk(X, row['X-transformed'])

    y = load_from_disk(row['y'])
    if normalize_y or log_y:
        if normalize_y:
            y = np.nan_to_num((y - y_means) / y_stds)
            if truncate_y:
                y[y > trunc] = trunc
                y[y < (-1.0 * trunc)] = -1.0 * trunc
        if log_y:
            y = np.log(y)
    save_to_disk(y, row['y-transformed'])
Esempio n. 2
0
  def transform_row(self, i, df, data_dir):
    """Logarithmically transforms data in dataset."""
    """Select features and tasks of interest for transformation."""
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
      num_features=len(X[0])
      if self.features is None:
        X = np.log(X+1)
      else:
        for j in xrange(num_features):
          if j in self.features:
            X[:,j] = np.log(X[:,j]+1)
          else:
            X[:,j] = X[:,j]
      save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:
      y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
      num_tasks=len(y[0])
      if self.tasks is None:
        y = np.log(y+1)
      else:
        for j in xrange(num_tasks):
          if j in self.tasks:
            y[:,j] = np.log(y[:,j]+1)
          else:
            y[:,j] = y[:,j]
      save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 3
0
    def transform_row(self, i, df, data_dir):
        """
    Normalizes the data (X, y, w, ...) in a single row).
    """
        row = df.iloc[i]

        if self.transform_X:
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            X = np.nan_to_num((X - self.X_means) / self.X_stds)
            save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

        if self.transform_y:

            y = load_from_disk(os.path.join(data_dir, row['y-transformed']))

            # transform tasks as normal
            y = np.nan_to_num((y - self.y_means) / self.y_stds)

            # add 2nd order correction term to gradients
            grad_var = 1 / self.y_stds[0] * (
                self.ydely_means - self.y_means[0] * self.y_means[1:])
            for i in range(y.shape[0]):
                y[i, 1:] = y[i, 1:] - grad_var * y[i, 0] / self.y_stds[0]

            save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 4
0
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
    """Creates a new DiskDataset

    Parameters
    ----------
    shard_generator: Iterable
      An iterable (either a list or generator) that provides tuples of data
      (X, y, w, ids). Each tuple will be written to a separate shard on disk.
    data_dir: str
      Filename for data directory. Creates a temp directory if none specified.
    tasks: list
      List of tasks for this dataset.
    """
    if data_dir is None:
      data_dir = tempfile.mkdtemp()
    elif not os.path.exists(data_dir):
      os.makedirs(data_dir)

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num
      metadata_rows.append(
          DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
                                         ids))
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    metadata_filename = os.path.join(data_dir, "metadata.joblib")
    save_to_disk((tasks, metadata_df), metadata_filename)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)
Esempio n. 5
0
    def transform_row(self, i, df, data_dir):
        """Logarithmically transforms data in dataset."""
        """Select features and tasks of interest for transformation."""
        row = df.iloc[i]
        if self.transform_X:
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            num_features = len(X[0])
            if self.features is None:
                X = np.log(X + 1)
            else:
                for j in range(num_features):
                    if j in self.features:
                        X[:, j] = np.log(X[:, j] + 1)
                    else:
                        X[:, j] = X[:, j]
            save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

        if self.transform_y:
            y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
            num_tasks = len(y[0])
            if self.tasks is None:
                y = np.log(y + 1)
            else:
                for j in range(num_tasks):
                    if j in self.tasks:
                        y[:, j] = np.log(y[:, j] + 1)
                    else:
                        y[:, j] = y[:, j]
            save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 6
0
  def featurize(self, input_file, feature_types, feature_dir, shard_size=128):
    """Featurize provided file and write to specified location."""
    input_type = _get_input_type(input_file)

    print("Loading raw samples now.")
    raw_df = load_pandas_from_disk(input_file)
    fields = raw_df.keys()
    print("Loaded raw data frame from file.")
    def process_raw_sample_helper(row, fields, input_type):
      return self._process_raw_sample(input_type, row, fields)
    process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                fields=fields,
                                                input_type=input_type)

    processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1)
    print("finished processing rows")
    raw_df = pd.DataFrame.from_records(processed_rows)

    nb_sample = raw_df.shape[0]
    interval_points = np.linspace(
        0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
    shard_files = []
    for j in range(len(interval_points)-1):
      print("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1))
      raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
      df = self._standardize_df(raw_df_shard)
      for feature_type in feature_types:
        print("Currently feauturizing feature_type: %s" % feature_type)
        self._featurize_df(df, feature_type)

      shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j)
      save_to_disk(df, shard_out)
      shard_files.append(shard_out)
    return shard_files
Esempio n. 7
0
  def transform_row(self, i, df, data_dir):
    """
    Normalizes the data (X, y, w, ...) in a single row).
    """
    row = df.iloc[i]

    if self.transform_X:
      X = load_from_disk(
          os.path.join(data_dir, row['X-transformed']))
      X = np.nan_to_num((X - self.X_means) / self.X_stds)
      save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:

      y = load_from_disk(os.path.join(data_dir, row['y-transformed']))

      # transform tasks as normal
      y = np.nan_to_num((y - self.y_means) / self.y_stds)

      # add 2nd order correction term to gradients
      grad_var = 1/self.y_stds[0]*(self.ydely_means-self.y_means[0]*self.y_means[1:])
      for i in range(y.shape[0]):
        y[i,1:] = y[i,1:] - grad_var*y[i,0]/self.y_stds[0]

      save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 8
0
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
    """Creates a new DiskDataset

    Parameters
    ----------
    shard_generator: Iterable
      An iterable (either a list or generator) that provides tuples of data
      (X, y, w, ids). Each tuple will be written to a separate shard on disk.
    data_dir: str
      Filename for data directory. Creates a temp directory if none specified.
    tasks: list
      List of tasks for this dataset.
    """
    if data_dir is None:
      data_dir = tempfile.mkdtemp()
    elif not os.path.exists(data_dir):
      os.makedirs(data_dir)

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num
      metadata_rows.append(
          DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
                                         ids))
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    metadata_filename = os.path.join(data_dir, "metadata.joblib")
    save_to_disk((tasks, metadata_df), metadata_filename)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)
Esempio n. 9
0
    def __init__(self,
                 data_dir=None,
                 tasks=[],
                 samples=None,
                 featurizers=None,
                 use_user_specified_features=False):
        """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        self.data_dir = data_dir

        if featurizers is not None:
            feature_types = [
                featurizer.__class__.__name__ for featurizer in featurizers
            ]
        else:
            feature_types = None

        if use_user_specified_features:
            feature_types = ["user-specified-features"]

        if samples is not None and feature_types is not None:
            if not isinstance(feature_types, list):
                raise ValueError("feature_types must be a list or None.")

            write_dataset_single_partial = partial(write_dataset_single,
                                                   data_dir=self.data_dir,
                                                   feature_types=feature_types,
                                                   tasks=tasks)

            metadata_rows = []
            # TODO(rbharath): Still a bit of information leakage.
            for df_file, df in zip(samples.dataset_files,
                                   samples.iterdataframes()):
                retval = write_dataset_single_partial((df_file, df))
                if retval is not None:
                    metadata_rows.append(retval)

            # TODO(rbharath): FeaturizedSamples should not be responsible for
            # X-transform, X_sums, etc. Move that stuff over to Dataset.
            self.metadata_df = pd.DataFrame(
                metadata_rows,
                columns=('df_file', 'task_names', 'ids', 'X', 'X-transformed',
                         'y', 'y-transformed', 'w', 'X_sums', 'X_sum_squares',
                         'X_n', 'y_sums', 'y_sum_squares', 'y_n'))
            save_to_disk(self.metadata_df, self._get_metadata_filename())
            # input/output transforms not specified yet, so
            # self.transforms = (input_transforms, output_transforms) =>
            self.transforms = ([], [])
            save_to_disk(self.transforms, self._get_transforms_filename())
        else:
            if os.path.exists(self._get_metadata_filename()):
                self.metadata_df = load_from_disk(
                    self._get_metadata_filename())
                self.transforms = load_from_disk(
                    self._get_transforms_filename())
            else:
                raise ValueError("No metadata found.")
Esempio n. 10
0
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
                   log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
  """
  Transforms the data (X, y, w,...) in a single row.

  Writes X-transforme,d y-transformed to disk.
  """
  row = df.iloc[i]
  X = load_from_disk(row['X'])
  if normalize_X or log_X:
    if normalize_X:
      # Turns NaNs to zeros
      X = np.nan_to_num((X - X_means) / X_stds)
      if truncate_X:
        X[X > trunc] = trunc
        X[X < (-1.0*trunc)] = -1.0 * trunc
    if log_X:
      X = np.log(X)
  save_to_disk(X, row['X-transformed'])

  y = load_from_disk(row['y'])
  if normalize_y or log_y:
    if normalize_y:
      y = np.nan_to_num((y - y_means) / y_stds)
      if truncate_y:
        y[y > trunc] = trunc
        y[y < (-1.0*trunc)] = -1.0 * trunc
    if log_y:
      y = np.log(y)
  save_to_disk(y, row['y-transformed'])
Esempio n. 11
0
 def save(self):
     """Dispatcher function for saving."""
     params = {
         "model_params": self.model_params,
         "task_types": self.task_types,
         "model_class": self.__class__
     }
     save_to_disk(params, Model.get_params_filename(self.model_dir))
Esempio n. 12
0
  def featurize(self, input_file, feature_dir, samples_dir,
                shard_size=1024, worker_pool=None,
                reload=False):
    """Featurize provided file and write to specified location."""
    # If we are not to reload data, or data has not already been featurized.
    if not reload or not os.path.exists(feature_dir):
      if not os.path.exists(feature_dir):
        os.makedirs(feature_dir)
      input_type = _get_input_type(input_file)

      log("Loading raw samples now.", self.verbosity)
      raw_df = load_pandas_from_disk(input_file)
      fields = raw_df.keys()
      log("Loaded raw data frame from file.", self.verbosity)
      log("About to preprocess samples.", self.verbosity)

      def process_raw_sample_helper(row, fields, input_type):
        return self._process_raw_sample(input_type, row, fields)
      process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                  fields=fields,
                                                  input_type=input_type)


      raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False)
      nb_sample = raw_df.shape[0]
      interval_points = np.linspace(
          0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
      shard_files = []
      for j in range(len(interval_points)-1):
        log("Sharding and standardizing into shard-%s / %s shards"
            % (str(j+1), len(interval_points)-1), self.verbosity)
        raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
        
        df = self._standardize_df(raw_df_shard) 

        for compound_featurizer in self.compound_featurizers:
          log("Currently featurizing feature_type: %s"
              % compound_featurizer.__class__.__name__, self.verbosity)
          self._featurize_compounds(df, compound_featurizer, worker_pool=worker_pool)

        for complex_featurizer in self.complex_featurizers:
          log("Currently featurizing feature_type: %s"
              % complex_featurizer.__class__.__name__, self.verbosity)
          self._featurize_complexes(df, complex_featurizer, worker_pool=worker_pool)

        shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j)
        save_to_disk(df, shard_out)
        shard_files.append(shard_out)
    else:
      # Reload should automatically find required files
      shard_files = None

    featurizers = self.compound_featurizers + self.complex_featurizers
    samples = FeaturizedSamples(samples_dir=samples_dir, featurizers=featurizers, 
                                dataset_files=shard_files, reload=reload,
                                verbosity=self.verbosity)

    return samples
Esempio n. 13
0
  def transform_row(i, df):
    """Logarithmically transforms data in dataset."""
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      X = np.log(X)
      save_to_disk(X, row['X-transformed'])

    if self.transform_y:
      y = load_from_disk(row['y-transformed'])
      y = np.log(y)
      save_to_disk(y, row['y-transformed'])
Esempio n. 14
0
  def featurize(self, input_file, feature_dir, samples_dir, shard_size=128):
    """Featurize provided file and write to specified location."""
    input_type = _get_input_type(input_file)

    log("Loading raw samples now.", self.verbose)
    raw_df = load_pandas_from_disk(input_file)
    fields = raw_df.keys()
    log("Loaded raw data frame from file.", self.verbose)

    def process_raw_sample_helper(row, fields, input_type):
      return self._process_raw_sample(input_type, row, fields)
    process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                fields=fields,
                                                input_type=input_type)


    #processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1)
    raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False)
    #raw_df = pd.DataFrame.from_records(processed_rows)

    nb_sample = raw_df.shape[0]
    interval_points = np.linspace(
        0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
    shard_files = []
    for j in range(len(interval_points)-1):
      log("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1), self.verbose)
      raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
      
      df = self._standardize_df(raw_df_shard) 
      log("Aggregating User-Specified Features", self.verbose)
      self._add_user_specified_features(df)

      for compound_featurizer in self.compound_featurizers:
        log("Currently feauturizing feature_type: %s"
            % compound_featurizer.__class__.__name__, self.verbose)
        self._featurize_compounds(df, compound_featurizer)

      for complex_featurizer in self.complex_featurizers:
        log("Currently feauturizing feature_type: %s"
            % complex_featurizer.__class__.__name__, self.verbose)
        self._featurize_complexes(df, complex_featurizer)

      shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j)
      save_to_disk(df, shard_out)
      shard_files.append(shard_out)

    featurizers = self.compound_featurizers + self.complex_featurizers
    samples = FeaturizedSamples(samples_dir=samples_dir, featurizers=featurizers, 
                                dataset_files=shard_files,
                                reload_data=False)

    return samples
Esempio n. 15
0
 def transform_row(self, i, df, data_dir):
     """Reweight the labels for this data."""
     row = df.iloc[i]
     y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
     w = load_from_disk(os.path.join(data_dir, row['w-transformed']))
     w_balanced = np.zeros_like(w)
     for ind, task in enumerate(self.dataset.get_task_names()):
         task_y = y[:, ind]
         task_w = w[:, ind]
         zero_indices = np.logical_and(task_y == 0, task_w != 0)
         one_indices = np.logical_and(task_y == 1, task_w != 0)
         w_balanced[zero_indices, ind] = self.weights[ind][0]
         w_balanced[one_indices, ind] = self.weights[ind][1]
     save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
Esempio n. 16
0
 def transform_row(self, i, df, data_dir):
   """Reweight the labels for this data."""
   row = df.iloc[i]
   y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
   w = load_from_disk(os.path.join(data_dir, row['w-transformed']))
   w_balanced = np.zeros_like(w)
   for ind, task in enumerate(self.dataset.get_task_names()):
     task_y = y[:, ind]
     task_w = w[:, ind]
     zero_indices = np.logical_and(task_y==0, task_w != 0)
     one_indices = np.logical_and(task_y==1, task_w != 0)
     w_balanced[zero_indices, ind] = self.weights[ind][0]
     w_balanced[one_indices, ind] = self.weights[ind][1]
   save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
Esempio n. 17
0
def write_dataset_single(val, data_dir, feature_types):
  """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
  (df_file, df) = val
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
    return None
  task_names = FeaturizedSamples.get_sorted_task_names(df)
  ids, X, y, w = _df_to_numpy(df, feature_types)
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
  y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

  basename = os.path.splitext(os.path.basename(df_file))[0]
  out_X = os.path.join(data_dir, "%s-X.joblib" % basename)
  out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename)
  out_y = os.path.join(data_dir, "%s-y.joblib" % basename)
  out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename)
  out_w = os.path.join(data_dir, "%s-w.joblib" % basename)
  out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename)

  save_to_disk(X, out_X)
  save_to_disk(y, out_y)
  save_to_disk(w, out_w)
  save_to_disk(ids, out_ids)
  # TODO(rbharath): Should X be saved to out_X_transformed as well? Since
  # itershards expects to loop over X-transformed? (Ditto for y/w)
  return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y,
          out_y_transformed, out_w,
          X_sums, X_sum_squares, X_n,
          y_sums, y_sum_squares, y_n])
Esempio n. 18
0
  def write_data_to_disk(data_dir,
                         basename,
                         tasks,
                         X=None,
                         y=None,
                         w=None,
                         ids=None):
    if X is not None:
      out_X = "%s-X.joblib" % basename
      save_to_disk(X, os.path.join(data_dir, out_X))
    else:
      out_X = None

    if y is not None:
      out_y = "%s-y.joblib" % basename
      save_to_disk(y, os.path.join(data_dir, out_y))
    else:
      out_y = None

    if w is not None:
      out_w = "%s-w.joblib" % basename
      save_to_disk(w, os.path.join(data_dir, out_w))
    else:
      out_w = None

    if ids is not None:
      out_ids = "%s-ids.joblib" % basename
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    else:
      out_ids = None

    # note that this corresponds to the _construct_metadata column order
    return [basename, tasks, out_ids, out_X, out_y, out_w]
Esempio n. 19
0
def write_dataset_single(val, data_dir, feature_types, tasks):
    """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
    (df_file, df) = val
    # TODO(rbharath): This is a hack. clean up.
    if not len(df):
        return None
    task_names = sorted(tasks)
    ids, X, y, w = _df_to_numpy(df, feature_types, tasks)
    X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
    y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

    basename = os.path.splitext(os.path.basename(df_file))[0]
    out_X = os.path.join(data_dir, "%s-X.joblib" % basename)
    out_X_transformed = os.path.join(data_dir,
                                     "%s-X-transformed.joblib" % basename)
    out_y = os.path.join(data_dir, "%s-y.joblib" % basename)
    out_y_transformed = os.path.join(data_dir,
                                     "%s-y-transformed.joblib" % basename)
    out_w = os.path.join(data_dir, "%s-w.joblib" % basename)
    out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename)

    save_to_disk(X, out_X)
    save_to_disk(y, out_y)
    save_to_disk(w, out_w)
    save_to_disk(ids, out_ids)
    # TODO(rbharath): Should X be saved to out_X_transformed as well? Since
    # itershards expects to loop over X-transformed? (Ditto for y/w)
    return ([
        df_file, task_names, out_ids, out_X, out_X_transformed, out_y,
        out_y_transformed, out_w, X_sums, X_sum_squares, X_n, y_sums,
        y_sum_squares, y_n
    ])
Esempio n. 20
0
  def write_data_to_disk(data_dir,
                         basename,
                         tasks,
                         X=None,
                         y=None,
                         w=None,
                         ids=None):
    if X is not None:
      out_X = "%s-X.joblib" % basename
      save_to_disk(X, os.path.join(data_dir, out_X))
    else:
      out_X = None

    if y is not None:
      out_y = "%s-y.joblib" % basename
      save_to_disk(y, os.path.join(data_dir, out_y))
    else:
      out_y = None

    if w is not None:
      out_w = "%s-w.joblib" % basename
      save_to_disk(w, os.path.join(data_dir, out_w))
    else:
      out_w = None

    if ids is not None:
      out_ids = "%s-ids.joblib" % basename
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    else:
      out_ids = None

    # note that this corresponds to the _construct_metadata column order
    return [out_ids, out_X, out_y, out_w]
Esempio n. 21
0
  def transform_row(self, i, df):
    """
    Randomly permute a Coulomb Matrix in a dataset
    """
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      for j in xrange(len(X)):
        cm = self.construct_cm_from_triu(X[j])
        X[j] = self.unpad_randomize_and_flatten(cm)
      save_to_disk(X, row['X-transformed'])

    if self.transform_y:
      print("y will not be transformed by CoulombRandomizationTransformer.")
Esempio n. 22
0
    def transform_row(self, i, df, data_dir):
        """
    Randomly permute a Coulomb Matrix in a dataset
    """
        row = df.iloc[i]
        if self.transform_X:
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            for j in range(len(X)):
                cm = self.construct_cm_from_triu(X[j])
                X[j] = self.unpad_randomize_and_flatten(cm)
            save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

        if self.transform_y:
            print("y will not be transformed by "
                  "CoulombRandomizationTransformer.")
Esempio n. 23
0
 def transform_row(self, i, df, data_dir):
     """
 Clips outliers for the data (X, y, w, ...) in a single row).
 """
     row = df.iloc[i]
     if self.transform_X:
         X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
         X[X > self.max_val] = self.max_val
         X[X < (-1.0 * self.max_val)] = -1.0 * self.max_val
         save_to_disk(X, os.path.join(data_dir, row['X-transformed']))
     if self.transform_y:
         y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
         y[y > trunc] = trunc
         y[y < (-1.0 * trunc)] = -1.0 * trunc
         save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 24
0
  def transform_row(self, i, df):
    """
    Normalizes the data (X, y, w, ...) in a single row).
    """
    row = df.iloc[i]

    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      X = np.nan_to_num((X - self.X_means) / self.X_stds)
      save_to_disk(X, row['X-transformed'])

    if self.transform_y:
      y = load_from_disk(row['y-transformed'])
      y = np.nan_to_num((y - self.y_means) / self.y_stds)
      save_to_disk(y, row['y-transformed'])
Esempio n. 25
0
    def transform_row(self, i, df, data_dir):
        """
    Normalizes the data (X, y, w, ...) in a single row).
    """
        row = df.iloc[i]

        if self.transform_X:
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            X = np.nan_to_num((X - self.X_means) / self.X_stds)
            save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

        if self.transform_y:
            y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
            y = np.nan_to_num((y - self.y_means) / self.y_stds)
            save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
Esempio n. 26
0
 def transform_row(self, i, df):
   """
   Clips outliers for the data (X, y, w, ...) in a single row).
   """
   row = df.iloc[i]
   if self.transform_X:
     X = load_from_disk(row['X-transformed'])
     X[X > self.max_val] = self.max_val
     X[X < (-1.0*self.max_val)] = -1.0 * self.max_val
     save_to_disk(X, row['X-transformed'])
   if self.transform_y:
     y = load_from_disk(row['y-transformed'])
     y[y > trunc] = trunc
     y[y < (-1.0*trunc)] = -1.0 * trunc
     save_to_disk(y, row['y-transformed'])
Esempio n. 27
0
  def __init__(self, data_dir, samples=None, feature_types=None):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    if not os.path.exists(data_dir):
      os.makedirs(data_dir)
    self.data_dir = data_dir

    if samples is not None and feature_types is not None:
      if not isinstance(feature_types, list):
        raise ValueError("feature_types must be a list or None.")

      write_dataset_single_partial = partial(
          write_dataset_single, data_dir=self.data_dir,
          feature_types=feature_types)

      metadata_rows = []
      # TODO(rbharath): Still a bit of information leakage.
      for df_file, df in zip(samples.dataset_files, samples.itersamples()):
        retval = write_dataset_single_partial((df_file, df))
        if retval is not None:
          metadata_rows.append(retval)

      # TODO(rbharath): FeaturizedSamples should not be responsible for
      # X-transform, X_sums, etc. Move that stuff over to Dataset.
      self.metadata_df = pd.DataFrame(
          metadata_rows,
          columns=('df_file', 'task_names', 'ids',
                   'X', 'X-transformed', 'y', 'y-transformed',
                   'w',
                   'X_sums', 'X_sum_squares', 'X_n',
                   'y_sums', 'y_sum_squares', 'y_n'))
      save_to_disk(
          self.metadata_df, self._get_metadata_filename())
      # input/output transforms not specified yet, so
      # self.transforms = (input_transforms, output_transforms) =>
      self.transforms = ([], [])
      save_to_disk(
          self.transforms, self._get_transforms_filename())
    else:
      if os.path.exists(self._get_metadata_filename()):
        self.metadata_df = load_from_disk(self._get_metadata_filename())
        self.transforms = load_from_disk(self._get_transforms_filename())
      else:
        raise ValueError("No metadata found.")
Esempio n. 28
0
  def transform(self, dataset, parallel=False):

    super(CoulombBinarizationTransformer, self).transform(dataset,
          parallel=parallel)

    df = dataset.metadata_df
    Xt = []

    for _, row in df.iterrows():
      X_t = load_from_disk(row['X-transformed'])
      Xt.append(np.array(X_t))

    X = np.vstack(Xt)
    X_means = X.mean(axis=0)
    X_stds = (X-X_means).std()

    for i, row in df.iterrows():
      X_t = (Xt[i]-X_means)/X_stds
      save_to_disk(X_t, row['X-transformed'])
Esempio n. 29
0
  def transform(self, dataset, parallel=False):

    super(CoulombBinarizationTransformer, self).transform(dataset,
          parallel=parallel)

    df = dataset.metadata_df
    Xt = []

    for _, row in df.iterrows():
      X_t = load_from_disk(os.path.join(dataset.data_dir, row['X-transformed']))
      Xt.append(np.array(X_t))

    X = np.vstack(Xt)
    X_means = X.mean(axis=0)
    X_stds = (X-X_means).std()

    for i, row in df.iterrows():
      X_t = (Xt[i]-X_means)/X_stds
      save_to_disk(X_t, os.path.join(dataset.data_dir, row['X-transformed']))
Esempio n. 30
0
  def transform_row(self, i, df):
    """
    Binarizes data in dataset with sigmoid function
    """

    row = df.iloc[i]
    X_bin = []
    if self.update_state: 
      self.set_max(df)
      self.update_state = False
    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      for i in range(X.shape[1]):
        for k in np.arange(0,self.feature_max[i]+self.theta,self.theta):
          X_bin += [np.tanh((X[:,i]-k)/self.theta)]

      X_bin = np.array(X_bin).T
      save_to_disk(X_bin, row['X-transformed'])

    if self.transform_y:
      print("y will not be transformed by CoulombBinarizationTransformer.")
Esempio n. 31
0
  def transform_row(self, i, df, data_dir):
    """
    Binarizes data in dataset with sigmoid function
    """
    row = df.iloc[i]
    X_bin = []
    if self.update_state: 
      self.set_max(df, data_dir)
      self.update_state = False
    if self.transform_X:
      X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
      for i in range(X.shape[1]):
        for k in np.arange(0,self.feature_max[i]+self.theta,self.theta):
          X_bin += [np.tanh((X[:,i]-k)/self.theta)]

      X_bin = np.array(X_bin).T
      save_to_disk(X_bin, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:
      print("y will not be transformed by "
            "CoulombBinarizationTransformer.")
Esempio n. 32
0
    def transform(self, input_transforms, output_transforms, parallel=False):
        """
    Transforms all internally stored data.

    Adds X-transform, y-transform columns to metadata.
    """
        (normalize_X, truncate_x, normalize_y, truncate_y, log_X,
         log_y) = (False, False, False, False, False, False)

        if "truncate" in input_transforms:
            truncate_x = True
        if "normalize" in input_transforms:
            normalize_X = True
        if "log" in input_transforms:
            log_X = True

        if "normalize" in output_transforms:
            normalize_y = True
        if "log" in output_transforms:
            log_y = True

        # Store input_transforms/output_transforms so the dataset remembers its state.

        X_means, X_stds, y_means, y_stds = self._transform(normalize_X,
                                                           normalize_y,
                                                           truncate_x,
                                                           truncate_y,
                                                           log_X,
                                                           log_y,
                                                           parallel=parallel)
        nrow = self.metadata_df.shape[0]
        # TODO(rbharath): These lines are puzzling. Better way to avoid storage
        # duplication here?
        self.metadata_df['X_means'] = [X_means for _ in range(nrow)]
        self.metadata_df['X_stds'] = [X_stds for _ in range(nrow)]
        self.metadata_df['y_means'] = [y_means for _ in range(nrow)]
        self.metadata_df['y_stds'] = [y_stds for _ in range(nrow)]
        save_to_disk(self.metadata_df, self._get_metadata_filename())
        self.transforms = (input_transforms, output_transforms)
        save_to_disk(self.transforms, self._get_transforms_filename())
Esempio n. 33
0
  def __init__(self, feature_dir, dataset_files=None, overwrite=True,
               reload_data=False):
    """
    Initialiize FeaturizedSamples

    If feature_dir does not exist, must specify dataset_files. Then feature_dir
    is created and populated. If feature_dir exists (created by previous call to
    FeaturizedSamples), then dataset_files cannot be specified. If overwrite is
    set and dataset_files is provided, will overwrite old dataset_files with
    new.
    """
    self.dataset_files = dataset_files

    if not os.path.exists(feature_dir):
      os.makedirs(feature_dir)
    self.feature_dir = feature_dir
    if os.path.exists(self._get_compounds_filename()) and reload_data:
      compounds_df = load_from_disk(self._get_compounds_filename())
    else:
      compounds_df = self._get_compounds()
      # compounds_df is not altered by any method after initialization, so it's
      # safe to keep a copy in memory and on disk.
      save_to_disk(compounds_df, self._get_compounds_filename())
    _check_validity(compounds_df)
    self.compounds_df = compounds_df

    if os.path.exists(self._get_dataset_paths_filename()):
      if dataset_files is not None:
        if overwrite:
          save_to_disk(dataset_files, self._get_dataset_paths_filename())
        else:
          raise ValueError("Can't change dataset_files already stored on disk")
      self.dataset_files = load_from_disk(self._get_dataset_paths_filename())
    else:
      save_to_disk(dataset_files, self._get_dataset_paths_filename())
Esempio n. 34
0
  def transform(self, input_transforms, output_transforms, parallel=False):
    """
    Transforms all internally stored data.

    Adds X-transform, y-transform columns to metadata.
    """
    (normalize_X, truncate_x, normalize_y, truncate_y, log_X, log_y) = (
        False, False, False, False, False, False)

    if "truncate" in input_transforms:
      truncate_x = True
    if "normalize" in input_transforms:
      normalize_X = True
    if "log" in input_transforms:
      log_X = True

    if "normalize" in output_transforms:
      normalize_y = True
    if "log" in output_transforms:
      log_y = True

    # Store input_transforms/output_transforms so the dataset remembers its state.

    X_means, X_stds, y_means, y_stds = self._transform(normalize_X, normalize_y,
                                                       truncate_x, truncate_y,
                                                       log_X, log_y,
                                                       parallel=parallel)
    nrow = self.metadata_df.shape[0]
    # TODO(rbharath): These lines are puzzling. Better way to avoid storage
    # duplication here?
    self.metadata_df['X_means'] = [X_means for _ in range(nrow)]
    self.metadata_df['X_stds'] = [X_stds for _ in range(nrow)]
    self.metadata_df['y_means'] = [y_means for _ in range(nrow)]
    self.metadata_df['y_stds'] = [y_stds for _ in range(nrow)]
    save_to_disk(
        self.metadata_df, self._get_metadata_filename())
    self.transforms = (input_transforms, output_transforms)
    save_to_disk(
        self.transforms, self._get_transforms_filename())
Esempio n. 35
0
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
                         ids=None):
    out_X = "%s-X.joblib" % basename
    out_y = "%s-y.joblib" % basename
    out_w = "%s-w.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
    if y is not None:
      save_to_disk(y, os.path.join(data_dir, out_y))
    if w is not None:
      save_to_disk(w, os.path.join(data_dir, out_w))
    if ids is not None:
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    return [basename, tasks, out_ids, out_X, out_y, out_w]
Esempio n. 36
0
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
                         ids=None):
    out_X = "%s-X.joblib" % basename
    out_y = "%s-y.joblib" % basename
    out_w = "%s-w.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
    if y is not None:
      save_to_disk(y, os.path.join(data_dir, out_y))
    if w is not None:
      save_to_disk(w, os.path.join(data_dir, out_w))
    if ids is not None:
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    return [basename, tasks, out_ids, out_X, out_y, out_w]
Esempio n. 37
0
def update_mean_and_std(df):
  """
  Compute means/stds of X/y from sums/sum_squares of tensors.
  """
  X_transform = []
  for _, row in df.iterrows():
    Xt = load_from_disk(row['X-transformed'])
    Xs = np.sum(Xt,axis=0)
    Xss = np.sum(np.square(Xt),axis=0)
    save_to_disk(Xs, row['X_sums'])
    save_to_disk(Xss, row['X_sum_squares'])

  y_transform = []
  for _, row in df.iterrows():
    yt = load_from_disk(row['y-transformed'])
    ys = np.sum(yt,axis=0)
    yss = np.sum(np.square(yt),axis=0)
    save_to_disk(ys, row['y_sums'])
    save_to_disk(yss, row['y_sum_squares'])
Esempio n. 38
0
  def _update_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
        Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
        Xs = np.sum(Xt,axis=0)
        Xss = np.sum(np.square(Xt),axis=0)
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))

    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
        yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
        ys = np.sum(yt,axis=0)
        yss = np.sum(np.square(yt),axis=0)
        save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
        save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
Esempio n. 39
0
    def __init__(self,
                 samples_dir,
                 featurizers,
                 dataset_files=None,
                 overwrite=True,
                 reload_data=False):
        """
    Initialiize FeaturizedSamples

    If samples_dir does not exist, must specify dataset_files. Then samples_dir
    is created and populated. If samples_dir exists (created by previous call to
    FeaturizedSamples), then dataset_files cannot be specified. If overwrite is
    set and dataset_files is provided, will overwrite old dataset_files with
    new.
    """
        self.dataset_files = dataset_files
        self.feature_types = (
            ["user-specified-features"] +
            [featurizer.__class__.__name__ for featurizer in featurizers])

        self.featurizers = featurizers

        if not os.path.exists(samples_dir):
            os.makedirs(samples_dir)
        self.samples_dir = samples_dir
        if os.path.exists(self._get_compounds_filename()) and reload_data:
            compounds_df = load_from_disk(self._get_compounds_filename())
        else:
            compounds_df = self._get_compounds()
            # compounds_df is not altered by any method after initialization, so it's
            # safe to keep a copy in memory and on disk.
            save_to_disk(compounds_df, self._get_compounds_filename())
        _check_validity(compounds_df)
        self.compounds_df = compounds_df
        self.num_samples = len(compounds_df)

        if os.path.exists(self._get_dataset_paths_filename()):
            if dataset_files is not None:
                if overwrite:
                    save_to_disk(dataset_files,
                                 self._get_dataset_paths_filename())
                else:
                    raise ValueError(
                        "Can't change dataset_files already stored on disk")
            self.dataset_files = load_from_disk(
                self._get_dataset_paths_filename())
        else:
            save_to_disk(dataset_files, self._get_dataset_paths_filename())
Esempio n. 40
0
  def __init__(self, samples_dir, featurizers, dataset_files=None, 
               reload=False, verbosity=None):
    """
    Initialiize FeaturizedSamples

    If samples_dir does not exist, must specify dataset_files. Then samples_dir
    is created and populated. If samples_dir exists (created by previous call to
    FeaturizedSamples), then dataset_files cannot be specified. If reload is
    False and dataset_files is provided, will overwrite old dataset_files with
    new.
    """
    assert verbosity in [None, "low", "high"]
    self.verbosity = verbosity
    self.dataset_files = dataset_files
    self.feature_types = (
        ["user-specified-features"] + 
        [featurizer.__class__.__name__ for featurizer in featurizers])

    self.featurizers = featurizers

    if not os.path.exists(samples_dir):
      os.makedirs(samples_dir)
    self.samples_dir = samples_dir

    if os.path.exists(self._get_dataset_paths_filename()):
      if dataset_files is not None:
        if not reload:
          save_to_disk(dataset_files, self._get_dataset_paths_filename())
        else:
          raise ValueError("Can't change dataset_files already stored on disk")
    else:
      save_to_disk(dataset_files, self._get_dataset_paths_filename())
    self.dataset_files = load_from_disk(self._get_dataset_paths_filename())

    if os.path.exists(self._get_compounds_filename()) and reload:
      compounds_df = load_from_disk(self._get_compounds_filename())
    else:
      compounds_df = self._get_compounds()
      # compounds_df is not altered by any method after initialization, so it's
      # safe to keep a copy in memory and on disk.
      save_to_disk(compounds_df, self._get_compounds_filename())
    _check_validity(compounds_df)
    self.compounds_df = compounds_df
    self.num_samples = len(compounds_df)
Esempio n. 41
0
  def transform_on_batch(self, X, y, w, batch_dataset):
    """
    Transforms data in a 1-shard Dataset object with Transformer objects.
    """
    # Save X, y, and w to batch_dataset
    # The save/load operations work correctly with 1-shard dataframe
    df = batch_dataset.metadata_df
    for _, row in df.iterrows():
      save_to_disk(X, row['X-transformed'])
      save_to_disk(y, row['y-transformed'])
      save_to_disk(w, row['w'])

    # Transform batch_dataset
    for transformer in self.fit_transformers:
      transformer.transform(batch_dataset)

    # Return numpy arrays from batch_dataset
    for _, row in df.iterrows(): 
      X = load_from_disk(row['X-transformed'])
      y = load_from_disk(row['y-transformed'])
      w = load_from_disk(row['w'])

    return X, y, w
Esempio n. 42
0
 def save(self):
     """Saves sklearn model to disk using joblib."""
     super(SklearnModel, self).save()
     save_to_disk(self.raw_model, self.get_model_filename(self.model_dir))
Esempio n. 43
0
feature_dir = os.path.join(base_dir, "features")
if not os.path.exists(feature_dir):
    os.makedirs(feature_dir)

samples_dir = os.path.join(base_dir, "samples")
if not os.path.exists(samples_dir):
    os.makedirs(samples_dir)


from deepchem.featurizers.featurize import DataFeaturizer

featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=["label"],
                            smiles_field="smiles",
                            protein_pdb_field="protein_pdb",
                            ligand_pdb_field="ligand_pdb",
                            compound_featurizers=compound_featurizers,
                            complex_featurizers=complex_featurizers,
                            id_field="complex_id",
                            verbose=False)
from ipyparallel import Client
c = Client()
print("c.ids")
print(c.ids)
dview = c[:]
featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir,
                                          worker_pool=dview, shard_size=1024)

save_to_disk(featurized_samples, featurized_samples_file)
Esempio n. 44
0
 def save_to_disk(self):
   """Save dataset to disk."""
   save_to_disk(
       self.metadata_df, self._get_metadata_filename())
Esempio n. 45
0
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
                         compute_feature_statistics=True):
    out_X = "%s-X.joblib" % basename
    out_X_transformed = "%s-X-transformed.joblib" % basename
    out_X_sums = "%s-X_sums.joblib" % basename
    out_X_sum_squares = "%s-X_sum_squares.joblib" % basename
    out_X_n = "%s-X_n.joblib" % basename
    out_y = "%s-y.joblib" % basename
    out_y_transformed = "%s-y-transformed.joblib" % basename
    out_y_sums = "%s-y_sums.joblib" % basename
    out_y_sum_squares = "%s-y_sum_squares.joblib" % basename
    out_y_n = "%s-y_n.joblib" % basename
    out_w = "%s-w.joblib" % basename
    out_w_transformed = "%s-w-transformed.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      if compute_feature_statistics:
        X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
        save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
        save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
        save_to_disk(X_n, os.path.join(data_dir, out_X_n))
    if y is not None:
      save_to_disk(y, os.path.join(data_dir, out_y))
      save_to_disk(y, os.path.join(data_dir, out_y_transformed))
      y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)
      save_to_disk(y_sums, os.path.join(data_dir, out_y_sums))
      save_to_disk(y_sum_squares, os.path.join(data_dir, out_y_sum_squares))
      save_to_disk(y_n, os.path.join(data_dir, out_y_n))
    if w is not None:
      save_to_disk(w, os.path.join(data_dir, out_w))
      save_to_disk(w, os.path.join(data_dir, out_w_transformed))
    if ids is not None:
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    return [basename, tasks, out_ids, out_X, out_X_transformed, out_y,
            out_y_transformed, out_w, out_w_transformed,
            out_X_sums, out_X_sum_squares, out_X_n,
            out_y_sums, out_y_sum_squares, out_y_n]
Esempio n. 46
0
if not os.path.exists(feature_dir):
    os.makedirs(feature_dir)

samples_dir = os.path.join(base_dir, "samples")
if not os.path.exists(samples_dir):
    os.makedirs(samples_dir)

from deepchem.featurizers.featurize import DataFeaturizer

featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=["label"],
                            smiles_field="smiles",
                            protein_pdb_field="protein_pdb",
                            ligand_pdb_field="ligand_pdb",
                            compound_featurizers=compound_featurizers,
                            complex_featurizers=complex_featurizers,
                            id_field="complex_id",
                            verbose=False)
from ipyparallel import Client
c = Client()
print("c.ids")
print(c.ids)
dview = c[:]
featurized_samples = featurizer.featurize(dataset_file,
                                          feature_dir,
                                          samples_dir,
                                          worker_pool=dview,
                                          shard_size=1024)

save_to_disk(featurized_samples, featurized_samples_file)
Esempio n. 47
0
    def featurize(self, input_file, feature_dir, samples_dir, shard_size=128):
        """Featurize provided file and write to specified location."""
        input_type = _get_input_type(input_file)

        log("Loading raw samples now.", self.verbose)
        raw_df = load_pandas_from_disk(input_file)
        fields = raw_df.keys()
        log("Loaded raw data frame from file.", self.verbose)

        def process_raw_sample_helper(row, fields, input_type):
            return self._process_raw_sample(input_type, row, fields)

        process_raw_sample_helper_partial = partial(process_raw_sample_helper,
                                                    fields=fields,
                                                    input_type=input_type)

        #processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1)
        raw_df = raw_df.apply(process_raw_sample_helper_partial,
                              axis=1,
                              reduce=False)
        #raw_df = pd.DataFrame.from_records(processed_rows)

        nb_sample = raw_df.shape[0]
        interval_points = np.linspace(0,
                                      nb_sample,
                                      np.ceil(float(nb_sample) / shard_size) +
                                      1,
                                      dtype=int)
        shard_files = []
        for j in range(len(interval_points) - 1):
            log(
                "Sharding and standardizing into shard-%s / %s shards" %
                (str(j + 1), len(interval_points) - 1), self.verbose)
            raw_df_shard = raw_df.iloc[range(interval_points[j],
                                             interval_points[j + 1])]

            df = self._standardize_df(raw_df_shard)
            log("Aggregating User-Specified Features", self.verbose)
            self._add_user_specified_features(df)

            for compound_featurizer in self.compound_featurizers:
                log(
                    "Currently feauturizing feature_type: %s" %
                    compound_featurizer.__class__.__name__, self.verbose)
                self._featurize_compounds(df, compound_featurizer)

            for complex_featurizer in self.complex_featurizers:
                log(
                    "Currently feauturizing feature_type: %s" %
                    complex_featurizer.__class__.__name__, self.verbose)
                self._featurize_complexes(df, complex_featurizer)

            shard_out = os.path.join(feature_dir,
                                     "features_shard%d.joblib" % j)
            save_to_disk(df, shard_out)
            shard_files.append(shard_out)

        featurizers = self.compound_featurizers + self.complex_featurizers
        samples = FeaturizedSamples(samples_dir=samples_dir,
                                    featurizers=featurizers,
                                    dataset_files=shard_files,
                                    reload_data=False)

        return samples
Esempio n. 48
0
 def save_to_disk(self):
   """Save dataset to disk."""
   save_to_disk((self.tasks, self.metadata_df), self._get_metadata_filename())
Esempio n. 49
0
 def save(self, out_dir):
   """Saves sklearn model to disk using joblib."""
   super(SklearnModel, self).save(out_dir)
   save_to_disk(self.raw_model, self.get_model_filename(out_dir))
Esempio n. 50
0
 def save(self, out_dir):
   """Dispatcher function for saving."""
   params = {"model_params" : self.model_params,
             "task_types" : self.task_types,
             "model_class": self.__class__}
   save_to_disk(params, Model.get_params_filename(out_dir))
Esempio n. 51
0
 def save(self):
     """Saves sklearn model to disk using joblib."""
     save_to_disk(self.model_instance,
                  self.get_model_filename(self.model_dir))
Esempio n. 52
0
 def _set_compound_df(self, df):
   """Internal method used to replace compounds_df."""
   _check_validity(df)
   save_to_disk(df, self._get_compounds_filename())
   self.compounds_df = df
Esempio n. 53
0
 def _set_compound_df(self, df):
     """Internal method used to replace compounds_df."""
     _check_validity(df)
     save_to_disk(df, self._get_compounds_filename())
     self.compounds_df = df
Esempio n. 54
0
    def write_data_to_disk(data_dir,
                           basename,
                           tasks,
                           X=None,
                           y=None,
                           w=None,
                           ids=None,
                           compute_feature_statistics=True):
        out_X = "%s-X.joblib" % basename
        out_X_transformed = "%s-X-transformed.joblib" % basename
        out_X_sums = "%s-X_sums.joblib" % basename
        out_X_sum_squares = "%s-X_sum_squares.joblib" % basename
        out_X_n = "%s-X_n.joblib" % basename
        out_y = "%s-y.joblib" % basename
        out_y_transformed = "%s-y-transformed.joblib" % basename
        out_y_sums = "%s-y_sums.joblib" % basename
        out_y_sum_squares = "%s-y_sum_squares.joblib" % basename
        out_y_n = "%s-y_n.joblib" % basename
        out_w = "%s-w.joblib" % basename
        out_w_transformed = "%s-w-transformed.joblib" % basename
        out_ids = "%s-ids.joblib" % basename

        if X is not None:
            save_to_disk(X, os.path.join(data_dir, out_X))
            save_to_disk(X, os.path.join(data_dir, out_X_transformed))
            if compute_feature_statistics:
                X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
                save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
                save_to_disk(X_sum_squares,
                             os.path.join(data_dir, out_X_sum_squares))
                save_to_disk(X_n, os.path.join(data_dir, out_X_n))
        if y is not None:
            save_to_disk(y, os.path.join(data_dir, out_y))
            save_to_disk(y, os.path.join(data_dir, out_y_transformed))
            y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)
            save_to_disk(y_sums, os.path.join(data_dir, out_y_sums))
            save_to_disk(y_sum_squares,
                         os.path.join(data_dir, out_y_sum_squares))
            save_to_disk(y_n, os.path.join(data_dir, out_y_n))
        if w is not None:
            save_to_disk(w, os.path.join(data_dir, out_w))
            save_to_disk(w, os.path.join(data_dir, out_w_transformed))
        if ids is not None:
            save_to_disk(ids, os.path.join(data_dir, out_ids))
        return [
            basename, tasks, out_ids, out_X, out_X_transformed, out_y,
            out_y_transformed, out_w, out_w_transformed, out_X_sums,
            out_X_sum_squares, out_X_n, out_y_sums, out_y_sum_squares, out_y_n
        ]