def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y, log_X, log_y, X_means, X_stds, y_means, y_stds, trunc): """ Transforms the data (X, y, w,...) in a single row. Writes X-transforme,d y-transformed to disk. """ row = df.iloc[i] X = load_from_disk(row['X']) if normalize_X or log_X: if normalize_X: # Turns NaNs to zeros X = np.nan_to_num((X - X_means) / X_stds) if truncate_X: X[X > trunc] = trunc X[X < (-1.0 * trunc)] = -1.0 * trunc if log_X: X = np.log(X) save_to_disk(X, row['X-transformed']) y = load_from_disk(row['y']) if normalize_y or log_y: if normalize_y: y = np.nan_to_num((y - y_means) / y_stds) if truncate_y: y[y > trunc] = trunc y[y < (-1.0 * trunc)] = -1.0 * trunc if log_y: y = np.log(y) save_to_disk(y, row['y-transformed'])
def transform_row(self, i, df, data_dir): """Logarithmically transforms data in dataset.""" """Select features and tasks of interest for transformation.""" row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) num_features=len(X[0]) if self.features is None: X = np.log(X+1) else: for j in xrange(num_features): if j in self.features: X[:,j] = np.log(X[:,j]+1) else: X[:,j] = X[:,j] save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) num_tasks=len(y[0]) if self.tasks is None: y = np.log(y+1) else: for j in xrange(num_tasks): if j in self.tasks: y[:,j] = np.log(y[:,j]+1) else: y[:,j] = y[:,j] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def transform_row(self, i, df, data_dir): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) # transform tasks as normal y = np.nan_to_num((y - self.y_means) / self.y_stds) # add 2nd order correction term to gradients grad_var = 1 / self.y_stds[0] * ( self.ydely_means - self.y_means[0] * self.y_means[1:]) for i in range(y.shape[0]): y[i, 1:] = y[i, 1:] - grad_var * y[i, 0] / self.y_stds[0] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True): """Creates a new DiskDataset Parameters ---------- shard_generator: Iterable An iterable (either a list or generator) that provides tuples of data (X, y, w, ids). Each tuple will be written to a separate shard on disk. data_dir: str Filename for data directory. Creates a temp directory if none specified. tasks: list List of tasks for this dataset. """ if data_dir is None: data_dir = tempfile.mkdtemp() elif not os.path.exists(data_dir): os.makedirs(data_dir) metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) metadata_filename = os.path.join(data_dir, "metadata.joblib") save_to_disk((tasks, metadata_df), metadata_filename) time2 = time.time() log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose) return DiskDataset(data_dir, verbose=verbose)
def transform_row(self, i, df, data_dir): """Logarithmically transforms data in dataset.""" """Select features and tasks of interest for transformation.""" row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) num_features = len(X[0]) if self.features is None: X = np.log(X + 1) else: for j in range(num_features): if j in self.features: X[:, j] = np.log(X[:, j] + 1) else: X[:, j] = X[:, j] save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) num_tasks = len(y[0]) if self.tasks is None: y = np.log(y + 1) else: for j in range(num_tasks): if j in self.tasks: y[:, j] = np.log(y[:, j] + 1) else: y[:, j] = y[:, j] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def featurize(self, input_file, feature_types, feature_dir, shard_size=128): """Featurize provided file and write to specified location.""" input_type = _get_input_type(input_file) print("Loading raw samples now.") raw_df = load_pandas_from_disk(input_file) fields = raw_df.keys() print("Loaded raw data frame from file.") def process_raw_sample_helper(row, fields, input_type): return self._process_raw_sample(input_type, row, fields) process_raw_sample_helper_partial = partial(process_raw_sample_helper, fields=fields, input_type=input_type) processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1) print("finished processing rows") raw_df = pd.DataFrame.from_records(processed_rows) nb_sample = raw_df.shape[0] interval_points = np.linspace( 0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int) shard_files = [] for j in range(len(interval_points)-1): print("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1)) raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])] df = self._standardize_df(raw_df_shard) for feature_type in feature_types: print("Currently feauturizing feature_type: %s" % feature_type) self._featurize_df(df, feature_type) shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j) save_to_disk(df, shard_out) shard_files.append(shard_out) return shard_files
def transform_row(self, i, df, data_dir): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk( os.path.join(data_dir, row['X-transformed'])) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) # transform tasks as normal y = np.nan_to_num((y - self.y_means) / self.y_stds) # add 2nd order correction term to gradients grad_var = 1/self.y_stds[0]*(self.ydely_means-self.y_means[0]*self.y_means[1:]) for i in range(y.shape[0]): y[i,1:] = y[i,1:] - grad_var*y[i,0]/self.y_stds[0] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def __init__(self, data_dir=None, tasks=[], samples=None, featurizers=None, use_user_specified_features=False): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if not os.path.exists(data_dir): os.makedirs(data_dir) self.data_dir = data_dir if featurizers is not None: feature_types = [ featurizer.__class__.__name__ for featurizer in featurizers ] else: feature_types = None if use_user_specified_features: feature_types = ["user-specified-features"] if samples is not None and feature_types is not None: if not isinstance(feature_types, list): raise ValueError("feature_types must be a list or None.") write_dataset_single_partial = partial(write_dataset_single, data_dir=self.data_dir, feature_types=feature_types, tasks=tasks) metadata_rows = [] # TODO(rbharath): Still a bit of information leakage. for df_file, df in zip(samples.dataset_files, samples.iterdataframes()): retval = write_dataset_single_partial((df_file, df)) if retval is not None: metadata_rows.append(retval) # TODO(rbharath): FeaturizedSamples should not be responsible for # X-transform, X_sums, etc. Move that stuff over to Dataset. self.metadata_df = pd.DataFrame( metadata_rows, columns=('df_file', 'task_names', 'ids', 'X', 'X-transformed', 'y', 'y-transformed', 'w', 'X_sums', 'X_sum_squares', 'X_n', 'y_sums', 'y_sum_squares', 'y_n')) save_to_disk(self.metadata_df, self._get_metadata_filename()) # input/output transforms not specified yet, so # self.transforms = (input_transforms, output_transforms) => self.transforms = ([], []) save_to_disk(self.transforms, self._get_transforms_filename()) else: if os.path.exists(self._get_metadata_filename()): self.metadata_df = load_from_disk( self._get_metadata_filename()) self.transforms = load_from_disk( self._get_transforms_filename()) else: raise ValueError("No metadata found.")
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y, log_X, log_y, X_means, X_stds, y_means, y_stds, trunc): """ Transforms the data (X, y, w,...) in a single row. Writes X-transforme,d y-transformed to disk. """ row = df.iloc[i] X = load_from_disk(row['X']) if normalize_X or log_X: if normalize_X: # Turns NaNs to zeros X = np.nan_to_num((X - X_means) / X_stds) if truncate_X: X[X > trunc] = trunc X[X < (-1.0*trunc)] = -1.0 * trunc if log_X: X = np.log(X) save_to_disk(X, row['X-transformed']) y = load_from_disk(row['y']) if normalize_y or log_y: if normalize_y: y = np.nan_to_num((y - y_means) / y_stds) if truncate_y: y[y > trunc] = trunc y[y < (-1.0*trunc)] = -1.0 * trunc if log_y: y = np.log(y) save_to_disk(y, row['y-transformed'])
def save(self): """Dispatcher function for saving.""" params = { "model_params": self.model_params, "task_types": self.task_types, "model_class": self.__class__ } save_to_disk(params, Model.get_params_filename(self.model_dir))
def featurize(self, input_file, feature_dir, samples_dir, shard_size=1024, worker_pool=None, reload=False): """Featurize provided file and write to specified location.""" # If we are not to reload data, or data has not already been featurized. if not reload or not os.path.exists(feature_dir): if not os.path.exists(feature_dir): os.makedirs(feature_dir) input_type = _get_input_type(input_file) log("Loading raw samples now.", self.verbosity) raw_df = load_pandas_from_disk(input_file) fields = raw_df.keys() log("Loaded raw data frame from file.", self.verbosity) log("About to preprocess samples.", self.verbosity) def process_raw_sample_helper(row, fields, input_type): return self._process_raw_sample(input_type, row, fields) process_raw_sample_helper_partial = partial(process_raw_sample_helper, fields=fields, input_type=input_type) raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False) nb_sample = raw_df.shape[0] interval_points = np.linspace( 0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int) shard_files = [] for j in range(len(interval_points)-1): log("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1), self.verbosity) raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])] df = self._standardize_df(raw_df_shard) for compound_featurizer in self.compound_featurizers: log("Currently featurizing feature_type: %s" % compound_featurizer.__class__.__name__, self.verbosity) self._featurize_compounds(df, compound_featurizer, worker_pool=worker_pool) for complex_featurizer in self.complex_featurizers: log("Currently featurizing feature_type: %s" % complex_featurizer.__class__.__name__, self.verbosity) self._featurize_complexes(df, complex_featurizer, worker_pool=worker_pool) shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j) save_to_disk(df, shard_out) shard_files.append(shard_out) else: # Reload should automatically find required files shard_files = None featurizers = self.compound_featurizers + self.complex_featurizers samples = FeaturizedSamples(samples_dir=samples_dir, featurizers=featurizers, dataset_files=shard_files, reload=reload, verbosity=self.verbosity) return samples
def transform_row(i, df): """Logarithmically transforms data in dataset.""" row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X = np.log(X) save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y = np.log(y) save_to_disk(y, row['y-transformed'])
def featurize(self, input_file, feature_dir, samples_dir, shard_size=128): """Featurize provided file and write to specified location.""" input_type = _get_input_type(input_file) log("Loading raw samples now.", self.verbose) raw_df = load_pandas_from_disk(input_file) fields = raw_df.keys() log("Loaded raw data frame from file.", self.verbose) def process_raw_sample_helper(row, fields, input_type): return self._process_raw_sample(input_type, row, fields) process_raw_sample_helper_partial = partial(process_raw_sample_helper, fields=fields, input_type=input_type) #processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1) raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False) #raw_df = pd.DataFrame.from_records(processed_rows) nb_sample = raw_df.shape[0] interval_points = np.linspace( 0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int) shard_files = [] for j in range(len(interval_points)-1): log("Sharding and standardizing into shard-%s / %s shards" % (str(j+1), len(interval_points)-1), self.verbose) raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])] df = self._standardize_df(raw_df_shard) log("Aggregating User-Specified Features", self.verbose) self._add_user_specified_features(df) for compound_featurizer in self.compound_featurizers: log("Currently feauturizing feature_type: %s" % compound_featurizer.__class__.__name__, self.verbose) self._featurize_compounds(df, compound_featurizer) for complex_featurizer in self.complex_featurizers: log("Currently feauturizing feature_type: %s" % complex_featurizer.__class__.__name__, self.verbose) self._featurize_complexes(df, complex_featurizer) shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j) save_to_disk(df, shard_out) shard_files.append(shard_out) featurizers = self.compound_featurizers + self.complex_featurizers samples = FeaturizedSamples(samples_dir=samples_dir, featurizers=featurizers, dataset_files=shard_files, reload_data=False) return samples
def transform_row(self, i, df, data_dir): """Reweight the labels for this data.""" row = df.iloc[i] y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) w = load_from_disk(os.path.join(data_dir, row['w-transformed'])) w_balanced = np.zeros_like(w) for ind, task in enumerate(self.dataset.get_task_names()): task_y = y[:, ind] task_w = w[:, ind] zero_indices = np.logical_and(task_y == 0, task_w != 0) one_indices = np.logical_and(task_y == 1, task_w != 0) w_balanced[zero_indices, ind] = self.weights[ind][0] w_balanced[one_indices, ind] = self.weights[ind][1] save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
def transform_row(self, i, df, data_dir): """Reweight the labels for this data.""" row = df.iloc[i] y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) w = load_from_disk(os.path.join(data_dir, row['w-transformed'])) w_balanced = np.zeros_like(w) for ind, task in enumerate(self.dataset.get_task_names()): task_y = y[:, ind] task_w = w[:, ind] zero_indices = np.logical_and(task_y==0, task_w != 0) one_indices = np.logical_and(task_y==1, task_w != 0) w_balanced[zero_indices, ind] = self.weights[ind][0] w_balanced[one_indices, ind] = self.weights[ind][1] save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
def write_dataset_single(val, data_dir, feature_types): """Writes files for single row (X, y, w, X-transformed, ...) to disk.""" (df_file, df) = val # TODO(rbharath): This is a hack. clean up. if not len(df): return None task_names = FeaturizedSamples.get_sorted_task_names(df) ids, X, y, w = _df_to_numpy(df, feature_types) X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X) y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w) basename = os.path.splitext(os.path.basename(df_file))[0] out_X = os.path.join(data_dir, "%s-X.joblib" % basename) out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename) out_y = os.path.join(data_dir, "%s-y.joblib" % basename) out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename) out_w = os.path.join(data_dir, "%s-w.joblib" % basename) out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename) save_to_disk(X, out_X) save_to_disk(y, out_y) save_to_disk(w, out_w) save_to_disk(ids, out_ids) # TODO(rbharath): Should X be saved to out_X_transformed as well? Since # itershards expects to loop over X-transformed? (Ditto for y/w) return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y, out_y_transformed, out_w, X_sums, X_sum_squares, X_n, y_sums, y_sum_squares, y_n])
def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None): if X is not None: out_X = "%s-X.joblib" % basename save_to_disk(X, os.path.join(data_dir, out_X)) else: out_X = None if y is not None: out_y = "%s-y.joblib" % basename save_to_disk(y, os.path.join(data_dir, out_y)) else: out_y = None if w is not None: out_w = "%s-w.joblib" % basename save_to_disk(w, os.path.join(data_dir, out_w)) else: out_w = None if ids is not None: out_ids = "%s-ids.joblib" % basename save_to_disk(ids, os.path.join(data_dir, out_ids)) else: out_ids = None # note that this corresponds to the _construct_metadata column order return [basename, tasks, out_ids, out_X, out_y, out_w]
def write_dataset_single(val, data_dir, feature_types, tasks): """Writes files for single row (X, y, w, X-transformed, ...) to disk.""" (df_file, df) = val # TODO(rbharath): This is a hack. clean up. if not len(df): return None task_names = sorted(tasks) ids, X, y, w = _df_to_numpy(df, feature_types, tasks) X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X) y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w) basename = os.path.splitext(os.path.basename(df_file))[0] out_X = os.path.join(data_dir, "%s-X.joblib" % basename) out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename) out_y = os.path.join(data_dir, "%s-y.joblib" % basename) out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename) out_w = os.path.join(data_dir, "%s-w.joblib" % basename) out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename) save_to_disk(X, out_X) save_to_disk(y, out_y) save_to_disk(w, out_w) save_to_disk(ids, out_ids) # TODO(rbharath): Should X be saved to out_X_transformed as well? Since # itershards expects to loop over X-transformed? (Ditto for y/w) return ([ df_file, task_names, out_ids, out_X, out_X_transformed, out_y, out_y_transformed, out_w, X_sums, X_sum_squares, X_n, y_sums, y_sum_squares, y_n ])
def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None): if X is not None: out_X = "%s-X.joblib" % basename save_to_disk(X, os.path.join(data_dir, out_X)) else: out_X = None if y is not None: out_y = "%s-y.joblib" % basename save_to_disk(y, os.path.join(data_dir, out_y)) else: out_y = None if w is not None: out_w = "%s-w.joblib" % basename save_to_disk(w, os.path.join(data_dir, out_w)) else: out_w = None if ids is not None: out_ids = "%s-ids.joblib" % basename save_to_disk(ids, os.path.join(data_dir, out_ids)) else: out_ids = None # note that this corresponds to the _construct_metadata column order return [out_ids, out_X, out_y, out_w]
def transform_row(self, i, df): """ Randomly permute a Coulomb Matrix in a dataset """ row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) for j in xrange(len(X)): cm = self.construct_cm_from_triu(X[j]) X[j] = self.unpad_randomize_and_flatten(cm) save_to_disk(X, row['X-transformed']) if self.transform_y: print("y will not be transformed by CoulombRandomizationTransformer.")
def transform_row(self, i, df, data_dir): """ Randomly permute a Coulomb Matrix in a dataset """ row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) for j in range(len(X)): cm = self.construct_cm_from_triu(X[j]) X[j] = self.unpad_randomize_and_flatten(cm) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: print("y will not be transformed by " "CoulombRandomizationTransformer.")
def transform_row(self, i, df, data_dir): """ Clips outliers for the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) X[X > self.max_val] = self.max_val X[X < (-1.0 * self.max_val)] = -1.0 * self.max_val save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) y[y > trunc] = trunc y[y < (-1.0 * trunc)] = -1.0 * trunc save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def transform_row(self, i, df): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y = np.nan_to_num((y - self.y_means) / self.y_stds) save_to_disk(y, row['y-transformed'])
def transform_row(self, i, df, data_dir): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) y = np.nan_to_num((y - self.y_means) / self.y_stds) save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def transform_row(self, i, df): """ Clips outliers for the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X[X > self.max_val] = self.max_val X[X < (-1.0*self.max_val)] = -1.0 * self.max_val save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y[y > trunc] = trunc y[y < (-1.0*trunc)] = -1.0 * trunc save_to_disk(y, row['y-transformed'])
def __init__(self, data_dir, samples=None, feature_types=None): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if not os.path.exists(data_dir): os.makedirs(data_dir) self.data_dir = data_dir if samples is not None and feature_types is not None: if not isinstance(feature_types, list): raise ValueError("feature_types must be a list or None.") write_dataset_single_partial = partial( write_dataset_single, data_dir=self.data_dir, feature_types=feature_types) metadata_rows = [] # TODO(rbharath): Still a bit of information leakage. for df_file, df in zip(samples.dataset_files, samples.itersamples()): retval = write_dataset_single_partial((df_file, df)) if retval is not None: metadata_rows.append(retval) # TODO(rbharath): FeaturizedSamples should not be responsible for # X-transform, X_sums, etc. Move that stuff over to Dataset. self.metadata_df = pd.DataFrame( metadata_rows, columns=('df_file', 'task_names', 'ids', 'X', 'X-transformed', 'y', 'y-transformed', 'w', 'X_sums', 'X_sum_squares', 'X_n', 'y_sums', 'y_sum_squares', 'y_n')) save_to_disk( self.metadata_df, self._get_metadata_filename()) # input/output transforms not specified yet, so # self.transforms = (input_transforms, output_transforms) => self.transforms = ([], []) save_to_disk( self.transforms, self._get_transforms_filename()) else: if os.path.exists(self._get_metadata_filename()): self.metadata_df = load_from_disk(self._get_metadata_filename()) self.transforms = load_from_disk(self._get_transforms_filename()) else: raise ValueError("No metadata found.")
def transform(self, dataset, parallel=False): super(CoulombBinarizationTransformer, self).transform(dataset, parallel=parallel) df = dataset.metadata_df Xt = [] for _, row in df.iterrows(): X_t = load_from_disk(row['X-transformed']) Xt.append(np.array(X_t)) X = np.vstack(Xt) X_means = X.mean(axis=0) X_stds = (X-X_means).std() for i, row in df.iterrows(): X_t = (Xt[i]-X_means)/X_stds save_to_disk(X_t, row['X-transformed'])
def transform(self, dataset, parallel=False): super(CoulombBinarizationTransformer, self).transform(dataset, parallel=parallel) df = dataset.metadata_df Xt = [] for _, row in df.iterrows(): X_t = load_from_disk(os.path.join(dataset.data_dir, row['X-transformed'])) Xt.append(np.array(X_t)) X = np.vstack(Xt) X_means = X.mean(axis=0) X_stds = (X-X_means).std() for i, row in df.iterrows(): X_t = (Xt[i]-X_means)/X_stds save_to_disk(X_t, os.path.join(dataset.data_dir, row['X-transformed']))
def transform_row(self, i, df): """ Binarizes data in dataset with sigmoid function """ row = df.iloc[i] X_bin = [] if self.update_state: self.set_max(df) self.update_state = False if self.transform_X: X = load_from_disk(row['X-transformed']) for i in range(X.shape[1]): for k in np.arange(0,self.feature_max[i]+self.theta,self.theta): X_bin += [np.tanh((X[:,i]-k)/self.theta)] X_bin = np.array(X_bin).T save_to_disk(X_bin, row['X-transformed']) if self.transform_y: print("y will not be transformed by CoulombBinarizationTransformer.")
def transform_row(self, i, df, data_dir): """ Binarizes data in dataset with sigmoid function """ row = df.iloc[i] X_bin = [] if self.update_state: self.set_max(df, data_dir) self.update_state = False if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) for i in range(X.shape[1]): for k in np.arange(0,self.feature_max[i]+self.theta,self.theta): X_bin += [np.tanh((X[:,i]-k)/self.theta)] X_bin = np.array(X_bin).T save_to_disk(X_bin, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: print("y will not be transformed by " "CoulombBinarizationTransformer.")
def transform(self, input_transforms, output_transforms, parallel=False): """ Transforms all internally stored data. Adds X-transform, y-transform columns to metadata. """ (normalize_X, truncate_x, normalize_y, truncate_y, log_X, log_y) = (False, False, False, False, False, False) if "truncate" in input_transforms: truncate_x = True if "normalize" in input_transforms: normalize_X = True if "log" in input_transforms: log_X = True if "normalize" in output_transforms: normalize_y = True if "log" in output_transforms: log_y = True # Store input_transforms/output_transforms so the dataset remembers its state. X_means, X_stds, y_means, y_stds = self._transform(normalize_X, normalize_y, truncate_x, truncate_y, log_X, log_y, parallel=parallel) nrow = self.metadata_df.shape[0] # TODO(rbharath): These lines are puzzling. Better way to avoid storage # duplication here? self.metadata_df['X_means'] = [X_means for _ in range(nrow)] self.metadata_df['X_stds'] = [X_stds for _ in range(nrow)] self.metadata_df['y_means'] = [y_means for _ in range(nrow)] self.metadata_df['y_stds'] = [y_stds for _ in range(nrow)] save_to_disk(self.metadata_df, self._get_metadata_filename()) self.transforms = (input_transforms, output_transforms) save_to_disk(self.transforms, self._get_transforms_filename())
def __init__(self, feature_dir, dataset_files=None, overwrite=True, reload_data=False): """ Initialiize FeaturizedSamples If feature_dir does not exist, must specify dataset_files. Then feature_dir is created and populated. If feature_dir exists (created by previous call to FeaturizedSamples), then dataset_files cannot be specified. If overwrite is set and dataset_files is provided, will overwrite old dataset_files with new. """ self.dataset_files = dataset_files if not os.path.exists(feature_dir): os.makedirs(feature_dir) self.feature_dir = feature_dir if os.path.exists(self._get_compounds_filename()) and reload_data: compounds_df = load_from_disk(self._get_compounds_filename()) else: compounds_df = self._get_compounds() # compounds_df is not altered by any method after initialization, so it's # safe to keep a copy in memory and on disk. save_to_disk(compounds_df, self._get_compounds_filename()) _check_validity(compounds_df) self.compounds_df = compounds_df if os.path.exists(self._get_dataset_paths_filename()): if dataset_files is not None: if overwrite: save_to_disk(dataset_files, self._get_dataset_paths_filename()) else: raise ValueError("Can't change dataset_files already stored on disk") self.dataset_files = load_from_disk(self._get_dataset_paths_filename()) else: save_to_disk(dataset_files, self._get_dataset_paths_filename())
def transform(self, input_transforms, output_transforms, parallel=False): """ Transforms all internally stored data. Adds X-transform, y-transform columns to metadata. """ (normalize_X, truncate_x, normalize_y, truncate_y, log_X, log_y) = ( False, False, False, False, False, False) if "truncate" in input_transforms: truncate_x = True if "normalize" in input_transforms: normalize_X = True if "log" in input_transforms: log_X = True if "normalize" in output_transforms: normalize_y = True if "log" in output_transforms: log_y = True # Store input_transforms/output_transforms so the dataset remembers its state. X_means, X_stds, y_means, y_stds = self._transform(normalize_X, normalize_y, truncate_x, truncate_y, log_X, log_y, parallel=parallel) nrow = self.metadata_df.shape[0] # TODO(rbharath): These lines are puzzling. Better way to avoid storage # duplication here? self.metadata_df['X_means'] = [X_means for _ in range(nrow)] self.metadata_df['X_stds'] = [X_stds for _ in range(nrow)] self.metadata_df['y_means'] = [y_means for _ in range(nrow)] self.metadata_df['y_stds'] = [y_stds for _ in range(nrow)] save_to_disk( self.metadata_df, self._get_metadata_filename()) self.transforms = (input_transforms, output_transforms) save_to_disk( self.transforms, self._get_transforms_filename())
def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None): out_X = "%s-X.joblib" % basename out_y = "%s-y.joblib" % basename out_w = "%s-w.joblib" % basename out_ids = "%s-ids.joblib" % basename if X is not None: save_to_disk(X, os.path.join(data_dir, out_X)) if y is not None: save_to_disk(y, os.path.join(data_dir, out_y)) if w is not None: save_to_disk(w, os.path.join(data_dir, out_w)) if ids is not None: save_to_disk(ids, os.path.join(data_dir, out_ids)) return [basename, tasks, out_ids, out_X, out_y, out_w]
def update_mean_and_std(df): """ Compute means/stds of X/y from sums/sum_squares of tensors. """ X_transform = [] for _, row in df.iterrows(): Xt = load_from_disk(row['X-transformed']) Xs = np.sum(Xt,axis=0) Xss = np.sum(np.square(Xt),axis=0) save_to_disk(Xs, row['X_sums']) save_to_disk(Xss, row['X_sum_squares']) y_transform = [] for _, row in df.iterrows(): yt = load_from_disk(row['y-transformed']) ys = np.sum(yt,axis=0) yss = np.sum(np.square(yt),axis=0) save_to_disk(ys, row['y_sums']) save_to_disk(yss, row['y_sum_squares'])
def _update_mean_and_std(self, df, X_stats, y_stats): """ Compute means/stds of X/y from sums/sum_squares of tensors. """ if X_stats: X_transform = [] for _, row in df.iterrows(): Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed'])) Xs = np.sum(Xt,axis=0) Xss = np.sum(np.square(Xt),axis=0) save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums'])) save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares'])) if y_stats: y_transform = [] for _, row in df.iterrows(): yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed'])) ys = np.sum(yt,axis=0) yss = np.sum(np.square(yt),axis=0) save_to_disk(ys, os.path.join(self.data_dir, row['y_sums'])) save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
def __init__(self, samples_dir, featurizers, dataset_files=None, overwrite=True, reload_data=False): """ Initialiize FeaturizedSamples If samples_dir does not exist, must specify dataset_files. Then samples_dir is created and populated. If samples_dir exists (created by previous call to FeaturizedSamples), then dataset_files cannot be specified. If overwrite is set and dataset_files is provided, will overwrite old dataset_files with new. """ self.dataset_files = dataset_files self.feature_types = ( ["user-specified-features"] + [featurizer.__class__.__name__ for featurizer in featurizers]) self.featurizers = featurizers if not os.path.exists(samples_dir): os.makedirs(samples_dir) self.samples_dir = samples_dir if os.path.exists(self._get_compounds_filename()) and reload_data: compounds_df = load_from_disk(self._get_compounds_filename()) else: compounds_df = self._get_compounds() # compounds_df is not altered by any method after initialization, so it's # safe to keep a copy in memory and on disk. save_to_disk(compounds_df, self._get_compounds_filename()) _check_validity(compounds_df) self.compounds_df = compounds_df self.num_samples = len(compounds_df) if os.path.exists(self._get_dataset_paths_filename()): if dataset_files is not None: if overwrite: save_to_disk(dataset_files, self._get_dataset_paths_filename()) else: raise ValueError( "Can't change dataset_files already stored on disk") self.dataset_files = load_from_disk( self._get_dataset_paths_filename()) else: save_to_disk(dataset_files, self._get_dataset_paths_filename())
def __init__(self, samples_dir, featurizers, dataset_files=None, reload=False, verbosity=None): """ Initialiize FeaturizedSamples If samples_dir does not exist, must specify dataset_files. Then samples_dir is created and populated. If samples_dir exists (created by previous call to FeaturizedSamples), then dataset_files cannot be specified. If reload is False and dataset_files is provided, will overwrite old dataset_files with new. """ assert verbosity in [None, "low", "high"] self.verbosity = verbosity self.dataset_files = dataset_files self.feature_types = ( ["user-specified-features"] + [featurizer.__class__.__name__ for featurizer in featurizers]) self.featurizers = featurizers if not os.path.exists(samples_dir): os.makedirs(samples_dir) self.samples_dir = samples_dir if os.path.exists(self._get_dataset_paths_filename()): if dataset_files is not None: if not reload: save_to_disk(dataset_files, self._get_dataset_paths_filename()) else: raise ValueError("Can't change dataset_files already stored on disk") else: save_to_disk(dataset_files, self._get_dataset_paths_filename()) self.dataset_files = load_from_disk(self._get_dataset_paths_filename()) if os.path.exists(self._get_compounds_filename()) and reload: compounds_df = load_from_disk(self._get_compounds_filename()) else: compounds_df = self._get_compounds() # compounds_df is not altered by any method after initialization, so it's # safe to keep a copy in memory and on disk. save_to_disk(compounds_df, self._get_compounds_filename()) _check_validity(compounds_df) self.compounds_df = compounds_df self.num_samples = len(compounds_df)
def transform_on_batch(self, X, y, w, batch_dataset): """ Transforms data in a 1-shard Dataset object with Transformer objects. """ # Save X, y, and w to batch_dataset # The save/load operations work correctly with 1-shard dataframe df = batch_dataset.metadata_df for _, row in df.iterrows(): save_to_disk(X, row['X-transformed']) save_to_disk(y, row['y-transformed']) save_to_disk(w, row['w']) # Transform batch_dataset for transformer in self.fit_transformers: transformer.transform(batch_dataset) # Return numpy arrays from batch_dataset for _, row in df.iterrows(): X = load_from_disk(row['X-transformed']) y = load_from_disk(row['y-transformed']) w = load_from_disk(row['w']) return X, y, w
def save(self): """Saves sklearn model to disk using joblib.""" super(SklearnModel, self).save() save_to_disk(self.raw_model, self.get_model_filename(self.model_dir))
feature_dir = os.path.join(base_dir, "features") if not os.path.exists(feature_dir): os.makedirs(feature_dir) samples_dir = os.path.join(base_dir, "samples") if not os.path.exists(samples_dir): os.makedirs(samples_dir) from deepchem.featurizers.featurize import DataFeaturizer featurizers = compound_featurizers + complex_featurizers featurizer = DataFeaturizer(tasks=["label"], smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", compound_featurizers=compound_featurizers, complex_featurizers=complex_featurizers, id_field="complex_id", verbose=False) from ipyparallel import Client c = Client() print("c.ids") print(c.ids) dview = c[:] featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir, worker_pool=dview, shard_size=1024) save_to_disk(featurized_samples, featurized_samples_file)
def save_to_disk(self): """Save dataset to disk.""" save_to_disk( self.metadata_df, self._get_metadata_filename())
def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None, compute_feature_statistics=True): out_X = "%s-X.joblib" % basename out_X_transformed = "%s-X-transformed.joblib" % basename out_X_sums = "%s-X_sums.joblib" % basename out_X_sum_squares = "%s-X_sum_squares.joblib" % basename out_X_n = "%s-X_n.joblib" % basename out_y = "%s-y.joblib" % basename out_y_transformed = "%s-y-transformed.joblib" % basename out_y_sums = "%s-y_sums.joblib" % basename out_y_sum_squares = "%s-y_sum_squares.joblib" % basename out_y_n = "%s-y_n.joblib" % basename out_w = "%s-w.joblib" % basename out_w_transformed = "%s-w-transformed.joblib" % basename out_ids = "%s-ids.joblib" % basename if X is not None: save_to_disk(X, os.path.join(data_dir, out_X)) save_to_disk(X, os.path.join(data_dir, out_X_transformed)) if compute_feature_statistics: X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X) save_to_disk(X_sums, os.path.join(data_dir, out_X_sums)) save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares)) save_to_disk(X_n, os.path.join(data_dir, out_X_n)) if y is not None: save_to_disk(y, os.path.join(data_dir, out_y)) save_to_disk(y, os.path.join(data_dir, out_y_transformed)) y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w) save_to_disk(y_sums, os.path.join(data_dir, out_y_sums)) save_to_disk(y_sum_squares, os.path.join(data_dir, out_y_sum_squares)) save_to_disk(y_n, os.path.join(data_dir, out_y_n)) if w is not None: save_to_disk(w, os.path.join(data_dir, out_w)) save_to_disk(w, os.path.join(data_dir, out_w_transformed)) if ids is not None: save_to_disk(ids, os.path.join(data_dir, out_ids)) return [basename, tasks, out_ids, out_X, out_X_transformed, out_y, out_y_transformed, out_w, out_w_transformed, out_X_sums, out_X_sum_squares, out_X_n, out_y_sums, out_y_sum_squares, out_y_n]
if not os.path.exists(feature_dir): os.makedirs(feature_dir) samples_dir = os.path.join(base_dir, "samples") if not os.path.exists(samples_dir): os.makedirs(samples_dir) from deepchem.featurizers.featurize import DataFeaturizer featurizers = compound_featurizers + complex_featurizers featurizer = DataFeaturizer(tasks=["label"], smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", compound_featurizers=compound_featurizers, complex_featurizers=complex_featurizers, id_field="complex_id", verbose=False) from ipyparallel import Client c = Client() print("c.ids") print(c.ids) dview = c[:] featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir, worker_pool=dview, shard_size=1024) save_to_disk(featurized_samples, featurized_samples_file)
def featurize(self, input_file, feature_dir, samples_dir, shard_size=128): """Featurize provided file and write to specified location.""" input_type = _get_input_type(input_file) log("Loading raw samples now.", self.verbose) raw_df = load_pandas_from_disk(input_file) fields = raw_df.keys() log("Loaded raw data frame from file.", self.verbose) def process_raw_sample_helper(row, fields, input_type): return self._process_raw_sample(input_type, row, fields) process_raw_sample_helper_partial = partial(process_raw_sample_helper, fields=fields, input_type=input_type) #processed_rows = raw_df.apply(process_raw_sample_helper_partial, axis=1) raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False) #raw_df = pd.DataFrame.from_records(processed_rows) nb_sample = raw_df.shape[0] interval_points = np.linspace(0, nb_sample, np.ceil(float(nb_sample) / shard_size) + 1, dtype=int) shard_files = [] for j in range(len(interval_points) - 1): log( "Sharding and standardizing into shard-%s / %s shards" % (str(j + 1), len(interval_points) - 1), self.verbose) raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j + 1])] df = self._standardize_df(raw_df_shard) log("Aggregating User-Specified Features", self.verbose) self._add_user_specified_features(df) for compound_featurizer in self.compound_featurizers: log( "Currently feauturizing feature_type: %s" % compound_featurizer.__class__.__name__, self.verbose) self._featurize_compounds(df, compound_featurizer) for complex_featurizer in self.complex_featurizers: log( "Currently feauturizing feature_type: %s" % complex_featurizer.__class__.__name__, self.verbose) self._featurize_complexes(df, complex_featurizer) shard_out = os.path.join(feature_dir, "features_shard%d.joblib" % j) save_to_disk(df, shard_out) shard_files.append(shard_out) featurizers = self.compound_featurizers + self.complex_featurizers samples = FeaturizedSamples(samples_dir=samples_dir, featurizers=featurizers, dataset_files=shard_files, reload_data=False) return samples
def save_to_disk(self): """Save dataset to disk.""" save_to_disk((self.tasks, self.metadata_df), self._get_metadata_filename())
def save(self, out_dir): """Saves sklearn model to disk using joblib.""" super(SklearnModel, self).save(out_dir) save_to_disk(self.raw_model, self.get_model_filename(out_dir))
def save(self, out_dir): """Dispatcher function for saving.""" params = {"model_params" : self.model_params, "task_types" : self.task_types, "model_class": self.__class__} save_to_disk(params, Model.get_params_filename(out_dir))
def save(self): """Saves sklearn model to disk using joblib.""" save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))
def _set_compound_df(self, df): """Internal method used to replace compounds_df.""" _check_validity(df) save_to_disk(df, self._get_compounds_filename()) self.compounds_df = df
def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None, compute_feature_statistics=True): out_X = "%s-X.joblib" % basename out_X_transformed = "%s-X-transformed.joblib" % basename out_X_sums = "%s-X_sums.joblib" % basename out_X_sum_squares = "%s-X_sum_squares.joblib" % basename out_X_n = "%s-X_n.joblib" % basename out_y = "%s-y.joblib" % basename out_y_transformed = "%s-y-transformed.joblib" % basename out_y_sums = "%s-y_sums.joblib" % basename out_y_sum_squares = "%s-y_sum_squares.joblib" % basename out_y_n = "%s-y_n.joblib" % basename out_w = "%s-w.joblib" % basename out_w_transformed = "%s-w-transformed.joblib" % basename out_ids = "%s-ids.joblib" % basename if X is not None: save_to_disk(X, os.path.join(data_dir, out_X)) save_to_disk(X, os.path.join(data_dir, out_X_transformed)) if compute_feature_statistics: X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X) save_to_disk(X_sums, os.path.join(data_dir, out_X_sums)) save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares)) save_to_disk(X_n, os.path.join(data_dir, out_X_n)) if y is not None: save_to_disk(y, os.path.join(data_dir, out_y)) save_to_disk(y, os.path.join(data_dir, out_y_transformed)) y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w) save_to_disk(y_sums, os.path.join(data_dir, out_y_sums)) save_to_disk(y_sum_squares, os.path.join(data_dir, out_y_sum_squares)) save_to_disk(y_n, os.path.join(data_dir, out_y_n)) if w is not None: save_to_disk(w, os.path.join(data_dir, out_w)) save_to_disk(w, os.path.join(data_dir, out_w_transformed)) if ids is not None: save_to_disk(ids, os.path.join(data_dir, out_ids)) return [ basename, tasks, out_ids, out_X, out_X_transformed, out_y, out_y_transformed, out_w, out_w_transformed, out_X_sums, out_X_sum_squares, out_X_n, out_y_sums, out_y_sum_squares, out_y_n ]