Beispiel #1
0
 def sparse_shuffle(self):
   """Shuffling that exploits data sparsity to shuffle large datasets.
   Only for 1-dimensional feature vectors (does not work for tensorial
   featurizations).
   """
   time1 = time.time()
   shard_size = self.get_shard_size()
   num_shards = self.get_number_shards()
   X_sparses, ys, ws, ids = [], [], [], []
   num_features = None
   for i in range(num_shards):
     (X_s, y_s, w_s, ids_s) = self.get_shard(i)
     if num_features is None:
       num_features = X_s.shape[1]
     X_sparse = sparsify_features(X_s)
     X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s], ws + [w_s],
                               ids + [np.atleast_1d(torch.squeeze(ids_s))])
   # Get full dataset in memory
   (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys), np.vstack(ws),
                            torch.cat(ids))
   # Shuffle in memory
   num_samples = len(X_sparse)
   permutation = torch.randperm(num_samples)
   X_sparse, y, w, ids = (X_sparse[permutation], y[permutation],
                          w[permutation], ids[permutation])
   # Write shuffled shards out to disk
   for i in range(num_shards):
     start, stop = i * shard_size, (i + 1) * shard_size
     (X_sparse_s, y_s, w_s, ids_s) = (X_sparse[start:stop], y[start:stop],
                                      w[start:stop], ids[start:stop])
     X_s = densify_features(X_sparse_s, num_features)
     self.set_shard(i, X_s, y_s, w_s, ids_s)
   time2 = time.time()
   log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
Beispiel #2
0
        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids
Beispiel #3
0
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
    """Creates a new DiskDataset
    Parameters
    ----------
    shard_generator: Iterable
      An iterable (either a list or generator) that provides tuples of data
      (X, y, w, ids). Each tuple will be written to a separate shard on disk.
    data_dir: str
      Filename for data directory. Creates a temp directory if none specified.
    tasks: list
      List of tasks for this dataset.
    """
    if data_dir is None:
      data_dir = tempfile.mkdtemp()
    elif not os.path.exists(data_dir):
      os.makedirs(data_dir)

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num
      metadata_rows.append(
          DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
                                         ids))
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    save_metadata(tasks, metadata_df, data_dir)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)
Beispiel #4
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    self.tasks, self.metadata_df = self.load_metadata()
Beispiel #5
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.
    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(),
                                          data_dir,
                                          self.tasks,
                                          verbose=self.verbose)
Beispiel #6
0
def get_user_specified_features(df, featurizer, verbose=True):
    """Extract and merge user specified features. 
  Merge features included in dataset provided by user
  into final features dataframe
  Three types of featurization here:
    1) Molecule featurization
      -) Smiles string featurization
      -) Rdkit MOL featurization
    2) Complex featurization
      -) PDB files for interacting molecules.
    3) User specified featurizations.
  """
    time1 = time.time()
    df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(
        pd.to_numeric)
    X_shard = df.as_matrix(columns=featurizer.feature_fields)
    time2 = time.time()
    log("TIMING: user specified processing took %0.3f s" % (time2 - time1),
        verbose)
    return X_shard
Beispiel #7
0
def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000):
    """Featurize individual compounds in dataframe.
  Featurizes .sdf files, so the 3-D structure should be preserved
  so we use the rdkit "mol" object created from .sdf instead of smiles
  string. Some featurizers such as CoulombMatrix also require a 3-D
  structure.  Featurizing from .sdf is currently the only way to
  perform CM feautization.
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, mol in enumerate(sample_elems):
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return torch.squeeze(torch.Tensor(features)), valid_inds
Beispiel #8
0
def featurize_smiles_torch(arr, featurizer, log_every_N=1000, verbose=True):
    """Featurize individual compounds in a torch tensor.
  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features array
  """
    features = []
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))

    valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = torch.squeeze(torch.Tensor(features))
    return features.reshape(-1, )
Beispiel #9
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.
  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
Beispiel #10
0
 def featurize_shard(self, shard):
     """Featurizes a shard of an input dataframe."""
     log(
         "Currently featurizing feature_type: %s" %
         self.featurizer.__class__.__name__, self.verbose)
     return featurize_mol_df(shard, self.featurizer, field=self.mol_field)