Esempio n. 1
0
def load_tox21(featurizer='ECFP', split='index'):
    """Load Tox21 datasets. Does not do train/test split"""
    # Featurize Tox21 dataset
    print("About to featurize Tox21 dataset.")
    current_dir = os.path.dirname(os.path.realpath(__file__))

    dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
    data_dir = deepchem.utils.get_data_dir()

    tox21_tasks = [
        'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
        'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
    ]

    dataset_dir = os.path.join(data_dir, "tox21", featurizer, split)
    train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join(
        dataset_dir, 'valid'), os.path.join(dataset_dir, 'test')
    if os.path.isdir(dataset_dir):
        train, valid, test = DiskDataset(data_dir=train), DiskDataset(
            data_dir=valid), DiskDataset(data_dir=test)
        transformers = [
            dc.trans.BalancingTransformer(transform_w=True, dataset=train)
        ]
        return tox21_tasks, (train, valid, test), transformers
    if featurizer == 'ECFP':
        featurizer_func = dc.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer_func = dc.feat.ConvMolFeaturizer()
    elif featurizer == 'AdjMatrix':
        featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True)
    loader = dc.data.CSVLoader(tasks=tox21_tasks,
                               smiles_field="smiles",
                               featurizer=featurizer_func)
    dataset = loader.featurize(dataset_file, shard_size=8192)

    # Initialize transformers
    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    print("About to transform data")
    for transformer in transformers:
        dataset = transformer.transform(dataset)

    splitters = {
        'index': dc.splits.IndexSplitter(),
        'random': dc.splits.RandomSplitter(),
        'scaffold': dc.splits.ScaffoldSplitter(),
        'butina': dc.splits.ButinaSplitter()
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         train_dir=train,
                                                         valid_dir=valid,
                                                         test_dir=test)

    return tox21_tasks, (train, valid, test), transformers
Esempio n. 2
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
Esempio n. 3
0
  def load_dataset(
      self, name: str, reload: bool
  ) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
    """Load the dataset.

    Parameters
    ----------
    name: str
      the name of the dataset, used to identify the directory on disk
    reload: bool
      if True, the first call for a particular featurizer and splitter will cache
      the datasets to disk, and subsequent calls will reload the cached datasets.
    """
    # Build the path to the dataset on disk.

    featurizer_name = str(self.featurizer)
    splitter_name = 'None' if self.splitter is None else str(self.splitter)
    save_folder = os.path.join(self.save_dir, name + "-featurized",
                               featurizer_name, splitter_name)
    if len(self.transformers) > 0:
      transformer_name = '_'.join(
          t.get_directory_name() for t in self.transformers)
      save_folder = os.path.join(save_folder, transformer_name)

    # Try to reload cached datasets.

    if reload:
      if self.splitter is None:
        if os.path.exists(save_folder):
          transformers = dc.utils.data_utils.load_transformers(save_folder)
          return self.tasks, (DiskDataset(save_folder),), transformers
      else:
        loaded, all_dataset, transformers = dc.utils.data_utils.load_dataset_from_disk(
            save_folder)
        if all_dataset is not None:
          return self.tasks, all_dataset, transformers

    # Create the dataset

    logger.info("About to featurize %s dataset." % name)
    dataset = self.create_dataset()

    # Split and transform the dataset.

    if self.splitter is None:
      transformer_dataset: Dataset = dataset
    else:
      logger.info("About to split dataset with {} splitter.".format(
          self.splitter.__class__.__name__))
      train, valid, test = self.splitter.train_valid_test_split(dataset)
      transformer_dataset = train
    transformers = [
        t.create_transformer(transformer_dataset) for t in self.transformers
    ]
    logger.info("About to transform data.")
    if self.splitter is None:
      for transformer in transformers:
        dataset = transformer.transform(dataset)
      if reload and isinstance(dataset, DiskDataset):
        dataset.move(save_folder)
        dc.utils.data_utils.save_transformers(save_folder, transformers)
      return self.tasks, (dataset,), transformers

    for transformer in transformers:
      train = transformer.transform(train)
      valid = transformer.transform(valid)
      test = transformer.transform(test)
    if reload and isinstance(train, DiskDataset) and isinstance(
        valid, DiskDataset) and isinstance(test, DiskDataset):
      dc.utils.data_utils.save_dataset_to_disk(save_folder, train, valid, test,
                                               transformers)
    return self.tasks, (train, valid, test), transformers
def load_gpcr(dataset_file,
              featurizer='ECFP',
              transformers=True,
              reload=True,
              sep='OnePositiveSplit',
              K=5):
    #    data_dir=os.path.dirname(dataset_file)

    save_dir = os.path.join(
        os.path.dirname(dataset_file),
        '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp",
        "split")
    train, valid, test = os.path.join(save_dir, 'train'), os.path.join(
        save_dir, 'valid'), os.path.join(save_dir, 'test')
    fopen = open(dataset_file, "r")
    ss = fopen.readlines()
    m = ss[0].strip('\n').split(',')
    m.remove('SMILES')
    if os.path.isdir(save_dir):
        if reload:
            dataset, train_dataset, valid_dataset, test_dataset = DiskDataset(
                data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset(
                    data_dir=valid), DiskDataset(data_dir=test)
            transformers = [
                deepchem.trans.NormalizationTransformer(transform_w=True,
                                                        dataset=train_dataset)
            ]
            all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
            return m, all_dataset, transformers
    if featurizer == 'ECFP':
        featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
        featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
        featurizer = deepchem.feat.RawFeaturizer()
    elif featurizer == 'AdjacencyConv':
        featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150,
                                                        max_valence=6)
    elif featurizer == 'SelfDefine':
        featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field)
    loader = deepchem.data.CSVLoader(tasks=m,
                                     smiles_field="SMILES",
                                     featurizer=featurizer)
    dataset = loader.featurize(dataset_file,
                               data_dir=save_dir,
                               shard_size=8192)
    #    dataset = loader.featurize(dataset_file, shard_size=8192)
    # Initialize transformers
    if transformers:
        transformers = [
            deepchem.trans.NormalizationTransformer(transform_w=True,
                                                    dataset=dataset)
        ]
        for transformer in transformers:
            dataset = transformer.transform(dataset)
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'random_stratified': deepchem.splits.RandomStratifiedSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter(),
        'butina': deepchem.splits.ButinaSplitter(),
        'task': deepchem.splits.TaskSplitter(),
        'Harmonious_positive': Harmonious_positive(),
        'OnePositiveSplit': OnePositiveSplit()
    }
    splitter = splitters[sep]
    if sep == 'task':
        fold_datasets = splitter.k_fold_split(dataset, K)
        all_dataset = fold_datasets
    elif sep == 'Harmonious_positive':
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset)
        train_dataset = DiskDataset.from_numpy(train_dataset.X,
                                               train_dataset.y,
                                               train_dataset.w,
                                               train_dataset.ids,
                                               dataset.tasks,
                                               data_dir=train)
        valid_dataset = DiskDataset.from_numpy(valid_dataset.X,
                                               valid_dataset.y,
                                               valid_dataset.w,
                                               valid_dataset.ids,
                                               dataset.tasks,
                                               data_dir=valid)
        test_dataset = DiskDataset.from_numpy(test_dataset.X,
                                              test_dataset.y,
                                              test_dataset.w,
                                              test_dataset.ids,
                                              dataset.tasks,
                                              data_dir=test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
    elif sep == 'Harmonious_positive' and K:
        #        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        #                                dataset,
        #                                frac_train=frac_train,
        #                                frac_valid=0,
        #                                frac_test=1- frac_train,
        #                                )
        #        train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids,
        #                                               dataset.tasks,data_dir=train)
        #        train_dataset.reshard(8192)
        #        test_dataset  = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids,
        #                                               dataset.tasks,data_dir=test)
        #        test_dataset.reshard(8192)
        #        fold_dataset = splitter.k_fold_split(
        #                train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True)
        fold_dataset = splitter.k_fold_split(
            dataset,
            K,
            directories=[os.path.join(valid, str(i)) for i in range(K)],
            verbose=True)
        folds = []
        for i in range(K):
            print('merge fold dataset {}...'.format(i))
            train_fold = DiskDataset.merge(
                [fold_dataset[j] for j in range(K) if j != i],
                merge_dir=os.path.join(valid, str(i), 'train_fold'))
            test_fold = DiskDataset.merge([fold_dataset[i]],
                                          merge_dir=os.path.join(
                                              valid, str(i), 'valid_fold'))
            folds.append([train_fold, test_fold])
        all_dataset = (dataset, [], folds, [])
    else:
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset,
            train_dir=train,
            valid_dir=valid,
            test_dir=test,
            frac_train=frac_train,
            frac_valid=frac_valid,
            frac_test=frac_test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)

#    else:
#        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test)
#        all_dataset = (dataset,train_dataset, valid_dataset, test_dataset)
#    if reload:
#        deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers)
    return m, all_dataset, transformers
def load_tf(samp_num=0,
            reload=True,
            split='random',
            frac_train_and_valid=0.9,
            data_time=10,
            data_num=1,
            data_dir='/home/hdd2/lifei/sam/script/tmm/data'):
    tf_tasks = ['values']
    dataset_file = (data_dir + "/fingerprint_" + str(samp_num) + '.csv')

    save_dir = os.path.join(
        os.path.dirname(dataset_file),
        '.'.join(os.path.basename(dataset_file).split('.')[:-1]))
    dataset_dir, test_dir = os.path.join(save_dir, 'dataset'), os.path.join(
        save_dir, 'test')

    if os.path.isdir(save_dir):
        if reload:
            dataset, train_dataset, valid_dataset, test_dataset = DiskDataset(
                data_dir=dataset_dir), DiskDataset(
                    data_dir=os.path.join(save_dir, (
                        'train_vaild_' +
                        str(data_num)), 'train')), DiskDataset(
                            data_dir=os.path.join(save_dir, (
                                'train_vaild_' +
                                str(data_num)), 'valid')), DiskDataset(
                                    data_dir=test_dir)
            all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
            return all_dataset
    else:
        print("About to featurize TF dataset.")
        featurizer = dc.feat.UserDefinedFeaturizer(tf_descriptors)
        loader = dc.data.UserCSVLoader(tasks=tf_tasks,
                                       id_field="compounds",
                                       featurizer=featurizer)
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        dataset = loader.featurize(dataset_file,
                                   data_dir=dataset_dir,
                                   shard_size=8192)
        splitters = {
            'index': dc.splits.IndexSplitter(),
            'random': dc.splits.RandomSplitter(),
            'scaffold': dc.splits.ScaffoldSplitter()
        }
        splitter = splitters[split]
        if not os.path.exists(test_dir):
            os.makedirs(test_dir)
        train_valid, test_dataset = splitter.train_test_split(
            dataset, test_dir=test_dir, frac_train=frac_train_and_valid)
        test_df = pd.DataFrame(test_dataset.ids)
        test_df.to_csv(os.path.join(test_dir, 'test.csv'))
        for i in range(data_time):
            train_valid_dir = os.path.join(save_dir,
                                           ('train_vaild_' + str(i + 1)))
            train_dir, valid_dir = os.path.join(train_valid_dir,
                                                'train'), os.path.join(
                                                    train_valid_dir, 'valid')
            for i in (train_dir, valid_dir):
                if not os.path.exists(i):
                    os.makedirs(i)
            train_dataset_t, vaild_dataset_t = splitter.train_test_split(
                train_valid,
                train_dir=train_dir,
                test_dir=valid_dir,
                frac_train=8.0 / 9)
            train_df, vaild_df = pd.DataFrame(
                train_dataset_t.ids), pd.DataFrame(vaild_dataset_t.ids)
            train_df.to_csv(train_dir + '/train.csv')
            vaild_df.to_csv(valid_dir + '/valid.csv')
        train_dataset, valid_dataset = DiskDataset(
            data_dir=os.path.join(save_dir, (
                'train_vaild_' + str(data_num)), 'train')), DiskDataset(
                    data_dir=os.path.join(save_dir, ('train_vaild_' +
                                                     str(data_num)), 'valid'))
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
        return all_dataset
Esempio n. 6
0
  def featurize(self, input_files, data_dir=None, shard_size=8192,
                num_shards_per_batch=24, worker_pool=None,
                logging=True, debug=False):
    """Featurize provided files and write to specified location."""
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    log("Loading raw samples now.", self.verbosity)
    log("shard_size: %d" % shard_size, self.verbosity)
    log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

    # Allow users to specify a single file for featurization
    if not isinstance(input_files, list):
      input_files = [input_files]

    if data_dir is not None:
      if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    else:
      data_dir = tempfile.mkdtemp()

    # Construct partial function to write datasets.
    if not len(input_files):
      return None
    input_type = get_input_type(input_files[0])

    if logging:
      mp.log_to_stderr()
    if worker_pool is None:
      if logging:
        worker_pool = LoggingPool(processes=1)
      else:
        worker_pool = mp.Pool(processes=1)
    log("Spawning workers now.", self.verbosity)
    metadata_rows = []
    def wrap_with_shard_metadata(iterator):
      for item in iterator:
        yield ((self, shard_size, input_type, data_dir), item)
    data_iterator = wrap_with_shard_metadata(
        enumerate(load_data(input_files, shard_size, self.verbosity)))
    # Turns out python map is terrible and exhausts the generator as given.
    # Solution seems to be to to manually pull out N elements from iterator,
    # then to map on only those N elements. BLECH. Python should do a better
    # job here.
    num_batches = 0
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: pre-map featurization took %0.3f s" % (time2-time1))
    ############################################################## TIMING
    while True:
      log("About to start processing next batch of shards", self.verbosity)
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
      iterator = itertools.islice(data_iterator, num_shards_per_batch)
      if not debug:
        batch_metadata = worker_pool.map(
            featurize_map_function, iterator)
      else:
        batch_metadata = []
        for elt in iterator:
          batch_metadata.append(featurize_map_function(elt))
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: map call on batch took %0.3f s" % (time2-time1),
           self.verbosity)
      ############################################################## TIMING
      if batch_metadata:
        metadata_rows.extend([elt for elt in batch_metadata if elt is not None])
        num_batches += 1
        log("Featurized %d datapoints\n"
            % (shard_size * num_shards_per_batch * num_batches), self.verbosity)
      else:
        break
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING

    # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
    # creating a Dataset. Is there a more elegant solutions?
    dataset = DiskDataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=True, verbosity=self.verbosity)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: dataset construction took %0.3f s" % (time2-time1),
          self.verbosity)
    ############################################################## TIMING
    return dataset