예제 #1
0
def uncurated_objects(y=["VALUE_NUM"]):
    params_from_ds = parse.wrapper(currentdir + '/config_uncurated_bp.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    uncurated_df = data.load_full_dataset()
    return params_from_ds, data, uncurated_df
예제 #2
0
def datastore_objects(y=["PIC50"]):
    params_from_ds = parse.wrapper(currentdir +
                                   '/config_datastore_dset_cav12.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    dset_df = data.load_full_dataset()
    data.get_featurized_data()
    data.split_dataset()
    return params_from_ds, data, dset_df
예제 #3
0
 def split_and_save_dataset(self, assay_params):
     self.get_dataset_metadata(assay_params)
     #TODO: check usage with defaults
     namespace_params = parse.wrapper(assay_params)
     #TODO: Don't want to recreate each time
     featurization = feat.create_featurization(namespace_params)
     data = model_datasets.create_model_dataset(namespace_params,
                                                featurization)
     data.get_featurized_data()
     data.split_dataset()
     data.save_split_dataset()
     assay_params['previously_split'] = True
     assay_params['split_uuid'] = data.split_uuid
예제 #4
0
def test_super_transform_dataset():
    """
    Args:
    dataset: The DeepChem DiskDataset that contains a dataset

    Returns:
    transformed_dataset

    Raises:
    None

    Dependencies:
    model_dataset.create_transformers

    Calls:
    None


    """
    #set up for a model wrapper with regression and NN.
    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    data_obj_ecfp = model_dataset.create_model_dataset(inp_params,
                                                       featurization,
                                                       ds_client=None)
    df_delaney = data_obj_ecfp.load_full_dataset()
    data_obj_ecfp.get_dataset_tasks(df_delaney)
    data_obj_ecfp.check_task_columns(df_delaney)
    data_obj_ecfp.get_featurized_data()
    mdl = model_wrapper.create_model_wrapper(inp_params,
                                             data_obj_ecfp.featurization)
    mdl.setup_model_dirs()
    mdl.create_transformers(data_obj_ecfp)
    dataset = mdl.transform_dataset(data_obj_ecfp.dataset)

    test = []
    # checking that the dataset is the correct type
    test.append(isinstance(dataset, DD))
    # since this is not descriptor featurization, the X values for the datasets should be the same
    test.append((dataset.X == data_obj_ecfp.dataset.X).all())
    # and the response values should be the same length:
    test.append(len(dataset.y) == len(data_obj_ecfp.dataset.y))
    test.append(len(dataset.y) == len(dataset.ids))
    assert all(test)
예제 #5
0
def delaney_objects(y=["measured log solubility in mols per litre"],
                    featurizer="ecfp",
                    split_strategy="train_valid_test",
                    splitter="random",
                    split_uuid=None):
    delaney_inp_file = currentdir + '/config_delaney.json'
    inp_params = parse.wrapper(delaney_inp_file)
    inp_params.response_cols = y
    inp_params.featurizer = featurizer
    inp_params.split_strategy = split_strategy
    inp_params.splitter = splitter
    if split_uuid is not None:
        inp_params.previously_split = True
        inp_params.split_uuid = split_uuid
    featurization = feat.create_featurization(inp_params)
    mdl = model_dataset.create_model_dataset(inp_params,
                                             featurization,
                                             ds_client=None)
    delaney_df = mdl.load_full_dataset()
    mdl.get_featurized_data()
    mdl.split_dataset()
    return inp_params, mdl, delaney_df
예제 #6
0
def moe_descriptors(datastore=False):
    if datastore == True:
        params_ds = parse.wrapper(currentdir +
                                  "/config_MAOA_moe_descriptors_ds.json")
    else:
        params_file = parse.wrapper(currentdir +
                                    "/config_MAOA_moe_descriptors.json")


#         if not os.path.isfile(params_file.dataset_key):
#             os.makedirs('pytest/config_MAOA_moe_descriptors/moe_descriptors', exist_ok=True)
#             copyfile(params_ds.dataset_key, params_file.dataset_key)
    if datastore == True:
        params_desc = params_ds
    else:
        params_desc = params_file
    featurization = feat.create_featurization(params_desc)
    dataset_obj_for_desc = model_dataset.create_model_dataset(params_desc,
                                                              featurization,
                                                              ds_client=None)
    df = dataset_obj_for_desc.load_full_dataset()
    return params_desc, dataset_obj_for_desc, df
예제 #7
0
    def split_and_save_dataset(self, assay_params):
        """
        Splits a given dataset, saves it, and sets the split_uuid in the metadata
        
        Args:
            assay_params: Dataset metadata

        Returns:
            None

        """
        self.get_dataset_metadata(assay_params)
        # TODO: check usage with defaults
        namespace_params = parse.wrapper(assay_params)
        # TODO: Don't want to recreate each time
        featurization = feat.create_featurization(namespace_params)
        data = model_datasets.create_model_dataset(namespace_params, featurization)
        data.get_featurized_data()
        data.split_dataset()
        data.save_split_dataset()
        assay_params['previously_split'] = True
        assay_params['split_uuid'] = data.split_uuid
def test_split_dataset_kfold_scaffold_from_pipeline(caplog):
    #Testing for correct type and length of dataset for k-fold splitting with a scaffold splitter
    #Testing a 3-fold split first for uniqueness of all validation and training sets.

    #mp.model_wrapper = model_wrapper.create_model_wrapper(mp.params, mp.featurization, mp.ds_client)
    #mp.model_wrapper.setup_model_dirs()
    mp = utils.delaney_pipeline(featurizer="ecfp",
                                split_strategy="k_fold_cv",
                                splitter="scaffold")
    mp.featurization = feat.create_featurization(mp.params)
    mp.data = model_datasets.create_model_dataset(mp.params, mp.featurization,
                                                  mp.ds_client)
    mp.data.get_featurized_data()
    mp.data.split_dataset()
    splitter_k_fold_scaffold = mp.data.splitting
    splitter_k_fold_scaffold.num_folds = 3
    nf = splitter_k_fold_scaffold.num_folds

    #mp.model_wrapper.create_transformers(self.data)
    #mp.data.dataset = mp.model_wrapper.transform_dataset(self.data.dataset)
    data_obj_k_fold_scaffold = mp.data

    data_obj_k_fold_scaffold.split_dataset()
    train_valid, test, train_valid_attr, test_attr = splitter_k_fold_scaffold.split_dataset(
        data_obj_k_fold_scaffold.dataset, data_obj_k_fold_scaffold.attr,
        data_obj_k_fold_scaffold.params.smiles_col)
    #assert no overlap of the k-fold validation sets between each other
    test_list = []
    for kfoldindex in range(0, nf):
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][0].X ==
             train_valid[kfoldindex][0].X).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].X ==
             train_valid[kfoldindex][1].X).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids ==
             train_valid[kfoldindex][1].ids).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids ==
             train_valid[kfoldindex][1].ids).all())
        test_list.append(train_valid_attr[kfoldindex][0].equals(
            data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][0]))
        test_list.append(train_valid_attr[kfoldindex][1].equals(
            data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][1]))
    assert all(test_list)
    test_list = []
    concat_valid = [x[1].ids.tolist() for x in train_valid]
    concat_valid = sum(concat_valid, [])
    test_list.append(len(concat_valid) == len(set(concat_valid)))

    assert all(test_list)
    tv_split = []
    test_list = []
    #asserting that each k-fold split has no internal overlap.
    for kfoldindex in range(0, nf):
        current_tv_split = train_valid[kfoldindex][0].ids.tolist(
        ) + train_valid[kfoldindex][1].ids.tolist()
        test_list.append(
            len(train_valid[kfoldindex][0].ids) == len(train_valid[kfoldindex]
                                                       [0].y))
        test_list.append(
            len(train_valid[kfoldindex][1].ids) == len(train_valid[kfoldindex]
                                                       [1].y))
        current_full_dataset = sum([current_tv_split, test.ids.tolist()], [])
        test_list.append(
            len(current_full_dataset) == len(set(current_full_dataset)))
        test_list.append(
            set(train_valid[kfoldindex][0].ids.tolist()) == set(
                train_valid_attr[kfoldindex][0].index.tolist()))
        test_list.append(
            set(train_valid[kfoldindex][1].ids.tolist()) == set(
                train_valid_attr[kfoldindex][1].index.tolist()))
        #checking length of the validation set (should be length of the kv set/num_folds +/- 1)
        len_valid = round(len(current_tv_split) / nf)
        test_list.append(
            len_valid - 1 <= len(train_valid[kfoldindex][1]) <= len_valid + 1)
        tv_split.append(current_tv_split)

    #asserting that all k-fold train valid sets are equivalent
    test_list.append(
        set.intersection(*[set(l) for l in tv_split]) == set(tv_split[0]))
    #aasserting that the test and test_attrs have the same index:
    test_list.append(set(test.ids.tolist()) == set(test_attr.index.tolist()))
    test_list.append(len(test.y) == len(test.ids))
    assert all(test_list)
    def return_split_uuid(self,
                          dataset_key,
                          bucket=None,
                          splitter=None,
                          split_combo=None,
                          retry_time=60):
        """
        Loads a dataset, splits it, saves it, and returns the split_uuid
        Args:
            dataset_key: key for dataset to split
            bucket: datastore-specific user group bucket
            splitter: Type of splitter to use to split the dataset
            split_combo: tuple of form (split_valid_frac, split_test_frac)

        Returns:

        """
        if bucket is None:
            bucket = self.params.bucket
        if splitter is None:
            splitter = self.params.splitter
        if split_combo is None:
            split_valid_frac = self.params.split_valid_frac
            split_test_frac = self.params.split_test_frac
        else:
            split_valid_frac = split_combo[0]
            split_test_frac = split_combo[1]
        retry = True
        i = 0
        #TODO: need to catch if dataset doesn't exist versus 500 failure
        while retry:
            try:
                metadata = dsf.get_keyval(dataset_key=dataset_key,
                                          bucket=bucket)
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..."
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, exiting"
                        % (dataset_key, e))
                    return None
        assay_params = {
            'dataset_key': dataset_key,
            'bucket': bucket,
            'splitter': splitter,
            'split_valid_frac': split_valid_frac,
            'split_test_frac': split_test_frac
        }
        #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use
        if type(self.params.featurizer) == list:
            assay_params['featurizer'] = self.params.featurizer[0]
        else:
            assay_params['featurizer'] = self.params.featurizer
        if 'id_col' in metadata.keys():
            assay_params['id_col'] = metadata['id_col']
        if 'response_cols' not in assay_params or assay_params[
                'response_cols'] is None:
            if 'param' in metadata.keys():
                assay_params['response_cols'] = [metadata['param']]
            if 'response_col' in metadata.keys():
                assay_params['response_cols'] = [metadata['response_col']]
            if 'response_cols' in metadata.keys():
                assay_params['response_cols'] = metadata['response_cols']
        if 'smiles_col' in metadata.keys():
            assay_params['smiles_col'] = metadata['smiles_col']
        if 'class_name' in metadata.keys():
            assay_params['class_name'] = metadata['class_name']
        if 'class_number' in metadata.keys():
            assay_params['class_number'] = metadata['class_number']
        assay_params['dataset_name'] = assay_params['dataset_key'].split(
            '/')[-1].rstrip('.csv')
        assay_params['datastore'] = True
        assay_params[
            'previously_featurized'] = self.params.previously_featurized
        try:
            assay_params['descriptor_key'] = self.params.descriptor_key
            assay_params['descriptor_bucket'] = self.params.descriptor_bucket
        except:
            print("")
        #TODO: check usage with defaults
        namespace_params = parse.wrapper(assay_params)
        # TODO: Don't want to recreate each time
        featurization = feat.create_featurization(namespace_params)
        data = model_datasets.create_model_dataset(namespace_params,
                                                   featurization)
        retry = True
        i = 0
        while retry:
            try:
                data.get_featurized_data()
                data.split_dataset()
                data.save_split_dataset()
                return data.split_uuid
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping"
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not save split dataset for dataset %s because of exception %s"
                        % (dataset_key, e))
                    return None
예제 #10
0
def analyze_split(params,
                  id_col='compound_id',
                  smiles_col='rdkit_smiles',
                  active_col='active'):
    """
    Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset.

    Also show the active frequencies in each subset and for the dataset as a whole.
    id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found
    the metadata values are used instead.

    Args:
        params (argparse.Namespace): Pipeline parameters.

        id_col (str): Dataset column containing compound IDs.

        smiles_col (str): Dataset column containing SMILES strings.

        active_col (str): Dataset column containing binary classifications.

    Returns:
        :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds

    """
    dset_key = params.dataset_key
    bucket = params.bucket
    split_uuid = params.split_uuid

    ds_client = dsf.config_client()
    try:
        split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid',
                                                          split_uuid,
                                                          ds_client,
                                                          operator='in',
                                                          bucket=bucket)
        split_oid = split_metadata['dataset_oid'].values[0]
        split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid,
                                                       client=ds_client)
    except Exception as e:
        print("Error when loading split file:\n%s" % str(e))
        raise

    try:
        dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                        bucket,
                                                        client=ds_client)
        dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                          bucket,
                                                          client=ds_client,
                                                          return_metadata=True)
    except Exception as e:
        print("Error when loading dataset:\n%s" % str(e))
        raise
    kv_dict = dsf.get_key_val(dataset_meta['metadata'])
    id_col = kv_dict.get('id_col', id_col)
    smiles_col = kv_dict.get('smiles_col', smiles_col)
    active_col = kv_dict.get('response_col', active_col)

    try:
        print('Dataset has %d unique compound IDs' %
              len(set(dataset_df[id_col].values)))
        print('Split table has %d unique compound IDs' %
              len(set(split_df.cmpd_id.values)))

        dset_df = dataset_df.merge(split_df,
                                   how='inner',
                                   left_on=id_col,
                                   right_on='cmpd_id').drop('cmpd_id', axis=1)
    except Exception as e:
        print("Error when joining dataset with split dataset:\n%s" % str(e))
        raise

    featurization = feat.create_featurization(params)
    data = md.create_model_dataset(params, featurization, ds_client)
    data.get_featurized_data()
    feat_arr = data.dataset.X
    # TODO: impute missing values if necessary
    y = data.dataset.y.flatten()
    if len(set(y) - set([0, 1])) > 0:
        raise ValueError(
            'AVEMinSplitter only works on binary classification datasets')
    ids = data.dataset.ids
    active_ind = np.where(y == 1)[0]
    inactive_ind = np.where(y == 0)[0]
    active_feat = feat_arr[active_ind, :]
    inactive_feat = feat_arr[inactive_ind, :]
    num_active = len(active_ind)
    num_inactive = len(inactive_ind)
    active_ids = ids[active_ind]
    inactive_ids = ids[inactive_ind]
    active_id_ind = dict(zip(active_ids, range(len(active_ids))))
    inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids))))
    if params.featurizer == 'ecfp':
        metric = 'jaccard'
    elif params.featurizer == 'graphconv':
        raise ValueError(
            "ave_min splitter dopesn't support graphconv features")
    else:
        metric = 'euclidean'

    # Calculate distance thresholds where nearest neighborfunction should be evaluated
    if metric == 'jaccard':
        max_nn_dist = 1.0
    else:
        nan_mat = np.isnan(feat_arr)
        nnan = np.sum(nan_mat)
        if nnan > 0:
            log.info('Input feature matrix has %d NaN elements' % nnan)
            not_nan = ~nan_mat
            for i in range(feat_arr.shape[1]):
                feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i],
                                                              i])
        nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1]
        med_nn_dist = np.median(nn_dist)
        max_nn_dist = 3.0 * med_nn_dist
    ndist = 100
    dist_thresh = np.linspace(0.0, max_nn_dist, ndist)

    # Compute distance matrices between subsets
    num_workers = 1
    aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None,
                             num_workers)
    ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None,
                             num_workers)
    ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None,
                             num_workers)
    ia_dist = ai_dist.transpose()

    subsets = sorted(set(dset_df.subset.values))
    subset_active_ind = {}
    subset_inactive_ind = {}

    if 'train' in subsets:
        # this is a TVT split
        subsets = ['train', 'valid', 'test']
        for subset in subsets:
            subset_df = dset_df[dset_df.subset == subset]
            active_df = subset_df[subset_df[active_col] == 1]
            inactive_df = subset_df[subset_df[active_col] == 0]
            subset_active_ids = active_df[id_col].values
            subset_inactive_ids = inactive_df[id_col].values
            subset_active_ind[subset] = [
                active_id_ind[id] for id in subset_active_ids
            ]
            subset_inactive_ind[subset] = [
                inactive_id_ind[id] for id in subset_inactive_ids
            ]

        taI = subset_active_ind['train']
        tiI = subset_inactive_ind['train']
        print("Results for %s split with %s %s features:" %
              (params.splitter, params.descriptor_type, params.featurizer))
        for valid_set in ['valid', 'test']:
            vaI = subset_active_ind[valid_set]
            viI = subset_inactive_ind[valid_set]
            split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist,
                            ia_dist, dist_thresh)
            _plot_nn_dist_distr(split_params)
            bias = _plot_bias(split_params, niter=0)
            print("For train/%s split: AVE bias = %.5f" % (valid_set, bias))
    else:
        # TODO: deal with k-fold splits later
        print('k-fold CV splits not supported yet')
        return

    # Tabulate the fractions of actives in the full dataset and each subset
    subset_list = []
    size_list = []
    frac_list = []
    active_frac_list = []

    dset_size = data.dataset.X.shape[0]
    dset_active = sum(data.dataset.y)
    subset_list.append('full dataset')
    size_list.append(dset_size)
    frac_list.append(1.0)
    active_frac_list.append(dset_active / dset_size)

    for subset in subsets:
        active_size = len(subset_active_ind[subset])
        inactive_size = len(subset_inactive_ind[subset])
        subset_size = active_size + inactive_size
        active_frac = active_size / subset_size
        subset_list.append(subset)
        size_list.append(subset_size)
        frac_list.append(subset_size / dset_size)
        active_frac_list.append(active_frac)
    frac_df = pd.DataFrame(
        dict(subset=subset_list,
             size=size_list,
             fraction=frac_list,
             active_frac=active_frac_list))
    print('\nSplit subsets:')
    print(frac_df)

    return frac_df
예제 #11
0
def test_super_create_transformers():
    """
    Args:
    model_dataset: The ModelDataset object that handles the current dataset

    Returns:
    self.transformers
    self.transformers_x
    self.params.transformer_key
    self.params.transformer_oid (if datastore)

    Raises:
    Exception when failing to save to the datastore

    Dependencies:
    create_featurization
    create_model_dataset
    model_dataset.load_full_dataset
    model_dataset.get_dataset_tasks
    model_dataset.check_task_columns
    model_dataset.get_featurized_data
    Requires (self.params.prediction_type == 'regression' and self.params.transformers == True) or len(self.transformers) > 0 

    Calls:
    self.featurization.create_feature_transformer
    dsf.upload_pickle_to_DS

    """
    #set up for a model wrapper with regression and NN.

    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    data_obj_ecfp = model_dataset.create_model_dataset(inp_params,
                                                       featurization,
                                                       ds_client=None)
    df_delaney = data_obj_ecfp.load_full_dataset()
    data_obj_ecfp.get_dataset_tasks(df_delaney)
    data_obj_ecfp.check_task_columns(df_delaney)
    data_obj_ecfp.get_featurized_data()
    mdl = model_wrapper.create_model_wrapper(inp_params,
                                             data_obj_ecfp.featurization)
    mdl.setup_model_dirs()

    #testing correct model_wrapper build with regression and NN
    test = []
    test.append(mdl.params.prediction_type == 'regression')
    test.append(mdl.params.model_type == 'NN')
    mdl.create_transformers(data_obj_ecfp)
    test.append(
        isinstance(mdl.transformers[0],
                   dc.trans.transformers.NormalizationTransformer))
    test.append(mdl.transformers_x == [])
    #testing saving of transformer to correct location:
    transformer_path = os.path.join(mdl.output_dir, 'transformers.pkl')
    test.append(os.path.isfile(transformer_path))

    # TODO: test proper saving of the transformer to the datastore

    # TODO: test when transformers is False:
    inp_params.prediction_type = 'classification'
    mdl = model_wrapper.create_model_wrapper(inp_params, featurization)
    test.append(mdl.transformers == [])
    test.append(mdl.transformers_x == [])
    assert all(test)