def test_create_model_wrapper():
    """
        Args:
        params (Namespace) : Parameters passed to the model pipeline
                     featurizer (Featurization): Object managing the featurization of compounds
                                 ds_client (DatastoreClient): Interface to the file datastore

                                                  Returns:
                                                  model (pipeline.Model): Wrapper for DeepChem, sklearn or other model.

                                                              Raises:
ValueError: Only params.model_type = 'NN' or 'RF' is supported. 

Dependencies:
None

Calls:
DCNNModelWrapper, DCRFModelWrapper
    """
    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    mdl = model_wrapper.create_model_wrapper(inp_params, featurization)
    mdl.setup_model_dirs()
    # testing for correct attribute initialization with model_type == "NN"
    test = []
    test.append(mdl.params.model_type == 'NN')
    test.append(isinstance(mdl.featurization, feat.DynamicFeaturization))
    test.append(mdl.output_dir == inp_params.output_dir)
    test.append(mdl.model_dir == inp_params.output_dir + '/' + 'model')
    test.append(mdl.best_model_dir == inp_params.output_dir + '/' +
                'best_model')
    test.append(mdl.baseline_model_dir == inp_params.output_dir + '/' +
                'baseline_epoch_model')
    test.append(mdl.transformers == [])
    test.append(mdl.transformers_x == [])
    test.append(isinstance(mdl, model_wrapper.DCNNModelWrapper))

    # testing for correct attribute initialization with model_type == "RF"
    temp_params = copy.deepcopy(inp_params)
    temp_params.model_type = 'RF'
    featurization = feat.create_featurization(temp_params)
    mdl_RF = model_wrapper.create_model_wrapper(temp_params, featurization)
    test.append(isinstance(mdl_RF, MP.model_wrapper.DCRFModelWrapper))
    test.append(mdl_RF.params.model_type == 'RF')

    # assertion for all tests
    assert all(test)

    #testing for Exception with model_type not in ['NN','RF']
    with pytest.raises(ValueError):
        temp_params.model_type = 'wrong'
        mdl_wrong = model_wrapper.create_model_wrapper(temp_params,
                                                       featurization)
Exemple #2
0
def uncurated_objects(y=["VALUE_NUM"]):
    params_from_ds = parse.wrapper(currentdir + '/config_uncurated_bp.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    uncurated_df = data.load_full_dataset()
    return params_from_ds, data, uncurated_df
Exemple #3
0
def datastore_objects(y=["PIC50"]):
    params_from_ds = parse.wrapper(currentdir +
                                   '/config_datastore_dset_cav12.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    dset_df = data.load_full_dataset()
    data.get_featurized_data()
    data.split_dataset()
    return params_from_ds, data, dset_df
Exemple #4
0
 def split_and_save_dataset(self, assay_params):
     self.get_dataset_metadata(assay_params)
     #TODO: check usage with defaults
     namespace_params = parse.wrapper(assay_params)
     #TODO: Don't want to recreate each time
     featurization = feat.create_featurization(namespace_params)
     data = model_datasets.create_model_dataset(namespace_params,
                                                featurization)
     data.get_featurized_data()
     data.split_dataset()
     data.save_split_dataset()
     assay_params['previously_split'] = True
     assay_params['split_uuid'] = data.split_uuid
def test_super_transform_dataset():
    """
    Args:
    dataset: The DeepChem DiskDataset that contains a dataset

    Returns:
    transformed_dataset

    Raises:
    None

    Dependencies:
    model_dataset.create_transformers

    Calls:
    None


    """
    #set up for a model wrapper with regression and NN.
    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    data_obj_ecfp = model_dataset.create_model_dataset(inp_params,
                                                       featurization,
                                                       ds_client=None)
    df_delaney = data_obj_ecfp.load_full_dataset()
    data_obj_ecfp.get_dataset_tasks(df_delaney)
    data_obj_ecfp.check_task_columns(df_delaney)
    data_obj_ecfp.get_featurized_data()
    mdl = model_wrapper.create_model_wrapper(inp_params,
                                             data_obj_ecfp.featurization)
    mdl.setup_model_dirs()
    mdl.create_transformers(data_obj_ecfp)
    dataset = mdl.transform_dataset(data_obj_ecfp.dataset)

    test = []
    # checking that the dataset is the correct type
    test.append(isinstance(dataset, DD))
    # since this is not descriptor featurization, the X values for the datasets should be the same
    test.append((dataset.X == data_obj_ecfp.dataset.X).all())
    # and the response values should be the same length:
    test.append(len(dataset.y) == len(data_obj_ecfp.dataset.y))
    test.append(len(dataset.y) == len(dataset.ids))
    assert all(test)
Exemple #6
0
def delaney_objects(y=["measured log solubility in mols per litre"],
                    featurizer="ecfp",
                    split_strategy="train_valid_test",
                    splitter="random",
                    split_uuid=None):
    delaney_inp_file = currentdir + '/config_delaney.json'
    inp_params = parse.wrapper(delaney_inp_file)
    inp_params.response_cols = y
    inp_params.featurizer = featurizer
    inp_params.split_strategy = split_strategy
    inp_params.splitter = splitter
    if split_uuid is not None:
        inp_params.previously_split = True
        inp_params.split_uuid = split_uuid
    featurization = feat.create_featurization(inp_params)
    mdl = model_dataset.create_model_dataset(inp_params,
                                             featurization,
                                             ds_client=None)
    delaney_df = mdl.load_full_dataset()
    mdl.get_featurized_data()
    mdl.split_dataset()
    return inp_params, mdl, delaney_df
Exemple #7
0
def moe_descriptors(datastore=False):
    if datastore == True:
        params_ds = parse.wrapper(currentdir +
                                  "/config_MAOA_moe_descriptors_ds.json")
    else:
        params_file = parse.wrapper(currentdir +
                                    "/config_MAOA_moe_descriptors.json")


#         if not os.path.isfile(params_file.dataset_key):
#             os.makedirs('pytest/config_MAOA_moe_descriptors/moe_descriptors', exist_ok=True)
#             copyfile(params_ds.dataset_key, params_file.dataset_key)
    if datastore == True:
        params_desc = params_ds
    else:
        params_desc = params_file
    featurization = feat.create_featurization(params_desc)
    dataset_obj_for_desc = model_dataset.create_model_dataset(params_desc,
                                                              featurization,
                                                              ds_client=None)
    df = dataset_obj_for_desc.load_full_dataset()
    return params_desc, dataset_obj_for_desc, df
    def split_and_save_dataset(self, assay_params):
        """
        Splits a given dataset, saves it, and sets the split_uuid in the metadata
        
        Args:
            assay_params: Dataset metadata

        Returns:
            None

        """
        self.get_dataset_metadata(assay_params)
        # TODO: check usage with defaults
        namespace_params = parse.wrapper(assay_params)
        # TODO: Don't want to recreate each time
        featurization = feat.create_featurization(namespace_params)
        data = model_datasets.create_model_dataset(namespace_params, featurization)
        data.get_featurized_data()
        data.split_dataset()
        data.save_split_dataset()
        assay_params['previously_split'] = True
        assay_params['split_uuid'] = data.split_uuid
def test_split_dataset_kfold_scaffold_from_pipeline(caplog):
    #Testing for correct type and length of dataset for k-fold splitting with a scaffold splitter
    #Testing a 3-fold split first for uniqueness of all validation and training sets.

    #mp.model_wrapper = model_wrapper.create_model_wrapper(mp.params, mp.featurization, mp.ds_client)
    #mp.model_wrapper.setup_model_dirs()
    mp = utils.delaney_pipeline(featurizer="ecfp",
                                split_strategy="k_fold_cv",
                                splitter="scaffold")
    mp.featurization = feat.create_featurization(mp.params)
    mp.data = model_datasets.create_model_dataset(mp.params, mp.featurization,
                                                  mp.ds_client)
    mp.data.get_featurized_data()
    mp.data.split_dataset()
    splitter_k_fold_scaffold = mp.data.splitting
    splitter_k_fold_scaffold.num_folds = 3
    nf = splitter_k_fold_scaffold.num_folds

    #mp.model_wrapper.create_transformers(self.data)
    #mp.data.dataset = mp.model_wrapper.transform_dataset(self.data.dataset)
    data_obj_k_fold_scaffold = mp.data

    data_obj_k_fold_scaffold.split_dataset()
    train_valid, test, train_valid_attr, test_attr = splitter_k_fold_scaffold.split_dataset(
        data_obj_k_fold_scaffold.dataset, data_obj_k_fold_scaffold.attr,
        data_obj_k_fold_scaffold.params.smiles_col)
    #assert no overlap of the k-fold validation sets between each other
    test_list = []
    for kfoldindex in range(0, nf):
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][0].X ==
             train_valid[kfoldindex][0].X).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].X ==
             train_valid[kfoldindex][1].X).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids ==
             train_valid[kfoldindex][1].ids).all())
        test_list.append(
            (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids ==
             train_valid[kfoldindex][1].ids).all())
        test_list.append(train_valid_attr[kfoldindex][0].equals(
            data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][0]))
        test_list.append(train_valid_attr[kfoldindex][1].equals(
            data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][1]))
    assert all(test_list)
    test_list = []
    concat_valid = [x[1].ids.tolist() for x in train_valid]
    concat_valid = sum(concat_valid, [])
    test_list.append(len(concat_valid) == len(set(concat_valid)))

    assert all(test_list)
    tv_split = []
    test_list = []
    #asserting that each k-fold split has no internal overlap.
    for kfoldindex in range(0, nf):
        current_tv_split = train_valid[kfoldindex][0].ids.tolist(
        ) + train_valid[kfoldindex][1].ids.tolist()
        test_list.append(
            len(train_valid[kfoldindex][0].ids) == len(train_valid[kfoldindex]
                                                       [0].y))
        test_list.append(
            len(train_valid[kfoldindex][1].ids) == len(train_valid[kfoldindex]
                                                       [1].y))
        current_full_dataset = sum([current_tv_split, test.ids.tolist()], [])
        test_list.append(
            len(current_full_dataset) == len(set(current_full_dataset)))
        test_list.append(
            set(train_valid[kfoldindex][0].ids.tolist()) == set(
                train_valid_attr[kfoldindex][0].index.tolist()))
        test_list.append(
            set(train_valid[kfoldindex][1].ids.tolist()) == set(
                train_valid_attr[kfoldindex][1].index.tolist()))
        #checking length of the validation set (should be length of the kv set/num_folds +/- 1)
        len_valid = round(len(current_tv_split) / nf)
        test_list.append(
            len_valid - 1 <= len(train_valid[kfoldindex][1]) <= len_valid + 1)
        tv_split.append(current_tv_split)

    #asserting that all k-fold train valid sets are equivalent
    test_list.append(
        set.intersection(*[set(l) for l in tv_split]) == set(tv_split[0]))
    #aasserting that the test and test_attrs have the same index:
    test_list.append(set(test.ids.tolist()) == set(test_attr.index.tolist()))
    test_list.append(len(test.y) == len(test.ids))
    assert all(test_list)
Exemple #10
0
 def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                        **metric_kwargs):
     """
     Load datasets from datastore, featurize them, and plot distributions of their inter-compound
     distances.
     """
     log = logging.getLogger('ATOM')
 
     dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
 
     if feat_type == 'descriptors':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
             descriptor_type='moe',
             featurizer='descriptors',
             system='twintron-blue',
             datastore=True,
             transformers=True))
     elif feat_type == 'ECFP':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             featurizer='ECFP',
             system='twintron-blue',
             datastore=True,
             ecfp_radius=2,
             ecfp_size=1024,
             transformers=True))
     else:
         log.error("Feature type %s not supported" % feat_type)
         return
     metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
     if 'id_col' in metadata.keys():
         params.id_col = metadata['id_col']
     if 'param' in metadata.keys():
         params.response_cols = [metadata['param']]
     elif 'response_col' in metadata.keys():
         params.response_cols = [metadata['response_col']]
     elif 'response_cols' in metadata.keys():
         params.response_cols = metadata['response_cols']
 
     if 'smiles_col' in metadata.keys():
         params.smiles_col = metadata['smiles_col']
 
     if 'class_number' in metadata.keys():
         params.class_number = metadata['class_number']
     params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
 
     log.warning("Featurizing data with %s featurizer" % feat_type)
     featurization = feat.create_featurization(params)
     model_dataset = md.MinimalDataset(params, featurization)
     model_dataset.get_featurized_data(dset_df)
     num_cmpds = model_dataset.dataset.X.shape[0]
     if num_cmpds > 50000:
         log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
         return
     # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
     dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
     import scipy
     dists = scipy.spatial.distance.squareform(dists)
     res_dir = '/ds/projdata/gsk_data/model_analysis/'
     plt_dir = '%s/Plots' % res_dir
     file_prefix = dset_key.split('/')[-1].rstrip('.csv')
     mcs_linkage = linkage(dists, method='complete')
     pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
     pdf = PdfPages(pdf_path)
     g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
     if plt_dir is not None:
         pdf.savefig(g.fig)
         pdf.close()
     return dists
    def return_split_uuid(self,
                          dataset_key,
                          bucket=None,
                          splitter=None,
                          split_combo=None,
                          retry_time=60):
        """
        Loads a dataset, splits it, saves it, and returns the split_uuid
        Args:
            dataset_key: key for dataset to split
            bucket: datastore-specific user group bucket
            splitter: Type of splitter to use to split the dataset
            split_combo: tuple of form (split_valid_frac, split_test_frac)

        Returns:

        """
        if bucket is None:
            bucket = self.params.bucket
        if splitter is None:
            splitter = self.params.splitter
        if split_combo is None:
            split_valid_frac = self.params.split_valid_frac
            split_test_frac = self.params.split_test_frac
        else:
            split_valid_frac = split_combo[0]
            split_test_frac = split_combo[1]
        retry = True
        i = 0
        #TODO: need to catch if dataset doesn't exist versus 500 failure
        while retry:
            try:
                metadata = dsf.get_keyval(dataset_key=dataset_key,
                                          bucket=bucket)
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..."
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, exiting"
                        % (dataset_key, e))
                    return None
        assay_params = {
            'dataset_key': dataset_key,
            'bucket': bucket,
            'splitter': splitter,
            'split_valid_frac': split_valid_frac,
            'split_test_frac': split_test_frac
        }
        #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use
        if type(self.params.featurizer) == list:
            assay_params['featurizer'] = self.params.featurizer[0]
        else:
            assay_params['featurizer'] = self.params.featurizer
        if 'id_col' in metadata.keys():
            assay_params['id_col'] = metadata['id_col']
        if 'response_cols' not in assay_params or assay_params[
                'response_cols'] is None:
            if 'param' in metadata.keys():
                assay_params['response_cols'] = [metadata['param']]
            if 'response_col' in metadata.keys():
                assay_params['response_cols'] = [metadata['response_col']]
            if 'response_cols' in metadata.keys():
                assay_params['response_cols'] = metadata['response_cols']
        if 'smiles_col' in metadata.keys():
            assay_params['smiles_col'] = metadata['smiles_col']
        if 'class_name' in metadata.keys():
            assay_params['class_name'] = metadata['class_name']
        if 'class_number' in metadata.keys():
            assay_params['class_number'] = metadata['class_number']
        assay_params['dataset_name'] = assay_params['dataset_key'].split(
            '/')[-1].rstrip('.csv')
        assay_params['datastore'] = True
        assay_params[
            'previously_featurized'] = self.params.previously_featurized
        try:
            assay_params['descriptor_key'] = self.params.descriptor_key
            assay_params['descriptor_bucket'] = self.params.descriptor_bucket
        except:
            print("")
        #TODO: check usage with defaults
        namespace_params = parse.wrapper(assay_params)
        # TODO: Don't want to recreate each time
        featurization = feat.create_featurization(namespace_params)
        data = model_datasets.create_model_dataset(namespace_params,
                                                   featurization)
        retry = True
        i = 0
        while retry:
            try:
                data.get_featurized_data()
                data.split_dataset()
                data.save_split_dataset()
                return data.split_uuid
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping"
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not save split dataset for dataset %s because of exception %s"
                        % (dataset_key, e))
                    return None
Exemple #12
0
def analyze_split(params,
                  id_col='compound_id',
                  smiles_col='rdkit_smiles',
                  active_col='active'):
    """
    Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset.

    Also show the active frequencies in each subset and for the dataset as a whole.
    id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found
    the metadata values are used instead.

    Args:
        params (argparse.Namespace): Pipeline parameters.

        id_col (str): Dataset column containing compound IDs.

        smiles_col (str): Dataset column containing SMILES strings.

        active_col (str): Dataset column containing binary classifications.

    Returns:
        :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds

    """
    dset_key = params.dataset_key
    bucket = params.bucket
    split_uuid = params.split_uuid

    ds_client = dsf.config_client()
    try:
        split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid',
                                                          split_uuid,
                                                          ds_client,
                                                          operator='in',
                                                          bucket=bucket)
        split_oid = split_metadata['dataset_oid'].values[0]
        split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid,
                                                       client=ds_client)
    except Exception as e:
        print("Error when loading split file:\n%s" % str(e))
        raise

    try:
        dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                        bucket,
                                                        client=ds_client)
        dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                          bucket,
                                                          client=ds_client,
                                                          return_metadata=True)
    except Exception as e:
        print("Error when loading dataset:\n%s" % str(e))
        raise
    kv_dict = dsf.get_key_val(dataset_meta['metadata'])
    id_col = kv_dict.get('id_col', id_col)
    smiles_col = kv_dict.get('smiles_col', smiles_col)
    active_col = kv_dict.get('response_col', active_col)

    try:
        print('Dataset has %d unique compound IDs' %
              len(set(dataset_df[id_col].values)))
        print('Split table has %d unique compound IDs' %
              len(set(split_df.cmpd_id.values)))

        dset_df = dataset_df.merge(split_df,
                                   how='inner',
                                   left_on=id_col,
                                   right_on='cmpd_id').drop('cmpd_id', axis=1)
    except Exception as e:
        print("Error when joining dataset with split dataset:\n%s" % str(e))
        raise

    featurization = feat.create_featurization(params)
    data = md.create_model_dataset(params, featurization, ds_client)
    data.get_featurized_data()
    feat_arr = data.dataset.X
    # TODO: impute missing values if necessary
    y = data.dataset.y.flatten()
    if len(set(y) - set([0, 1])) > 0:
        raise ValueError(
            'AVEMinSplitter only works on binary classification datasets')
    ids = data.dataset.ids
    active_ind = np.where(y == 1)[0]
    inactive_ind = np.where(y == 0)[0]
    active_feat = feat_arr[active_ind, :]
    inactive_feat = feat_arr[inactive_ind, :]
    num_active = len(active_ind)
    num_inactive = len(inactive_ind)
    active_ids = ids[active_ind]
    inactive_ids = ids[inactive_ind]
    active_id_ind = dict(zip(active_ids, range(len(active_ids))))
    inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids))))
    if params.featurizer == 'ecfp':
        metric = 'jaccard'
    elif params.featurizer == 'graphconv':
        raise ValueError(
            "ave_min splitter dopesn't support graphconv features")
    else:
        metric = 'euclidean'

    # Calculate distance thresholds where nearest neighborfunction should be evaluated
    if metric == 'jaccard':
        max_nn_dist = 1.0
    else:
        nan_mat = np.isnan(feat_arr)
        nnan = np.sum(nan_mat)
        if nnan > 0:
            log.info('Input feature matrix has %d NaN elements' % nnan)
            not_nan = ~nan_mat
            for i in range(feat_arr.shape[1]):
                feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i],
                                                              i])
        nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1]
        med_nn_dist = np.median(nn_dist)
        max_nn_dist = 3.0 * med_nn_dist
    ndist = 100
    dist_thresh = np.linspace(0.0, max_nn_dist, ndist)

    # Compute distance matrices between subsets
    num_workers = 1
    aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None,
                             num_workers)
    ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None,
                             num_workers)
    ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None,
                             num_workers)
    ia_dist = ai_dist.transpose()

    subsets = sorted(set(dset_df.subset.values))
    subset_active_ind = {}
    subset_inactive_ind = {}

    if 'train' in subsets:
        # this is a TVT split
        subsets = ['train', 'valid', 'test']
        for subset in subsets:
            subset_df = dset_df[dset_df.subset == subset]
            active_df = subset_df[subset_df[active_col] == 1]
            inactive_df = subset_df[subset_df[active_col] == 0]
            subset_active_ids = active_df[id_col].values
            subset_inactive_ids = inactive_df[id_col].values
            subset_active_ind[subset] = [
                active_id_ind[id] for id in subset_active_ids
            ]
            subset_inactive_ind[subset] = [
                inactive_id_ind[id] for id in subset_inactive_ids
            ]

        taI = subset_active_ind['train']
        tiI = subset_inactive_ind['train']
        print("Results for %s split with %s %s features:" %
              (params.splitter, params.descriptor_type, params.featurizer))
        for valid_set in ['valid', 'test']:
            vaI = subset_active_ind[valid_set]
            viI = subset_inactive_ind[valid_set]
            split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist,
                            ia_dist, dist_thresh)
            _plot_nn_dist_distr(split_params)
            bias = _plot_bias(split_params, niter=0)
            print("For train/%s split: AVE bias = %.5f" % (valid_set, bias))
    else:
        # TODO: deal with k-fold splits later
        print('k-fold CV splits not supported yet')
        return

    # Tabulate the fractions of actives in the full dataset and each subset
    subset_list = []
    size_list = []
    frac_list = []
    active_frac_list = []

    dset_size = data.dataset.X.shape[0]
    dset_active = sum(data.dataset.y)
    subset_list.append('full dataset')
    size_list.append(dset_size)
    frac_list.append(1.0)
    active_frac_list.append(dset_active / dset_size)

    for subset in subsets:
        active_size = len(subset_active_ind[subset])
        inactive_size = len(subset_inactive_ind[subset])
        subset_size = active_size + inactive_size
        active_frac = active_size / subset_size
        subset_list.append(subset)
        size_list.append(subset_size)
        frac_list.append(subset_size / dset_size)
        active_frac_list.append(active_frac)
    frac_df = pd.DataFrame(
        dict(subset=subset_list,
             size=size_list,
             fraction=frac_list,
             active_frac=active_frac_list))
    print('\nSplit subsets:')
    print(frac_df)

    return frac_df
def test_train_NN_graphconv_scaffold_inputs():
    """

    Args:
    pipeline (ModelPipeline): The ModelPipeline instance for this model run.
    
    Dependencies:
    ModelPipeline creation
    featurization creation
    creation of model_wrapper
    mp.load_featurize_data

    Calls:
    create_perf_data
    perf_data.accumulate_preds
    perf_data.comput_perf_metrics
    data.combined_training-data()
    self._copy_model
    """
    # checking that the layers, dropouts, and learning rate are properly added to the deepchem graphconv model
    general_params['featurizer'] = 'graphconv'
    general_params['layer_sizes'] = '100,100,10'
    general_params['dropouts'] = '0.3,0.3,0.1'
    general_params['uncertainty'] = False
    inp_params = parse.wrapper(general_params)
    mp = MP.ModelPipeline(inp_params)
    mp.featurization = feat.create_featurization(inp_params)
    mp.model_wrapper = model_wrapper.create_model_wrapper(
        inp_params, mp.featurization, mp.ds_client)
    # asserting that the correct model is created with the correct layer sizes, dropouts, model_dir, and mode by default
    test1 = []

    test1.append(mp.model_wrapper.params.layer_sizes == [100, 100, 10])
    test1.append(mp.model_wrapper.params.dropouts == [0.3, 0.3, 0.1])
    # checking that parameters are properly passed to the deepchem model object
    test1.append(isinstance(mp.model_wrapper.model, GraphConvModel))
    test1.append(
        mp.model_wrapper.model.model_dir == mp.model_wrapper.model_dir)
    test1.append(
        [i.out_channel
         for i in mp.model_wrapper.model.model.graph_convs] == [100, 100])
    test1.append(
        [i.rate
         for i in mp.model_wrapper.model.model.dropouts] == [0.3, 0.3, 0.1])
    test1.append(mp.model_wrapper.model.mode == 'regression')
    test1.append(mp.model_wrapper.model.model.dense.units == 10)
    assert all(test1)

    #***********************************************************************************
    def test_super_get_train_valid_pred_results():
        """
        Args:
        perf_data: A PerfData object that stores the predicted values and metrics
        Returns:
        dict: A dictionary of the prediction results

            Raises:
        None

        Dependencies:
        create_perf_data

        Calls:
        perf_data.get_prediction_results()

        """
        pass

    # should be tested in perf_data.get_prediction_results()
    # should still be called to make sure that the function is callable

    #***********************************************************************************
    def test_super_get_test_perf_data():
        """
        Args:
        model_dir (str): Directory where the saved model is stored
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        perf_data: PerfData object containing the predicted values and metrics for the current test dataset

            Raises:
        None

        Dependencies:
        A model must be in model_dir
        model_dataset.test_dset must exist

        Calls:
        create_perf_data
        self.generate_predictions
        perf_data.accumulate_preds
        """
        pass
        # mostly tested in accumulate_preds, but should be tested to ensure taht the predictions are properly being called

    #***********************************************************************************
    def test_super_get_test_pred_results():
        """
        Args:
        model_dir (str): Directory where the saved model is stored
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        dict: A dictionary containing the prediction values and metrics for the current dataset.

            Raises:
        None

        Dependencies:
        A model must be in model_dir
        model_dataset.test_dset must exist

        Calls:
        self.get_test_perf_data
        perf_data.get_prediction_results
        """
        pass
        #mostly tested in perf_data.get_prediction_results

    #***********************************************************************************
    def test_super_get_full_dataset_perf_data():
        """
        Args:
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        perf_data: PerfData object containing the predicted values and metrics for the current full dataset

            Raises:
        None

        Dependencies:
        A model must already be trained

        Calls:
        create_perf_data
        self.generate_predictions
        self.accumulate_preds
        """
        pass

    #***********************************************************************************
    def test_super_get_full_dataset_pred_results():
        """
        Args:
        model_dataset (DiskDataset): Stores the current dataset and related methods
        Returns:
        dict: A dictionary containing predicted values and metrics for the current full dataset

            Raises:
        None

        Dependencies:
        A model was already be trained.

        Calls:
        get_full_dataset_perf_data
        self.get_prediction_results()
        """
        pass
def test_super_create_transformers():
    """
    Args:
    model_dataset: The ModelDataset object that handles the current dataset

    Returns:
    self.transformers
    self.transformers_x
    self.params.transformer_key
    self.params.transformer_oid (if datastore)

    Raises:
    Exception when failing to save to the datastore

    Dependencies:
    create_featurization
    create_model_dataset
    model_dataset.load_full_dataset
    model_dataset.get_dataset_tasks
    model_dataset.check_task_columns
    model_dataset.get_featurized_data
    Requires (self.params.prediction_type == 'regression' and self.params.transformers == True) or len(self.transformers) > 0 

    Calls:
    self.featurization.create_feature_transformer
    dsf.upload_pickle_to_DS

    """
    #set up for a model wrapper with regression and NN.

    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    data_obj_ecfp = model_dataset.create_model_dataset(inp_params,
                                                       featurization,
                                                       ds_client=None)
    df_delaney = data_obj_ecfp.load_full_dataset()
    data_obj_ecfp.get_dataset_tasks(df_delaney)
    data_obj_ecfp.check_task_columns(df_delaney)
    data_obj_ecfp.get_featurized_data()
    mdl = model_wrapper.create_model_wrapper(inp_params,
                                             data_obj_ecfp.featurization)
    mdl.setup_model_dirs()

    #testing correct model_wrapper build with regression and NN
    test = []
    test.append(mdl.params.prediction_type == 'regression')
    test.append(mdl.params.model_type == 'NN')
    mdl.create_transformers(data_obj_ecfp)
    test.append(
        isinstance(mdl.transformers[0],
                   dc.trans.transformers.NormalizationTransformer))
    test.append(mdl.transformers_x == [])
    #testing saving of transformer to correct location:
    transformer_path = os.path.join(mdl.output_dir, 'transformers.pkl')
    test.append(os.path.isfile(transformer_path))

    # TODO: test proper saving of the transformer to the datastore

    # TODO: test when transformers is False:
    inp_params.prediction_type = 'classification'
    mdl = model_wrapper.create_model_wrapper(inp_params, featurization)
    test.append(mdl.transformers == [])
    test.append(mdl.transformers_x == [])
    assert all(test)