def test_create_model_wrapper(): """ Args: params (Namespace) : Parameters passed to the model pipeline featurizer (Featurization): Object managing the featurization of compounds ds_client (DatastoreClient): Interface to the file datastore Returns: model (pipeline.Model): Wrapper for DeepChem, sklearn or other model. Raises: ValueError: Only params.model_type = 'NN' or 'RF' is supported. Dependencies: None Calls: DCNNModelWrapper, DCRFModelWrapper """ inp_params = parse.wrapper(general_params) featurization = feat.create_featurization(inp_params) mdl = model_wrapper.create_model_wrapper(inp_params, featurization) mdl.setup_model_dirs() # testing for correct attribute initialization with model_type == "NN" test = [] test.append(mdl.params.model_type == 'NN') test.append(isinstance(mdl.featurization, feat.DynamicFeaturization)) test.append(mdl.output_dir == inp_params.output_dir) test.append(mdl.model_dir == inp_params.output_dir + '/' + 'model') test.append(mdl.best_model_dir == inp_params.output_dir + '/' + 'best_model') test.append(mdl.baseline_model_dir == inp_params.output_dir + '/' + 'baseline_epoch_model') test.append(mdl.transformers == []) test.append(mdl.transformers_x == []) test.append(isinstance(mdl, model_wrapper.DCNNModelWrapper)) # testing for correct attribute initialization with model_type == "RF" temp_params = copy.deepcopy(inp_params) temp_params.model_type = 'RF' featurization = feat.create_featurization(temp_params) mdl_RF = model_wrapper.create_model_wrapper(temp_params, featurization) test.append(isinstance(mdl_RF, MP.model_wrapper.DCRFModelWrapper)) test.append(mdl_RF.params.model_type == 'RF') # assertion for all tests assert all(test) #testing for Exception with model_type not in ['NN','RF'] with pytest.raises(ValueError): temp_params.model_type = 'wrong' mdl_wrong = model_wrapper.create_model_wrapper(temp_params, featurization)
def uncurated_objects(y=["VALUE_NUM"]): params_from_ds = parse.wrapper(currentdir + '/config_uncurated_bp.json') params_from_ds.response_cols = y featurization = feat.create_featurization(params_from_ds) data = model_dataset.create_model_dataset(params_from_ds, featurization) uncurated_df = data.load_full_dataset() return params_from_ds, data, uncurated_df
def datastore_objects(y=["PIC50"]): params_from_ds = parse.wrapper(currentdir + '/config_datastore_dset_cav12.json') params_from_ds.response_cols = y featurization = feat.create_featurization(params_from_ds) data = model_dataset.create_model_dataset(params_from_ds, featurization) dset_df = data.load_full_dataset() data.get_featurized_data() data.split_dataset() return params_from_ds, data, dset_df
def split_and_save_dataset(self, assay_params): self.get_dataset_metadata(assay_params) #TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) #TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) data.get_featurized_data() data.split_dataset() data.save_split_dataset() assay_params['previously_split'] = True assay_params['split_uuid'] = data.split_uuid
def test_super_transform_dataset(): """ Args: dataset: The DeepChem DiskDataset that contains a dataset Returns: transformed_dataset Raises: None Dependencies: model_dataset.create_transformers Calls: None """ #set up for a model wrapper with regression and NN. inp_params = parse.wrapper(general_params) featurization = feat.create_featurization(inp_params) data_obj_ecfp = model_dataset.create_model_dataset(inp_params, featurization, ds_client=None) df_delaney = data_obj_ecfp.load_full_dataset() data_obj_ecfp.get_dataset_tasks(df_delaney) data_obj_ecfp.check_task_columns(df_delaney) data_obj_ecfp.get_featurized_data() mdl = model_wrapper.create_model_wrapper(inp_params, data_obj_ecfp.featurization) mdl.setup_model_dirs() mdl.create_transformers(data_obj_ecfp) dataset = mdl.transform_dataset(data_obj_ecfp.dataset) test = [] # checking that the dataset is the correct type test.append(isinstance(dataset, DD)) # since this is not descriptor featurization, the X values for the datasets should be the same test.append((dataset.X == data_obj_ecfp.dataset.X).all()) # and the response values should be the same length: test.append(len(dataset.y) == len(data_obj_ecfp.dataset.y)) test.append(len(dataset.y) == len(dataset.ids)) assert all(test)
def delaney_objects(y=["measured log solubility in mols per litre"], featurizer="ecfp", split_strategy="train_valid_test", splitter="random", split_uuid=None): delaney_inp_file = currentdir + '/config_delaney.json' inp_params = parse.wrapper(delaney_inp_file) inp_params.response_cols = y inp_params.featurizer = featurizer inp_params.split_strategy = split_strategy inp_params.splitter = splitter if split_uuid is not None: inp_params.previously_split = True inp_params.split_uuid = split_uuid featurization = feat.create_featurization(inp_params) mdl = model_dataset.create_model_dataset(inp_params, featurization, ds_client=None) delaney_df = mdl.load_full_dataset() mdl.get_featurized_data() mdl.split_dataset() return inp_params, mdl, delaney_df
def moe_descriptors(datastore=False): if datastore == True: params_ds = parse.wrapper(currentdir + "/config_MAOA_moe_descriptors_ds.json") else: params_file = parse.wrapper(currentdir + "/config_MAOA_moe_descriptors.json") # if not os.path.isfile(params_file.dataset_key): # os.makedirs('pytest/config_MAOA_moe_descriptors/moe_descriptors', exist_ok=True) # copyfile(params_ds.dataset_key, params_file.dataset_key) if datastore == True: params_desc = params_ds else: params_desc = params_file featurization = feat.create_featurization(params_desc) dataset_obj_for_desc = model_dataset.create_model_dataset(params_desc, featurization, ds_client=None) df = dataset_obj_for_desc.load_full_dataset() return params_desc, dataset_obj_for_desc, df
def split_and_save_dataset(self, assay_params): """ Splits a given dataset, saves it, and sets the split_uuid in the metadata Args: assay_params: Dataset metadata Returns: None """ self.get_dataset_metadata(assay_params) # TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) data.get_featurized_data() data.split_dataset() data.save_split_dataset() assay_params['previously_split'] = True assay_params['split_uuid'] = data.split_uuid
def test_split_dataset_kfold_scaffold_from_pipeline(caplog): #Testing for correct type and length of dataset for k-fold splitting with a scaffold splitter #Testing a 3-fold split first for uniqueness of all validation and training sets. #mp.model_wrapper = model_wrapper.create_model_wrapper(mp.params, mp.featurization, mp.ds_client) #mp.model_wrapper.setup_model_dirs() mp = utils.delaney_pipeline(featurizer="ecfp", split_strategy="k_fold_cv", splitter="scaffold") mp.featurization = feat.create_featurization(mp.params) mp.data = model_datasets.create_model_dataset(mp.params, mp.featurization, mp.ds_client) mp.data.get_featurized_data() mp.data.split_dataset() splitter_k_fold_scaffold = mp.data.splitting splitter_k_fold_scaffold.num_folds = 3 nf = splitter_k_fold_scaffold.num_folds #mp.model_wrapper.create_transformers(self.data) #mp.data.dataset = mp.model_wrapper.transform_dataset(self.data.dataset) data_obj_k_fold_scaffold = mp.data data_obj_k_fold_scaffold.split_dataset() train_valid, test, train_valid_attr, test_attr = splitter_k_fold_scaffold.split_dataset( data_obj_k_fold_scaffold.dataset, data_obj_k_fold_scaffold.attr, data_obj_k_fold_scaffold.params.smiles_col) #assert no overlap of the k-fold validation sets between each other test_list = [] for kfoldindex in range(0, nf): test_list.append( (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][0].X == train_valid[kfoldindex][0].X).all()) test_list.append( (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].X == train_valid[kfoldindex][1].X).all()) test_list.append( (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids == train_valid[kfoldindex][1].ids).all()) test_list.append( (data_obj_k_fold_scaffold.train_valid_dsets[kfoldindex][1].ids == train_valid[kfoldindex][1].ids).all()) test_list.append(train_valid_attr[kfoldindex][0].equals( data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][0])) test_list.append(train_valid_attr[kfoldindex][1].equals( data_obj_k_fold_scaffold.train_valid_attr[kfoldindex][1])) assert all(test_list) test_list = [] concat_valid = [x[1].ids.tolist() for x in train_valid] concat_valid = sum(concat_valid, []) test_list.append(len(concat_valid) == len(set(concat_valid))) assert all(test_list) tv_split = [] test_list = [] #asserting that each k-fold split has no internal overlap. for kfoldindex in range(0, nf): current_tv_split = train_valid[kfoldindex][0].ids.tolist( ) + train_valid[kfoldindex][1].ids.tolist() test_list.append( len(train_valid[kfoldindex][0].ids) == len(train_valid[kfoldindex] [0].y)) test_list.append( len(train_valid[kfoldindex][1].ids) == len(train_valid[kfoldindex] [1].y)) current_full_dataset = sum([current_tv_split, test.ids.tolist()], []) test_list.append( len(current_full_dataset) == len(set(current_full_dataset))) test_list.append( set(train_valid[kfoldindex][0].ids.tolist()) == set( train_valid_attr[kfoldindex][0].index.tolist())) test_list.append( set(train_valid[kfoldindex][1].ids.tolist()) == set( train_valid_attr[kfoldindex][1].index.tolist())) #checking length of the validation set (should be length of the kv set/num_folds +/- 1) len_valid = round(len(current_tv_split) / nf) test_list.append( len_valid - 1 <= len(train_valid[kfoldindex][1]) <= len_valid + 1) tv_split.append(current_tv_split) #asserting that all k-fold train valid sets are equivalent test_list.append( set.intersection(*[set(l) for l in tv_split]) == set(tv_split[0])) #aasserting that the test and test_attrs have the same index: test_list.append(set(test.ids.tolist()) == set(test_attr.index.tolist())) test_list.append(len(test.y) == len(test.ids)) assert all(test_list)
def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs): """ Load datasets from datastore, featurize them, and plot distributions of their inter-compound distances. """ log = logging.getLogger('ATOM') dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client) if feat_type == 'descriptors': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather', descriptor_type='moe', featurizer='descriptors', system='twintron-blue', datastore=True, transformers=True)) elif feat_type == 'ECFP': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, featurizer='ECFP', system='twintron-blue', datastore=True, ecfp_radius=2, ecfp_size=1024, transformers=True)) else: log.error("Feature type %s not supported" % feat_type) return metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket) if 'id_col' in metadata.keys(): params.id_col = metadata['id_col'] if 'param' in metadata.keys(): params.response_cols = [metadata['param']] elif 'response_col' in metadata.keys(): params.response_cols = [metadata['response_col']] elif 'response_cols' in metadata.keys(): params.response_cols = metadata['response_cols'] if 'smiles_col' in metadata.keys(): params.smiles_col = metadata['smiles_col'] if 'class_number' in metadata.keys(): params.class_number = metadata['class_number'] params.dataset_name = dset_key.split('/')[-1].rstrip('.csv') log.warning("Featurizing data with %s featurizer" % feat_type) featurization = feat.create_featurization(params) model_dataset = md.MinimalDataset(params, featurization) model_dataset.get_featurized_data(dset_df) num_cmpds = model_dataset.dataset.X.shape[0] if num_cmpds > 50000: log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds) return # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs) dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all') import scipy dists = scipy.spatial.distance.squareform(dists) res_dir = '/ds/projdata/gsk_data/model_analysis/' plt_dir = '%s/Plots' % res_dir file_prefix = dset_key.split('/')[-1].rstrip('.csv') mcs_linkage = linkage(dists, method='complete') pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma') if plt_dir is not None: pdf.savefig(g.fig) pdf.close() return dists
def return_split_uuid(self, dataset_key, bucket=None, splitter=None, split_combo=None, retry_time=60): """ Loads a dataset, splits it, saves it, and returns the split_uuid Args: dataset_key: key for dataset to split bucket: datastore-specific user group bucket splitter: Type of splitter to use to split the dataset split_combo: tuple of form (split_valid_frac, split_test_frac) Returns: """ if bucket is None: bucket = self.params.bucket if splitter is None: splitter = self.params.splitter if split_combo is None: split_valid_frac = self.params.split_valid_frac split_test_frac = self.params.split_test_frac else: split_valid_frac = split_combo[0] split_test_frac = split_combo[1] retry = True i = 0 #TODO: need to catch if dataset doesn't exist versus 500 failure while retry: try: metadata = dsf.get_keyval(dataset_key=dataset_key, bucket=bucket) retry = False except Exception as e: if i < 5: print( "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..." % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print( "Could not get metadata from datastore for dataset %s because of exception %s, exiting" % (dataset_key, e)) return None assay_params = { 'dataset_key': dataset_key, 'bucket': bucket, 'splitter': splitter, 'split_valid_frac': split_valid_frac, 'split_test_frac': split_test_frac } #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use if type(self.params.featurizer) == list: assay_params['featurizer'] = self.params.featurizer[0] else: assay_params['featurizer'] = self.params.featurizer if 'id_col' in metadata.keys(): assay_params['id_col'] = metadata['id_col'] if 'response_cols' not in assay_params or assay_params[ 'response_cols'] is None: if 'param' in metadata.keys(): assay_params['response_cols'] = [metadata['param']] if 'response_col' in metadata.keys(): assay_params['response_cols'] = [metadata['response_col']] if 'response_cols' in metadata.keys(): assay_params['response_cols'] = metadata['response_cols'] if 'smiles_col' in metadata.keys(): assay_params['smiles_col'] = metadata['smiles_col'] if 'class_name' in metadata.keys(): assay_params['class_name'] = metadata['class_name'] if 'class_number' in metadata.keys(): assay_params['class_number'] = metadata['class_number'] assay_params['dataset_name'] = assay_params['dataset_key'].split( '/')[-1].rstrip('.csv') assay_params['datastore'] = True assay_params[ 'previously_featurized'] = self.params.previously_featurized try: assay_params['descriptor_key'] = self.params.descriptor_key assay_params['descriptor_bucket'] = self.params.descriptor_bucket except: print("") #TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) retry = True i = 0 while retry: try: data.get_featurized_data() data.split_dataset() data.save_split_dataset() return data.split_uuid except Exception as e: if i < 5: print( "Could not get metadata from datastore for dataset %s because of exception %s, sleeping" % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print( "Could not save split dataset for dataset %s because of exception %s" % (dataset_key, e)) return None
def analyze_split(params, id_col='compound_id', smiles_col='rdkit_smiles', active_col='active'): """ Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset. Also show the active frequencies in each subset and for the dataset as a whole. id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found the metadata values are used instead. Args: params (argparse.Namespace): Pipeline parameters. id_col (str): Dataset column containing compound IDs. smiles_col (str): Dataset column containing SMILES strings. active_col (str): Dataset column containing binary classifications. Returns: :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds """ dset_key = params.dataset_key bucket = params.bucket split_uuid = params.split_uuid ds_client = dsf.config_client() try: split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid', split_uuid, ds_client, operator='in', bucket=bucket) split_oid = split_metadata['dataset_oid'].values[0] split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid, client=ds_client) except Exception as e: print("Error when loading split file:\n%s" % str(e)) raise try: dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, client=ds_client) dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, client=ds_client, return_metadata=True) except Exception as e: print("Error when loading dataset:\n%s" % str(e)) raise kv_dict = dsf.get_key_val(dataset_meta['metadata']) id_col = kv_dict.get('id_col', id_col) smiles_col = kv_dict.get('smiles_col', smiles_col) active_col = kv_dict.get('response_col', active_col) try: print('Dataset has %d unique compound IDs' % len(set(dataset_df[id_col].values))) print('Split table has %d unique compound IDs' % len(set(split_df.cmpd_id.values))) dset_df = dataset_df.merge(split_df, how='inner', left_on=id_col, right_on='cmpd_id').drop('cmpd_id', axis=1) except Exception as e: print("Error when joining dataset with split dataset:\n%s" % str(e)) raise featurization = feat.create_featurization(params) data = md.create_model_dataset(params, featurization, ds_client) data.get_featurized_data() feat_arr = data.dataset.X # TODO: impute missing values if necessary y = data.dataset.y.flatten() if len(set(y) - set([0, 1])) > 0: raise ValueError( 'AVEMinSplitter only works on binary classification datasets') ids = data.dataset.ids active_ind = np.where(y == 1)[0] inactive_ind = np.where(y == 0)[0] active_feat = feat_arr[active_ind, :] inactive_feat = feat_arr[inactive_ind, :] num_active = len(active_ind) num_inactive = len(inactive_ind) active_ids = ids[active_ind] inactive_ids = ids[inactive_ind] active_id_ind = dict(zip(active_ids, range(len(active_ids)))) inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids)))) if params.featurizer == 'ecfp': metric = 'jaccard' elif params.featurizer == 'graphconv': raise ValueError( "ave_min splitter dopesn't support graphconv features") else: metric = 'euclidean' # Calculate distance thresholds where nearest neighborfunction should be evaluated if metric == 'jaccard': max_nn_dist = 1.0 else: nan_mat = np.isnan(feat_arr) nnan = np.sum(nan_mat) if nnan > 0: log.info('Input feature matrix has %d NaN elements' % nnan) not_nan = ~nan_mat for i in range(feat_arr.shape[1]): feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i], i]) nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1] med_nn_dist = np.median(nn_dist) max_nn_dist = 3.0 * med_nn_dist ndist = 100 dist_thresh = np.linspace(0.0, max_nn_dist, ndist) # Compute distance matrices between subsets num_workers = 1 aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None, num_workers) ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None, num_workers) ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None, num_workers) ia_dist = ai_dist.transpose() subsets = sorted(set(dset_df.subset.values)) subset_active_ind = {} subset_inactive_ind = {} if 'train' in subsets: # this is a TVT split subsets = ['train', 'valid', 'test'] for subset in subsets: subset_df = dset_df[dset_df.subset == subset] active_df = subset_df[subset_df[active_col] == 1] inactive_df = subset_df[subset_df[active_col] == 0] subset_active_ids = active_df[id_col].values subset_inactive_ids = inactive_df[id_col].values subset_active_ind[subset] = [ active_id_ind[id] for id in subset_active_ids ] subset_inactive_ind[subset] = [ inactive_id_ind[id] for id in subset_inactive_ids ] taI = subset_active_ind['train'] tiI = subset_inactive_ind['train'] print("Results for %s split with %s %s features:" % (params.splitter, params.descriptor_type, params.featurizer)) for valid_set in ['valid', 'test']: vaI = subset_active_ind[valid_set] viI = subset_inactive_ind[valid_set] split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist, ia_dist, dist_thresh) _plot_nn_dist_distr(split_params) bias = _plot_bias(split_params, niter=0) print("For train/%s split: AVE bias = %.5f" % (valid_set, bias)) else: # TODO: deal with k-fold splits later print('k-fold CV splits not supported yet') return # Tabulate the fractions of actives in the full dataset and each subset subset_list = [] size_list = [] frac_list = [] active_frac_list = [] dset_size = data.dataset.X.shape[0] dset_active = sum(data.dataset.y) subset_list.append('full dataset') size_list.append(dset_size) frac_list.append(1.0) active_frac_list.append(dset_active / dset_size) for subset in subsets: active_size = len(subset_active_ind[subset]) inactive_size = len(subset_inactive_ind[subset]) subset_size = active_size + inactive_size active_frac = active_size / subset_size subset_list.append(subset) size_list.append(subset_size) frac_list.append(subset_size / dset_size) active_frac_list.append(active_frac) frac_df = pd.DataFrame( dict(subset=subset_list, size=size_list, fraction=frac_list, active_frac=active_frac_list)) print('\nSplit subsets:') print(frac_df) return frac_df
def test_train_NN_graphconv_scaffold_inputs(): """ Args: pipeline (ModelPipeline): The ModelPipeline instance for this model run. Dependencies: ModelPipeline creation featurization creation creation of model_wrapper mp.load_featurize_data Calls: create_perf_data perf_data.accumulate_preds perf_data.comput_perf_metrics data.combined_training-data() self._copy_model """ # checking that the layers, dropouts, and learning rate are properly added to the deepchem graphconv model general_params['featurizer'] = 'graphconv' general_params['layer_sizes'] = '100,100,10' general_params['dropouts'] = '0.3,0.3,0.1' general_params['uncertainty'] = False inp_params = parse.wrapper(general_params) mp = MP.ModelPipeline(inp_params) mp.featurization = feat.create_featurization(inp_params) mp.model_wrapper = model_wrapper.create_model_wrapper( inp_params, mp.featurization, mp.ds_client) # asserting that the correct model is created with the correct layer sizes, dropouts, model_dir, and mode by default test1 = [] test1.append(mp.model_wrapper.params.layer_sizes == [100, 100, 10]) test1.append(mp.model_wrapper.params.dropouts == [0.3, 0.3, 0.1]) # checking that parameters are properly passed to the deepchem model object test1.append(isinstance(mp.model_wrapper.model, GraphConvModel)) test1.append( mp.model_wrapper.model.model_dir == mp.model_wrapper.model_dir) test1.append( [i.out_channel for i in mp.model_wrapper.model.model.graph_convs] == [100, 100]) test1.append( [i.rate for i in mp.model_wrapper.model.model.dropouts] == [0.3, 0.3, 0.1]) test1.append(mp.model_wrapper.model.mode == 'regression') test1.append(mp.model_wrapper.model.model.dense.units == 10) assert all(test1) #*********************************************************************************** def test_super_get_train_valid_pred_results(): """ Args: perf_data: A PerfData object that stores the predicted values and metrics Returns: dict: A dictionary of the prediction results Raises: None Dependencies: create_perf_data Calls: perf_data.get_prediction_results() """ pass # should be tested in perf_data.get_prediction_results() # should still be called to make sure that the function is callable #*********************************************************************************** def test_super_get_test_perf_data(): """ Args: model_dir (str): Directory where the saved model is stored model_dataset (DiskDataset): Stores the current dataset and related methods Returns: perf_data: PerfData object containing the predicted values and metrics for the current test dataset Raises: None Dependencies: A model must be in model_dir model_dataset.test_dset must exist Calls: create_perf_data self.generate_predictions perf_data.accumulate_preds """ pass # mostly tested in accumulate_preds, but should be tested to ensure taht the predictions are properly being called #*********************************************************************************** def test_super_get_test_pred_results(): """ Args: model_dir (str): Directory where the saved model is stored model_dataset (DiskDataset): Stores the current dataset and related methods Returns: dict: A dictionary containing the prediction values and metrics for the current dataset. Raises: None Dependencies: A model must be in model_dir model_dataset.test_dset must exist Calls: self.get_test_perf_data perf_data.get_prediction_results """ pass #mostly tested in perf_data.get_prediction_results #*********************************************************************************** def test_super_get_full_dataset_perf_data(): """ Args: model_dataset (DiskDataset): Stores the current dataset and related methods Returns: perf_data: PerfData object containing the predicted values and metrics for the current full dataset Raises: None Dependencies: A model must already be trained Calls: create_perf_data self.generate_predictions self.accumulate_preds """ pass #*********************************************************************************** def test_super_get_full_dataset_pred_results(): """ Args: model_dataset (DiskDataset): Stores the current dataset and related methods Returns: dict: A dictionary containing predicted values and metrics for the current full dataset Raises: None Dependencies: A model was already be trained. Calls: get_full_dataset_perf_data self.get_prediction_results() """ pass
def test_super_create_transformers(): """ Args: model_dataset: The ModelDataset object that handles the current dataset Returns: self.transformers self.transformers_x self.params.transformer_key self.params.transformer_oid (if datastore) Raises: Exception when failing to save to the datastore Dependencies: create_featurization create_model_dataset model_dataset.load_full_dataset model_dataset.get_dataset_tasks model_dataset.check_task_columns model_dataset.get_featurized_data Requires (self.params.prediction_type == 'regression' and self.params.transformers == True) or len(self.transformers) > 0 Calls: self.featurization.create_feature_transformer dsf.upload_pickle_to_DS """ #set up for a model wrapper with regression and NN. inp_params = parse.wrapper(general_params) featurization = feat.create_featurization(inp_params) data_obj_ecfp = model_dataset.create_model_dataset(inp_params, featurization, ds_client=None) df_delaney = data_obj_ecfp.load_full_dataset() data_obj_ecfp.get_dataset_tasks(df_delaney) data_obj_ecfp.check_task_columns(df_delaney) data_obj_ecfp.get_featurized_data() mdl = model_wrapper.create_model_wrapper(inp_params, data_obj_ecfp.featurization) mdl.setup_model_dirs() #testing correct model_wrapper build with regression and NN test = [] test.append(mdl.params.prediction_type == 'regression') test.append(mdl.params.model_type == 'NN') mdl.create_transformers(data_obj_ecfp) test.append( isinstance(mdl.transformers[0], dc.trans.transformers.NormalizationTransformer)) test.append(mdl.transformers_x == []) #testing saving of transformer to correct location: transformer_path = os.path.join(mdl.output_dir, 'transformers.pkl') test.append(os.path.isfile(transformer_path)) # TODO: test proper saving of the transformer to the datastore # TODO: test when transformers is False: inp_params.prediction_type = 'classification' mdl = model_wrapper.create_model_wrapper(inp_params, featurization) test.append(mdl.transformers == []) test.append(mdl.transformers_x == []) assert all(test)