def get_dataset_metadata(self, assay_params, retry_time=60): """ Gather the required metadata for a dataset Args: assay_params: dataset metadata Returns: None """ if not self.params.datastore: return print(assay_params['dataset_key']) retry = True i = 0 #TODO: need to catch if dataset doesn't exist versus 500 failure while retry: try: metadata = dsf.get_keyval( dataset_key=assay_params['dataset_key'], bucket=assay_params['bucket']) retry = False except Exception as e: if i < 5: print( "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..." % (assay_params['dataset_key'], e)) time.sleep(retry_time) i += 1 else: print( "Could not get metadata from datastore for dataset %s because of exception %s, exiting" % (assay_params['dataset_key'], e)) return None if 'id_col' in metadata.keys(): assay_params['id_col'] = metadata['id_col'] if 'response_cols' not in assay_params or assay_params[ 'response_cols'] is None: if 'param' in metadata.keys(): assay_params['response_cols'] = [metadata['param']] if 'response_col' in metadata.keys(): assay_params['response_cols'] = [metadata['response_col']] if 'response_cols' in metadata.keys(): assay_params['response_cols'] = metadata['response_cols'] if 'smiles_col' in metadata.keys(): assay_params['smiles_col'] = metadata['smiles_col'] if 'class_name' in metadata.keys(): assay_params['class_name'] = metadata['class_name'] if 'class_number' in metadata.keys(): assay_params['class_number'] = metadata['class_number'] if 'num_row' in metadata.keys(): self.num_rows[assay_params['dataset_key']] = metadata['num_row'] assay_params['dataset_name'] = assay_params['dataset_key'].split( '/')[-1].rstrip('.csv') assay_params['hyperparam_uuid'] = self.hyperparam_uuid
def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs): """ Load datasets from datastore, featurize them, and plot distributions of their inter-compound distances. """ log = logging.getLogger('ATOM') dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client) if feat_type == 'descriptors': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather', descriptor_type='moe', featurizer='descriptors', system='twintron-blue', datastore=True, transformers=True)) elif feat_type == 'ECFP': params = parse.wrapper(dict( dataset_key=dset_key, bucket=bucket, featurizer='ECFP', system='twintron-blue', datastore=True, ecfp_radius=2, ecfp_size=1024, transformers=True)) else: log.error("Feature type %s not supported" % feat_type) return metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket) if 'id_col' in metadata.keys(): params.id_col = metadata['id_col'] if 'param' in metadata.keys(): params.response_cols = [metadata['param']] elif 'response_col' in metadata.keys(): params.response_cols = [metadata['response_col']] elif 'response_cols' in metadata.keys(): params.response_cols = metadata['response_cols'] if 'smiles_col' in metadata.keys(): params.smiles_col = metadata['smiles_col'] if 'class_number' in metadata.keys(): params.class_number = metadata['class_number'] params.dataset_name = dset_key.split('/')[-1].rstrip('.csv') log.warning("Featurizing data with %s featurizer" % feat_type) featurization = feat.create_featurization(params) model_dataset = md.MinimalDataset(params, featurization) model_dataset.get_featurized_data(dset_df) num_cmpds = model_dataset.dataset.X.shape[0] if num_cmpds > 50000: log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds) return # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs) dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all') import scipy dists = scipy.spatial.distance.squareform(dists) res_dir = '/ds/projdata/gsk_data/model_analysis/' plt_dir = '%s/Plots' % res_dir file_prefix = dset_key.split('/')[-1].rstrip('.csv') mcs_linkage = linkage(dists, method='complete') pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma') if plt_dir is not None: pdf.savefig(g.fig) pdf.close() return dists
def return_split_uuid(self, dataset_key, bucket=None, splitter=None, split_combo=None, retry_time=60): """ Loads a dataset, splits it, saves it, and returns the split_uuid Args: dataset_key: key for dataset to split bucket: datastore-specific user group bucket splitter: Type of splitter to use to split the dataset split_combo: tuple of form (split_valid_frac, split_test_frac) Returns: """ if bucket is None: bucket = self.params.bucket if splitter is None: splitter = self.params.splitter if split_combo is None: split_valid_frac = self.params.split_valid_frac split_test_frac = self.params.split_test_frac else: split_valid_frac = split_combo[0] split_test_frac = split_combo[1] retry = True i = 0 #TODO: need to catch if dataset doesn't exist versus 500 failure while retry: try: metadata = dsf.get_keyval(dataset_key=dataset_key, bucket=bucket) retry = False except Exception as e: if i < 5: print( "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..." % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print( "Could not get metadata from datastore for dataset %s because of exception %s, exiting" % (dataset_key, e)) return None assay_params = { 'dataset_key': dataset_key, 'bucket': bucket, 'splitter': splitter, 'split_valid_frac': split_valid_frac, 'split_test_frac': split_test_frac } #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use if type(self.params.featurizer) == list: assay_params['featurizer'] = self.params.featurizer[0] else: assay_params['featurizer'] = self.params.featurizer if 'id_col' in metadata.keys(): assay_params['id_col'] = metadata['id_col'] if 'response_cols' not in assay_params or assay_params[ 'response_cols'] is None: if 'param' in metadata.keys(): assay_params['response_cols'] = [metadata['param']] if 'response_col' in metadata.keys(): assay_params['response_cols'] = [metadata['response_col']] if 'response_cols' in metadata.keys(): assay_params['response_cols'] = metadata['response_cols'] if 'smiles_col' in metadata.keys(): assay_params['smiles_col'] = metadata['smiles_col'] if 'class_name' in metadata.keys(): assay_params['class_name'] = metadata['class_name'] if 'class_number' in metadata.keys(): assay_params['class_number'] = metadata['class_number'] assay_params['dataset_name'] = assay_params['dataset_key'].split( '/')[-1].rstrip('.csv') assay_params['datastore'] = True assay_params[ 'previously_featurized'] = self.params.previously_featurized try: assay_params['descriptor_key'] = self.params.descriptor_key assay_params['descriptor_bucket'] = self.params.descriptor_bucket except: print("") #TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) retry = True i = 0 while retry: try: data.get_featurized_data() data.split_dataset() data.save_split_dataset() return data.split_uuid except Exception as e: if i < 5: print( "Could not get metadata from datastore for dataset %s because of exception %s, sleeping" % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print( "Could not save split dataset for dataset %s because of exception %s" % (dataset_key, e)) return None