def get_dataset_metadata(self, assay_params, retry_time=60):
        """
        Gather the required metadata for a dataset
        
        Args:
            assay_params: dataset metadata

        Returns:
            None

        """
        if not self.params.datastore:
            return
        print(assay_params['dataset_key'])
        retry = True
        i = 0
        #TODO: need to catch if dataset doesn't exist versus 500 failure
        while retry:
            try:
                metadata = dsf.get_keyval(
                    dataset_key=assay_params['dataset_key'],
                    bucket=assay_params['bucket'])
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..."
                        % (assay_params['dataset_key'], e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, exiting"
                        % (assay_params['dataset_key'], e))
                    return None
        if 'id_col' in metadata.keys():
            assay_params['id_col'] = metadata['id_col']
        if 'response_cols' not in assay_params or assay_params[
                'response_cols'] is None:
            if 'param' in metadata.keys():
                assay_params['response_cols'] = [metadata['param']]
            if 'response_col' in metadata.keys():
                assay_params['response_cols'] = [metadata['response_col']]
            if 'response_cols' in metadata.keys():
                assay_params['response_cols'] = metadata['response_cols']
        if 'smiles_col' in metadata.keys():
            assay_params['smiles_col'] = metadata['smiles_col']
        if 'class_name' in metadata.keys():
            assay_params['class_name'] = metadata['class_name']
        if 'class_number' in metadata.keys():
            assay_params['class_number'] = metadata['class_number']
        if 'num_row' in metadata.keys():
            self.num_rows[assay_params['dataset_key']] = metadata['num_row']
        assay_params['dataset_name'] = assay_params['dataset_key'].split(
            '/')[-1].rstrip('.csv')
        assay_params['hyperparam_uuid'] = self.hyperparam_uuid
예제 #2
0
 def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                        **metric_kwargs):
     """
     Load datasets from datastore, featurize them, and plot distributions of their inter-compound
     distances.
     """
     log = logging.getLogger('ATOM')
 
     dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
 
     if feat_type == 'descriptors':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
             descriptor_type='moe',
             featurizer='descriptors',
             system='twintron-blue',
             datastore=True,
             transformers=True))
     elif feat_type == 'ECFP':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             featurizer='ECFP',
             system='twintron-blue',
             datastore=True,
             ecfp_radius=2,
             ecfp_size=1024,
             transformers=True))
     else:
         log.error("Feature type %s not supported" % feat_type)
         return
     metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
     if 'id_col' in metadata.keys():
         params.id_col = metadata['id_col']
     if 'param' in metadata.keys():
         params.response_cols = [metadata['param']]
     elif 'response_col' in metadata.keys():
         params.response_cols = [metadata['response_col']]
     elif 'response_cols' in metadata.keys():
         params.response_cols = metadata['response_cols']
 
     if 'smiles_col' in metadata.keys():
         params.smiles_col = metadata['smiles_col']
 
     if 'class_number' in metadata.keys():
         params.class_number = metadata['class_number']
     params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
 
     log.warning("Featurizing data with %s featurizer" % feat_type)
     featurization = feat.create_featurization(params)
     model_dataset = md.MinimalDataset(params, featurization)
     model_dataset.get_featurized_data(dset_df)
     num_cmpds = model_dataset.dataset.X.shape[0]
     if num_cmpds > 50000:
         log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
         return
     # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
     dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
     import scipy
     dists = scipy.spatial.distance.squareform(dists)
     res_dir = '/ds/projdata/gsk_data/model_analysis/'
     plt_dir = '%s/Plots' % res_dir
     file_prefix = dset_key.split('/')[-1].rstrip('.csv')
     mcs_linkage = linkage(dists, method='complete')
     pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
     pdf = PdfPages(pdf_path)
     g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
     if plt_dir is not None:
         pdf.savefig(g.fig)
         pdf.close()
     return dists
    def return_split_uuid(self,
                          dataset_key,
                          bucket=None,
                          splitter=None,
                          split_combo=None,
                          retry_time=60):
        """
        Loads a dataset, splits it, saves it, and returns the split_uuid
        Args:
            dataset_key: key for dataset to split
            bucket: datastore-specific user group bucket
            splitter: Type of splitter to use to split the dataset
            split_combo: tuple of form (split_valid_frac, split_test_frac)

        Returns:

        """
        if bucket is None:
            bucket = self.params.bucket
        if splitter is None:
            splitter = self.params.splitter
        if split_combo is None:
            split_valid_frac = self.params.split_valid_frac
            split_test_frac = self.params.split_test_frac
        else:
            split_valid_frac = split_combo[0]
            split_test_frac = split_combo[1]
        retry = True
        i = 0
        #TODO: need to catch if dataset doesn't exist versus 500 failure
        while retry:
            try:
                metadata = dsf.get_keyval(dataset_key=dataset_key,
                                          bucket=bucket)
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping..."
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, exiting"
                        % (dataset_key, e))
                    return None
        assay_params = {
            'dataset_key': dataset_key,
            'bucket': bucket,
            'splitter': splitter,
            'split_valid_frac': split_valid_frac,
            'split_test_frac': split_test_frac
        }
        #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use
        if type(self.params.featurizer) == list:
            assay_params['featurizer'] = self.params.featurizer[0]
        else:
            assay_params['featurizer'] = self.params.featurizer
        if 'id_col' in metadata.keys():
            assay_params['id_col'] = metadata['id_col']
        if 'response_cols' not in assay_params or assay_params[
                'response_cols'] is None:
            if 'param' in metadata.keys():
                assay_params['response_cols'] = [metadata['param']]
            if 'response_col' in metadata.keys():
                assay_params['response_cols'] = [metadata['response_col']]
            if 'response_cols' in metadata.keys():
                assay_params['response_cols'] = metadata['response_cols']
        if 'smiles_col' in metadata.keys():
            assay_params['smiles_col'] = metadata['smiles_col']
        if 'class_name' in metadata.keys():
            assay_params['class_name'] = metadata['class_name']
        if 'class_number' in metadata.keys():
            assay_params['class_number'] = metadata['class_number']
        assay_params['dataset_name'] = assay_params['dataset_key'].split(
            '/')[-1].rstrip('.csv')
        assay_params['datastore'] = True
        assay_params[
            'previously_featurized'] = self.params.previously_featurized
        try:
            assay_params['descriptor_key'] = self.params.descriptor_key
            assay_params['descriptor_bucket'] = self.params.descriptor_bucket
        except:
            print("")
        #TODO: check usage with defaults
        namespace_params = parse.wrapper(assay_params)
        # TODO: Don't want to recreate each time
        featurization = feat.create_featurization(namespace_params)
        data = model_datasets.create_model_dataset(namespace_params,
                                                   featurization)
        retry = True
        i = 0
        while retry:
            try:
                data.get_featurized_data()
                data.split_dataset()
                data.save_split_dataset()
                return data.split_uuid
            except Exception as e:
                if i < 5:
                    print(
                        "Could not get metadata from datastore for dataset %s because of exception %s, sleeping"
                        % (dataset_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not save split dataset for dataset %s because of exception %s"
                        % (dataset_key, e))
                    return None