Esempio n. 1
0
def train_model_from_tracker(model_uuid, output_dir):
    """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker

    Args:
        model_uuid (str): model tracker model_uuid file

        output_dir (str): path to output directory

    Returns:
        the model pipeline object with trained model
    """

    if not mlmt_supported:
        logger.debug(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    collection_name = mt.get_model_collection_by_uuid(model_uuid,
                                                      mlmt_client=mlmt_client)

    # get metadata from tracker
    config = mt.get_metadata_by_uuid(model_uuid)

    # check if datastore dataset
    try:
        result = dsf.retrieve_dataset_by_datasetkey(
            config['training_dataset']['dataset_key'],
            bucket=config['training_dataset']['bucket'])
        if result is not None:
            config['datastore'] = True
    except:
        pass
    # fix weird old parameters
    #if config[]
    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output_dir
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']
    # specify collection
    params.collection_name = collection_name

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
Esempio n. 2
0
def _get_descriptors(smiles_arr):
    """
    DEPRECATED. This function is guaranteed not to work, since it refers to datasets that no longer exist.
    """
    ds_client = dsf.config_client()

    full_feature_matrix_key = '/ds/projdata/gsk_data/GSK_datasets/eXP_Panel_Min_100_Cmpds/scaled_descriptors/' \
                              'subset_all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi_HTR2A_5_' \
                              'HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50.csv'
    full_feature_matrix = dsf.retrieve_dataset_by_datasetkey(full_feature_matrix_key, 'gskdata', ds_client)
    smiles_df = pd.DataFrame(smiles_arr)
    #df = full_feature_matrix.merge(
    #    smiles_df, how='inner', left_on='smiles', right_on=smiles_df.columns[0])
    df = full_feature_matrix.head(20)
    del full_feature_matrix
    descriptor_features = [x for x in df.columns.values.tolist() if x not in
                               ['compound_id', 'inchi_key', 'smiles', 'smiles_out',
                                'lost_frags', 'inchi_string', 'pxc50', 'rdkit_smiles',
                                'HTR2A_5_HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50']]
    #TODO this probably doesn't work
    return df[descriptor_features]
def export_model(model_uuid, collection, model_dir, alt_bucket='CRADA'):
    """
    Export the metadata (parameters) and other files needed to recreate a model
    from the model tracker database to a gzipped tar archive.

    Args:
        model_uuid (str): Model unique identifier

        collection (str): Name of the collection holding the model in the database.

        model_dir (str): Path to directory where the model metadata and parameter files will be written. The directory will
        be created if it doesn't already exist. Subsequently, the directory contents will be packed into a gzipped tar archive
        named model_dir.tar.gz.

        alt_bucket (str): Alternate datastore bucket to search for model tarball and transformer objects.

    Returns:
        none
    """
    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return

    ds_client = dsf.config_client()
    metadata_dict = get_metadata_by_uuid(model_uuid,
                                         collection_name=collection)

    # Get the tarball containing the saved model from the datastore, and extract it into model_dir.
    if 'ModelMetadata' in metadata_dict:
        # Convert old style metadata
        metadata_dict = convert_metadata(metadata_dict)

    if 'model_parameters' in metadata_dict:
        model_parameters = metadata_dict['model_parameters']
    else:
        raise Exception("Bad metadata for model UUID %s" % model_uuid)

    os.makedirs(model_dir, exist_ok=True)

    model_params = parse.wrapper(metadata_dict)

    # Override selected model training parameters

    # Check that buckets where model tarball and transformers were saved still exist. If not, try alt_bucket.
    trans_bucket_differs = (model_params.transformer_bucket !=
                            model_params.model_bucket)
    model_bucket_meta = ds_client.ds_buckets.get_buckets(
        buckets=[model_params.model_bucket]).result()
    if len(model_bucket_meta) == 0:
        model_params.model_bucket = alt_bucket
    if trans_bucket_differs:
        trans_bucket_meta = ds_client.ds_buckets.get_buckets(
            buckets=[model_params.transformer_bucket]).result()
        if len(trans_bucket_meta) == 0:
            model_params.transformer_bucket = alt_bucket
    else:
        if len(model_bucket_meta) == 0:
            model_params.transformer_bucket = alt_bucket

    # Unpack the model state tarball into a subdirectory of the new archive
    model_dataset_key = 'model_%s_tarball' % model_uuid
    extract_dir = dsf.retrieve_dataset_by_datasetkey(model_dataset_key,
                                                     model_params.model_bucket,
                                                     client=ds_client,
                                                     return_metadata=False,
                                                     nrows=None,
                                                     print_metadata=False,
                                                     sep=False,
                                                     tarpath='%s/best_model' %
                                                     model_dir)

    # Download the transformers pickle file if there is one
    if trans.transformers_needed(model_params):
        try:
            if model_params.transformer_key is None:
                transformer_key = 'transformers_%s.pkl' % model_uuid
            else:
                transformer_key = model_params.transformer_key
            trans_fp = ds_client.open_bucket_dataset(
                model_params.transformer_bucket, transformer_key, mode='b')
            trans_data = trans_fp.read()
            trans_fp.close()
            trans_path = "%s/transformers.pkl" % model_dir
            trans_out = open(trans_path, mode='wb')
            trans_out.write(trans_data)
            trans_out.close()
            del model_parameters['transformer_oid']
            model_parameters['transformer_key'] = 'transformers.pkl'

        except:
            print(
                "Transformers expected but not found in datastore in bucket %s with key\n%s"
                % (model_params.transformer_bucket, transformer_key))
            raise

    # Save the metadata params
    model_parameters['save_results'] = False
    meta_path = "%s/model_metadata.json" % model_dir
    with open(meta_path, 'w') as meta_out:
        json.dump(metadata_dict, meta_out, indent=4)

    # Create a new tarball containing both the metadata and the parameters from the retrieved model tarball
    new_tarpath = "%s.tar.gz" % model_dir
    tarball = tarfile.open(new_tarpath, mode='w:gz')
    tarball.add(model_dir, arcname='.')
    tarball.close()
    print("Wrote model files to %s" % new_tarpath)
Esempio n. 4
0
def diversity_plots(dset_key, datastore=True, bucket='gsk_ml', title_prefix=None, ecfp_radius=4, out_dir=None, 
                    id_col='compound_id', smiles_col='rdkit_smiles', max_for_mcs=300):
    """
    Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain
    columns for a compound ID and a SMILES string.
    """
    # Load table of compound names, IDs and SMILES strings
    if datastore:
        cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket)
    else:
        cmpd_df = pd.read_csv(dset_key, index_col=False)
    file_prefix = os.path.splitext(os.path.basename(dset_key))[0]
    if title_prefix is None:
        title_prefix = file_prefix.replace('_', ' ')
    compound_ids = cmpd_df[id_col].values
    smiles_strs = cmpd_df[smiles_col].values
    ncmpds = len(smiles_strs)
    print(ncmpds)
    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    print("Canonicalizing molecules...")
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    print("Done")

    # Generate ECFP fingerprints
    print("Computing fingerprints...")
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]
    print("Done")

    if ncmpds <= max_for_mcs:
        # Get MCS distance matrix and draw a heatmap
        print("Computing MCS distance matrix...")
        mcs_dist = dm.mcs(base_mols)
        print("Done")
        cmpd1 = []
        cmpd2 = []
        dist = []
        ind1 = []
        ind2 = []
        for i in range(ncmpds-1):
            for j in range(i+1, ncmpds):
                cmpd1.append(compound_ids[i])
                cmpd2.append(compound_ids[j])
                dist.append(mcs_dist[i,j])
                ind1.append(i)
                ind2.append(j)
        dist_df = pd.DataFrame({'compound_1' : cmpd1, 'compound_2' : cmpd2, 'dist' : dist,
                                'i' : ind1, 'j' : ind2})
        dist_df = dist_df.sort_values(by='dist')
        print(dist_df.head(10))
        if out_dir is not None:
            dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix), index=False)
            for k in range(10):
                mol_i = base_mols[dist_df.i.values[k]]
                mol_j = base_mols[dist_df.j.values[k]]
                img_file_i = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.i.values[k]])
                img_file_j = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.j.values[k]])
                Draw.MolToFile(mol_i, img_file_i, size=(500,500), fitImage=False)
                Draw.MolToFile(mol_j, img_file_j, size=(500,500), fitImage=False)
    
        mcs_linkage = linkage(mcs_dist, method='complete')
        mcs_df = pd.DataFrame(mcs_dist, columns=compound_ids, index=compound_ids)
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        g = sns.clustermap(mcs_df, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12,12), cmap='plasma')
        if out_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
    
        # Draw a UMAP projection based on MCS distance
        mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17)
        reps = mapper.fit_transform(mcs_dist)
        rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
        rep_df['compound_id'] = compound_ids
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        fig, ax = plt.subplots(figsize=(12,12))
        sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
        ax.set_title("%s, 2D projection based on MCS distance" % title_prefix)
        if out_dir is not None:
            pdf.savefig(fig)
            pdf.close()
            rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix), index=False)

    # Get Tanimoto distance matrix
    print("Computing Tanimoto distance matrix...")
    tani_dist = dm.tanimoto(fps)
    print("Done")
    # Draw a UMAP projection based on Tanimoto distance
    mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17)
    reps = mapper.fit_transform(tani_dist)
    rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
    rep_df['compound_id'] = compound_ids
    if out_dir is not None:
        pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
    ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix)
    if out_dir is not None:
        pdf.savefig(fig)
        pdf.close()

    # Draw a cluster heatmap based on Tanimoto distance
    tani_linkage = linkage(tani_dist, method='complete')
    tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids)
    if out_dir is not None:
        pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12,12), cmap='plasma')
    if out_dir is not None:
        pdf.savefig(g.fig)
        pdf.close()
Esempio n. 5
0
 def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                        **metric_kwargs):
     """
     Load datasets from datastore, featurize them, and plot distributions of their inter-compound
     distances.
     """
     log = logging.getLogger('ATOM')
 
     dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
 
     if feat_type == 'descriptors':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
             descriptor_type='moe',
             featurizer='descriptors',
             system='twintron-blue',
             datastore=True,
             transformers=True))
     elif feat_type == 'ECFP':
         params = parse.wrapper(dict(
             dataset_key=dset_key,
             bucket=bucket,
             featurizer='ECFP',
             system='twintron-blue',
             datastore=True,
             ecfp_radius=2,
             ecfp_size=1024,
             transformers=True))
     else:
         log.error("Feature type %s not supported" % feat_type)
         return
     metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
     if 'id_col' in metadata.keys():
         params.id_col = metadata['id_col']
     if 'param' in metadata.keys():
         params.response_cols = [metadata['param']]
     elif 'response_col' in metadata.keys():
         params.response_cols = [metadata['response_col']]
     elif 'response_cols' in metadata.keys():
         params.response_cols = metadata['response_cols']
 
     if 'smiles_col' in metadata.keys():
         params.smiles_col = metadata['smiles_col']
 
     if 'class_number' in metadata.keys():
         params.class_number = metadata['class_number']
     params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
 
     log.warning("Featurizing data with %s featurizer" % feat_type)
     featurization = feat.create_featurization(params)
     model_dataset = md.MinimalDataset(params, featurization)
     model_dataset.get_featurized_data(dset_df)
     num_cmpds = model_dataset.dataset.X.shape[0]
     if num_cmpds > 50000:
         log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
         return
     # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
     dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
     import scipy
     dists = scipy.spatial.distance.squareform(dists)
     res_dir = '/ds/projdata/gsk_data/model_analysis/'
     plt_dir = '%s/Plots' % res_dir
     file_prefix = dset_key.split('/')[-1].rstrip('.csv')
     mcs_linkage = linkage(dists, method='complete')
     pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
     pdf = PdfPages(pdf_path)
     g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
     if plt_dir is not None:
         pdf.savefig(g.fig)
         pdf.close()
     return dists
Esempio n. 6
0
def liability_dset_diversity(bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine', **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param', value=['PIC50','PEC50'], operator='in', 
                                                bucket=bucket, client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" % (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            descriptor_key='all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                            descriptor_type='MOE',
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='descriptors',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                            dataset_file=dataset_file,
                            y=task_name,
                            bucket=bucket,
                            splitter=split,
                            id_col='compound_id',
                            smiles_col='rdkit_smiles',
                            featurizer='ECFP',
                            prediction_type='regression', 
                            system='twintron-blue',
                            datastore=True,
                            ecfp_radius=2, ecfp_size=1024, 
                            transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, task_name, **metric_kwargs)

    # ------------------------------------------------------------------------------------------------------------------
    def get_dset_diversity(dset_key, ds_client, bucket='gsk_ml', feat_type='descriptors', dist_metric='cosine',
                           **metric_kwargs):
        """
        Load datasets from datastore, featurize them, and plot distributions of their inter-compound
        distances.
        """
        log = logging.getLogger('ATOM')
    
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket, ds_client)
    
        if feat_type == 'descriptors':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                descriptor_key='/ds/projdata/gsk_data/GSK_Descriptors/GSK_2D_3D_MOE_Descriptors_By_Variant_ID_With_Base_RDKit_SMILES.feather',
                descriptor_type='moe',
                featurizer='descriptors',
                system='twintron-blue',
                datastore=True,
                transformers=True))
        elif feat_type == 'ECFP':
            params = parse.wrapper(dict(
                dataset_key=dset_key,
                bucket=bucket,
                featurizer='ECFP',
                system='twintron-blue',
                datastore=True,
                ecfp_radius=2,
                ecfp_size=1024,
                transformers=True))
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        metadata = dsf.get_keyval(dataset_key=dset_key, bucket=bucket)
        if 'id_col' in metadata.keys():
            params.id_col = metadata['id_col']
        if 'param' in metadata.keys():
            params.response_cols = [metadata['param']]
        elif 'response_col' in metadata.keys():
            params.response_cols = [metadata['response_col']]
        elif 'response_cols' in metadata.keys():
            params.response_cols = metadata['response_cols']
    
        if 'smiles_col' in metadata.keys():
            params.smiles_col = metadata['smiles_col']
    
        if 'class_number' in metadata.keys():
            params.class_number = metadata['class_number']
        params.dataset_name = dset_key.split('/')[-1].rstrip('.csv')
    
        log.warning("Featurizing data with %s featurizer" % feat_type)
        featurization = feat.create_featurization(params)
        model_dataset = md.MinimalDataset(params, featurization)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" % num_cmpds)
            return
        # plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric, params.response_cols, **metric_kwargs)
        dists = cd.calc_dist_diskdataset('descriptors', dist_metric, model_dataset.dataset, calc_type='all')
        import scipy
        dists = scipy.spatial.distance.squareform(dists)
        res_dir = '/ds/projdata/gsk_data/model_analysis/'
        plt_dir = '%s/Plots' % res_dir
        file_prefix = dset_key.split('/')[-1].rstrip('.csv')
        mcs_linkage = linkage(dists, method='complete')
        pdf_path = '%s/%s_mcs_clustermap.pdf' % (plt_dir, file_prefix)
        pdf = PdfPages(pdf_path)
        g = sns.clustermap(dists, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma')
        if plt_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
        return dists
    def get_shortlist_df(self, split_uuids=False, retry_time=60):
        """
        
        Args:
            split_uuids: Boolean value saying if you want just datasets returned or the split_uuids as well

        Returns:
            The list of dataset_keys, along with their accompanying bucket, split type, and split_uuid if split_uuids is True
        """
        if self.params.datastore:
            retry = True
            i = 0
            while retry:
                try:
                    df = dsf.retrieve_dataset_by_datasetkey(
                        self.params.shortlist_key, self.params.bucket)
                    retry = False
                except Exception as e:
                    if i < 5:
                        print(
                            "Could not retrieve shortlist %s because of exception %s, sleeping..."
                            % (self.params.shortlist_key, e))
                        time.sleep(retry_time)
                        i += 1
                    else:
                        print(
                            "Could not retrieve shortlist %s because of exception %s, exiting"
                            % (self.params.shortlist_key, e))
                        sys.exit(1)
        else:
            if not os.path.exists(self.params.shortlist_key):
                return None
            df = pd.read_csv(self.params.shortlist_key, index_col=False)
        if df is None:
            sys.exit(1)
        if len(df.columns) == 1:
            assays = df[df.columns[0]].values.tolist()
        else:
            if 'task_name' in df.columns:
                col_name = 'task_name'
            else:
                col_name = 'dataset_key'
            assays = df[col_name].values.tolist()
        if 'bucket' in df.columns:
            datasets = list(zip(assays, df.bucket.values.tolist()))
        elif 'bucket_name' in df.columns:
            datasets = list(zip(assays, df.bucket_name.values.tolist()))
        else:
            datasets = list(zip(assays, [self.params.bucket]))
        datasets = [(d[0].strip(), d[1].strip()) for d in datasets]
        if not split_uuids:
            return datasets
        if type(self.params.splitter) == str:
            splitters = [self.params.splitter]
        else:
            splitters = self.params.splitter
        assays = []
        for splitter in splitters:
            split_name = '%s_%d_%d' % (splitter, self.params.split_valid_frac *
                                       100, self.params.split_test_frac * 100)
            if split_name in df.columns:
                for i, row in df.iterrows():
                    assays.append((datasets[i][0], datasets[i][1], splitter,
                                   row[split_name]))
            else:
                for assay, bucket in datasets:
                    try:
                        # do we want to move this into loop so we ignore ones it failed for?
                        split_uuid = self.return_split_uuid(assay, bucket)
                        assays.append((assay, bucket, splitter, split_uuid))
                    except Exception as e:
                        print("Splitting failed for dataset %s, skipping..." %
                              assay)
                        print(e)
                        print(traceback.print_exc())
                        continue
        return assays
    def generate_split_shortlist(self, retry_time=60):
        """
        Processes a shortlist, generates splits for each dataset on the list, and uploads a new shortlist file with the
        split_uuids included. Generates splits for the split_combos [[0.1,0.1], [0.1,0.2],[0.2,0.2]], [random, scaffold]
        
        Returns:
            None
        """
        retry = True
        i = 0
        while retry:
            try:
                shortlist_metadata = dsf.retrieve_dataset_by_datasetkey(
                    bucket=self.params.bucket,
                    dataset_key=self.params.shortlist_key,
                    return_metadata=True)
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not retrieve shortlist %s from datastore because of exception %s, sleeping..."
                        % (self.params.shortlist_key, e))
                    time.sleep(retry_time)
                    i += 1
                else:
                    print(
                        "Could not retrieve shortlist %s from datastore because of exception %s, exiting"
                        % (self.params.shortlist_key, e))
                    return None

        datasets = self.get_shortlist_df()
        rows = []
        for assay, bucket in datasets:
            split_uuids = {'dataset_key': assay, 'bucket': bucket}
            for splitter in ['random', 'scaffold']:
                for split_combo in [[0.1, 0.1], [0.1, 0.2], [0.2, 0.2]]:
                    split_name = "%s_%d_%d" % (splitter, split_combo[0] * 100,
                                               split_combo[1] * 100)
                    try:
                        split_uuids[split_name] = self.return_split_uuid(
                            assay, bucket, splitter, split_combo)
                    except Exception as e:
                        print(e)
                        print("Splitting failed for dataset %s" % assay)
                        split_uuids[split_name] = None
                        continue
            rows.append(split_uuids)
        df = pd.DataFrame(rows)
        new_metadata = {}
        new_metadata['dataset_key'] = shortlist_metadata['dataset_key'].strip(
            '.csv') + '_with_uuids.csv'
        new_metadata['has_uuids'] = True
        new_metadata['description'] = '%s, with UUIDs' % shortlist_metadata[
            'description']
        retry = True
        i = 0
        while retry:
            try:
                dsf.upload_df_to_DS(df,
                                    bucket=self.params.bucket,
                                    filename=new_metadata['dataset_key'],
                                    title=new_metadata['dataset_key'].replace(
                                        '_', ' '),
                                    description=new_metadata['description'],
                                    tags=[],
                                    key_values={},
                                    dataset_key=new_metadata['dataset_key'])
                retry = False
            except Exception as e:
                if i < 5:
                    print(
                        "Could not save new shortlist because of exception %s, sleeping..."
                        % e)
                    time.sleep(retry_time)
                    i += 1
                else:
                    #TODO: Add save to disk.
                    print(
                        "Could not save new shortlist because of exception %s, exiting"
                        % e)
                    retry = False
Esempio n. 9
0
# ksm: params.response_cols is required now by our code, though it's not marked as such in parameter_parser.py.
# Therefore, the following line will fail.
#(params_from_file_noy, dataset_obj_from_file_noy, df_delaney) = utils.delaney_objects(y = None)

# ksm: Creating the following dataset will fail now, because it doesn't actually contain the response columns.
#(params_from_file_wrongy, dataset_obj_from_file_wrongy, df_delaney) = utils.delaney_objects(y = ["not","a","task"])

delaney_from_disk = pd.read_csv("delaney-processed.csv")

if not datastore_is_down:
    (params_from_ds, dataset_obj_from_datastore,
     df_datastore) = utils.datastore_objects()
    #(params_from_ds_noy, dataset_obj_from_datastore_noy, df_datastore) = utils.datastore_objects(y = None)
    #(params_from_ds_wrongy, dataset_obj_from_datastore_wrongy, df_datastore) = utils.datastore_objects(y = ["not","a","task"])

    df_datastore = ds.retrieve_dataset_by_datasetkey(
        params_from_ds.dataset_key, params_from_ds.bucket)

DD = dc.data.datasets.DiskDataset


#***********************************************************************************
def test_create_model_dataset():
    """testing if classes are properly generated from the factory method. Asserting that the correct methods exist, and are callable. """

    (params_from_file, dataset_obj_from_file,
     df_delaney) = utils.delaney_objects()
    (params_from_file_scaffold, dataset_obj_from_file_scaffold,
     df_delaney) = utils.delaney_objects(split_strategy="train_valid_test",
                                         splitter="scaffold")

    test_list = []
Esempio n. 10
0
def analyze_split(params,
                  id_col='compound_id',
                  smiles_col='rdkit_smiles',
                  active_col='active'):
    """
    Evaluate the AVE bias for the training/validation and training/test set splits of the given dataset.

    Also show the active frequencies in each subset and for the dataset as a whole.
    id_col, smiles_col and active_col are defaults to be used in case they aren't found in the dataset metadata; if found
    the metadata values are used instead.

    Args:
        params (argparse.Namespace): Pipeline parameters.

        id_col (str): Dataset column containing compound IDs.

        smiles_col (str): Dataset column containing SMILES strings.

        active_col (str): Dataset column containing binary classifications.

    Returns:
        :obj:`pandas.DataFrame`: Table of split subsets showing sizes, numbers and fractions of active compounds

    """
    dset_key = params.dataset_key
    bucket = params.bucket
    split_uuid = params.split_uuid

    ds_client = dsf.config_client()
    try:
        split_metadata = dsf.search_datasets_by_key_value('split_dataset_uuid',
                                                          split_uuid,
                                                          ds_client,
                                                          operator='in',
                                                          bucket=bucket)
        split_oid = split_metadata['dataset_oid'].values[0]
        split_df = dsf.retrieve_dataset_by_dataset_oid(split_oid,
                                                       client=ds_client)
    except Exception as e:
        print("Error when loading split file:\n%s" % str(e))
        raise

    try:
        dataset_df = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                        bucket,
                                                        client=ds_client)
        dataset_meta = dsf.retrieve_dataset_by_datasetkey(dset_key,
                                                          bucket,
                                                          client=ds_client,
                                                          return_metadata=True)
    except Exception as e:
        print("Error when loading dataset:\n%s" % str(e))
        raise
    kv_dict = dsf.get_key_val(dataset_meta['metadata'])
    id_col = kv_dict.get('id_col', id_col)
    smiles_col = kv_dict.get('smiles_col', smiles_col)
    active_col = kv_dict.get('response_col', active_col)

    try:
        print('Dataset has %d unique compound IDs' %
              len(set(dataset_df[id_col].values)))
        print('Split table has %d unique compound IDs' %
              len(set(split_df.cmpd_id.values)))

        dset_df = dataset_df.merge(split_df,
                                   how='inner',
                                   left_on=id_col,
                                   right_on='cmpd_id').drop('cmpd_id', axis=1)
    except Exception as e:
        print("Error when joining dataset with split dataset:\n%s" % str(e))
        raise

    featurization = feat.create_featurization(params)
    data = md.create_model_dataset(params, featurization, ds_client)
    data.get_featurized_data()
    feat_arr = data.dataset.X
    # TODO: impute missing values if necessary
    y = data.dataset.y.flatten()
    if len(set(y) - set([0, 1])) > 0:
        raise ValueError(
            'AVEMinSplitter only works on binary classification datasets')
    ids = data.dataset.ids
    active_ind = np.where(y == 1)[0]
    inactive_ind = np.where(y == 0)[0]
    active_feat = feat_arr[active_ind, :]
    inactive_feat = feat_arr[inactive_ind, :]
    num_active = len(active_ind)
    num_inactive = len(inactive_ind)
    active_ids = ids[active_ind]
    inactive_ids = ids[inactive_ind]
    active_id_ind = dict(zip(active_ids, range(len(active_ids))))
    inactive_id_ind = dict(zip(inactive_ids, range(len(inactive_ids))))
    if params.featurizer == 'ecfp':
        metric = 'jaccard'
    elif params.featurizer == 'graphconv':
        raise ValueError(
            "ave_min splitter dopesn't support graphconv features")
    else:
        metric = 'euclidean'

    # Calculate distance thresholds where nearest neighborfunction should be evaluated
    if metric == 'jaccard':
        max_nn_dist = 1.0
    else:
        nan_mat = np.isnan(feat_arr)
        nnan = np.sum(nan_mat)
        if nnan > 0:
            log.info('Input feature matrix has %d NaN elements' % nnan)
            not_nan = ~nan_mat
            for i in range(feat_arr.shape[1]):
                feat_arr[nan_mat[:, i], i] = np.mean(feat_arr[not_nan[:, i],
                                                              i])
        nn_dist = np.sort(squareform(pdist(feat_arr, metric)))[:, 1]
        med_nn_dist = np.median(nn_dist)
        max_nn_dist = 3.0 * med_nn_dist
    ndist = 100
    dist_thresh = np.linspace(0.0, max_nn_dist, ndist)

    # Compute distance matrices between subsets
    num_workers = 1
    aa_dist = _calc_dist_mat(active_feat, active_feat, metric, None,
                             num_workers)
    ii_dist = _calc_dist_mat(inactive_feat, inactive_feat, metric, None,
                             num_workers)
    ai_dist = _calc_dist_mat(active_feat, inactive_feat, metric, None,
                             num_workers)
    ia_dist = ai_dist.transpose()

    subsets = sorted(set(dset_df.subset.values))
    subset_active_ind = {}
    subset_inactive_ind = {}

    if 'train' in subsets:
        # this is a TVT split
        subsets = ['train', 'valid', 'test']
        for subset in subsets:
            subset_df = dset_df[dset_df.subset == subset]
            active_df = subset_df[subset_df[active_col] == 1]
            inactive_df = subset_df[subset_df[active_col] == 0]
            subset_active_ids = active_df[id_col].values
            subset_inactive_ids = inactive_df[id_col].values
            subset_active_ind[subset] = [
                active_id_ind[id] for id in subset_active_ids
            ]
            subset_inactive_ind[subset] = [
                inactive_id_ind[id] for id in subset_inactive_ids
            ]

        taI = subset_active_ind['train']
        tiI = subset_inactive_ind['train']
        print("Results for %s split with %s %s features:" %
              (params.splitter, params.descriptor_type, params.featurizer))
        for valid_set in ['valid', 'test']:
            vaI = subset_active_ind[valid_set]
            viI = subset_inactive_ind[valid_set]
            split_params = ((vaI, viI, taI, tiI), aa_dist, ii_dist, ai_dist,
                            ia_dist, dist_thresh)
            _plot_nn_dist_distr(split_params)
            bias = _plot_bias(split_params, niter=0)
            print("For train/%s split: AVE bias = %.5f" % (valid_set, bias))
    else:
        # TODO: deal with k-fold splits later
        print('k-fold CV splits not supported yet')
        return

    # Tabulate the fractions of actives in the full dataset and each subset
    subset_list = []
    size_list = []
    frac_list = []
    active_frac_list = []

    dset_size = data.dataset.X.shape[0]
    dset_active = sum(data.dataset.y)
    subset_list.append('full dataset')
    size_list.append(dset_size)
    frac_list.append(1.0)
    active_frac_list.append(dset_active / dset_size)

    for subset in subsets:
        active_size = len(subset_active_ind[subset])
        inactive_size = len(subset_inactive_ind[subset])
        subset_size = active_size + inactive_size
        active_frac = active_size / subset_size
        subset_list.append(subset)
        size_list.append(subset_size)
        frac_list.append(subset_size / dset_size)
        active_frac_list.append(active_frac)
    frac_df = pd.DataFrame(
        dict(subset=subset_list,
             size=size_list,
             fraction=frac_list,
             active_frac=active_frac_list))
    print('\nSplit subsets:')
    print(frac_df)

    return frac_df
Esempio n. 11
0
def diversity_plots(dset_key,
                    datastore=True,
                    bucket='public',
                    title_prefix=None,
                    ecfp_radius=4,
                    umap_file=None,
                    out_dir=None,
                    id_col='compound_id',
                    smiles_col='rdkit_smiles',
                    is_base_smiles=False,
                    response_col=None,
                    max_for_mcs=300):
    """
    Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain
    columns for a compound ID and a SMILES string. Produces a clustered heatmap display of Tanimoto distances between
    compounds along with a 2D UMAP projection plot based on ECFP fingerprints, with points colored according to the response
    variable.

    Args:
        dset_key (str): Datastore key or filepath for dataset.

        datastore (bool): Whether to load dataset from datastore or from filesystem.

        bucket (str): Name of datastore bucket containing dataset.

        title_prefix (str): Prefix for plot titles.

        ecfp_radius (int): Radius for ECFP fingerprint calculation.

        umap_file (str, optional): Path to file to write UMAP coordinates to.

        out_dir (str, optional):  Output directory for plots and tables. If provided, plots will be output as PDF files rather
            than in the current notebook, and some additional CSV files will be generated.

        id_col (str): Column in dataset containing compound IDs.

        smiles_col (str): Column in dataset containing SMILES strings.

        is_base_smiles (bool): True if SMILES strings do not need to be salt-stripped and standardized.

        response_col (str): Column in dataset containing response values.

        max_for_mcs (int): Maximum dataset size for plots based on MCS distance. If the number of compounds is less than this
            value, an additional cluster heatmap and UMAP projection plot will be produced based on maximum common substructure
            distance.

    """
    # Load table of compound names, IDs and SMILES strings
    if datastore:
        cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket)
    else:
        cmpd_df = pd.read_csv(dset_key, index_col=False)
    cmpd_df = cmpd_df.drop_duplicates(subset=smiles_col)
    file_prefix = os.path.splitext(os.path.basename(dset_key))[0]
    if title_prefix is None:
        title_prefix = file_prefix.replace('_', ' ')
    compound_ids = cmpd_df[id_col].values
    smiles_strs = cmpd_df[smiles_col].values
    ncmpds = len(smiles_strs)
    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    if is_base_smiles:
        base_mols = np.array([Chem.MolFromSmiles(s) for s in smiles_strs])
    else:
        print("Canonicalizing %d molecules..." % ncmpds)
        base_mols = np.array([
            struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs
        ])
        for i, mol in enumerate(base_mols):
            if mol is None:
                print('Unable to get base molecule for compound %d = %s' %
                      (i, compound_ids[i]))
        print("Done")

    has_good_smiles = np.array([mol is not None for mol in base_mols])
    base_mols = base_mols[has_good_smiles]

    cmpd_df = cmpd_df[has_good_smiles]
    ncmpds = cmpd_df.shape[0]
    compound_ids = cmpd_df[id_col].values
    responses = None
    if response_col is not None:
        responses = cmpd_df[response_col].values
        uniq_responses = set(responses)
        if uniq_responses == set([0, 1]):
            response_type = 'binary'
            colorpal = {0: 'forestgreen', 1: 'red'}
        elif len(uniq_responses) <= 10:
            response_type = 'categorical'
            colorpal = sns.color_palette('husl', n_colors=len(uniq_responses))
        else:
            response_type = 'continuous'
            colorpal = sns.blend_palette(['red', 'green', 'blue'],
                                         12,
                                         as_cmap=True)

    # Generate ECFP fingerprints
    print("Computing fingerprints...")
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024)
        for mol in base_mols if mol is not None
    ]
    print("Done")

    if ncmpds <= max_for_mcs:
        # Get MCS distance matrix and draw a heatmap
        print("Computing MCS distance matrix...")
        mcs_dist = dm.mcs(base_mols)
        print("Done")
        cmpd1 = []
        cmpd2 = []
        dist = []
        ind1 = []
        ind2 = []
        for i in range(ncmpds - 1):
            for j in range(i + 1, ncmpds):
                cmpd1.append(compound_ids[i])
                cmpd2.append(compound_ids[j])
                dist.append(mcs_dist[i, j])
                ind1.append(i)
                ind2.append(j)
        dist_df = pd.DataFrame({
            'compound_1': cmpd1,
            'compound_2': cmpd2,
            'dist': dist,
            'i': ind1,
            'j': ind2
        })
        dist_df = dist_df.sort_values(by='dist')
        print(dist_df.head(10))
        if out_dir is not None:
            dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix),
                           index=False)
            for k in range(10):
                mol_i = base_mols[dist_df.i.values[k]]
                mol_j = base_mols[dist_df.j.values[k]]
                img_file_i = '%s/%d_%s.png' % (
                    out_dir, k, compound_ids[dist_df.i.values[k]])
                img_file_j = '%s/%d_%s.png' % (
                    out_dir, k, compound_ids[dist_df.j.values[k]])
                Draw.MolToFile(mol_i,
                               img_file_i,
                               size=(500, 500),
                               fitImage=False)
                Draw.MolToFile(mol_j,
                               img_file_j,
                               size=(500, 500),
                               fitImage=False)

        mcs_linkage = linkage(mcs_dist, method='complete')
        mcs_df = pd.DataFrame(mcs_dist,
                              columns=compound_ids,
                              index=compound_ids)
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        g = sns.clustermap(mcs_df,
                           row_linkage=mcs_linkage,
                           col_linkage=mcs_linkage,
                           figsize=(12, 12),
                           cmap='plasma')
        if out_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()

        # Draw a UMAP projection based on MCS distance
        mapper = umap.UMAP(n_neighbors=20,
                           min_dist=0.1,
                           n_components=2,
                           metric='precomputed',
                           random_state=17)
        reps = mapper.fit_transform(mcs_dist)
        rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
        rep_df['compound_id'] = compound_ids
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        fig, ax = plt.subplots(figsize=(12, 12))
        if responses is None:
            sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
        else:
            rep_df['response'] = responses
            sns.scatterplot(x='x',
                            y='y',
                            hue='response',
                            palette=colorpal,
                            data=rep_df,
                            ax=ax)
        ax.set_title("%s, 2D projection based on MCS distance" % title_prefix)
        if out_dir is not None:
            pdf.savefig(fig)
            pdf.close()
            rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix),
                          index=False)

    # Get Tanimoto distance matrix
    print("Computing Tanimoto distance matrix...")
    tani_dist = dm.tanimoto(fps)
    print("Done")
    # Draw a UMAP projection based on Tanimoto distance
    mapper = umap.UMAP(n_neighbors=20,
                       min_dist=0.1,
                       n_components=2,
                       metric='precomputed',
                       random_state=17)
    reps = mapper.fit_transform(tani_dist)
    rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
    rep_df['compound_id'] = compound_ids
    if responses is not None:
        rep_df['response'] = responses
    if umap_file is not None:
        rep_df.to_csv(umap_file, index=False)
        print("Wrote UMAP mapping to %s" % umap_file)
    if out_dir is not None:
        pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12, 12))
    if responses is None:
        sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
    else:
        sns.scatterplot(x='x',
                        y='y',
                        hue='response',
                        palette=colorpal,
                        data=rep_df,
                        ax=ax)
    ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix)
    if out_dir is not None:
        pdf.savefig(fig)
        pdf.close()

    # Draw a cluster heatmap based on Tanimoto distance
    tani_linkage = linkage(tani_dist, method='complete')
    tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids)
    if out_dir is not None:
        pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    g = sns.clustermap(tani_df,
                       row_linkage=tani_linkage,
                       col_linkage=tani_linkage,
                       figsize=(12, 12),
                       cmap='plasma')
    if out_dir is not None:
        pdf.savefig(g.fig)
        pdf.close()
Esempio n. 12
0
def _liability_dset_diversity(bucket='public',
                              feat_type='descriptors',
                              dist_metric='cosine',
                              **metric_kwargs):
    """
    Load datasets from datastore, featurize them, and plot distributions of their inter-compound
    distances.
    """
    log = logging.getLogger('ATOM')
    ds_client = dsf.config_client()
    ds_table = dsf.search_datasets_by_key_value(key='param',
                                                value=['PIC50', 'PEC50'],
                                                operator='in',
                                                bucket=bucket,
                                                client=ds_client)
    dset_keys = ds_table.dataset_key.values
    metadata = ds_table.metadata.values
    split = 'random'
    task_names = []
    num_cmpds = []
    for i, dset_key in enumerate(dset_keys):
        md_dict = dsf.metadata_to_dict(metadata[i])
        task_name = md_dict['task_name']
        num_cmpds = md_dict['CMPD_COUNT'][0]
        log.warning("Loading dataset for %s, %d compounds" %
                    (task_name, num_cmpds))
        dset_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket,
                                                     ds_client)
        dataset_dir = os.path.dirname(dset_key)
        dataset_file = os.path.basename(dset_key)
        if feat_type == 'descriptors':
            params = argparse.Namespace(
                dataset_dir=dataset_dir,
                dataset_file=dataset_file,
                y=task_name,
                bucket=bucket,
                descriptor_key=
                'all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi',
                descriptor_type='MOE',
                splitter=split,
                id_col='compound_id',
                smiles_col='rdkit_smiles',
                featurizer='descriptors',
                prediction_type='regression',
                system='twintron-blue',
                datastore=True,
                transformers=True)
        elif feat_type == 'ECFP':
            params = argparse.Namespace(dataset_dir=dataset_dir,
                                        dataset_file=dataset_file,
                                        y=task_name,
                                        bucket=bucket,
                                        splitter=split,
                                        id_col='compound_id',
                                        smiles_col='rdkit_smiles',
                                        featurizer='ECFP',
                                        prediction_type='regression',
                                        system='twintron-blue',
                                        datastore=True,
                                        ecfp_radius=2,
                                        ecfp_size=1024,
                                        transformers=True)
        else:
            log.error("Feature type %s not supported" % feat_type)
            return
        log.warning("Featurizing data with %s featurizer" % feat_type)
        model_dataset = md.MinimalDataset(params)
        model_dataset.get_featurized_data(dset_df)
        num_cmpds = model_dataset.dataset.X.shape[0]
        if num_cmpds > 50000:
            log.warning("Too many compounds to compute distance matrix: %d" %
                        num_cmpds)
            continue
        plot_dataset_dist_distr(model_dataset.dataset, feat_type, dist_metric,
                                task_name, **metric_kwargs)