def curate():
    """
    Curate dataset for model fitting
    """
    if (not os.path.isfile('delaney-processed_curated.csv')
            and not os.path.isfile('delaney-processed_curated_fit.csv')
            and not os.path.isfile('delaney-processed_curated_external.csv')):
        raw_df = pd.read_csv('delaney-processed.csv')

        # Generate smiles, inchi
        raw_df['rdkit_smiles'] = raw_df['smiles'].apply(
            curate_data.base_smiles_from_smiles)
        raw_df['inchi_key'] = raw_df['smiles'].apply(
            struct_utils.smiles_to_inchi_key)

        # Check for duplicate compounds based on SMILES string
        # Average the response value for duplicates
        # Remove compounds where response value variation is above the threshold
        # tolerance=% of individual respsonse value is allowed to different from the average to be included in averaging.
        # max_std = maximum allowed standard deviation for computed average response value
        tolerance = 10  # percentage
        column = 'measured log solubility in mols per litre'
        list_bad_duplicates = 'Yes'
        data = raw_df
        max_std = 100000  # esentially turned off in this example
        data['compound_id'] = data['inchi_key']
        curated_df = curate_data.average_and_remove_duplicates(
            column,
            tolerance,
            list_bad_duplicates,
            data,
            max_std,
            compound_id='compound_id',
            smiles_col='rdkit_smiles')

        # add additional columns for other forms of prediction
        # e.g. classification and multi task
        mean = np.mean(curated_df[column])
        column_class = column + '_class'
        curated_df[column_class] = curated_df[column].apply(
            lambda x: 1.0 if x > mean else 0.0)

        # make a copy of each column for multi task
        curated_df[column + '2'] = curated_df[column]
        curated_df[column_class + '2'] = curated_df[column_class]

        # Check distribution of response values
        assert (curated_df.shape[0] == 1117
                ), 'Error: Incorrect number of compounds'

        curated_df.to_csv('delaney-processed_curated.csv')

        # Create second test set by reproducible index for prediction
        curated_df.tail(1000).to_csv('delaney-processed_curated_fit.csv')
        curated_df.head(117).to_csv('delaney-processed_curated_external.csv')

    assert (os.path.isfile('delaney-processed_curated.csv'))
    assert (os.path.isfile('delaney-processed_curated_fit.csv'))
    assert (os.path.isfile('delaney-processed_curated_external.csv'))
def curate():
    """
    Curate dataset for model fitting
    """
    with zipfile.ZipFile('ci8b00785_si_001.zip', 'r') as zip_ref:
        zip_ref.extractall('clearance')

    raw_df = pd.read_csv('clearance/SuppInfo/Dataset_chembl_clearcaco.txt', sep=";", dtype='str')

    #response variable
    #hlm_clearance[mL.min-1.g-1]

    # replace commas with decimal point
    raw_df['hlm_clearance[mL.min-1.g-1]']=raw_df['hlm_clearance[mL.min-1.g-1]'].str.replace(',','.')
    # record as floating point values.
    raw_df['hlm_clearance[mL.min-1.g-1]']=raw_df['hlm_clearance[mL.min-1.g-1]'].astype(float)

    hlmc_df=raw_df.rename(columns={'hlm_clearance[mL.min-1.g-1]':'value'}, inplace=False)
    hlmc_df.rename(columns={'ID':'compound_id'}, inplace=True)
    hlmc_df['rdkit_smiles']=hlmc_df['Canonical_Smiles'].apply(struct_utils.base_smiles_from_smiles)
    col = ['compound_id', 'rdkit_smiles', 'value']
    hlmc_df = hlmc_df[col]
    # drop NaN values
    hlmc_df=hlmc_df.dropna()
    print(hlmc_df.shape)

    assert(hlmc_df.shape == (5348, 3)), 'Error: Incorrect data size'

    tolerance=10
    column='value'
    list_bad_duplicates='Yes'
    data=hlmc_df
    max_std=20 
    curated_df=curate_data.average_and_remove_duplicates (column, tolerance,list_bad_duplicates, data, max_std, compound_id='compound_id',smiles_col='rdkit_smiles')

    data_filename="hlm_clearance_curated.csv"

    nr=curated_df.shape[0]
    nc=curated_df.shape[1]

    curated_df.to_csv(data_filename, index=False)

    # Create second test set by reproducible index for prediction
    curated_df.tail(5000).to_csv('hlm_clearance_curated_fit.csv')
    curated_df.head(348).to_csv('hlm_clearance_curated_external.csv')
Beispiel #3
0
    def combine_replicates(self,
                           data_frame,
                           ignore_compound_id,
                           tolerance=10,
                           max_std=1,
                           output_value_col=None,
                           label_actives=True,
                           active_thresh=None,
                           date_col=None):
        """
            Combine replicates by taking average and discarding molecules with high variation in the measured value

        Args:
            data_frame: target specific subset of data_frame
            ignore_compound_id: when combing replicates across data sources, we have to assume the same compound will have differnt IDs, so you must use the smiles to match across datasources.
            tolerance: percent variation between replicates tolerated
            max_std: maximum standard deviation between replicates tolerated
            output_value_col: Optional; the column name to use in the output data frame for the averaged data.
            label_actives: If True, generate an additional column 'active' indicating whether the mean value is above a threshold specified by active_thresh.
            active_thresh: The threshold to be used for labeling compounds as active or inactive.
                           If active_thresh is None (the default), the threshold used is the minimum reported value across all records
                           with left-censored values (i.e., those with '<' in the relation column).
            date_col: The input data frame column containing dates when the assay data was uploaded. If not None, the code will assign the earliest
                     date among replicates to the aggregate data record.
        Returns
            A data frame of compounds with averaged values and a dataframe with compounds that were rejected as having too much variation
        """

        column = self.value_col
        smiles_col = self.base_smiles_col
        compound_id = self.id_col
        if ignore_compound_id:
            compound_id = smiles_col
        relation_col = self.relation_col
        ##################################################
        ### This is to run a diagnostic to look for and report outlieres
        ### Outliers are then removed
        ### TODO: we might want to make option to report outliers without removing them
        ##################################################
        ### list_bad_duplicates set to Yes would be redundant since they are saved to file now
        list_bad_duplicates = 'No'
        curated_df = curate_data.average_and_remove_duplicates(
            column,
            tolerance,
            list_bad_duplicates,
            data_frame,
            max_std,
            compound_id=compound_id,
            smiles_col=smiles_col)
        save_ids = curated_df[compound_id].unique().tolist()
        reject = data_frame[~(data_frame[compound_id].isin(save_ids))]
        keep_df = data_frame[data_frame[compound_id].isin(save_ids)]

        ###
        ### TODO: THIS HARDCODES THE COLUMN NAMES AND SHOULD USE id_col value to set the column name
        ###       for now the "compound id" needs to be called "compound_id"
        ###       "relation" needs to be called "relation" !!!!!
        ###        Need to change this
        data_frame = curate_data.aggregate_assay_data(
            keep_df,
            value_col=column,
            output_value_col=None,
            label_actives=True,
            active_thresh=None,
            id_col=compound_id,
            smiles_col=smiles_col,
            relation_col=relation_col)

        return data_frame, reject