Ejemplo n.º 1
0
def load_process_DAVIS(path='./data',
                       binary=False,
                       convert_to_log=True,
                       threshold=30):
    print('Beginning Processing...')

    if not os.path.exists(path):
        os.makedirs(path)

    url = 'https://drive.google.com/uc?export=download&id=14h-0YyHN8lxuc0KV3whsaSaA-4KSmiVN'
    saved_path = wget.download(url, path)

    print('Beginning to extract zip file...')
    with ZipFile(saved_path, 'r') as zip:
        zip.extractall(path=path)

    affinity = pd.read_csv(path + '/DAVIS/affinity.txt', header=None, sep=' ')

    with open(path + '/DAVIS/target_seq.txt') as f:
        target = json.load(f)

    with open(path + '/DAVIS/SMILES.txt') as f:
        drug = json.load(f)

    target = list(target.values())
    drug = list(drug.values())

    SMILES = []
    Target_seq = []
    y = []

    for i in range(len(drug)):
        for j in range(len(target)):
            SMILES.append(drug[i])
            Target_seq.append(target[j])
            y.append(affinity.values[i, j])

    if binary:
        print(
            'Default binary threshold for the binding affinity scores are 30, you can adjust it by using the "threshold" parameter'
        )
        y = [1 if i else 0 for i in np.array(y) < threshold]
    else:
        if convert_to_log:
            print('Default set to logspace (nM -> p) for easier regression')
            y = convert_y_unit(np.array(y), 'nM', 'p')
        else:
            y = y
    print('Done!')
    return np.array(SMILES), np.array(Target_seq), np.array(y)
Ejemplo n.º 2
0
def process_BindingDB(path=None,
                      df=None,
                      y='Kd',
                      binary=False,
                      convert_to_log=True,
                      threshold=30):
    if not os.path.exists(path):
        os.makedirs(path)

    if df is not None:
        print('Loading Dataset from the pandas input...')
    else:
        print('Loading Dataset from path...')
        df = pd.read_csv(path, sep='\t', error_bad_lines=False)
    print('Beginning Processing...')
    df = df[df[
        'Number of Protein Chains in Target (>1 implies a multichain complex)']
            == 1.0]
    df = df[df['Ligand SMILES'].notnull()]

    if y == 'Kd':
        idx_str = 'Kd (nM)'
    elif y == 'IC50':
        idx_str = 'IC50 (nM)'
    elif y == 'Ki':
        idx_str = 'Ki (nM)'
    elif y == 'EC50':
        idx_str = 'EC50 (nM)'
    else:
        print('select Kd, Ki, IC50 or EC50')

    df_want = df[df[idx_str].notnull()]
    df_want = df_want[['BindingDB Reactant_set_id', 'Ligand InChI', 'Ligand SMILES',\
          'PubChem CID', 'UniProt (SwissProt) Primary ID of Target Chain',\
          'BindingDB Target Chain  Sequence', idx_str]]
    df_want.rename(columns={
        'BindingDB Reactant_set_id': 'ID',
        'Ligand SMILES': 'SMILES',
        'Ligand InChI': 'InChI',
        'PubChem CID': 'PubChem_ID',
        'UniProt (SwissProt) Primary ID of Target Chain': 'UniProt_ID',
        'BindingDB Target Chain  Sequence': 'Target Sequence',
        idx_str: 'Label'
    },
                   inplace=True)

    df_want['Label'] = df_want['Label'].str.replace('>', '')
    df_want['Label'] = df_want['Label'].str.replace('<', '')
    df_want['Label'] = df_want['Label'].astype(float)

    # have at least uniprot or pubchem ID
    df_want = df_want[df_want.PubChem_ID.notnull()
                      | df_want.UniProt_ID.notnull()]
    df_want = df_want[df_want.InChI.notnull()]

    df_want = df_want[df_want.Label <= 10000000.0]
    print('There are ' + str(len(df_want)) + ' drug target pairs.')

    if binary:
        print(
            'Default binary threshold for the binding affinity scores are 30, you can adjust it by using the "threshold" parameter'
        )
        y = [1 if i else 0 for i in df_want.Label.values < threshold]
    else:
        if convert_to_log:
            print('Default set to logspace (nM -> p) for easier regression')
            y = convert_y_unit(df_want.Label.values, 'nM', 'p')
        else:
            y = df_want.Label.values

    return df_want.SMILES.values, df_want['Target Sequence'].values, np.array(
        y)