Esempio n. 1
0
def build_clinical_trial_tables(output_dir):
    """
    Build the clinical trial and drug trial tables by querying the
    clinicaltrial.gov API. Queries are made by drug names from the drug
    synonyms table.

    @param output_dir: [`string`] The file path to the directory with all
                                    PharmacoDB tables
    @return: None
    """
    # Load drug synonym table
    drug_file = os.path.join(output_dir, 'drug_synonym.csv')
    drug_df = pd.read_csv(drug_file)[['drug_id', 'drug_name']]

    # Query clinicaltrials.gov API to get clinical trials by drug name
    print('Getting clinical trials from clinicaltrials.gov...')
    all_studies = parallelize(list(drug_df['drug_name']),
                              get_clinical_trials_by_drug_names, 50)
    studies_df = pd.concat(all_studies)

    # Explode list-like columns into separate rows, duplicating the index
    # I only use this because all the fields are returned in arrays for some reason
    object_columns = studies_df.dtypes[studies_df.dtypes ==
                                       'object'].index.values
    for column in object_columns:
        studies_df = studies_df.explode(column)
    # Drop and rename columns
    studies_df.drop(columns='Rank', inplace=True)
    studies_df.rename(columns={
        'OrgStudyId': 'clinical_trial_id',
        'NCTId': 'nct',
        'SeeAlsoLinkURL': 'link',
        'OverallStatus': 'status'
    },
                      inplace=True)

    # Build clinical trials table
    clin_trial_df = studies_df[['clinical_trial_id', 'nct', 'link',
                                'status']].copy()
    clin_trial_df.drop_duplicates('clinical_trial_id', inplace=True)
    clin_trial_df.reset_index(inplace=True, drop=True)
    write_table(dt.Frame(clin_trial_df),
                'clinical_trial',
                output_dir,
                add_index=False)

    # Build drug trial table
    drug_trial_df = studies_df[['clinical_trial_id', 'drug_name']].copy()
    drug_trial_df.drop_duplicates(inplace=True)
    drug_trial_df = pd.merge(drug_trial_df, drug_df, on='drug_name')
    drug_trial_df.drop(columns='drug_name', inplace=True)
    write_table(dt.Frame(drug_trial_df),
                'drug_trial',
                output_dir,
                add_index=False)
def build_drug_synonym_df(drug_file, metadata_dir, output_dir):
    # Get metadata file and drug_df
    drug_metadata = get_metadata(drug_file, metadata_dir)
    drug_df = pd.read_csv(os.path.join(output_dir, 'drug.csv'))

    # Find all columns relevant to drugid
    # Right now only FDA col is dropped, but may be more metadata in the future
    pattern = re.compile('drugid')
    drug_cols = drug_metadata[[
        col for col in drug_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with drugs_df
    drug_synonym_df = melt_and_join(drug_cols, 'unique.drugid', drug_df)
    drug_synonym_df = drug_synonym_df.rename(columns={
        'id': 'drug_id',
        'value': 'drug_name'
    })

    # Add blank col for dataset_id (TODO)
    drug_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(drug_synonym_df)
    df = write_table(df, 'drug_synonym', output_dir)
    return df
def build_tissue_synonym_df(tissue_file, metadata_dir, output_dir):
    # Get metadata file and tissue_df (assume taht tissue_df is also in output_dir)
    tissue_metadata = get_metadata(tissue_file, metadata_dir)
    tissue_df = pd.read_csv(os.path.join(output_dir, 'tissue.csv'))

    # Find all columns relevant to tissueid
    pattern = re.compile('tissueid')
    tissue_cols = tissue_metadata[[
        col for col in tissue_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with tissue_df
    tissue_synonym_df = melt_and_join(tissue_cols, 'unique.tissueid',
                                      tissue_df)
    tissue_synonym_df = tissue_synonym_df.rename(columns={
        'id': 'tissue_id',
        'value': 'tissue_name'
    })

    # Add blank col for dataset_id (TODO)
    tissue_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(tissue_synonym_df)
    df = write_table(df, 'tissue_synonym', output_dir)
    return df
def build_cell_synonym_df(cell_file, metadata_dir, output_dir):
    # Get metadata file and cell_df
    cell_metadata = get_metadata(cell_file, metadata_dir)
    cell_df = pd.read_csv(os.path.join(output_dir, 'cell.csv'))

    # Find all columns relevant to cellid
    pattern = re.compile('cellid')
    cell_columns = cell_metadata[[
        col for col in cell_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with cell_df
    cell_synonym_df = melt_and_join(cell_columns, 'unique.cellid', cell_df)
    cell_synonym_df = cell_synonym_df.rename(columns={
        'id': 'cell_id',
        'value': 'cell_name'
    })

    # Add blank col for dataset_id (TODO)
    cell_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(cell_synonym_df)
    df = write_table(df, 'cell_synonym', output_dir)
    return df
Esempio n. 5
0
def build_cellosaurus_df(cellosaurus_path, output_dir, cell_df):
    """
    Build cellosaurus table.

    @param cellosaurus_path: [`string`] Full file path to cellosaurus file
    @param output_dir: [`string`] The directory to write the cellosaurus table
    @param cell_df: [`datatable.Frame`] The cell table; should be renamed, keyed,
                                        and shouldn't have 'tissue_id' column
    @return: [`datatable.Frame`] The cellosaurus table
    """

    with open(cellosaurus_path) as f:
        file = [line for line in f]

    file = file[55:]
    entries = ''.join(file).split('//\n')
    entry_list = [entry.split('\n') for entry in entries]
    entry_split_list = [[item.split('   ') for item in entry]
                        for entry in entry_list]
    entry_tuple_list = [[(item[0], item[1]) for item in entry if len(item) > 1]
                        for entry in entry_split_list]

    pool = Pool(cpu_count() - 1)

    dict_list = pool.map(build_defaultdict, entry_tuple_list)
    dict_list = [dict(item) for item in dict_list]
    dict_list = [{key: '|||'.join(value)
                  for key, value in dct.items()} for dct in dict_list]

    cellosaurus_df = pd.DataFrame(dict_list)
    cellosaurus_df.dropna(axis=1, how='all', inplace=True)

    # Always close your pool or you will have a bunch of processes doing nothing
    pool.close()

    # Drop AG and DT columns (age of donor, date)
    cellosaurus_df.drop(columns=['AG', 'DT'], inplace=True)

    # Rename cols and add cell_id column
    rename_dict = {col: col.lower() for col in cellosaurus_df.columns}
    cellosaurus_df.rename(columns=rename_dict, inplace=True)
    cellosaurus_df.rename(columns={
        'id': 'identifier',
        'ac': 'accession'
    },
                          inplace=True)
    cellosaurus_df['cell_id'] = cellosaurus_df['identifier']

    # Convert to datatable and join with cell_df
    df = join_tables(dt.Frame(cellosaurus_df), cell_df, 'cell_id')
    df = df[dt.f.cell_id >= 1, :]
    df = df[:, [
        'cell_id', 'identifier', 'accession', 'as', 'sy', 'dr', 'rx', 'ww',
        'cc', 'st', 'di', 'ox', 'hi', 'oi', 'sx', 'ca'
    ]]
    df = write_table(df, 'cellosaurus', output_dir)
    return df
Esempio n. 6
0
def build_gene_drug_table(chembl_df, drugbank_df, target_df, output_dir):
    """
    Build a join table...

    @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table
    @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @return: [`datatable.Frame`] The gene_target table
    """
    # Get target-uniprot mappings from ChEMBL and Drugbank tables
    gene_target_df = pd.concat([
        chembl_df[['name', 'uniprot_id']].copy(),
        drugbank_df[['name', 'uniprot_id']].copy()
    ])
    gene_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    gene_target_df.drop_duplicates(inplace=True)

    # Retrieve Uniprot-ENSEMBL gene ID mappings
    uniprot_ids = pd.Series(pd.unique(gene_target_df['uniprot_id']))
    uniprot_ensembl_mappings = pd.concat(
        parallelize(uniprot_ids, map_uniprot_to_ensembl, 1000))
    uniprot_ensembl_mappings.drop_duplicates(inplace=True)

    # Join gene_target table with gene table based on uniprot-ensembl mappings
    gene_target_df = pd.merge(gene_target_df,
                              uniprot_ensembl_mappings,
                              on='uniprot_id')
    gene_target_df.drop(columns=['uniprot_id'], inplace=True)

    # Load and key the gene table from output_dir
    gene_file = os.path.join(output_dir, 'gene.csv')
    if not os.path.exists(gene_file):
        raise FileNotFoundError(f"There is no gene file in {output_dir}!")
    gene_df = dt.fread(gene_file, sep=",")
    gene_df = rename_and_key(gene_df, 'gene_id')

    # Join target table with gene table and target table
    gene_target_df = dt.Frame(gene_target_df)
    gene_target_df = join_tables(gene_target_df, gene_df, 'gene_id')
    gene_target_df = join_tables(gene_target_df, target_df, 'target_id')

    # Drop columns that didn't join and drop duplicates
    gene_target_df = gene_target_df[(dt.f.target_id >= 1) &
                                    (dt.f.gene_id >= 1), :]
    gene_target_df = gene_target_df[0, :, dt.by(gene_target_df.names)]

    gene_target_df = write_table(gene_target_df,
                                 'gene_target',
                                 output_dir,
                                 add_index=False)
    return gene_target_df
Esempio n. 7
0
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir,
                                compound_synonym_file):
    """
    Using data from the Drugbank and ChEMBL drug target files and 
    the target table, build the drug target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @param compound_synonym_file: [`string`] The file path to the compound synonym table
    @return: [`dt.Frame`] The drug target table
    """
    # Load compound synonym table from output_dir
    if not os.path.exists(compound_synonym_file):
        raise FileNotFoundError(
            f"The file {compound_synonym_file} doesn't exist!")
    drug_syn_df = dt.fread(compound_synonym_file)
    # Join drugbank df with drug table
    del drug_syn_df[:, ['dataset_id', 'id']]
    drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \
        .drop_duplicates()
    drugbank_df = pl.from_arrow(
        drugbank_df[:, ['name', 'compound_name']].to_arrow())
    drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name')
    # Combine ChEMBL and Drugbank tables to make drug target table
    drug_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'compound_id']].copy(),
        drugbank_df.to_pandas()[['name', 'compound_id']].copy()
    ])
    drug_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    drug_target_df.drop_duplicates(inplace=True)
    # Join with target table
    drug_target_df = dt.Frame(drug_target_df)
    drug_target_df = join_tables(drug_target_df, target_df, 'target_id')
    # Drop rows with no target_id, drop duplicates
    drug_target_df = drug_target_df[dt.f.target_id >= 1, :]
    drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)]
    drug_target_df = dt.Frame(
        pl.from_arrow(drug_target_df.to_arrow()) \
            .drop_nulls() \
            .to_arrow())
    drug_target_df = write_table(drug_target_df,
                                 'compound_target',
                                 output_dir,
                                 add_index=False)
    return drug_target_df
Esempio n. 8
0
def build_target_table(chembl_df, drugbank_df, output_dir):
    """
    Using data from the Drugbank and ChEMBL drug target files and
    the UniProt API, build the target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param output_dir: [`string`] The file path to write the final target table
    @return: [`dt.Frame`] The target table
    """
    # Combine ChEMBL and Drugbank tables to make target table
    target_df = dt.rbind([chembl_df['name'], drugbank_df['name']]).to_pandas()
    target_df.drop_duplicates(inplace=True)
    target_df = dt.Frame(target_df)
    target_df = write_table(target_df, 'target', output_dir)
    target_df = rename_and_key(target_df, 'target_id')
    return target_df
Esempio n. 9
0
def build_drug_target_table(chembl_df, drugbank_df, target_df, output_dir):
    """
    Using data from the Drugbank and ChEMBL drug target files and 
    the target table, build the drug target table.

    @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table
    @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @return: [`datatable.Frame`] The drug target table
    """
    # Load drug synonym table from output_dir
    drug_synonym_file = os.path.join(output_dir, 'drug_synonym.csv')
    if not os.path.exists(drug_synonym_file):
        raise FileNotFoundError(
            f"There is no drug synonym file in {output_dir}!")
    drug_syn_df = pd.read_csv(drug_synonym_file, dtype={'drug_id': 'int32'})

    # Join drugbank df with drug table (TODO: are we really using drug name to map?)
    drugbank_df = pd.merge(drugbank_df, drug_syn_df, on='drug_name')
    # TODO: from 7521 down to only 122 rows :/

    # Combine ChEMBL and Drugbank tables to make drug target table
    drug_target_df = pd.concat([
        chembl_df[['name', 'drug_id']].copy(), drugbank_df[['name',
                                                            'drug_id']].copy()
    ])
    drug_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    drug_target_df.drop_duplicates(inplace=True)

    # Join with target table
    drug_target_df = dt.Frame(drug_target_df)
    drug_target_df = join_tables(drug_target_df, target_df, 'target_id')
    # Drop rows with no target_id, drop duplicates
    drug_target_df = drug_target_df[dt.f.target_id >= 1, :]
    drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)]

    drug_target_df = write_table(drug_target_df,
                                 'drug_target',
                                 output_dir,
                                 add_index=False)
    return drug_target_df