Ejemplos de fread en Python, ejemplos de datatable.dt.fread en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: build_target_tables.py Proyecto: bhklab/PharmacoDI

def build_target_tables(drugbank_file, chembl_file, output_dir,
                        compound_synonym_file):
    """
    Build the target and drug target tables using data from Drugbank
    and ChEMBL.

    @param drugbank_file: [`string`] The full file path to Drugbank targets
    @param chembl_file: [`string`] The full file path to ChEMBL targets
    @param output_dir: [`string`] The directory of all final PharmacoDB tables
    @param compound_synonym_file: [`string`] The file path to the compound synonym table
    @return: None
    """
    # Get Drugbank data
    if not os.path.exists(drugbank_file):
        raise FileNotFoundError(f"The file {drugbank_file} doesn't exist!")
    drugbank_df = dt.fread(drugbank_file)
    drugbank_df.names = {
        'polypeptide.external.identifiers.UniProtKB': 'uniprot_id',
        'drugName': 'compound_name'
    }

    # Get ChEMBL data
    if not os.path.exists(chembl_file):
        raise FileNotFoundError(f"The file {chembl_file} doesn't exist!")
    chembl_df = dt.fread(chembl_file)
    chembl_df.names = {'pref_name': 'name', 'accession': 'uniprot_id'}
    chembl_df.key = chembl_df.names[0]

    target_df = build_target_table(chembl_df, drugbank_df, output_dir)
    build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir,
                                compound_synonym_file)
    build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: pharmacodi_load.py Proyecto: bhklab/PharmacoDI_snakemake_pipeline

def chunk_append_to_table(file_path: str,
                          table: sqlalchemy.Table,
                          chunksize: int = 10000000) -> None:
    """
    Append to an existing database table after the highest value of the id
    column.
    """
    table_name = table.__table__.name
    logger.info(
        f'\tAppending data from {os.path.basename(file_path)} to {table_name}...'
    )
    # Get the highest index from the table so far
    with engine.connect() as con:
        max_index = con.execute(f'SELECT MAX(id) FROM {table_name}').first()[0]
    # Read in the raw table data and filter to the index of interest
    data_df = dt.fread(file_path)
    session = Session(bind=engine)
    logger.info(f'Inserting after table id: {max_index}')
    bulk_insert_chunk(data_df=data_df,
                      table=table,
                      session=session,
                      chunksize=chunksize,
                      start_index=(max_index - 1))
    logger.info(
        f'\tAppending data from {os.path.basename(file_path)} to {table_name}... DONE!\n'
    )

Ejemplo n.º 3

0

Mostrar archivo

def test_view_to_jay():
    # See issue #3099
    DT1 = dt.Frame([None] * 10)[:3, :]
    saved = DT1.to_jay()
    DT2 = dt.fread(saved)
    assert DT2.type == dt.Type.void
    assert_equals(DT1, DT2)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test-date32.py Proyecto: dmillar/datatable

def test_save_to_jay(tempfile_jay):
    src = [d(1, 1, 1), d(2001, 12, 13), d(2026, 5, 9), None, d(1956, 11, 11)]
    DT = dt.Frame(src)
    DT.to_jay(tempfile_jay)
    del DT
    DT2 = dt.fread(tempfile_jay)
    assert DT2.shape == (5, 1)
    assert DT2.type == dt.Type.date32
    assert DT2.to_list()[0] == src

Ejemplo n.º 5

0

Mostrar archivo

Archivo: pharmacodi_load.py Proyecto: bhklab/PharmacoDI_snakemake_pipeline

def bulk_insert(file_path, table):
    """Batched INSERT statements via the ORM "bulk", using dictionaries."""
    logger.info(f'\tInserting data from {os.path.basename(file_path)}...')
    df = dt.fread(file_path).to_pandas()
    row_dict = create_records(df)
    session = Session(bind=engine)
    session.bulk_insert_mappings(table, row_dict)
    session.commit()
    logger.info(
        f'\tInserting data from {os.path.basename(file_path)}... DONE\n')

Ejemplo n.º 6

0

Mostrar archivo

 def load_from_jay(filename):
     DT = dt.fread(filename)
     frame = MetaFrame()
     frame.df = DT
     frame.data = DT.to_list()
     frame.names = list(DT.names)
     frame.types = [_ltype_to_pytype[lt] for lt in DT.ltypes]
     frame.nkeys = len(DT.key)
     print(f"{frame.name} = <loaded from {filename}>")
     return frame

Ejemplo n.º 7

0

Mostrar archivo

def build_gene_compound_tissue_dataset_df(
        gene_sig_dir: str,
        pset_name: str,
        ignore_psets: list = ['NCI60', 'PRISM']) -> pd.DataFrame:
    """
    @param gene_sig_dir: [`string`] The file path to the directory 
        containing the gene signatures for each PSet
    @param pset_name: [`string`] The name of the PSet
    @return: [`DataFrame`] The gene_compounds table for this PSet, 
        containing all stats (?)
    """
    # Early return for PSets without gene signatures
    if pset_name in ignore_psets:
        return None
    # Get gene_sig_df from gene_sig_file
    try:
        gene_sig_df = dt.fread(
            os.path.join(gene_sig_dir, 'gene_compound_tissue_dataset.csv'),
            memory_limit=int(1e10)  # 10 GBs
        )
    except ValueError:
        return None
    gene_sig_df[f.dataset == pset_name, :]
    # Extract relevant columns
    # gene_compound_tissue_dataset = gctd
    gctd_df = gene_sig_df.loc[:, [
        'gene', 'compound', 'tissue', 'dataset', 'estimate', 'lower_analytic',
        'upper_analytic', 'lower_permutation', 'upper_permutation', 'n',
        'pvalue_analytic', 'pvalue_permutation', 'df', 'fdr_analytic',
        'fdr_permutation', 'significant_permutation', 'mDataType'
    ]]
    # Add missing columns
    gctd_df.loc[:, 'sens_stat'] = 'AAC'
    gctd_df.loc[:, 'permutation_done'] = 0
    gctd_df.loc[~gctd_df['fdr_permutation'].isna(), 'permutation_done'] = 1
    # Rename foreign key columns
    gctd_df.rename(columns={
        'gene': 'gene_id',
        'compound': 'compound_id',
        'tissue': 'tissue_id',
        'dataset': 'dataset_id'
    },
                   inplace=True)
    gctd_df['gene_id'] = gctd_df['gene_id'] \
        .apply(lambda x: re.sub(r'\..*$', '', x))
    # Reorder columns
    return gctd_df[[
        'gene_id', 'compound_id', 'dataset_id', 'tissue_id', 'estimate',
        'lower_analytic', 'upper_analytic', 'lower_permutation',
        'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation',
        'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation',
        'permutation_done', 'sens_stat', 'mDataType'
    ]]

Ejemplo n.º 8

0

Mostrar archivo

def test_time64_read_from_csv():
    DT = dt.fread("timestamp\n"
                  "2001-05-12T12:00:00\n"
                  "2013-06-24T14:00:01\n"
                  "2023-11-02T23:59:59.999999\n")
    assert DT.type == dt.Type.time64
    assert DT.shape == (3, 1)
    assert DT.to_list()[0] == [
        d(2001, 5, 12, 12),
        d(2013, 6, 24, 14, 0, 1),
        d(2023, 11, 2, 23, 59, 59, 999999)
    ]

Ejemplo n.º 9

0

Mostrar archivo

Archivo: pharmacodi_load.py Proyecto: bhklab/PharmacoDI_snakemake_pipeline

def bulk_insert_chunk_from_file(file_path, table, chunksize=100000):
    """Batched INSERT statements via the ORM "bulk", using dictionaries."""
    logger.info(f'\tInserting data from {os.path.basename(file_path)}...')
    session = Session(bind=engine)
    data_df = dt.fread(file_path)
    bulk_insert_chunk(data_df=data_df,
                      table=table,
                      session=session,
                      chunksize=chunksize,
                      start_index=0)
    logger.info(
        f'\tInserting data from {os.path.basename(file_path)}... DONE!\n')

Ejemplo n.º 10

0

Mostrar archivo

def test_save_to_jay(tempfile_jay):
    src = [
        d(1901, 2, 3, 4, 5, 6),
        d(2001, 12, 13, 0, 30),
        d(2026, 5, 9, 12), None,
        d(1956, 11, 11, 11, 11, 11)
    ]
    DT = dt.Frame(src)
    DT.to_jay(tempfile_jay)
    del DT
    DT2 = dt.fread(tempfile_jay)
    assert DT2.shape == (5, 1)
    assert DT2.types == [dt.Type.time64]
    assert DT2.to_list()[0] == src

Ejemplo n.º 11

0

Mostrar archivo

Archivo: build_target_tables.py Proyecto: bhklab/PharmacoDI

def build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir):
    """
    Build a join table...

    @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table
    @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @return: [`datatable.Frame`] The gene_target table
    """
    # Get target-uniprot mappings from ChEMBL and Drugbank tables
    gene_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'uniprot_id']],
        drugbank_df.to_pandas()[['name', 'uniprot_id']]
    ])
    gene_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    gene_target_df.drop_duplicates(inplace=True)

    # Retrieve Uniprot-ENSEMBL gene ID mappings
    uniprot_ids = pd.Series(pd.unique(gene_target_df['uniprot_id']))
    uniprot_ensembl_mappings = pd.concat(
        parallelize(uniprot_ids, map_uniprot_to_ensembl, 1000))
    uniprot_ensembl_mappings.drop_duplicates(inplace=True)

    # Join gene_target table with gene table based on uniprot-ensembl mappings
    gene_target_df = pd.merge(gene_target_df,
                              uniprot_ensembl_mappings,
                              on='uniprot_id')
    gene_target_df.drop(columns=['uniprot_id'], inplace=True)

    # Load and key the gene table from output_dir
    gene_file = os.path.join(output_dir, 'gene.jay')
    if not os.path.exists(gene_file):
        raise FileNotFoundError(f"There is no gene file in {output_dir}!")
    gene_df = dt.fread(gene_file, sep=",")
    gene_df = rename_and_key(gene_df, 'gene_id')

    # Join target table with gene table and target table
    gene_target_df = dt.Frame(gene_target_df)
    gene_target_df = join_tables(gene_target_df, gene_df, 'gene_id')
    gene_target_df = join_tables(gene_target_df, target_df, 'target_id')

    # Drop columns that didn't join and drop duplicates
    gene_target_df = gene_target_df[(dt.f.target_id >= 1) &
                                    (dt.f.gene_id >= 1), :]
    gene_target_df = gene_target_df[0, :, dt.by(gene_target_df.names)]

    gene_target_df.to_jay(os.path.join(output_dir, 'gene_target.jay'))
    return gene_target_df

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test-date32.py Proyecto: dmillar/datatable

def test_with_stats(tempfile_jay):
    DT = dt.Frame([-1, 0, 1, None, 12, 3, None], stype='date32')
    # precompute stats so that they get stored in the Jay file
    assert DT.countna1() == 2
    assert DT.min1() == datetime.date(1969, 12, 31)
    assert DT.max1() == datetime.date(1970, 1, 13)
    DT.to_jay(tempfile_jay)
    del DT
    DT = dt.fread(tempfile_jay)
    # assert_equals() also checks frame integrity, including validity of stats
    assert_equals(DT,
        dt.Frame([-1, 0, 1, None, 12, 3, None], stype='date32'))
    assert DT.countna1() == 2
    assert DT.min1() == datetime.date(1969, 12, 31)
    assert DT.max1() == datetime.date(1970, 1, 13)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: build_target_tables.py Proyecto: bhklab/PharmacoDI

def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir,
                                compound_synonym_file):
    """
    Using data from the Drugbank and ChEMBL drug target files and 
    the target table, build the drug target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @param compound_synonym_file: [`string`] The file path to the compound synonym table
    @return: [`dt.Frame`] The drug target table
    """
    # Load compound synonym table from output_dir
    if not os.path.exists(compound_synonym_file):
        raise FileNotFoundError(
            f"The file {compound_synonym_file} doesn't exist!")
    drug_syn_df = dt.fread(compound_synonym_file)
    # Join drugbank df with drug table
    del drug_syn_df[:, ['dataset_id', 'id']]
    drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \
        .drop_duplicates()
    drugbank_df = pl.from_arrow(
        drugbank_df[:, ['name', 'compound_name']].to_arrow())
    drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name')
    # Combine ChEMBL and Drugbank tables to make drug target table
    drug_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'compound_id']].copy(),
        drugbank_df.to_pandas()[['name', 'compound_id']].copy()
    ])
    drug_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    drug_target_df.drop_duplicates(inplace=True)
    # Join with target table
    drug_target_df = dt.Frame(drug_target_df)
    drug_target_df = join_tables(drug_target_df, target_df, 'target_id')
    # Drop rows with no target_id, drop duplicates
    drug_target_df = drug_target_df[dt.f.target_id >= 1, :]
    drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)]
    drug_target_df = dt.Frame(
        pl.from_arrow(drug_target_df.to_arrow()) \
            .drop_nulls() \
            .to_arrow())
    drug_target_df = write_table(drug_target_df,
                                 'compound_target',
                                 output_dir,
                                 add_index=False)
    return drug_target_df

Ejemplo n.º 14

0

Mostrar archivo

def test_with_stats(tempfile_jay):
    src = [
        d(1901, 12, 13, 0, 11, 59),
        d(2001, 2, 17, 0, 30), None,
        d(2026, 5, 19, 12, 0, 1, 1111), None,
        d(1956, 11, 11, 11, 11, 11)
    ]
    DT = dt.Frame(src)
    # precompute stats so that they get stored in the Jay file
    assert DT.countna1() == 2
    assert DT.min1() == d(1901, 12, 13, 0, 11, 59)
    assert DT.max1() == d(2026, 5, 19, 12, 0, 1, 1111)
    DT.to_jay(tempfile_jay)
    DTnew = dt.fread(tempfile_jay)
    # assert_equals() also checks frame integrity, including validity of stats
    assert_equals(DTnew, DT)
    assert DTnew.countna1() == 2
    assert DTnew.min1() == d(1901, 12, 13, 0, 11, 59)
    assert DTnew.max1() == d(2026, 5, 19, 12, 0, 1, 1111)

Ejemplo n.º 15

0

Mostrar archivo

def combine_primary_tables(data_dir: str, output_dir: str,
                           compound_meta_file: str) -> dict:
    """
    Build all the primary tables, i.e., tables that require no joins,
    and return them in a dictionary.

    @param data_dir: [`string`] The file path to read the PSet tables
    @param output_dir: [`string`] The file path to write the final tables
    @return: [`dict(string: datatable.Frame)`] A dictionary of all the primary
        tables, with names as keys
    """
    # Load, concatenate, and write primary tables to disk
    tissue_df = load_table("tissue", data_dir)
    tissue_df = tissue_df[:, :, sort(f.name, na_position="last")]
    tissue_df = write_table(tissue_df, "tissue", output_dir)
    gene_df = load_join_write("gene", data_dir, output_dir)
    dataset_df = load_join_write("dataset", data_dir, output_dir)

    # Annotate compounds
    compound_df = load_table("compound", data_dir)
    compound_meta_df = dt.fread(compound_meta_file)
    compound_meta_df.names = {
        "unique.drugid": "name",
        "PharmacoDB.uid": "compound_uid"
    }

    # Add compound_uid to compound table and write to disk
    compound_meta_df.key = "name"
    compound_df = compound_df[:, :, dt.join(compound_meta_df)]
    compound_df = write_table(compound_df, "compound", output_dir)

    # Transform tables to be used for joins
    dfs = {}
    dfs["tissue"] = rename_and_key(tissue_df, "tissue_id")
    dfs["compound"] = rename_and_key(compound_df, "compound_id")
    dfs["gene"] = rename_and_key(gene_df, "gene_id")
    dfs["dataset"] = rename_and_key(dataset_df, "dataset_id")
    return dfs

Ejemplo n.º 16

0

Mostrar archivo

# In[1]:


from datatable import dt, f
from operator import and_, or_, xor, eq
from functools import reduce
import re


# In[2]:


file_path = "https://github.com/samukweku/data_files/raw/master/msleep.txt"

DT = dt.fread(file_path)

DT.head(5)


# ### Filtering Rows Based on a Numeric Variable

# You can filter numeric variables based on their values. A number of commonly used operators include: >, >=, <, <=, == and !=.
# 
# Note that in datatable, filtration occurs in the ``i`` section: 

# In[3]:


DT[f.sleep_total > 18,  ["name", "sleep_total"]]

Ejemplo n.º 17

0

Mostrar archivo

Archivo: python_datatable_wide_to_long.py Proyecto: samukweku/data-wrangling


melt(DT).stypes


# In[35]:


melt(DT, id_vars=['first', 'last'])


# In[36]:


url = 'https://raw.githubusercontent.com/tidyverse/tidyr/main/data-raw/billboard.csv'
DT = dt.fread(url)
measure_vars = [name for name in DT.names if name.startswith('wk')]
melt(DT, measure_vars=measure_vars, variable_name='week')


# In[37]:


DT = dt.Frame([{'A': 'a', 'B': 1, 'C': 2},
 {'A': 'b', 'B': 3, 'C': 4},
 {'A': 'c', 'B': 5, 'C': 6}])

DT


# In[38]:

Ejemplo n.º 18

0

Mostrar archivo

def test_to_jay():
    DT1 = dt.Frame([None] * 13)
    saved = DT1.to_jay()
    DT2 = dt.fread(saved)
    assert DT2.type == dt.Type.void
    assert_equals(DT1, DT2)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: replicate_SD_datatable.py Proyecto: samukweku/data-wrangling

# ## **.SD - Subset of Data**

# I will be using [Jose Morales](https://twitter.com/jmrlsz) excellent [post](https://rpubs.com/josemz/SDbf) to show how .SD's functionality can be replicated in  python's [datatable](https://datatable.readthedocs.io/en/latest/index.html).
#
#  Not all functions can be replicated; R [data.table](https://github.com/Rdatatable/data.table) has a whole lot more functions and features that are not yet implemented in [datatable](https://datatable.readthedocs.io/en/latest/index.html).
#
#
# The data file can be accessed [here](https://github.com/samukweku/data_files/raw/master/iris.csv)

# In[1]:

from datatable import dt, by, f, update, sort

# In[2]:

DT = dt.fread('https://github.com/samukweku/data_files/raw/master/iris.csv')
DT.head()

# ####  Number of unique observations per column

# In[3]:

# DT[, lapply(.SD, uniqueN)] --> Rdatatable

DT.nunique()

# #### Mean of all columns by `species`

# In[4]:

# DT[, lapply(.SD, mean), by = species] --> Rdatatable

Ejemplo n.º 20

0

Mostrar archivo

def build_cellosaurus_df(cellosaurus_path, output_dir):
    """
    Build cellosaurus table.

    @param cellosaurus_path: [`string`] Full file path to cellosaurus file
    @param output_dir: [`string`] The directory to write the cellosaurus table
    @param cell_df: [`datatable.Frame`] The cell table; should be renamed, keyed,
                                        and shouldn't have 'tissue_id' column
    @return: [`datatable.Frame`] The cellosaurus table
    """

    with open(cellosaurus_path) as f:
        file = [line for line in f]

    file = file[55:]
    entries = ''.join(file).split('//\n')
    entry_list = [entry.split('\n') for entry in entries]
    entry_split_list = [[item.split('   ') for item in entry]
                        for entry in entry_list]
    entry_tuple_list = [[(item[0], item[1]) for item in entry if len(item) > 1]
                        for entry in entry_split_list]

    pool = Pool(cpu_count() - 1)

    dict_list = pool.map(build_defaultdict, entry_tuple_list)
    dict_list = [dict(item) for item in dict_list]
    dict_list = [{key: '|||'.join(value)
                  for key, value in dct.items()} for dct in dict_list]

    cellosaurus_df = pd.DataFrame(dict_list)
    cellosaurus_df.dropna(axis=1, how='all', inplace=True)

    # Always close your pool or you will have a bunch of processes doing nothing
    pool.close()

    # Drop AG and DT columns (age of donor, date)
    cellosaurus_df.drop(columns=['AG', 'DT'], inplace=True)

    # Rename cols and add cell_id column
    rename_dict = {col: col.lower() for col in cellosaurus_df.columns}
    cellosaurus_df.rename(columns=rename_dict, inplace=True)
    cellosaurus_df.rename(columns={
        'id': 'identifier',
        'ac': 'accession'
    },
                          inplace=True)
    cellosaurus_df['cell_id'] = cellosaurus_df['identifier']

    # Load cell_df
    cell_path = os.path.join(output_dir, 'cell.jay')
    cell_df = rename_and_key(dt.fread(cell_path), 'cell_id')

    # Convert to datatable and join with cell_df
    cellosaurus_df = dt.Frame(cellosaurus_df)
    cellosaurus_df.key = 'cell_id'
    df = cell_df[:, :, join(dt.Frame(cellosaurus_df))]
    df = df[dt.f.id >= 1, :]
    df = df[:, [
        'cell_id', 'id', 'accession', 'as', 'sy', 'dr', 'rx', 'ww', 'cc', 'st',
        'di', 'ox', 'hi', 'oi', 'sx', 'ca'
    ]]
    df.names = {'cell_id': 'identifier', 'id': 'cell_id'}
    df = write_table(df, 'cellosaurus', output_dir)
    return df

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test-array.py Proyecto: dmillar/datatable

def test_arr32_to_jay():
    DT = dt.Frame(W=[['ad', 'dfkvjn'], ['b b, f', None], ['r', 'w', 'dfvdf']])
    assert DT.type == dt.Type.arr32(dt.Type.str32)
    out = DT.to_jay()
    RES = dt.fread(out)
    assert_equals(RES, DT)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test-date32.py Proyecto: dmillar/datatable

def test_do_not_read_from_csv():
    assert dt.options.fread.parse_dates is True
    with dt.options.context(**{"fread.parse_dates": False}):
        DT = dt.fread("X\n1990-10-10\n2011-11-11\n2020-02-05\n")
        assert_equals(DT,
                      dt.Frame(X=["1990-10-10", "2011-11-11", "2020-02-05"]))

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test-date32.py Proyecto: dmillar/datatable

def test_read_date32_from_csv():
    DT = dt.fread("a-b-c\n1999-01-01\n2010-11-11\n2020-12-31\n")
    assert_equals(DT, dt.Frame([d(1999,1,1), d(2010,11,11), d(2020,12,31)],
                               names=["a-b-c"]))

Ejemplo n.º 24

0

Mostrar archivo

def test_view_to_jay_2():
    DT1 = dt.Frame(A=[None, None], B=[None, "qoo"])[0, :]
    saved = DT1.to_jay()
    DT2 = dt.fread(saved)
    assert_equals(DT2, dt.Frame(A=[None], B=[None] / dt.str32))

Ejemplo n.º 25

0

Mostrar archivo

Archivo: build_meta_tables.py Proyecto: bhklab/PharmacoDI

def build_gene_compound_dataset_df(gene_compound_dataset_file, gene_file,
                                   compound_file, dataset_file, output_dir,
                                   compound_names):
    """
    Build gene_compound_dataset table (description?)

    @param gene_compound_dataset_file: [`str`] Path to the gene signature .csv file.
    @param gene_file: [`str`] Path to the gene table .csv file.
    @param compound_file: [`str`] Path to the compound table .csv file.
    @param dataset_file: [`str`] Path to the tissue table .csv file.
    @param output_dir: [`str`] Path to write the output file to.
    :param compound_name: [`str`] Path to an optional .csv file mapping 
        updated compound names to the dataset. This is to ensure that corrected
        compound annotations still make it into the database without the need
        to rerun all the gene signatures

    @return [`datatable.Frame`] Writes the 'gene_compound_dataset.csv' file to 
        output_dir the returns the table.
    """
    # -- Check the input files exist
    for fl in [
            gene_compound_dataset_file, gene_file, compound_file, dataset_file
    ]:
        if not os.path.exists(fl):
            raise FileNotFoundError(f'Could not find the {fl}')

    # -- Read in mapping tables
    gene_dt = fread(gene_file)
    compound_dt = fread(compound_file)
    dataset_dt = fread(dataset_file)

    # -- Read in gene_compound_tissue table
    gcd_dt = fread(gene_compound_dataset_file)

    # -- Fix names and assign missing columns
    gcd_dt.names = {
        'gene': 'gene_id',
        'compound': 'compound_id',
        'dataset': 'dataset_id',
        'lower': 'lower_analytic',
        'upper': 'upper_analytic',
        'pvalue': 'pvalue_analytic',
        'fdr': 'fdr_analytic'
    }
    del gcd_dt[:, ['significant', 'tissue']]

    # Determine missing columns and assign them, so we don't have to change code
    #>when new columns are addeds
    gcd_table_columns = np.asarray(
        ('id', 'gene_id', 'compound_id', 'dataset_id', 'estimate',
         'lower_analytic', 'upper_analytic', 'lower_permutation',
         'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation',
         'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation',
         'permutation_done', 'sens_stat', 'mDataType'))
    gcd_missing_columns = np.setdiff1d(gcd_table_columns,
                                       np.asarray(gcd_dt.names))
    for col in gcd_missing_columns:
        gcd_dt[col] = None
    gcd_dt1 = gcd_dt[:, list(gcd_table_columns)]
    # Sanity check the columns are there
    if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)):
        raise ValueError(f'The build_gene_compound_dataset table',
                         ' has missing columns!')

    gcd_dt1[:, update(sens_stat='AAC', permutation_done=0)]

    # -- Map to existing FK ids
    # gene id
    gcd_dt1.names = {'gene_id': 'gene_name'}
    gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'}
    gene_dt.key = 'gene_name'
    # NOTE: the g object references the joined tables namespace
    gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)]

    # make sure all genes mapped
    failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id), 'gene_name'] \
        .to_numpy().flatten())
    if len(failed_genes) > 0:
        warnings.warn(f'The genes: {failed_genes} did not map!')

    if (np.any(gcd_dt1[:, dt.isna(f.gene_id)].to_numpy())):
        warnings.warn('Some gene_ids in gene_compound_dataset are still NA!'
                      'Dropping the missing rows...')
        gcd_dt1 = gcd_dt1[~dt.isna(f.gene_id), :]

    # fix compound names
    ## FIXME:: Remove this when gene signatures are regenerated
    ## START patch
    fix_names_df = dt.fread(compound_names)
    fix_names_df[f.dataset == "GDSC_2020(v1-8.2)", update(dataset="GDSC_v1")]
    fix_names_df[f.dataset == "GDSC_2020(v2-8.2)", update(dataset="GDSC_v2")]
    fix_names_df.names = {
        "drugid": "compound_name",
        "unique.drugid": "compound_id",
        "dataset": "dataset_id"
    }
    fix_names_df.key = ["compound_name", "dataset_id"]
    gcd_dt1.names = {'compound_id': 'compound_name'}
    gcd_dt1[~dt.isna(g.compound_id),
            update(compound_name=g.compound_id),
            join(fix_names_df)]
    ## END patch

    # compound id
    compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'}
    del compound_dt[:, 'compound_uid']
    compound_dt.key = 'compound_name'
    gcd_dt1[:, update(compound_id=g.compound_id), join(compound_dt)]

    if np.any(gcd_dt1[:, dt.isna(f.compound_id)].to_numpy()):
        warnings.warn("Some compound_ids in gene_compound_dataset are stll "
                      "NA! Dropping the missing rows...")
        gcd_dt1 = gcd_dt1[~dt.isna(f.compound_id)]

    # dataset id
    gcd_dt1.names = {'dataset_id': 'dataset_name'}
    dataset_dt.names = {'id': 'dataset_id', 'name': 'dataset_name'}
    dataset_dt.key = 'dataset_name'
    gcd_dt1[:, update(dataset_id=g.dataset_id), join(dataset_dt)]

    # -- Sort then assign the primary key column
    gcd_dt2 = gcd_dt1[:,
                      list(gcd_table_columns),
                      sort('gene_id', 'compound_id', 'dataset_id', 'mDataType'
                           )]
    gcd_dt2[:, update(id=range(1, gcd_dt2.nrows + 1))]

    # Sanity check we didn't lose any rows
    if not gcd_dt.nrows == gcd_dt2.nrows:
        warnings.warn('The gene_compound_dataset table has lost some rows!')

    gcd_dt2.to_jay(os.path.join(output_dir, 'gene_compound_dataset.jay'))