def build_target_tables(drugbank_file, chembl_file, output_dir, compound_synonym_file): """ Build the target and drug target tables using data from Drugbank and ChEMBL. @param drugbank_file: [`string`] The full file path to Drugbank targets @param chembl_file: [`string`] The full file path to ChEMBL targets @param output_dir: [`string`] The directory of all final PharmacoDB tables @param compound_synonym_file: [`string`] The file path to the compound synonym table @return: None """ # Get Drugbank data if not os.path.exists(drugbank_file): raise FileNotFoundError(f"The file {drugbank_file} doesn't exist!") drugbank_df = dt.fread(drugbank_file) drugbank_df.names = { 'polypeptide.external.identifiers.UniProtKB': 'uniprot_id', 'drugName': 'compound_name' } # Get ChEMBL data if not os.path.exists(chembl_file): raise FileNotFoundError(f"The file {chembl_file} doesn't exist!") chembl_df = dt.fread(chembl_file) chembl_df.names = {'pref_name': 'name', 'accession': 'uniprot_id'} chembl_df.key = chembl_df.names[0] target_df = build_target_table(chembl_df, drugbank_df, output_dir) build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir, compound_synonym_file) build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir)
def chunk_append_to_table(file_path: str, table: sqlalchemy.Table, chunksize: int = 10000000) -> None: """ Append to an existing database table after the highest value of the id column. """ table_name = table.__table__.name logger.info( f'\tAppending data from {os.path.basename(file_path)} to {table_name}...' ) # Get the highest index from the table so far with engine.connect() as con: max_index = con.execute(f'SELECT MAX(id) FROM {table_name}').first()[0] # Read in the raw table data and filter to the index of interest data_df = dt.fread(file_path) session = Session(bind=engine) logger.info(f'Inserting after table id: {max_index}') bulk_insert_chunk(data_df=data_df, table=table, session=session, chunksize=chunksize, start_index=(max_index - 1)) logger.info( f'\tAppending data from {os.path.basename(file_path)} to {table_name}... DONE!\n' )
def test_view_to_jay(): # See issue #3099 DT1 = dt.Frame([None] * 10)[:3, :] saved = DT1.to_jay() DT2 = dt.fread(saved) assert DT2.type == dt.Type.void assert_equals(DT1, DT2)
def test_save_to_jay(tempfile_jay): src = [d(1, 1, 1), d(2001, 12, 13), d(2026, 5, 9), None, d(1956, 11, 11)] DT = dt.Frame(src) DT.to_jay(tempfile_jay) del DT DT2 = dt.fread(tempfile_jay) assert DT2.shape == (5, 1) assert DT2.type == dt.Type.date32 assert DT2.to_list()[0] == src
def bulk_insert(file_path, table): """Batched INSERT statements via the ORM "bulk", using dictionaries.""" logger.info(f'\tInserting data from {os.path.basename(file_path)}...') df = dt.fread(file_path).to_pandas() row_dict = create_records(df) session = Session(bind=engine) session.bulk_insert_mappings(table, row_dict) session.commit() logger.info( f'\tInserting data from {os.path.basename(file_path)}... DONE\n')
def load_from_jay(filename): DT = dt.fread(filename) frame = MetaFrame() frame.df = DT frame.data = DT.to_list() frame.names = list(DT.names) frame.types = [_ltype_to_pytype[lt] for lt in DT.ltypes] frame.nkeys = len(DT.key) print(f"{frame.name} = <loaded from {filename}>") return frame
def build_gene_compound_tissue_dataset_df( gene_sig_dir: str, pset_name: str, ignore_psets: list = ['NCI60', 'PRISM']) -> pd.DataFrame: """ @param gene_sig_dir: [`string`] The file path to the directory containing the gene signatures for each PSet @param pset_name: [`string`] The name of the PSet @return: [`DataFrame`] The gene_compounds table for this PSet, containing all stats (?) """ # Early return for PSets without gene signatures if pset_name in ignore_psets: return None # Get gene_sig_df from gene_sig_file try: gene_sig_df = dt.fread( os.path.join(gene_sig_dir, 'gene_compound_tissue_dataset.csv'), memory_limit=int(1e10) # 10 GBs ) except ValueError: return None gene_sig_df[f.dataset == pset_name, :] # Extract relevant columns # gene_compound_tissue_dataset = gctd gctd_df = gene_sig_df.loc[:, [ 'gene', 'compound', 'tissue', 'dataset', 'estimate', 'lower_analytic', 'upper_analytic', 'lower_permutation', 'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation', 'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation', 'mDataType' ]] # Add missing columns gctd_df.loc[:, 'sens_stat'] = 'AAC' gctd_df.loc[:, 'permutation_done'] = 0 gctd_df.loc[~gctd_df['fdr_permutation'].isna(), 'permutation_done'] = 1 # Rename foreign key columns gctd_df.rename(columns={ 'gene': 'gene_id', 'compound': 'compound_id', 'tissue': 'tissue_id', 'dataset': 'dataset_id' }, inplace=True) gctd_df['gene_id'] = gctd_df['gene_id'] \ .apply(lambda x: re.sub(r'\..*$', '', x)) # Reorder columns return gctd_df[[ 'gene_id', 'compound_id', 'dataset_id', 'tissue_id', 'estimate', 'lower_analytic', 'upper_analytic', 'lower_permutation', 'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation', 'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation', 'permutation_done', 'sens_stat', 'mDataType' ]]
def test_time64_read_from_csv(): DT = dt.fread("timestamp\n" "2001-05-12T12:00:00\n" "2013-06-24T14:00:01\n" "2023-11-02T23:59:59.999999\n") assert DT.type == dt.Type.time64 assert DT.shape == (3, 1) assert DT.to_list()[0] == [ d(2001, 5, 12, 12), d(2013, 6, 24, 14, 0, 1), d(2023, 11, 2, 23, 59, 59, 999999) ]
def bulk_insert_chunk_from_file(file_path, table, chunksize=100000): """Batched INSERT statements via the ORM "bulk", using dictionaries.""" logger.info(f'\tInserting data from {os.path.basename(file_path)}...') session = Session(bind=engine) data_df = dt.fread(file_path) bulk_insert_chunk(data_df=data_df, table=table, session=session, chunksize=chunksize, start_index=0) logger.info( f'\tInserting data from {os.path.basename(file_path)}... DONE!\n')
def test_save_to_jay(tempfile_jay): src = [ d(1901, 2, 3, 4, 5, 6), d(2001, 12, 13, 0, 30), d(2026, 5, 9, 12), None, d(1956, 11, 11, 11, 11, 11) ] DT = dt.Frame(src) DT.to_jay(tempfile_jay) del DT DT2 = dt.fread(tempfile_jay) assert DT2.shape == (5, 1) assert DT2.types == [dt.Type.time64] assert DT2.to_list()[0] == src
def build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir): """ Build a join table... @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table @param target_df: [`datatable.Frame`] The target table, keyed @param output_dir: [`string`] The file path with all final PharmacoDB tables @return: [`datatable.Frame`] The gene_target table """ # Get target-uniprot mappings from ChEMBL and Drugbank tables gene_target_df = pd.concat([ chembl_df.to_pandas()[['name', 'uniprot_id']], drugbank_df.to_pandas()[['name', 'uniprot_id']] ]) gene_target_df.rename(columns={'name': 'target_id'}, inplace=True) gene_target_df.drop_duplicates(inplace=True) # Retrieve Uniprot-ENSEMBL gene ID mappings uniprot_ids = pd.Series(pd.unique(gene_target_df['uniprot_id'])) uniprot_ensembl_mappings = pd.concat( parallelize(uniprot_ids, map_uniprot_to_ensembl, 1000)) uniprot_ensembl_mappings.drop_duplicates(inplace=True) # Join gene_target table with gene table based on uniprot-ensembl mappings gene_target_df = pd.merge(gene_target_df, uniprot_ensembl_mappings, on='uniprot_id') gene_target_df.drop(columns=['uniprot_id'], inplace=True) # Load and key the gene table from output_dir gene_file = os.path.join(output_dir, 'gene.jay') if not os.path.exists(gene_file): raise FileNotFoundError(f"There is no gene file in {output_dir}!") gene_df = dt.fread(gene_file, sep=",") gene_df = rename_and_key(gene_df, 'gene_id') # Join target table with gene table and target table gene_target_df = dt.Frame(gene_target_df) gene_target_df = join_tables(gene_target_df, gene_df, 'gene_id') gene_target_df = join_tables(gene_target_df, target_df, 'target_id') # Drop columns that didn't join and drop duplicates gene_target_df = gene_target_df[(dt.f.target_id >= 1) & (dt.f.gene_id >= 1), :] gene_target_df = gene_target_df[0, :, dt.by(gene_target_df.names)] gene_target_df.to_jay(os.path.join(output_dir, 'gene_target.jay')) return gene_target_df
def test_with_stats(tempfile_jay): DT = dt.Frame([-1, 0, 1, None, 12, 3, None], stype='date32') # precompute stats so that they get stored in the Jay file assert DT.countna1() == 2 assert DT.min1() == datetime.date(1969, 12, 31) assert DT.max1() == datetime.date(1970, 1, 13) DT.to_jay(tempfile_jay) del DT DT = dt.fread(tempfile_jay) # assert_equals() also checks frame integrity, including validity of stats assert_equals(DT, dt.Frame([-1, 0, 1, None, 12, 3, None], stype='date32')) assert DT.countna1() == 2 assert DT.min1() == datetime.date(1969, 12, 31) assert DT.max1() == datetime.date(1970, 1, 13)
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir, compound_synonym_file): """ Using data from the Drugbank and ChEMBL drug target files and the target table, build the drug target table. @param chembl_df: [`dt.Frame`] The ChEMBL drug target table @param drugbank_df: [`dt.Frame`] The DrugBank drug target table @param target_df: [`datatable.Frame`] The target table, keyed @param output_dir: [`string`] The file path with all final PharmacoDB tables @param compound_synonym_file: [`string`] The file path to the compound synonym table @return: [`dt.Frame`] The drug target table """ # Load compound synonym table from output_dir if not os.path.exists(compound_synonym_file): raise FileNotFoundError( f"The file {compound_synonym_file} doesn't exist!") drug_syn_df = dt.fread(compound_synonym_file) # Join drugbank df with drug table del drug_syn_df[:, ['dataset_id', 'id']] drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \ .drop_duplicates() drugbank_df = pl.from_arrow( drugbank_df[:, ['name', 'compound_name']].to_arrow()) drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name') # Combine ChEMBL and Drugbank tables to make drug target table drug_target_df = pd.concat([ chembl_df.to_pandas()[['name', 'compound_id']].copy(), drugbank_df.to_pandas()[['name', 'compound_id']].copy() ]) drug_target_df.rename(columns={'name': 'target_id'}, inplace=True) drug_target_df.drop_duplicates(inplace=True) # Join with target table drug_target_df = dt.Frame(drug_target_df) drug_target_df = join_tables(drug_target_df, target_df, 'target_id') # Drop rows with no target_id, drop duplicates drug_target_df = drug_target_df[dt.f.target_id >= 1, :] drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)] drug_target_df = dt.Frame( pl.from_arrow(drug_target_df.to_arrow()) \ .drop_nulls() \ .to_arrow()) drug_target_df = write_table(drug_target_df, 'compound_target', output_dir, add_index=False) return drug_target_df
def test_with_stats(tempfile_jay): src = [ d(1901, 12, 13, 0, 11, 59), d(2001, 2, 17, 0, 30), None, d(2026, 5, 19, 12, 0, 1, 1111), None, d(1956, 11, 11, 11, 11, 11) ] DT = dt.Frame(src) # precompute stats so that they get stored in the Jay file assert DT.countna1() == 2 assert DT.min1() == d(1901, 12, 13, 0, 11, 59) assert DT.max1() == d(2026, 5, 19, 12, 0, 1, 1111) DT.to_jay(tempfile_jay) DTnew = dt.fread(tempfile_jay) # assert_equals() also checks frame integrity, including validity of stats assert_equals(DTnew, DT) assert DTnew.countna1() == 2 assert DTnew.min1() == d(1901, 12, 13, 0, 11, 59) assert DTnew.max1() == d(2026, 5, 19, 12, 0, 1, 1111)
def combine_primary_tables(data_dir: str, output_dir: str, compound_meta_file: str) -> dict: """ Build all the primary tables, i.e., tables that require no joins, and return them in a dictionary. @param data_dir: [`string`] The file path to read the PSet tables @param output_dir: [`string`] The file path to write the final tables @return: [`dict(string: datatable.Frame)`] A dictionary of all the primary tables, with names as keys """ # Load, concatenate, and write primary tables to disk tissue_df = load_table("tissue", data_dir) tissue_df = tissue_df[:, :, sort(f.name, na_position="last")] tissue_df = write_table(tissue_df, "tissue", output_dir) gene_df = load_join_write("gene", data_dir, output_dir) dataset_df = load_join_write("dataset", data_dir, output_dir) # Annotate compounds compound_df = load_table("compound", data_dir) compound_meta_df = dt.fread(compound_meta_file) compound_meta_df.names = { "unique.drugid": "name", "PharmacoDB.uid": "compound_uid" } # Add compound_uid to compound table and write to disk compound_meta_df.key = "name" compound_df = compound_df[:, :, dt.join(compound_meta_df)] compound_df = write_table(compound_df, "compound", output_dir) # Transform tables to be used for joins dfs = {} dfs["tissue"] = rename_and_key(tissue_df, "tissue_id") dfs["compound"] = rename_and_key(compound_df, "compound_id") dfs["gene"] = rename_and_key(gene_df, "gene_id") dfs["dataset"] = rename_and_key(dataset_df, "dataset_id") return dfs
# In[1]: from datatable import dt, f from operator import and_, or_, xor, eq from functools import reduce import re # In[2]: file_path = "https://github.com/samukweku/data_files/raw/master/msleep.txt" DT = dt.fread(file_path) DT.head(5) # ### Filtering Rows Based on a Numeric Variable # You can filter numeric variables based on their values. A number of commonly used operators include: >, >=, <, <=, == and !=. # # Note that in datatable, filtration occurs in the ``i`` section: # In[3]: DT[f.sleep_total > 18, ["name", "sleep_total"]]
melt(DT).stypes # In[35]: melt(DT, id_vars=['first', 'last']) # In[36]: url = 'https://raw.githubusercontent.com/tidyverse/tidyr/main/data-raw/billboard.csv' DT = dt.fread(url) measure_vars = [name for name in DT.names if name.startswith('wk')] melt(DT, measure_vars=measure_vars, variable_name='week') # In[37]: DT = dt.Frame([{'A': 'a', 'B': 1, 'C': 2}, {'A': 'b', 'B': 3, 'C': 4}, {'A': 'c', 'B': 5, 'C': 6}]) DT # In[38]:
def test_to_jay(): DT1 = dt.Frame([None] * 13) saved = DT1.to_jay() DT2 = dt.fread(saved) assert DT2.type == dt.Type.void assert_equals(DT1, DT2)
# ## **.SD - Subset of Data** # I will be using [Jose Morales](https://twitter.com/jmrlsz) excellent [post](https://rpubs.com/josemz/SDbf) to show how .SD's functionality can be replicated in python's [datatable](https://datatable.readthedocs.io/en/latest/index.html). # # Not all functions can be replicated; R [data.table](https://github.com/Rdatatable/data.table) has a whole lot more functions and features that are not yet implemented in [datatable](https://datatable.readthedocs.io/en/latest/index.html). # # # The data file can be accessed [here](https://github.com/samukweku/data_files/raw/master/iris.csv) # In[1]: from datatable import dt, by, f, update, sort # In[2]: DT = dt.fread('https://github.com/samukweku/data_files/raw/master/iris.csv') DT.head() # #### Number of unique observations per column # In[3]: # DT[, lapply(.SD, uniqueN)] --> Rdatatable DT.nunique() # #### Mean of all columns by `species` # In[4]: # DT[, lapply(.SD, mean), by = species] --> Rdatatable
def build_cellosaurus_df(cellosaurus_path, output_dir): """ Build cellosaurus table. @param cellosaurus_path: [`string`] Full file path to cellosaurus file @param output_dir: [`string`] The directory to write the cellosaurus table @param cell_df: [`datatable.Frame`] The cell table; should be renamed, keyed, and shouldn't have 'tissue_id' column @return: [`datatable.Frame`] The cellosaurus table """ with open(cellosaurus_path) as f: file = [line for line in f] file = file[55:] entries = ''.join(file).split('//\n') entry_list = [entry.split('\n') for entry in entries] entry_split_list = [[item.split(' ') for item in entry] for entry in entry_list] entry_tuple_list = [[(item[0], item[1]) for item in entry if len(item) > 1] for entry in entry_split_list] pool = Pool(cpu_count() - 1) dict_list = pool.map(build_defaultdict, entry_tuple_list) dict_list = [dict(item) for item in dict_list] dict_list = [{key: '|||'.join(value) for key, value in dct.items()} for dct in dict_list] cellosaurus_df = pd.DataFrame(dict_list) cellosaurus_df.dropna(axis=1, how='all', inplace=True) # Always close your pool or you will have a bunch of processes doing nothing pool.close() # Drop AG and DT columns (age of donor, date) cellosaurus_df.drop(columns=['AG', 'DT'], inplace=True) # Rename cols and add cell_id column rename_dict = {col: col.lower() for col in cellosaurus_df.columns} cellosaurus_df.rename(columns=rename_dict, inplace=True) cellosaurus_df.rename(columns={ 'id': 'identifier', 'ac': 'accession' }, inplace=True) cellosaurus_df['cell_id'] = cellosaurus_df['identifier'] # Load cell_df cell_path = os.path.join(output_dir, 'cell.jay') cell_df = rename_and_key(dt.fread(cell_path), 'cell_id') # Convert to datatable and join with cell_df cellosaurus_df = dt.Frame(cellosaurus_df) cellosaurus_df.key = 'cell_id' df = cell_df[:, :, join(dt.Frame(cellosaurus_df))] df = df[dt.f.id >= 1, :] df = df[:, [ 'cell_id', 'id', 'accession', 'as', 'sy', 'dr', 'rx', 'ww', 'cc', 'st', 'di', 'ox', 'hi', 'oi', 'sx', 'ca' ]] df.names = {'cell_id': 'identifier', 'id': 'cell_id'} df = write_table(df, 'cellosaurus', output_dir) return df
def test_arr32_to_jay(): DT = dt.Frame(W=[['ad', 'dfkvjn'], ['b b, f', None], ['r', 'w', 'dfvdf']]) assert DT.type == dt.Type.arr32(dt.Type.str32) out = DT.to_jay() RES = dt.fread(out) assert_equals(RES, DT)
def test_do_not_read_from_csv(): assert dt.options.fread.parse_dates is True with dt.options.context(**{"fread.parse_dates": False}): DT = dt.fread("X\n1990-10-10\n2011-11-11\n2020-02-05\n") assert_equals(DT, dt.Frame(X=["1990-10-10", "2011-11-11", "2020-02-05"]))
def test_read_date32_from_csv(): DT = dt.fread("a-b-c\n1999-01-01\n2010-11-11\n2020-12-31\n") assert_equals(DT, dt.Frame([d(1999,1,1), d(2010,11,11), d(2020,12,31)], names=["a-b-c"]))
def test_view_to_jay_2(): DT1 = dt.Frame(A=[None, None], B=[None, "qoo"])[0, :] saved = DT1.to_jay() DT2 = dt.fread(saved) assert_equals(DT2, dt.Frame(A=[None], B=[None] / dt.str32))
def build_gene_compound_dataset_df(gene_compound_dataset_file, gene_file, compound_file, dataset_file, output_dir, compound_names): """ Build gene_compound_dataset table (description?) @param gene_compound_dataset_file: [`str`] Path to the gene signature .csv file. @param gene_file: [`str`] Path to the gene table .csv file. @param compound_file: [`str`] Path to the compound table .csv file. @param dataset_file: [`str`] Path to the tissue table .csv file. @param output_dir: [`str`] Path to write the output file to. :param compound_name: [`str`] Path to an optional .csv file mapping updated compound names to the dataset. This is to ensure that corrected compound annotations still make it into the database without the need to rerun all the gene signatures @return [`datatable.Frame`] Writes the 'gene_compound_dataset.csv' file to output_dir the returns the table. """ # -- Check the input files exist for fl in [ gene_compound_dataset_file, gene_file, compound_file, dataset_file ]: if not os.path.exists(fl): raise FileNotFoundError(f'Could not find the {fl}') # -- Read in mapping tables gene_dt = fread(gene_file) compound_dt = fread(compound_file) dataset_dt = fread(dataset_file) # -- Read in gene_compound_tissue table gcd_dt = fread(gene_compound_dataset_file) # -- Fix names and assign missing columns gcd_dt.names = { 'gene': 'gene_id', 'compound': 'compound_id', 'dataset': 'dataset_id', 'lower': 'lower_analytic', 'upper': 'upper_analytic', 'pvalue': 'pvalue_analytic', 'fdr': 'fdr_analytic' } del gcd_dt[:, ['significant', 'tissue']] # Determine missing columns and assign them, so we don't have to change code #>when new columns are addeds gcd_table_columns = np.asarray( ('id', 'gene_id', 'compound_id', 'dataset_id', 'estimate', 'lower_analytic', 'upper_analytic', 'lower_permutation', 'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation', 'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation', 'permutation_done', 'sens_stat', 'mDataType')) gcd_missing_columns = np.setdiff1d(gcd_table_columns, np.asarray(gcd_dt.names)) for col in gcd_missing_columns: gcd_dt[col] = None gcd_dt1 = gcd_dt[:, list(gcd_table_columns)] # Sanity check the columns are there if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)): raise ValueError(f'The build_gene_compound_dataset table', ' has missing columns!') gcd_dt1[:, update(sens_stat='AAC', permutation_done=0)] # -- Map to existing FK ids # gene id gcd_dt1.names = {'gene_id': 'gene_name'} gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'} gene_dt.key = 'gene_name' # NOTE: the g object references the joined tables namespace gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)] # make sure all genes mapped failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id), 'gene_name'] \ .to_numpy().flatten()) if len(failed_genes) > 0: warnings.warn(f'The genes: {failed_genes} did not map!') if (np.any(gcd_dt1[:, dt.isna(f.gene_id)].to_numpy())): warnings.warn('Some gene_ids in gene_compound_dataset are still NA!' 'Dropping the missing rows...') gcd_dt1 = gcd_dt1[~dt.isna(f.gene_id), :] # fix compound names ## FIXME:: Remove this when gene signatures are regenerated ## START patch fix_names_df = dt.fread(compound_names) fix_names_df[f.dataset == "GDSC_2020(v1-8.2)", update(dataset="GDSC_v1")] fix_names_df[f.dataset == "GDSC_2020(v2-8.2)", update(dataset="GDSC_v2")] fix_names_df.names = { "drugid": "compound_name", "unique.drugid": "compound_id", "dataset": "dataset_id" } fix_names_df.key = ["compound_name", "dataset_id"] gcd_dt1.names = {'compound_id': 'compound_name'} gcd_dt1[~dt.isna(g.compound_id), update(compound_name=g.compound_id), join(fix_names_df)] ## END patch # compound id compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'} del compound_dt[:, 'compound_uid'] compound_dt.key = 'compound_name' gcd_dt1[:, update(compound_id=g.compound_id), join(compound_dt)] if np.any(gcd_dt1[:, dt.isna(f.compound_id)].to_numpy()): warnings.warn("Some compound_ids in gene_compound_dataset are stll " "NA! Dropping the missing rows...") gcd_dt1 = gcd_dt1[~dt.isna(f.compound_id)] # dataset id gcd_dt1.names = {'dataset_id': 'dataset_name'} dataset_dt.names = {'id': 'dataset_id', 'name': 'dataset_name'} dataset_dt.key = 'dataset_name' gcd_dt1[:, update(dataset_id=g.dataset_id), join(dataset_dt)] # -- Sort then assign the primary key column gcd_dt2 = gcd_dt1[:, list(gcd_table_columns), sort('gene_id', 'compound_id', 'dataset_id', 'mDataType' )] gcd_dt2[:, update(id=range(1, gcd_dt2.nrows + 1))] # Sanity check we didn't lose any rows if not gcd_dt.nrows == gcd_dt2.nrows: warnings.warn('The gene_compound_dataset table has lost some rows!') gcd_dt2.to_jay(os.path.join(output_dir, 'gene_compound_dataset.jay'))