def test_groupby(): DT = dt.Frame(A=[1, 1, 1, 2, 2, 2], B=[ d(2001, 7, 12, 0, 0, 0), d(2005, 3, 14, 15, 9, 26), None, d(2007, 11, 2, 19, 7, 38), d(1965, 6, 19, 2, 17, 7), d(2004, 4, 18, 12, 3, 31) ]) RES = DT[:, { "count": dt.count(f.B), "min": dt.min(f.B), "max": dt.max(f.B), "mean": dt.mean(f.B), "first": dt.first(f.B), "last": dt.last(f.B) }, dt.by(f.A)] assert_equals( RES, dt.Frame(A=[1, 2], count=[2, 3] / dt.int64, min=[d(2001, 7, 12, 0, 0, 0), d(1965, 6, 19, 2, 17, 7)], max=[d(2005, 3, 14, 15, 9, 26), d(2007, 11, 2, 19, 7, 38)], mean=[ d(2003, 5, 13, 19, 34, 43), d(1992, 7, 13, 19, 9, 25, 333333) ], first=[d(2001, 7, 12, 0, 0, 0), d(2007, 11, 2, 19, 7, 38)], last=[None, d(2004, 4, 18, 12, 3, 31)]))
def join_tables(df1: dt.Frame, df2: dt.Frame, join_col: str, delete_unjoined=True) -> None: """ Join df2 and df1 based on join_col (left outer join by default). @param df1: [`datatable.Frame`] The datatable with the foreign key @param df2: [`datatable.Frame`] The join table (ex. tissue datatable) @param join_col: [`string`] The name of the columns on which the tables will be joined (ex. "tissue_id") @param delete_unjoined: [`bool`] An optional parameter (default=True) that lets you keep rows in df1 which didn"t join to any rows in df2 @return [`datatable.Frame`] The new, joined table """ if (join_col not in df1.names) or (join_col not in df2.names): logger.info( f"{join_col} is missing from one or both of the datatables " "passed! Make sure you have prepared df2 using rename_and_key().") return None df = df1[:, :, dt.join(df2)] # Check to see if any FKs are null if df[dt.isna(df[:, "id"]), :].nrows > 0: logger.info(f"The following {join_col}s failed to map:") unmatched = df[dt.isna(df[:, "id"]), join_col].copy() unmatched = unmatched[0, :, dt.by(join_col)] logger.info(unmatched) if delete_unjoined: logger.info(f"Rows with these {join_col}s will be deleted!") del df[dt.isna(df[:, "id"]), :] # Rename the join col and drop it df.names = {join_col: "drop", "id": join_col} del df[:, "drop"] return df
def test_dt_nunique_with_by_for_ungroupped(): DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5]) EXP = dt.Frame(G=[1, 2], V1=[0, 2] / dt.int64, V2=[0, 1] / dt.int64) RES = DT[:, { "V1": dt.nunique(f.V), "V2": dt.nunique(dt.mean(f.V)) }, dt.by(f.G)] assert_equals(EXP, RES)
def build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir): """ Build a join table... @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table @param target_df: [`datatable.Frame`] The target table, keyed @param output_dir: [`string`] The file path with all final PharmacoDB tables @return: [`datatable.Frame`] The gene_target table """ # Get target-uniprot mappings from ChEMBL and Drugbank tables gene_target_df = pd.concat([ chembl_df.to_pandas()[['name', 'uniprot_id']], drugbank_df.to_pandas()[['name', 'uniprot_id']] ]) gene_target_df.rename(columns={'name': 'target_id'}, inplace=True) gene_target_df.drop_duplicates(inplace=True) # Retrieve Uniprot-ENSEMBL gene ID mappings uniprot_ids = pd.Series(pd.unique(gene_target_df['uniprot_id'])) uniprot_ensembl_mappings = pd.concat( parallelize(uniprot_ids, map_uniprot_to_ensembl, 1000)) uniprot_ensembl_mappings.drop_duplicates(inplace=True) # Join gene_target table with gene table based on uniprot-ensembl mappings gene_target_df = pd.merge(gene_target_df, uniprot_ensembl_mappings, on='uniprot_id') gene_target_df.drop(columns=['uniprot_id'], inplace=True) # Load and key the gene table from output_dir gene_file = os.path.join(output_dir, 'gene.jay') if not os.path.exists(gene_file): raise FileNotFoundError(f"There is no gene file in {output_dir}!") gene_df = dt.fread(gene_file, sep=",") gene_df = rename_and_key(gene_df, 'gene_id') # Join target table with gene table and target table gene_target_df = dt.Frame(gene_target_df) gene_target_df = join_tables(gene_target_df, gene_df, 'gene_id') gene_target_df = join_tables(gene_target_df, target_df, 'target_id') # Drop columns that didn't join and drop duplicates gene_target_df = gene_target_df[(dt.f.target_id >= 1) & (dt.f.gene_id >= 1), :] gene_target_df = gene_target_df[0, :, dt.by(gene_target_df.names)] gene_target_df.to_jay(os.path.join(output_dir, 'gene_target.jay')) return gene_target_df
def test_date32_in_groupby(): DT = dt.Frame(A=[1, 2, 3]*1000, B=list(range(3000)), stypes={"B": "date32"}) RES = DT[:, {"count": dt.count(f.B), "min": dt.min(f.B), "max": dt.max(f.B), "first": dt.first(f.B), "last": dt.last(f.B)}, dt.by(f.A)] date32 = dt.stype.date32 assert_equals(RES, dt.Frame(A=[1, 2, 3], count = [1000] * 3 / dt.int64, min = [0, 1, 2] / date32, max = [2997, 2998, 2999] / date32, first = [0, 1, 2] / date32, last = [2997, 2998, 2999] / date32))
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir, compound_synonym_file): """ Using data from the Drugbank and ChEMBL drug target files and the target table, build the drug target table. @param chembl_df: [`dt.Frame`] The ChEMBL drug target table @param drugbank_df: [`dt.Frame`] The DrugBank drug target table @param target_df: [`datatable.Frame`] The target table, keyed @param output_dir: [`string`] The file path with all final PharmacoDB tables @param compound_synonym_file: [`string`] The file path to the compound synonym table @return: [`dt.Frame`] The drug target table """ # Load compound synonym table from output_dir if not os.path.exists(compound_synonym_file): raise FileNotFoundError( f"The file {compound_synonym_file} doesn't exist!") drug_syn_df = dt.fread(compound_synonym_file) # Join drugbank df with drug table del drug_syn_df[:, ['dataset_id', 'id']] drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \ .drop_duplicates() drugbank_df = pl.from_arrow( drugbank_df[:, ['name', 'compound_name']].to_arrow()) drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name') # Combine ChEMBL and Drugbank tables to make drug target table drug_target_df = pd.concat([ chembl_df.to_pandas()[['name', 'compound_id']].copy(), drugbank_df.to_pandas()[['name', 'compound_id']].copy() ]) drug_target_df.rename(columns={'name': 'target_id'}, inplace=True) drug_target_df.drop_duplicates(inplace=True) # Join with target table drug_target_df = dt.Frame(drug_target_df) drug_target_df = join_tables(drug_target_df, target_df, 'target_id') # Drop rows with no target_id, drop duplicates drug_target_df = drug_target_df[dt.f.target_id >= 1, :] drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)] drug_target_df = dt.Frame( pl.from_arrow(drug_target_df.to_arrow()) \ .drop_nulls() \ .to_arrow()) drug_target_df = write_table(drug_target_df, 'compound_target', output_dir, add_index=False) return drug_target_df
# ~ 2a ~ # Create rd3_<release>_subject # Not much is needed. Most of the data comes from the PED and PHENOPACKET files subjects = release[:, { 'id': f.subjectID, 'subjectID': f.samples_subject, 'organisation': f.subject_organisation, 'ERN': f.subject_ERN, 'solved': f.subject_solved, # 'date_solved': f.subject_date_solved, # optional: if available 'matchMakerPermission': f.subject_matchMakerPermission, 'recontact': f.subject_recontact, 'patch': f.patch }, dt.sort('id')][:, dt.first(f[1:]), dt.by(f.id)] # reocde solved status subjects['solved'] = dt.Frame([ recodeValue(mappings=solvedStatusMappings, value=d, label='Solved status') for d in subjects['solved'].to_list()[0] ]) # ~ b ~ # Create rd3_<release>_subjectinfo # There isn't much to add at this point as most of the data in this # table comes from other sources or has never been collected. Add more column # names here if required. subjectInfo = subjects[:, (f.id, f.patch)] subjectInfo['subjectID'] = subjectInfo['id']
def test_dt_count_na2(): DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5]) EXP = dt.Frame(G=[1, 2], V1=[3, 1], V2=[3, 0]) RES = DT[:, [dt.countna(f.V), dt.countna(dt.mean(f.V))], dt.by(f.G)] assert EXP.to_list() == RES.to_list()
# them to step 1 and rerun. # # In the step below, pull selected columns and select distinct cases only ( # subject-study identifiers are already built into the ID). Using this object # create the subject info table and subset by study. # # At this point, you don't have to worry about creating the subjectinfo table. # That table will be built at import time. # # We only need to select new subjects # select columns of interest and unique rows subjects = shipment[ f.isNewSubject == True, :][:, dt.first(f[:]), dt.by(f.subjectID)][:, { 'id': f.subjectID, 'subjectID': f.participant_subject, 'patch': f.patch, 'organisation': f.organisation, 'ERN': f.ERN, 'typeOfAnalysis': f.typeOfAnalysis }] # subset the subjects by group (i.e., type of analysis) # NOTE: objects for the rd3_<release>_subjectinfo tables will be created # at time of import. subjectsByAnalysis = {'_nrows': {'_total': 0}} for type in dt.unique(subjects['typeOfAnalysis']).to_list()[0]: dataByAnalysisType = subjects[f.typeOfAnalysis == type, :] subjectsByAnalysis[type] = dataByAnalysisType
# collapse release statusMsg('Collapsing emx-release....') subjects['associatedRD3Releases'] = dt.Frame([ flattenValueArray( array=subjects[f.subjectID==d, f.release][f.release != None, :].to_list()[0] ) for d in subjects[:, f.subjectID].to_list()[0] ]) # DISTINCT RECORDS ONLY # since all information has been flattend and repeated by subject, it is # possible to select only the distinct records. statusMsg('Complete! Selecting distinct records only....') subjects = subjects[:, first(f[:]), dt.by(f.subjectID)] #////////////////////////////////////////////////////////////////////////////// # ~ 2 ~ # RESHAPE SAMPLES # Sample metadata will need to be processed a bit differently than subject # metadata. The idea is to have all samples listed horizontally by subject. # This means that for each subject there will be a column for all samples # released in DF1, DF2, DF3, and so on. It was done this way since so that # references to other tables can be made. statusMsg('Summarizing sample metadata....') # recode subjectID --- extract subject ID only (i.e., remove '_original', etc.) samples.names={'subject': 'subjectID'} samples['subjectID']=dt.Frame([
def test_groupby_void_multicolumn(): # See issue #3104 DT0 = dt.Frame(A=[None] * 5, B=range(5), C=['q'] * 5) DT1 = DT0[:, dt.count(), dt.by(f.A, f.B)] EXP = dt.Frame(A=[None] * 5, B=range(5), count=([1] * 5) / dt.int64) assert_equals(DT1, EXP)
def test_groupby_void_twice(): # See issue #3108 DT0 = dt.Frame([[None, None, None], [1, 2, 3]]) DT1 = DT0[:, :, dt.by("C0")] DT2 = DT1[:, :, dt.by("C0")] assert_equals(DT2, DT0)
def test_groupby_void_reducer(): DT = dt.Frame([None] * 5)[:, dt.count(), dt.by(0)] assert_equals(DT, dt.Frame(C0=[None], count=[5] / dt.int64))
def test_groupby_void_results(): # See issue #3109 DT0 = dt.Frame([[None] * 5, [0, 1, 1, 2, 3]]) DT1 = DT0[:, :, dt.by("C0")] assert_equals(DT1, DT0)
def test_dt_nunique_with_by_for_groupped(): DT = dt.Frame([1, None, 1, 2, None, None]) EXP = dt.Frame(C0=[None, 1, 2], nunique=[0, 1, 1] / dt.int64) RES = DT[:, {"nunique": dt.nunique(f[0])}, dt.by(f[0])] assert_equals(EXP, RES)