Beispiel #1
0
def test_groupby():
    DT = dt.Frame(A=[1, 1, 1, 2, 2, 2],
                  B=[
                      d(2001, 7, 12, 0, 0, 0),
                      d(2005, 3, 14, 15, 9, 26), None,
                      d(2007, 11, 2, 19, 7, 38),
                      d(1965, 6, 19, 2, 17, 7),
                      d(2004, 4, 18, 12, 3, 31)
                  ])
    RES = DT[:, {
        "count": dt.count(f.B),
        "min": dt.min(f.B),
        "max": dt.max(f.B),
        "mean": dt.mean(f.B),
        "first": dt.first(f.B),
        "last": dt.last(f.B)
    },
             dt.by(f.A)]
    assert_equals(
        RES,
        dt.Frame(A=[1, 2],
                 count=[2, 3] / dt.int64,
                 min=[d(2001, 7, 12, 0, 0, 0),
                      d(1965, 6, 19, 2, 17, 7)],
                 max=[d(2005, 3, 14, 15, 9, 26),
                      d(2007, 11, 2, 19, 7, 38)],
                 mean=[
                     d(2003, 5, 13, 19, 34, 43),
                     d(1992, 7, 13, 19, 9, 25, 333333)
                 ],
                 first=[d(2001, 7, 12, 0, 0, 0),
                        d(2007, 11, 2, 19, 7, 38)],
                 last=[None, d(2004, 4, 18, 12, 3, 31)]))
Beispiel #2
0
def join_tables(df1: dt.Frame,
                df2: dt.Frame,
                join_col: str,
                delete_unjoined=True) -> None:
    """
    Join df2 and df1 based on join_col (left outer join by default).

    @param df1: [`datatable.Frame`] The datatable with the foreign key
    @param df2: [`datatable.Frame`] The join table (ex. tissue datatable)
    @param join_col: [`string`] The name of the columns on which the tables
        will be joined (ex. "tissue_id")
    @param delete_unjoined: [`bool`] An optional parameter (default=True)
        that lets you keep rows in df1 which didn"t join to any rows in df2
    @return [`datatable.Frame`] The new, joined table
    """
    if (join_col not in df1.names) or (join_col not in df2.names):
        logger.info(
            f"{join_col} is missing from one or both of the datatables "
            "passed! Make sure you have prepared df2 using rename_and_key().")
        return None
    df = df1[:, :, dt.join(df2)]
    # Check to see if any FKs are null
    if df[dt.isna(df[:, "id"]), :].nrows > 0:
        logger.info(f"The following {join_col}s failed to map:")
        unmatched = df[dt.isna(df[:, "id"]), join_col].copy()
        unmatched = unmatched[0, :, dt.by(join_col)]
        logger.info(unmatched)
        if delete_unjoined:
            logger.info(f"Rows with these {join_col}s will be deleted!")
            del df[dt.isna(df[:, "id"]), :]
    # Rename the join col and drop it
    df.names = {join_col: "drop", "id": join_col}
    del df[:, "drop"]
    return df
Beispiel #3
0
def test_dt_nunique_with_by_for_ungroupped():
    DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5])
    EXP = dt.Frame(G=[1, 2], V1=[0, 2] / dt.int64, V2=[0, 1] / dt.int64)
    RES = DT[:, {
        "V1": dt.nunique(f.V),
        "V2": dt.nunique(dt.mean(f.V))
    }, dt.by(f.G)]
    assert_equals(EXP, RES)
def build_gene_target_table(chembl_df, drugbank_df, target_df, output_dir):
    """
    Build a join table...

    @param chembl_df: [`pd.DataFrame`] The ChEMBL drug target table
    @param drugbank_df: [`pd.DataFrame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @return: [`datatable.Frame`] The gene_target table
    """
    # Get target-uniprot mappings from ChEMBL and Drugbank tables
    gene_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'uniprot_id']],
        drugbank_df.to_pandas()[['name', 'uniprot_id']]
    ])
    gene_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    gene_target_df.drop_duplicates(inplace=True)

    # Retrieve Uniprot-ENSEMBL gene ID mappings
    uniprot_ids = pd.Series(pd.unique(gene_target_df['uniprot_id']))
    uniprot_ensembl_mappings = pd.concat(
        parallelize(uniprot_ids, map_uniprot_to_ensembl, 1000))
    uniprot_ensembl_mappings.drop_duplicates(inplace=True)

    # Join gene_target table with gene table based on uniprot-ensembl mappings
    gene_target_df = pd.merge(gene_target_df,
                              uniprot_ensembl_mappings,
                              on='uniprot_id')
    gene_target_df.drop(columns=['uniprot_id'], inplace=True)

    # Load and key the gene table from output_dir
    gene_file = os.path.join(output_dir, 'gene.jay')
    if not os.path.exists(gene_file):
        raise FileNotFoundError(f"There is no gene file in {output_dir}!")
    gene_df = dt.fread(gene_file, sep=",")
    gene_df = rename_and_key(gene_df, 'gene_id')

    # Join target table with gene table and target table
    gene_target_df = dt.Frame(gene_target_df)
    gene_target_df = join_tables(gene_target_df, gene_df, 'gene_id')
    gene_target_df = join_tables(gene_target_df, target_df, 'target_id')

    # Drop columns that didn't join and drop duplicates
    gene_target_df = gene_target_df[(dt.f.target_id >= 1) &
                                    (dt.f.gene_id >= 1), :]
    gene_target_df = gene_target_df[0, :, dt.by(gene_target_df.names)]

    gene_target_df.to_jay(os.path.join(output_dir, 'gene_target.jay'))
    return gene_target_df
Beispiel #5
0
def test_date32_in_groupby():
    DT = dt.Frame(A=[1, 2, 3]*1000, B=list(range(3000)), stypes={"B": "date32"})
    RES = DT[:, {"count": dt.count(f.B),
                 "min": dt.min(f.B),
                 "max": dt.max(f.B),
                 "first": dt.first(f.B),
                 "last": dt.last(f.B)},
            dt.by(f.A)]
    date32 = dt.stype.date32
    assert_equals(RES,
        dt.Frame(A=[1, 2, 3],
                 count = [1000] * 3 / dt.int64,
                 min = [0, 1, 2] / date32,
                 max = [2997, 2998, 2999] / date32,
                 first = [0, 1, 2] / date32,
                 last = [2997, 2998, 2999] / date32))
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir,
                                compound_synonym_file):
    """
    Using data from the Drugbank and ChEMBL drug target files and 
    the target table, build the drug target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @param compound_synonym_file: [`string`] The file path to the compound synonym table
    @return: [`dt.Frame`] The drug target table
    """
    # Load compound synonym table from output_dir
    if not os.path.exists(compound_synonym_file):
        raise FileNotFoundError(
            f"The file {compound_synonym_file} doesn't exist!")
    drug_syn_df = dt.fread(compound_synonym_file)
    # Join drugbank df with drug table
    del drug_syn_df[:, ['dataset_id', 'id']]
    drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \
        .drop_duplicates()
    drugbank_df = pl.from_arrow(
        drugbank_df[:, ['name', 'compound_name']].to_arrow())
    drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name')
    # Combine ChEMBL and Drugbank tables to make drug target table
    drug_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'compound_id']].copy(),
        drugbank_df.to_pandas()[['name', 'compound_id']].copy()
    ])
    drug_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    drug_target_df.drop_duplicates(inplace=True)
    # Join with target table
    drug_target_df = dt.Frame(drug_target_df)
    drug_target_df = join_tables(drug_target_df, target_df, 'target_id')
    # Drop rows with no target_id, drop duplicates
    drug_target_df = drug_target_df[dt.f.target_id >= 1, :]
    drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)]
    drug_target_df = dt.Frame(
        pl.from_arrow(drug_target_df.to_arrow()) \
            .drop_nulls() \
            .to_arrow())
    drug_target_df = write_table(drug_target_df,
                                 'compound_target',
                                 output_dir,
                                 add_index=False)
    return drug_target_df
# ~ 2a ~
# Create rd3_<release>_subject
# Not much is needed. Most of the data comes from the PED and PHENOPACKET files
subjects = release[:, {
    'id': f.subjectID,
    'subjectID': f.samples_subject,
    'organisation': f.subject_organisation,
    'ERN': f.subject_ERN,
    'solved': f.subject_solved,
    # 'date_solved': f.subject_date_solved, # optional: if available
    'matchMakerPermission': f.subject_matchMakerPermission,
    'recontact': f.subject_recontact,
    'patch': f.patch
},
                   dt.sort('id')][:, dt.first(f[1:]),
                                  dt.by(f.id)]

# reocde solved status
subjects['solved'] = dt.Frame([
    recodeValue(mappings=solvedStatusMappings, value=d, label='Solved status')
    for d in subjects['solved'].to_list()[0]
])

# ~ b ~
# Create rd3_<release>_subjectinfo
# There isn't much to add at this point as most of the data in this
# table comes from other sources or has never been collected. Add more column
# names here if required.

subjectInfo = subjects[:, (f.id, f.patch)]
subjectInfo['subjectID'] = subjectInfo['id']
Beispiel #8
0
def test_dt_count_na2():
    DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5])
    EXP = dt.Frame(G=[1, 2], V1=[3, 1], V2=[3, 0])
    RES = DT[:, [dt.countna(f.V), dt.countna(dt.mean(f.V))], dt.by(f.G)]
    assert EXP.to_list() == RES.to_list()
# them to step 1 and rerun.
#
# In the step below, pull selected columns and select distinct cases only (
# subject-study identifiers are already built into the ID). Using this object
# create the subject info table and subset by study.
#
# At this point, you don't have to worry about creating the subjectinfo table.
# That table will be built at import time.
#
# We only need to select new subjects

# select columns of interest and unique rows
subjects = shipment[
    f.isNewSubject ==
    True, :][:, dt.first(f[:]),
             dt.by(f.subjectID)][:, {
                 'id': f.subjectID,
                 'subjectID': f.participant_subject,
                 'patch': f.patch,
                 'organisation': f.organisation,
                 'ERN': f.ERN,
                 'typeOfAnalysis': f.typeOfAnalysis
             }]

# subset the subjects by group (i.e., type of analysis)
# NOTE: objects for the rd3_<release>_subjectinfo tables will be created
#       at time of import.
subjectsByAnalysis = {'_nrows': {'_total': 0}}
for type in dt.unique(subjects['typeOfAnalysis']).to_list()[0]:
    dataByAnalysisType = subjects[f.typeOfAnalysis == type, :]
    subjectsByAnalysis[type] = dataByAnalysisType
# collapse release
statusMsg('Collapsing emx-release....')
subjects['associatedRD3Releases'] = dt.Frame([
    flattenValueArray(
        array=subjects[f.subjectID==d, f.release][f.release != None, :].to_list()[0]
    )
    for d in subjects[:, f.subjectID].to_list()[0]
])

# DISTINCT RECORDS ONLY
# since all information has been flattend and repeated by subject, it is
# possible to select only the distinct records.
statusMsg('Complete! Selecting distinct records only....')

subjects = subjects[:, first(f[:]), dt.by(f.subjectID)]

#//////////////////////////////////////////////////////////////////////////////

# ~ 2 ~ 
# RESHAPE SAMPLES
# Sample metadata will need to be processed a bit differently than subject
# metadata. The idea is to have all samples listed horizontally by subject.
# This means that for each subject there will be a column for all samples
# released in DF1, DF2, DF3, and so on. It was done this way since so that
# references to other tables can be made.
statusMsg('Summarizing sample metadata....')

# recode subjectID --- extract subject ID only (i.e., remove '_original', etc.)
samples.names={'subject': 'subjectID'}
samples['subjectID']=dt.Frame([
Beispiel #11
0
def test_groupby_void_multicolumn():
    # See issue #3104
    DT0 = dt.Frame(A=[None] * 5, B=range(5), C=['q'] * 5)
    DT1 = DT0[:, dt.count(), dt.by(f.A, f.B)]
    EXP = dt.Frame(A=[None] * 5, B=range(5), count=([1] * 5) / dt.int64)
    assert_equals(DT1, EXP)
Beispiel #12
0
def test_groupby_void_twice():
    # See issue #3108
    DT0 = dt.Frame([[None, None, None], [1, 2, 3]])
    DT1 = DT0[:, :, dt.by("C0")]
    DT2 = DT1[:, :, dt.by("C0")]
    assert_equals(DT2, DT0)
Beispiel #13
0
def test_groupby_void_reducer():
    DT = dt.Frame([None] * 5)[:, dt.count(), dt.by(0)]
    assert_equals(DT, dt.Frame(C0=[None], count=[5] / dt.int64))
Beispiel #14
0
def test_groupby_void_results():
    # See issue #3109
    DT0 = dt.Frame([[None] * 5, [0, 1, 1, 2, 3]])
    DT1 = DT0[:, :, dt.by("C0")]
    assert_equals(DT1, DT0)
Beispiel #15
0
def test_dt_nunique_with_by_for_groupped():
    DT = dt.Frame([1, None, 1, 2, None, None])
    EXP = dt.Frame(C0=[None, 1, 2], nunique=[0, 1, 1] / dt.int64)
    RES = DT[:, {"nunique": dt.nunique(f[0])}, dt.by(f[0])]
    assert_equals(EXP, RES)