Exemple #1
0
def main(argv):
    """Runs the experiment."""

    syn = synapseclient.Synapse()
    syn.login("grzadkow")
    cdata = VariantCohort(syn,
                          'TCGA-OV',
                          mut_genes=['TTN'],
                          mut_levels=('Gene', 'Form', 'Exon'),
                          cv_info={
                              'Prop': 0.8,
                              'Seed': argv[-1]
                          })
    cdata.train_expr_ = cdata.train_expr_.sort_index()

    prot_data = pd.read_csv(in_path + 'PNNL-causality-formatted.txt.zip',
                            sep='\t')
    prot_vec = prot_data.ix[prot_data['ID'] == 'TTN', :]
    prot_vec = prot_vec.loc[:, prot_vec.columns.isin(cdata.train_expr_.index)]
    prot_vec = prot_vec.dropna(axis=1)
    use_indx = cdata.train_expr_.index.isin(prot_vec.columns)

    base_cor = spearmanr(
        np.array(prot_vec)[0],
        np.array(cdata.train_expr_.ix[prot_vec.columns, 'TTN']))

    mtypes = [
        MuType({('Gene', 'TTN'): {
                    ('Form', 'Missense_Mutation'): None
                }}),
        MuType({('Gene', 'TTN'): {
                    ('Form', 'Nonsense_Mutation'): None
                }}),
    ]

    mut_list = [
        cdata.train_mut_.status(cdata.train_expr_.index, mtype)
        for mtype in mtypes
    ]

    clf = MKBMTL(path_keys={(((), ('controls-state-change-of', )), )})
    clf.named_steps['fit'].R = 5
    clf.fit_coh(cohort=cdata, mtypes=mtypes)
    H_cor = [
        spearmanr(clf.named_steps['fit'].H_mat['mu'][i, use_indx],
                  np.array(prot_vec)[0])
        for i in range(clf.named_steps['fit'].R)
    ]

    print(clf.named_steps['fit'].bw_mat['mu'].round(2))
    print(clf.eval_coh(cohort=cdata, mtypes=mtypes))

    # saves classifier results to file
    out_file = out_path + argv[0] + '_' + argv[1] + '__run' + argv[-1] + '.p'
    print(out_file)
    out_data = {'H_cor': H_cor, 'base': base_cor}
    pickle.dump(out_data, open(out_file, 'wb'))
Exemple #2
0
def main(argv):
    """Runs the experiment."""

    # gets the directory where output will be saved and the name of the TCGA
    # cohort under consideration, loads the list of gene sub-variants
    print(argv)
    out_dir = os.path.join(base_dir, 'output', argv[0], argv[1], argv[2])
    coh_lbl = 'TCGA-{}'.format(argv[0])

    # loads the expression data and gene mutation data for the given TCGA
    # cohort, with the training/testing cohort split defined by the
    # cross-validation id for this task
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = VariantCohort(cohort=coh_lbl,
                          mut_genes=[argv[1]],
                          mut_levels=('Gene', 'Form', 'Exon', 'Location',
                                      'Protein'),
                          syn=syn,
                          cv_seed=(int(argv[3]) + 3) * 17)

    base_mtype = MuType({('Gene', argv[1]): None})
    optim = PartitionOptim(cdata, base_mtype, eval(argv[2]),
                           ('Form', 'Exon', 'Location', 'Protein'))

    while optim.traverse_branch():
        optim_mtypes = optim.best_optim()

    # saves classifier results to file
    out_file = os.path.join(out_dir, 'results', 'out__cv-{}.p'.format(argv[3]))
    pickle.dump(
        {
            'best': optim.best_mtypes,
            'hist': optim.mtype_scores,
            'pred': optim.pred_scores,
            'optim': optim.best_optim()
        }, open(out_file, 'wb'))
Exemple #3
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description=("Test a classifier's ability to predict the presence "
                     "of a list of sub-types."))

    # positional command line arguments
    parser.add_argument('mtype_dir',
                        type=str,
                        help='the folder where sub-types are stored')
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('base_gene',
                        type=str,
                        help='the gene to cross with respect to')

    parser.add_argument('cv_id',
                        type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('task_id',
                        type=int,
                        help='the subset of sub-types to assign to this task')

    parser.add_argument(
        '--tune_splits',
        type=int,
        default=8,
        help='how many training cohort splits to use for tuning')
    parser.add_argument(
        '--test_count',
        type=int,
        default=24,
        help='how many hyper-parameter values to test in each tuning split')
    parser.add_argument(
        '--parallel_jobs',
        type=int,
        default=12,
        help='how many parallel CPUs to allocate the tuning tests across')

    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    if args.verbose:
        print("Starting testing for directory\n{}\nwith "
              "cross-validation ID {} and task ID {} ...".format(
                  args.mtype_dir, args.cv_id, args.task_id))

    mtype_list = sorted(
        pickle.load(
            open(os.path.join(args.mtype_dir, 'tmp', 'mtype_list.p'), 'rb')))

    # loads the pipeline used for classifying variants, gets the mutated
    # genes for each variant under consideration
    mut_clf = eval(args.classif)
    use_genes = reduce(
        or_,
        [set(gn for gn, _ in mtype.subtype_list())
         for mtype in mtype_list]) | {args.base_gene}

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = VariantCohort(cohort=args.cohort,
                          mut_genes=list(use_genes),
                          mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'],
                          expr_source='Firehose',
                          data_dir=firehose_dir,
                          syn=syn,
                          cv_seed=(args.cv_id + 53) * 7,
                          cv_prop=2 / 3)

    base_mtype = MuType({('Gene', args.base_gene): None})
    base_train_samps = base_mtype.get_samples(cdata.train_mut)
    base_test_samps = base_mtype.get_samples(cdata.test_mut)

    if args.verbose:
        print("Loaded {} sub-types over {} genes which will be tested using "
              "classifier {} in cohort {} with {} samples.".format(
                  len(mtype_list), len(use_genes), args.classif, args.cohort,
                  len(cdata.samples)))

    out_acc = {mtype: {} for mtype in mtype_list}

    for i, mtype in enumerate(mtype_list):
        if (i % 10) == args.task_id:

            if args.verbose:
                print("Testing {} ...".format(mtype))

            ex_genes = set(gn for gn, _ in mtype.subtype_list())
            clf = mut_clf()

            cur_train_samps = mtype.get_samples(cdata.train_mut)
            cur_test_samps = mtype.get_samples(cdata.test_mut)

            clf.tune_coh(cdata,
                         mtype,
                         exclude_genes=ex_genes,
                         tune_splits=args.tune_splits,
                         test_count=args.test_count,
                         parallel_jobs=args.parallel_jobs)

            clf.fit_coh(cdata, mtype, exclude_genes=ex_genes)
            out_acc[mtype]['Base'] = clf.eval_coh(cdata,
                                                  mtype,
                                                  exclude_genes=ex_genes)

            if (len(cur_train_samps - base_train_samps) > 3
                    and len(cur_test_samps - base_test_samps) > 3):

                print("Null test {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             exclude_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            exclude_samps=base_train_samps)
                out_acc[mtype]['Null'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    exclude_samps=base_test_samps)

            if (len(cur_train_samps & base_train_samps) > 3
                    and len(cur_test_samps & base_test_samps) > 3):

                print("Mut test {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             include_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            include_samps=base_train_samps)
                out_acc[mtype]['Mut'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    include_samps=base_test_samps)

            if (len(cur_train_samps - base_train_samps) > 3
                    and len(cur_test_samps & base_test_samps) > 3):

                print("Null cross {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             exclude_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            exclude_samps=base_train_samps)
                out_acc[mtype]['NullX'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    include_samps=base_test_samps)

            if (len(cur_train_samps & base_train_samps) > 3
                    and len(cur_test_samps - base_test_samps) > 3):

                print("Mut cross {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             include_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            include_samps=base_train_samps)
                out_acc[mtype]['MutX'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    exclude_samps=base_test_samps)

        else:
            del (out_acc[mtype])

    # saves the performance measurements for each variant to file
    out_file = os.path.join(
        args.mtype_dir, 'results',
        'out__cv-{}_task-{}.p'.format(args.cv_id, args.task_id))
    pickle.dump(
        {
            'Acc': out_acc,
            'Info': {
                'TuneSplits': args.tune_splits,
                'TestCount': args.test_count,
                'ParallelJobs': args.parallel_jobs
            }
        }, open(out_file, 'wb'))
Exemple #4
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up touring for sub-types to detect.'
        )
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff', type=float, default=0.02,
                        help='subtype sample frequency threshold')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument('--mut_levels', type=str, default='Gene',
                        help='the mutation property levels to consider')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'setup', args.cohort)
    os.makedirs(out_path, exist_ok=True)
    use_lvls = args.mut_levels.split('__')

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()
    
    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=None, mut_levels=use_lvls,
        expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir,
        cv_prop=1.0, samp_cutoff=args.freq_cutoff, syn=syn
        )

    if args.verbose:
        print("Found {} candidate genes with mutations in at least "
              "{:.1f}% of the samples in TCGA cohort {}.\nLooking for "
              "subtypes of these genes that are combinations of up to two "
              "mutations at annotation levels {} ...\n".format(
                  len(tuple(cdata.train_mut)), args.freq_cutoff * 100,
                  args.cohort, use_lvls
                )
             )
    
    min_samps = args.freq_cutoff * len(cdata.samples)
    if use_lvls == ['Gene']:

        use_mtypes = {MuType({('Gene', gn): None})
                      for gn, mut in cdata.train_mut
                      if len(mut) >= min_samps}

    elif use_lvls[0] == 'Gene':
        use_lvls = use_lvls[1:]

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        for gn, mut in cdata.train_mut:
            cur_mtypes = {
                MuType({('Gene', gn): mtype})
                for mtype in mut.combtypes(comb_sizes=(1, 2),
                                           sub_levels=use_lvls,
                                           min_type_size=min_samps)
                }

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            cur_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in cur_mtypes - use_mtypes}

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets.update({
                mtype: sampset for mtype, sampset in cur_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - min_samps)
                })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    else:
        cur_mtypes = cdata.train_mut.combtypes(comb_sizes=(1, 2),
                                               sub_levels=use_lvls,
                                               min_type_size=min_samps)

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        cur_sampsets = {mtype: frozenset(mtype.get_samples(cdata.train_mut))
                        for mtype in cur_mtypes - use_mtypes}

        # removes the sub-types with so many mutated samples that there
        # are not enough negatively-labelled samples for classification
        mtype_sampsets.update({
            mtype: sampset for mtype, sampset in cur_sampsets.items()
            if len(sampset) <= (len(cdata.samples) - min_samps)
            })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(
        sorted(list(use_mtypes)),
        open(os.path.join(
            out_path, 'mtype_list__freq_{}__levels_{}.p'.format(
                args.freq_cutoff, args.mut_levels)
            ), 'wb')
        )

    pickle.dump({'Samps': cdata.samples},
                open(os.path.join(out_path, 'cohort_info.p'), 'wb'))

    with open(os.path.join(
            out_path,
            'mtype_count__freq_{}__levels_{}.txt'.format(
                args.freq_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_mtypes)))
Exemple #5
0
def main(argv):
    """Runs the experiment."""
    syn = synapseclient.Synapse()
    syn.login()

    # load drug-mutation association data,
    # filter for pan-cancer associations
    drug_mut_assoc = pd.read_csv(base_dir +
                                 '/../../data/drugs/ioria/drug_anova.txt.gz',
                                 sep='\t',
                                 comment='#')

    if patient_cohs[argv[0]] in drug_mut_assoc.columns:
        drug_mut_assoc = drug_mut_assoc.ix[
            drug_mut_assoc[patient_cohs[argv[0]]] != 0, :]

    else:
        drug_mut_assoc = drug_mut_assoc.ix[drug_mut_assoc['PANCAN'] != 0, :]

    # categorize associations by mutation type
    pnt_indx = drug_mut_assoc['FEAT'].str.contains('_mut$')
    # TODO: determine how iorio handled CNVs (they're currently ignored)
    cnv_indx = drug_mut_assoc['FEAT'].str.contains('^(?:loss|gain):')
    fus_indx = drug_mut_assoc['FEAT'].str.contains('_fusion$')

    # get list of genes affected by point mutations, load TCGA cohort
    # with corresponding set of mutations
    pnt_genes = list(
        set(x[0] for x in drug_mut_assoc['FEAT'][pnt_indx].str.split('_')))
    print(len(pnt_genes))

    # create a VariantCohort with expression only for genes which have
    # point mutations in the drug_mut_assoc dataframe
    # (cv_prop = cross validation proportion)(train on all here)
    # cross val seed is provided as last arg in an HTCondor submit script, and
    # cohort name is the first (should match cohort names as they appear in BMEG)
    tcga_var_coh = VariantCohort(syn,
                                 cohort="TCGA-{}".format(
                                     patient_cohs[argv[0]]),
                                 mut_genes=pnt_genes,
                                 mut_levels=['Gene', 'Type'],
                                 cv_seed=int(argv[-1]) + 1,
                                 cv_prop=1)

    tcga_back_cohs = {
        coh: VariantCohort(syn,
                           cohort=coh,
                           mut_genes=pnt_genes,
                           mut_levels=['Gene', 'Type'],
                           cv_seed=int(argv[-1]) + 1,
                           cv_prop=1)
        for coh in tcga_backcohs
    }

    # TODO: recall why frameshifts aren't considered below
    # get list of point mutation types and drugs associated with at least one
    pnt_mtypes = [
        MuType({('Gene', gn): {
                    ('Type', ('Frame', 'Point')): None
                }}) for gn in pnt_genes
    ]
    pnt_muts = {
        (gn + '_mut'): mtype
        for gn, mtype in zip(pnt_genes, pnt_mtypes)
        # TODO: the get_samples argument should be a MuTree...right?
        if len(mtype.get_samples(tcga_var_coh.train_mut)) >= 5
    }
    pnt_drugs = list(
        set(drug_mut_assoc['DRUG'][pnt_indx][
            drug_mut_assoc['FEAT'][pnt_indx].isin(pnt_muts.keys())]))
    pnt_drugs.sort()
    print(len(pnt_drugs))

    # ... stores predicted drug responses for cell lines and tcga samples
    ccle_response = {}
    tcga_response = {}
    back_tcga_resp = {coh: {} for coh in tcga_backcohs}

    # ... stores predicted drug response for organoid sample
    patient_response = pd.Series(float('nan'), index=pnt_drugs)

    # array that stores classifier performance on held-out cell lines
    clf_perf = pd.Series(float('nan'), index=pnt_drugs)

    # ... stores t-test p-values for mutation state vs predicted
    # drug responses in TCGA cohort
    tcga_ttest = pd.DataFrame(float('nan'),
                              index=pnt_drugs,
                              columns=pnt_muts.keys())

    # ... stores AUC scores for mutation vs drug response in TCGA
    tcga_auc = pd.DataFrame(float('nan'),
                            index=pnt_drugs,
                            columns=pnt_muts.keys())

    # loads patient (or patient-derived model (PDM)) RNAseq data
    patient_expr = pd.read_csv(patient_files[argv[0]], header=0, sep='\t')

    # get rid of the unnecessary info in gene_id, get Hugo symbols
    patient_expr['gene_id'] = [
        i.split('^')[1] for i in patient_expr['gene_id']
    ]
    annot_data = get_gencode()
    patient_expr['Symbol'] = [
        annot_data[gn]['gene_name'] if gn in annot_data else 'no_gene'
        for gn in patient_expr['gene_id']
    ]

    # ensure that there are no zeros in preparation for log normalization
    patient_expr.loc[:, 'FPKM'] = (
        patient_expr.loc[:, 'FPKM'] +
        min(patient_expr.loc[:, 'FPKM'][patient_expr.loc[:, 'FPKM'] > 0]) / 2)
    # log normalize the FPKM values
    patient_expr.loc[:, 'FPKM'] = np.log2(patient_expr.loc[:, 'FPKM'])

    # combine multiple entries of same gene symbol (use their mean)
    patient_expr = patient_expr.groupby(['Symbol'])['FPKM'].mean()
    patient_expr = pd.DataFrame(patient_expr)

    for drug in pnt_drugs:
        drug_clf = eval(argv[1])()
        cell_line_drug_coh = DrugCohort(cohort='ioria',
                                        drug_names=[drug],
                                        cv_seed=int(argv[-1]))
        drug_lbl = cell_line_drug_coh.train_resp.columns[0]
        print("Testing drug {} with alias {} ...".format(drug, drug_lbl))

        # TODO: 'Symbol' --> gene_id
        # get the union of genes in all 3 datasets (tcga, ccle, patient/PDM RNAseq
        use_genes = (set(tcga_var_coh.genes) & set(cell_line_drug_coh.genes)
                     & set(patient_expr.index)
                     & reduce(lambda x, y: x & y,
                              [coh.genes for coh in tcga_back_cohs.values()]))

        # filter patient (or PDM) RNAseq data to include only use_genes
        patient_expr_filt = patient_expr.loc[use_genes, :]

        # TODO: does patient_expr_filtered need to be transposed?

        # tunes and fits the classifier on the CCLE data, and evaluates its
        # performance on the held-out samples
        pr = cProfile.Profile()
        pr.enable()
        drug_clf.tune_coh(cell_line_drug_coh,
                          pheno=drug_lbl,
                          tune_splits=4,
                          test_count=16,
                          include_genes=use_genes)
        drug_clf.fit_coh(cell_line_drug_coh,
                         pheno=drug_lbl,
                         include_genes=use_genes)
        pr.disable()
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())
        print(drug_clf)
        clf_perf[drug] = drug_clf.eval_coh(cell_line_drug_coh,
                                           pheno=drug_lbl,
                                           include_genes=use_genes)

        # predicts drug response for the patient or PDM, stores classifier
        # for later use
        ccle_response[drug] = pd.Series(
            drug_clf.predict_train(cell_line_drug_coh,
                                   include_genes=use_genes))
        tcga_response[drug] = pd.Series(
            drug_clf.predict_train(tcga_var_coh, include_genes=use_genes))

        for coh in tcga_backcohs:
            back_tcga_resp[coh][drug] = pd.Series(
                drug_clf.predict_train(tcga_back_cohs[coh],
                                       include_genes=use_genes))

        patient_response[drug] = drug_clf.predict(
            patient_expr_filt.transpose())[0]

        for gn, mtype in pnt_muts.items():
            print("Gene: {}, Drug: {}".format(gn, drug))
            # for each mutated gene, get the vector of mutation status
            # for the TCGA samples
            mut_stat = np.array(tcga_var_coh.train_pheno(mtype=mtype))

            # gets the classifier's predictions of drug response for the
            # TCGA cohort, and evaluate its concordance with mutation status
            tcga_ttest.loc[drug, gn] = -log10(
                ttest_ind(tcga_response[drug][mut_stat],
                          tcga_response[drug][~mut_stat],
                          equal_var=False)[1])
            tcga_auc.loc[drug, gn] = roc_auc_score(mut_stat,
                                                   tcga_response[drug])

    # save everything to file
    out_data = {
        'Performance': clf_perf,
        'CCLE_Response': ccle_response,
        'TCGA_Response': tcga_response,
        'back_TCGA_Response': back_tcga_resp,
        'Patient_Response': patient_response,
        'TCGA_ttest': tcga_ttest,
        'TCGA_AUC': tcga_auc
    }
    out_file = ('/home/users/grzadkow/compbio/bergamot/HetMan/experiments/'
                'drug_predictions/output/mat_' + argv[0] + '_' + argv[1] +
                '__run' + argv[-1] + '.p')
    pickle.dump(out_data, open(out_file, 'wb'))
Exemple #6
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description=("Test a classifier's ability to create a mutation "
                     "signature for a gene that can be transferred from a "
                     "TCGA cohort to ICGC PACA-AU.")
        )

    parser.add_argument('classif', type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('mtypes', type=str,
                        help='a list of mutation types to test')

    parser.add_argument('cv_id', type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('task_id', type=int,
                        help=('the subset of TCGA cohorts and mutated genes '
                              'to assign to this task'))

    parser.add_argument(
        '--tune_splits', type=int, default=4,
        help='how many training cohort splits to use for tuning'
        )
    parser.add_argument(
        '--test_count', type=int, default=24,
        help='how many hyper-parameter values to test in each tuning split'
        )
    parser.add_argument(
        '--parallel_jobs', type=int, default=8,
        help='how many parallel CPUs to allocate the tuning tests across'
        )

    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    if args.verbose:
        print("Starting ICGC transfer test with classifier {} on mutation "
              "type list `{}` for cross-validation ID {} and "
              "task ID {} ...".format(args.classif, args.mtypes,
                                      args.cv_id, args.task_id))

    cohort_mtypes = sorted(pickle.load(
        open(os.path.join(base_dir, 'setup',
                          'cohort_{}.p'.format(args.mtypes)),
             'rb')))

    test_count = ceil(len(cohort_mtypes) / 6)
    cohort_mtypes = [x for i, x in enumerate(cohort_mtypes)
                     if i // test_count == args.task_id]

    use_cohorts = set(coh for coh, _ in cohort_mtypes)
    mut_clf = eval(args.classif)

    out_acc = {cohort: dict() for cohort in use_cohorts}
    out_par = {cohort: dict() for cohort in use_cohorts}

    cdata_icgc = ICGCcohort('PACA-AU', icgc_data_dir, mut_genes=None,
                            samp_cutoff=[1/12, 11/12], cv_prop=1.0)

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/mgrzad"
                                "/input-data/synapse")
    syn.login()
    
    for cohort in use_cohorts:
        cur_mtypes = [mtype for coh, mtype in cohort_mtypes if coh == cohort]

        if args.mtypes == 'genes':
            cur_genes = cur_mtypes.copy()
            cur_mtypes = [MuType({('Gene', gn): None}) for gn in cur_genes]

        else:
            cur_genes = reduce(
                or_,
                [set(gn for gn, _ in mtype.subtype_list())
                 for mtype in cur_mtypes]
                )

        tcga_cdata = TCGAcohort(
            cohort=cohort, mut_genes=cur_genes,
            mut_levels=['Gene', 'Form_base'],
            expr_source='toil', expr_dir=toil_dir, var_source='mc3', syn=syn,
            collapse_txs=True, cv_prop=0.75, cv_seed=(args.cv_id - 37) * 101
            )

        if args.verbose:
            print("Loaded mutations for {} genes in cohort {} with "
                  "{} samples.".format(len(cur_genes), cohort,
                                       len(tcga_cdata.samples)))

        for mtype in cur_mtypes:
            if args.verbose:
                print("Testing {} in {} ...".format(mtype, cohort))

            clf = mut_clf()
            use_genes = ((cdata_icgc.genes & tcga_cdata.genes)
                         - set(gn for gn, _ in mtype.subtype_list()))

            clf.tune_coh(tcga_cdata, mtype, include_genes=use_genes,
                         tune_splits=args.tune_splits,
                         test_count=args.test_count,
                         parallel_jobs=args.parallel_jobs)
            out_par[cohort][mtype] = {par: clf.get_params()[par]
                                      for par, _ in clf.tune_priors}

            clf.fit_coh(tcga_cdata, mtype, include_genes=use_genes)
            out_acc[cohort][mtype] = clf.eval_coh(
                cdata_icgc, mtype, include_genes=use_genes,
                use_train=True
                )

    out_file = os.path.join(base_dir, 'output', args.classif, args.mtypes,
                            'out__cv-{}_task-{}.p'.format(
                                args.cv_id, args.task_id)
                            )

    pickle.dump({'Acc': out_acc, 'Par': out_par,
                 'Info': {'TuneSplits': args.tune_splits,
                          'TestCount': args.test_count,
                          'ParallelJobs': args.parallel_jobs}},
                open(out_file, 'wb'))
Exemple #7
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up searching for sub-types to detect.')

    # positional command line arguments
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('base_gene',
                        type=str,
                        help='a gene to cross with respect to')

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff',
                        type=int,
                        default=20,
                        help='sub-type sample frequency threshold')
    parser.add_argument('--max_genes',
                        type=int,
                        default=200,
                        help='maximum number of mutated genes to consider')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument(
        '--mut_levels',
        type=str,
        nargs='+',
        default=['Form_base', 'Exon', 'Protein'],
        help='the mutation property levels to consider in addition to `Genes`')
    parser.add_argument(
        '--comb_size',
        type=int,
        default=2,
        help='maximum number of individual mutations to combine'
        'when searching for mutation sub-types')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'output', args.cohort, args.classif,
                            'cross', args.base_gene)

    if args.verbose:
        print("Looking for mutation sub-types in cohort {} composed of at "
              "most {} individual mutations with at least {} "
              "samples in total.\n".format(args.cohort, args.comb_size,
                                           args.freq_cutoff))

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # load the expression matrix for the given cohort from Broad Firehose,
    # load the MC3 variant call set from Synapse, find the mutations for the
    # samples that are in both datasets
    expr_data = get_expr_firehose(args.cohort, firehose_dir)
    mc3_data = get_variants_mc3(syn)
    expr_mc3 = mc3_data.loc[mc3_data['Sample'].isin(expr_data.index), :]

    # get the genes whose mutations appear in enough samples to pass the
    # frequency threshold
    gene_counts = expr_mc3.groupby(by='Gene').Sample.nunique()
    count_cutoff = int(args.freq_cutoff / args.comb_size)
    common_genes = set(gene_counts.index[gene_counts >= count_cutoff])

    if args.verbose:
        print("Found {} candidate genes with at least {} potential "
              "mutated samples.".format(len(common_genes), count_cutoff))

    if len(common_genes) >= args.max_genes:
        gene_counts = gene_counts[common_genes].sort_values(ascending=False)
        common_genes = set(gene_counts[:args.max_genes].index)

        if args.verbose:
            print("Too many genes found, culling list to {} genes which each "
                  "have at least {} mutated samples.".format(
                      args.max_genes, min(gene_counts[common_genes])))

    cdata = VariantCohort(cohort=args.cohort,
                          mut_genes=common_genes,
                          mut_levels=['Gene'] + args.mut_levels,
                          expr_source='Firehose',
                          data_dir=firehose_dir,
                          cv_prop=1.0,
                          syn=syn)

    base_mtype = MuType({('Gene', args.base_gene): None})
    base_samps = base_mtype.get_samples(cdata.train_mut)

    with_muts = deepcopy(cdata.train_mut).subtree(base_samps)
    without_muts = deepcopy(cdata.train_mut).subtree(cdata.samples -
                                                     base_samps)

    # intializes the list of found sub-types and the list of samples each
    # sub-type appears in
    use_mtypes = set()
    use_sampsets = set()

    search_level = 1
    break_status = False

    # until we have not reached the limit of sub-type enumeration or run out
    # property level combinations to test...
    while (len(use_mtypes) < 10000 and not break_status
           and search_level <= 2**len(args.mut_levels)):

        # try a list of property level combinations and number of individual
        # variants to combine, where the complexity of the level combination
        # plus the variant count is held constant
        for lvl_combn, comb_size in zip(
                rev_powerset_slice(args.mut_levels, search_level),
                range(1, min(search_level + 1, args.comb_size + 1))):
            use_lvls = ['Gene'] + list(lvl_combn)

            if args.verbose:
                print("\nLooking for sub-types that are combinations "
                      "of {} mutation(s) at levels {}...\n".format(
                          comb_size, use_lvls))

            # enumerates the sub-types consisting of a combination of the given
            # number of individual mutations at the given property levels
            sub_mtypes = with_muts.combtypes(comb_sizes=(comb_size, ),
                                             sub_levels=use_lvls,
                                             min_type_size=int(
                                                 args.freq_cutoff / 2))
            sub_mtypes |= without_muts.combtypes(comb_sizes=(comb_size, ),
                                                 sub_levels=use_lvls,
                                                 min_type_size=int(
                                                     args.freq_cutoff / 2))

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            mtype_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in sub_mtypes - use_mtypes
                if (mtype & base_mtype).is_empty()
            }

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets = {
                mtype: sampset
                for mtype, sampset in mtype_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - args.freq_cutoff)
            }

            sub_mtypes = sorted(list(mtype_sampsets))
            if args.verbose:
                print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            # if the list of remaining sub-types isn't too long...
            if len(sub_mtypes) < 8000:
                add_mtypes = set()

                for i, mtype in enumerate(sub_mtypes):
                    if args.verbose and (i % 200) == 100:
                        print("\nchecked {} sub-types\n".format(i))

                    # ...we remove each one whose set of mutated samples is
                    # identical to that of a sub-type that was already found
                    if mtype_sampsets[mtype] in use_sampsets:
                        if args.verbose:
                            print("Removing functionally duplicate MuType {}"\
                                    .format(mtype))

                    else:
                        add_mtypes.update({mtype})
                        use_sampsets.update({mtype_sampsets[mtype]})

                use_mtypes |= add_mtypes

            elif len(sub_mtypes) > 100000:
                break_status = True

        search_level += 1

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(sorted(list(use_mtypes)),
                open(os.path.join(out_path, 'tmp/mtype_list.p'), 'wb'))