def test_medium(self, muts, mtree, mtypes, mut_lvls): assert (mtypes[1].get_samples(mtree) == set( muts.Sample[muts.Form == 'Missense_Mutation'])) assert (mtypes[2].get_samples(mtree) == set( muts.Sample[muts.Exon == '8/21'])) assert (mtypes[7].get_samples(mtree) == set(muts.Sample[muts.Gene.isin( ['TP53', 'KRAS', 'BRAF'])])) for (gn, frm), mut_data in muts.groupby(['Gene', 'Form']): assert (MuType({ ('Gene', gn): { ('Form', frm): None } }).get_samples(mtree) == set(mut_data.Sample)) for (gn1, gn2) in combn(set(muts.Gene), r=2): assert (MuType({ ('Gene', (gn1, gn2)): None }).get_samples(mtree) == set(muts.Sample[muts.Gene.isin([gn1, gn2])])) for (frm1, frm2) in combn(set(muts.Form), r=2): assert (MuType({ ('Form', (frm1, frm2)): None }).get_samples(mtree) == set(muts.Sample[muts.Form.isin( [frm1, frm2])]))
def plot_task_characteristics(coef_df, auc_vals, pheno_dict, pred_df, args): fig, axarr = plt.subplots(figsize=(11, 18), nrows=3, ncols=1) coef_magns = coef_df.abs().mean(axis=1) for mtype, coef_list in coef_df.iterrows(): use_clr = choose_subtype_colour(mtype) coef_grps = coef_list.groupby(level=0) mtype_coefs = np.array([coef_grps.nth(i) for i in range(40)]) pcorr_val = np.mean([ pearsonr(mtype_coefs[i], mtype_coefs[j])[0] for i, j in combn(range(40), 2) ]) scorr_val = np.mean([ spearmanr(mtype_coefs[i], mtype_coefs[j])[0] for i, j in combn(range(40), 2) ]) for ax, val in zip(axarr, [pcorr_val, scorr_val, coef_magns[mtype]]): ax.scatter(auc_vals[mtype], val, facecolor=[use_clr], s=751 * np.mean(pheno_dict[mtype]), alpha=0.31, edgecolors='none') for ax in axarr: x_lims = ax.get_xlim() y_lims = [-ax.get_ylim()[1] / 91, ax.get_ylim()[1]] ax.plot(x_lims, [0, 0], color='black', linewidth=1.6, alpha=0.71) ax.plot([0.5, 0.5], [0, y_lims[1]], color='black', linewidth=1.4, linestyle=':', alpha=0.61) ax.tick_params(axis='both', which='major', labelsize=17) ax.grid(alpha=0.37, linewidth=0.9) ax.set_xlim(x_lims) ax.set_ylim(y_lims) axarr[-1].set_xlabel("Mean AUC Across CVs", size=23, weight='semibold') for ax, ylbl in zip(axarr, [ "Mean Pearson Corr\nBetween CVs", "Mean Spearman Corr\nBetween CVs", "Mean Signature\nMagnitude" ]): ax.set_ylabel(ylbl, size=23, weight='semibold') plt.savefig(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), "{}_task-characteristics_{}.svg".format(args.gene, args.classif)), bbox_inches='tight', format='svg') plt.close()
def test_invariants(self, mtypes): """Do binary operators preserve set theoretic invariants?""" for mtype in mtypes: assert mtype == (mtype & mtype) assert mtype == (mtype | mtype) assert (mtype - mtype).is_empty() for mtype1, mtype2 in combn(mtypes, 2): if mtype1.get_levels() == mtype2.get_levels(): assert mtype1 | mtype2 == mtype2 | mtype1 assert mtype1 & mtype2 == mtype2 & mtype1 assert (mtype1 | mtype2).is_supertype(mtype1 & mtype2) assert mtype1 - mtype2 == mtype1 - (mtype1 & mtype2) assert mtype1 | mtype2 == ((mtype1 - mtype2) | (mtype2 - mtype1) | (mtype1 & mtype2)) if mtype1.get_levels() <= mtype2.get_levels(): if mtype1 == mtype2 or mtype1.is_supertype(mtype2): assert mtype2 == (mtype1 & mtype2) if mtype1.get_levels() >= mtype2.get_levels(): if mtype1 == mtype2 or mtype2.is_supertype(mtype1): assert mtype2 == (mtype1 | mtype2)
def test_allkeys(self, mtree_tester): """Can we retrieve the mutation set key of the tree?""" muts, mtree, mut_lvls = mtree_tester.get_muts_mtree() lvl_sets = chain.from_iterable( combn(mut_lvls, r) for r in range(1, len(mut_lvls) + 1)) for lvl_set in lvl_sets: lvl_key = {} for vals, _ in muts.groupby(lvl_set): cur_key = lvl_key if isinstance(vals, str): vals = (vals, ) for i in range(len(lvl_set) - 1): if (lvl_set[i], vals[i]) not in cur_key: cur_key.update({(lvl_set[i], vals[i]): {}}) cur_key = cur_key[(lvl_set[i], vals[i])] cur_key.update({(lvl_set[-1], vals[-1]): None}) assert mtree.allkey(lvl_set) == lvl_key
def test_print(self, mcombs): for mcomb1, mcomb2 in combn(mcombs, 2): if mcomb1 == mcomb2: assert repr(mcomb1) == repr(mcomb2) assert str(mcomb1) == str(mcomb2) else: assert repr(mcomb1) != repr(mcomb2)
def test_order(self, mtypes): for mtype1, mtype2 in combn(mtypes, r=2): assert MutComb(mtype1, mtype2) == MutComb(mtype2, mtype1) for mtype3 in mtypes: assert (MutComb(mtype1, mtype2, not_mtype=mtype3) == MutComb(mtype2, mtype1, not_mtype=mtype3))
def plot_similarity_scatter(simil_df, auc_list, pheno_dict, args): fig, ax = plt.subplots(figsize=(10, 6)) mutex_pvals = [] simil_vals = [] auc_vals = [] size_vals = [] test_combs = [ (mtypes1, mtypes2) for mtypes1, mtypes2 in combn(auc_list[auc_list > 0.6].index, 2) if ((len(mtypes1) == 1 and len(mtypes2) == 1 and (mtypes1[0] & mtypes2[0]).is_empty()) or ( len(mtypes1) == 2 ^ len(mtypes2) == 2)) ] for mtypes1, mtypes2 in test_combs: mutex_pvals += [ -np.log10( fisher_exact(table=pd.crosstab(pheno_dict[mtypes1], pheno_dict[mtypes2]), alternative='less')[1]) ] siml_adj1 = np.clip(auc_list[mtypes1] - 0.5, 0, 1)**2 siml_adj2 = np.clip(auc_list[mtypes2] - 0.5, 0, 1)**2 siml_val = siml_adj1 * simil_df.loc[[mtypes1], [mtypes2]].iloc[0, 0] siml_val += siml_adj2 * simil_df.loc[[mtypes2], [mtypes1]].iloc[0, 0] simil_vals += [siml_val / (siml_adj1 + siml_adj2)] auc_vals += [max(siml_adj1, siml_adj2)] size_vals += [sum(pheno_dict[mtypes1]) + sum(pheno_dict[mtypes2])] for mutex_pval, simil_val, auc_val, size_val in zip( mutex_pvals, simil_vals, auc_vals, size_vals): ax.scatter(mutex_pval, simil_val, marker='o', s=size_val / 13, alpha=(auc_val - 0.01)**0.61) plt.xticks(size=11) plt.yticks(size=11) plt.xlabel("Mutual Exclusivity", size=23, weight='semibold') plt.ylabel("Inferred Similarity", size=23, weight='semibold') plt.savefig(os.path.join( plot_dir, '{}_{}'.format(args.cohort, args.gene), "simil-scatter__{}__samps_{}__{}.png".format(args.classif, args.samp_cutoff, args.mut_levels)), dpi=300, bbox_inches='tight') plt.close()
def plot_auc_comparisons(auc_df, aucs_df, pheno_dict, args): fig, axarr = plt.subplots(figsize=(13, 12), nrows=3, ncols=3) var_df = aucs_df.applymap(np.var).applymap(np.log10) auc_rng = np.percentile(auc_df.values, q=[0, 100]) var_rng = np.percentile(var_df.values, q=[0, 100]) for i, cis_lbl in enumerate(cis_lbls): axarr[i, i].axis('off') axarr[i, i].text(0.5, 0.5, cis_lbl, size=23, weight='semibold', ha='center', va='center') for (i, cis_lbl1), (j, cis_lbl2) in combn(enumerate(cis_lbls), 2): for mtype in auc_df.index: mtype_size = 211 * np.mean(pheno_dict[mtype]) axarr[i, j].scatter(auc_df.loc[mtype, cis_lbl1], auc_df.loc[mtype, cis_lbl2], marker='o', s=mtype_size, alpha=0.41, edgecolor='none') axarr[j, i].scatter(var_df.loc[mtype, cis_lbl2], var_df.loc[mtype, cis_lbl1], marker='o', s=mtype_size, alpha=0.41, edgecolor='none') axarr[i, j].plot([auc_rng[0], auc_rng[1]], [auc_rng[0], auc_rng[1]], linewidth=1.3, linestyle='--', color='#550000', alpha=0.53) axarr[j, i].plot([var_rng[0], var_rng[1]], [var_rng[0], var_rng[1]], linewidth=1.3, linestyle='--', color='#550000', alpha=0.53) fig.tight_layout(w_pad=1.9, h_pad=1.9) plt.savefig(os.path.join( plot_dir, "{}__{}__samps-{}".format(args.expr_source, args.cohort, args.samp_cutoff), "auc-comparisons_{}__{}.svg".format(args.mut_levels, args.classif)), bbox_inches='tight', format='svg') plt.close()
def test_print(self, mtypes): """Can we print MuTypes?""" for mtype in mtypes: assert isinstance(repr(mtype), str) assert isinstance(str(mtype), str) for mtype1, mtype2 in combn(mtypes, 2): if mtype1 == mtype2: assert str(mtype1) == str(mtype2) else: assert repr(mtype1) != repr(mtype2) assert str(mtype1) != str(mtype2)
def __new__(cls, *mtypes, not_mtype=None): if not all(isinstance(mtype, MuType) for mtype in mtypes): raise TypeError( "A MutComb object must be a combination of MuTypes!") mtypes = list(mtypes) obj = super().__new__(cls) # removes overlap between the given mutations for i, j in combn(range(len(mtypes)), r=2): if mtypes[i] is not None and mtypes[j] is not None: if mtypes[i].is_supertype(mtypes[j]): mtypes[i] = None elif mtypes[j].is_supertype(mtypes[i]): mtypes[j] = None # removes mutations that are covered by other given mutations mtypes = [mtype for mtype in mtypes if mtype is not None] intrx_mtype = reduce(and_, mtypes) if not_mtype is not None: mtypes = [ mtype - not_mtype if mtype.get_levels() == not_mtype.get_levels() else mtype for mtype in mtypes ] if not_mtype.get_levels() == intrx_mtype.get_levels(): not_mtype -= intrx_mtype if not_mtype.is_empty(): not_mtype = None # removes mutations that are covered by other given mutations mtypes = [mtype for mtype in mtypes if not mtype.is_empty()] # if only one unique mutation was given, return that mutation... if mtypes: if len(mtypes) == 1 and not_mtype is None: return mtypes[0] # ...otherwise, return the combination of the given mutations else: obj.mtypes = frozenset(mtypes) obj.not_mtype = not_mtype return obj else: return MuType({})
def eliminate_with_other_tuples(cell, groups_with_cell, max_size=5): """Figure out if there are two other cells in this row, column or square that can each only be the same two items (e.g. two cells that are both (1,5) ). If so, remove those numbers from this cell's list. Extended to groups larger than two.""" for cells in groups_with_cell: for size in range(max_size, 1, -1): for cell_comb in combn(filter(lambda x: len(x) == size, cells), size): tupls = map(tuple, cell_comb) match = reduce(lambda x, y: x == y, tupls) if match: cell -= cell_comb[0] if len(cell) == 1: return
def test_type(self, mtree_tester): """Is the Type mutation level correctly defined?""" muts, mtree, mut_lvls = mtree_tester.get_muts_mtree() # Type level should catch all samples for (gene, form), mut in muts.groupby(['Gene', 'Form']): mtype = MuType({('Gene', gene): {('Form', form): None}}) assert mtype.get_samples(mtree) == set(mut['Sample']) # Type level categories should be mutually exclusive test_key = mtree.allkey(levels=('Type', 'Protein')) assert (set(val for _, val in test_key.keys()) <= {'CNV', 'Point', 'Frame', 'Other'}) for plist1, plist2 in combn(test_key.values(), 2): assert not (set(val for _, val in plist1.keys()) & set(val for _, val in plist2.keys()))
def test_subkeys(self, mtypes): """Can we get the leaf types stored in a MuType?""" for mtype in mtypes: key_mtypes = [MuType(k) for k in mtype.subkeys()] assert len(set(key_mtypes)) == len(key_mtypes) assert reduce(or_, key_mtypes) == mtype if len(key_mtypes) > 1: assert reduce(and_, key_mtypes).is_empty() for mtype1, mtype2 in combn(mtypes, 2): if mtype1 == mtype2: keys1 = mtype1.subkeys() keys2 = mtype2.subkeys() assert len(keys1) == len(keys2) assert (sorted(MuType(k) for k in keys1) == sorted( MuType(k) for k in keys2))
def test_comparison(self, mtypes): """Are rich comparison operators correctly implemented for MuTypes?""" for mtype in mtypes: assert mtype == mtype assert mtype <= mtype assert mtype >= mtype assert not mtype < mtype assert not mtype > mtype for mtype1, mtype2 in combn(mtypes, 2): assert (mtype1 <= mtype2) != (mtype1 > mtype2) if mtype1 < mtype2: assert mtype1 <= mtype2 assert mtype1 != mtype2 elif mtype1 > mtype2: assert mtype1 >= mtype2 assert mtype1 != mtype2
def test_leaves(self, mtypes): """Can we get the leaf types stored in a MuType?""" for mtype in mtypes: leaf_mtypes = [MuType(k) for k in mtype.leaves()] assert all( len(leaf_mtype.leaves()) == 1 for leaf_mtype in leaf_mtypes) assert len(set(leaf_mtypes)) == len(leaf_mtypes) assert reduce(or_, leaf_mtypes, MuType({})) == mtype if len(leaf_mtypes) == 0: assert mtype.is_empty() elif len(leaf_mtypes) == 1: assert leaf_mtypes[0] == mtype else: assert reduce(and_, leaf_mtypes).is_empty() for mtype1, mtype2 in combn(mtypes, 2): assert ((sorted(MuType(k) for k in mtype1.leaves()) == sorted( MuType(k) for k in mtype2.leaves())) == (mtype1 == mtype2))
def plot_umap_clustering(trans_expr, type_data, cdata, args): fig, axarr = plt.subplots(figsize=(14, 13), nrows=4, ncols=4) trans_expr = trans_expr[:, :4] type_stat = np.array([ type_data.SUBTYPE[type_data.index.get_loc(samp)] if samp in type_data.index else 'Not Available' for samp in cdata.train_data()[0].index ]) type_clrs = sns.color_palette('bright', n_colors=len(set(type_stat))) lgnd_lbls = [] lgnd_marks = [] for sub_type, type_clr in zip(sorted(set(type_stat)), type_clrs): type_indx = type_stat == sub_type for i, j in combn(range(4), 2): axarr[i, j].plot(trans_expr[type_indx, i], trans_expr[type_indx, j], marker='o', linewidth=0, markersize=5, alpha=0.23, mfc=type_clr, mec='none') lgnd_lbls += ["{} ({})".format(sub_type, np.sum(type_indx))] lgnd_marks += [ Line2D([], [], marker='o', linestyle='None', markersize=19, alpha=0.43, markerfacecolor=type_clr, markeredgecolor='none') ] for i in range(4): axarr[i, i].axis('off') axarr[i, i].text(0.5, 0.5, "UMAP Component {}".format(i + 1), size=17, weight='semibold', ha='center', va='center') for i, j in combn(range(4), 2): axarr[i, j].set_xticklabels([]) axarr[i, j].set_yticklabels([]) axarr[j, i].set_xticklabels([]) axarr[j, i].set_yticklabels([]) plt.legend(lgnd_marks, lgnd_lbls, bbox_to_anchor=(0.5, 1 / 29), bbox_transform=fig.transFigure, frameon=False, fontsize=21, ncol=3, loc=9, handletextpad=0.3) fig.savefig(os.path.join(plot_dir, args.expr_source, "{}__UMAP-clustering.svg".format(args.cohort)), bbox_inches='tight', format='svg') plt.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument( 'samp_cutoff', type=int, help="minimum number of mutated samples needed to test a gene") # parse command line arguments, identify directory where intermediate # files are to be stored, load cohort expression and mutation data parser.add_argument('--setup_dir', type=str, default=base_dir) args = parser.parse_args() out_path = os.path.join(args.setup_dir, 'setup') cdata = get_cohort_data(args.cohort) # save cohort data to file for use by future tasks with open(os.path.join(out_path, "cohort-data.p"), 'wb') as cdata_fl: pickle.dump(cdata, cdata_fl) # find subsets of point mutations with enough affected samples for each # mutated gene in the cohort vars_list = reduce(or_, [{ MuType({('Gene', gene): mtype}) for mtype in muts['Point'].branchtypes(min_size=args.samp_cutoff) } for gene, muts in cdata.mtree if ('Scale', 'Point') in muts.allkey()], set()) # add copy number deletions for each gene if enough samples are affected vars_list |= { MuType({('Gene', gene): { ('Copy', 'DeepDel'): None }}) for gene, muts in cdata.mtree if (('Scale', 'Copy') in muts.allkey() and ('Copy', 'DeepDel') in muts['Copy'].allkey() and len(muts['Copy']['DeepDel']) >= args.samp_cutoff) } # add copy number amplifications for each gene vars_list |= { MuType({('Gene', gene): { ('Copy', 'DeepGain'): None }}) for gene, muts in cdata.mtree if (('Scale', 'Copy') in muts.allkey() and ( 'Copy', 'DeepGain') in muts['Copy'].allkey() and len(muts['Copy']['DeepGain']) >= args.samp_cutoff) } # add all point mutations as a single mutation type for each gene if it # contains more than one type of point mutation vars_list |= { MuType({('Gene', gene): { ('Scale', 'Point'): None }}) for gene, muts in cdata.mtree if (('Scale', 'Point') in muts.allkey() and len(muts['Point'].allkey()) > 1 and len(muts['Point']) >= args.samp_cutoff) } # filter out mutations that do not have enough wild-type samples vars_list = { mtype for mtype in vars_list if (len(mtype.get_samples(cdata.mtree)) <= (len(cdata.get_samples()) - args.samp_cutoff)) } # remove mutations that are functionally equivalent to another mutation vars_list -= { mtype1 for mtype1, mtype2 in product(vars_list, repeat=2) if (mtype1 != mtype2 and mtype1.is_supertype(mtype2) and ( mtype1.get_samples(cdata.mtree) == mtype2.get_samples(cdata.mtree)) ) } # find the pairs of remaining mutations that do not have overlapping # definitions and have enough samples with exactly one mutation in the pair samp_dict = {mtype: mtype.get_samples(cdata.mtree) for mtype in vars_list} pairs_list = { tuple(sorted([mtype1, mtype2])) for (mtype1, samps1), (mtype2, samps2) in combn(samp_dict.items(), 2) if (len(samps1 - samps2) >= args.samp_cutoff and len(samps2 - samps1) >= args.samp_cutoff and (mtype1 & mtype2).is_empty()) } # save the enumerate pairs to file along with a count of the pairs with open(os.path.join(out_path, "pairs-list.p"), 'wb') as f: pickle.dump(sorted(pairs_list), f) with open(os.path.join(out_path, "pairs-count.txt"), 'w') as fl: fl.write(str(len(pairs_list)))
def main(): parser = argparse.ArgumentParser( "Set up the gene subtype expression effect cross-isolation " "experiment by enumerating the pairs of subtypes to be tested.") # create positional command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene', type=str, help="which gene to consider") parser.add_argument('mut_levels', type=str, help='the mutation property levels to consider') # create optional command line arguments parser.add_argument('--samp_cutoff', type=int, default=25, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found subtypes # will be stored args = parser.parse_args() use_lvls = args.mut_levels.split('__') out_path = os.path.join(base_dir, 'setup', args.cohort, args.gene) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) if args.verbose: print("Looking for combinations of subtypes of mutations in gene {} " "present in at least {} of the samples in TCGA cohort {} at " "annotation levels {}.\n".format(args.gene, args.samp_cutoff, args.cohort, use_lvls)) cross_mtypes = cdata.train_mut.find_unique_subtypes( max_types=100, max_combs=10, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) mtype_samps = { mtype: mtype.get_samples(cdata.train_mut) for mtype in cross_mtypes } cross_mtypes = { mtype for mtype in cross_mtypes if len(mtype_samps[mtype]) <= (len(cdata.samples) - args.samp_cutoff) } if args.verbose: print("\nFound {} total sub-types to cross!".format(len(cross_mtypes))) use_pairs = { (mtype1, mtype2) for mtype1, mtype2 in combn(cross_mtypes, 2) if ((len(mtype_samps[mtype1] - mtype_samps[mtype2]) >= args.samp_cutoff ) and (len(mtype_samps[mtype2] - mtype_samps[mtype1]) >= args.samp_cutoff) and ( len(mtype_samps[mtype1] | mtype_samps[mtype2]) <= (len(cdata.samples) - args.samp_cutoff)) and ( mtype1 & mtype2).is_empty()) } if args.verbose: print("\nFound {} non-overlapping sub-type pairs!".format( len(use_pairs))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(use_pairs), open( os.path.join( out_path, 'pairs_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) with open( os.path.join( out_path, 'pairs_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_pairs)))
def main(): parser = argparse.ArgumentParser( "Set up the paired-gene subtype expression effect isolation " "experiment by enumerating the subtypes to be tested.") # create positional command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('mut_levels', type=str, help="the mutation property levels to consider") parser.add_argument('genes', type=str, nargs='+', help="a list of mutated genes") # create optional command line arguments parser.add_argument('--samp_cutoff', type=int, default=20, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found subtypes # will be stored args = parser.parse_args() use_lvls = args.mut_levels.split('__') out_path = os.path.join(base_dir, 'setup', args.cohort, '_'.join(args.genes)) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=args.genes, mut_levels=['Gene'] + use_lvls, expr_source='Firehose', var_source='mc3', copy_source='Firehose', annot_file=annot_file, expr_dir=expr_dir, domain_dir=domain_dir, cv_prop=1.0, syn=syn) iso_mtypes = set() for gene in args.genes: other_samps = reduce(or_, [ cdata.train_mut[other_gn].get_samples() for other_gn in set(args.genes) - {gene} ]) if args.verbose: print("Looking for combinations of subtypes of mutations in gene " "{} present in at least {} of the samples in TCGA cohort " "{} at annotation levels {}.\n".format( gene, args.samp_cutoff, args.cohort, use_lvls)) pnt_mtypes = cdata.train_mut[gene]['Point'].find_unique_subtypes( max_types=500, max_combs=2, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) # filter out the subtypes that appear in too many samples for there to # be a wild-type class of sufficient size for classification pnt_mtypes = { MuType({('Scale', 'Point'): mtype}) for mtype in pnt_mtypes if (len(mtype.get_samples(cdata.train_mut[gene]['Point'])) <= ( len(cdata.samples) - args.samp_cutoff)) } pnt_mtypes |= {MuType({('Scale', 'Point'): None})} cna_mtypes = cdata.train_mut[gene]['Copy'].branchtypes( min_size=args.samp_cutoff) cna_mtypes |= {MuType({('Copy', ('HetGain', 'HomGain')): None})} cna_mtypes |= {MuType({('Copy', ('HetDel', 'HomDel')): None})} cna_mtypes = { MuType({('Scale', 'Copy'): mtype}) for mtype in cna_mtypes if (len(mtype.get_samples(cdata.train_mut[gene]['Copy'])) <= ( len(cdata.samples) - args.samp_cutoff)) } all_mtype = MuType(cdata.train_mut[gene].allkey()) use_mtypes = pnt_mtypes | cna_mtypes only_mtypes = { (MuType({('Gene', gene): mtype}), ) for mtype in use_mtypes if (len( mtype.get_samples(cdata.train_mut[gene]) - (all_mtype - mtype).get_samples(cdata.train_mut[gene]) - other_samps) >= args.samp_cutoff) } comb_mtypes = {(MuType({('Gene', gene): mtype1}), MuType({('Gene', gene): mtype2})) for mtype1, mtype2 in combn(use_mtypes, 2) if ((mtype1 & mtype2).is_empty() and ( len((mtype1.get_samples(cdata.train_mut[gene]) & mtype2.get_samples(cdata.train_mut[gene])) - (mtype1.get_samples(cdata.train_mut[gene]) ^ mtype2.get_samples(cdata.train_mut[gene])) - (all_mtype - mtype1 - mtype2).get_samples(cdata.train_mut[gene]) - other_samps) >= args.samp_cutoff))} iso_mtypes |= only_mtypes | comb_mtypes if args.verbose: print( "\nFound {} exclusive sub-types and {} combination sub-types " "to isolate!".format(len(only_mtypes), len(comb_mtypes))) for cur_genes in chain.from_iterable( combn(args.genes, r) for r in range(1, len(args.genes))): gene_mtype = MuType({('Gene', cur_genes): None}) rest_mtype = MuType({ ('Gene', tuple(set(args.genes) - set(cur_genes))): None }) if (args.samp_cutoff <= len( gene_mtype.get_samples(cdata.train_mut) - rest_mtype.get_samples(cdata.train_mut)) <= (len(cdata.samples) - args.samp_cutoff)): iso_mtypes |= {(gene_mtype, )} if args.verbose: print("\nFound {} total sub-types to isolate!".format(len(iso_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(iso_mtypes), open( os.path.join( out_path, 'mtypes_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) with open( os.path.join( out_path, 'mtypes_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(iso_mtypes)))
def plot_tuning_mtype_grid(par_df, auc_df, use_clf, args, cdata): par_count = len(use_clf.tune_priors) fig, axarr = plt.subplots(figsize=(0.5 + 7 * par_count, 7 * par_count), nrows=par_count, ncols=par_count) auc_vals = auc_df.quantile(q=0.25, axis=1) auc_clrs = auc_vals.apply(auc_cmap) size_vec = [ 461 * sum(cdata.train_pheno(mtype)) / (len(cdata.get_samples()) * par_count) for mtype in auc_vals.index ] for i, (par_name, tune_distr) in enumerate(use_clf.tune_priors): axarr[i, i].grid(False) if detect_log_distr(tune_distr): use_distr = [np.log10(par_val) for par_val in tune_distr] par_lbl = par_name + '\n(log-scale)' else: use_distr = tune_distr par_lbl = par_name distr_diff = np.mean( np.array(use_distr[1:]) - np.array(use_distr[:-1])) plt_min = use_distr[0] - distr_diff / 2 plt_max = use_distr[-1] + distr_diff / 2 axarr[i, i].set_xlim(plt_min, plt_max) axarr[i, i].set_ylim(plt_min, plt_max) axarr[i, i].text((plt_min + plt_max) / 2, (plt_min + plt_max) / 2, par_lbl, ha='center', fontsize=28, weight='semibold') for par_val in use_distr: axarr[i, i].axhline(y=par_val, color='#116611', ls='--', linewidth=4.1, alpha=0.27) axarr[i, i].axvline(x=par_val, color='#116611', ls='--', linewidth=4.1, alpha=0.27) for (i, (par_name1, tn_distr1)), (j, (par_name2, tn_distr2)) in combn( enumerate(use_clf.tune_priors), 2): if detect_log_distr(tn_distr1): use_distr1 = [np.log10(par_val) for par_val in tn_distr1] par_meds1 = np.log10(par_df[par_name1]).median(axis=1) par_means1 = np.log10(par_df[par_name1]).mean(axis=1) distr_diff = np.mean( np.log10(np.array(tn_distr1[1:])) - np.log10(np.array(tn_distr1[:-1]))) plt_ymin = np.log10(tn_distr1[0]) - distr_diff / 2 plt_ymax = np.log10(tn_distr1[-1]) + distr_diff / 2 else: use_distr1 = tn_distr1 par_meds1 = par_df[par_name1].median(axis=1) par_means1 = par_df[par_name1].mean(axis=1) distr_diff = np.mean( np.array(tn_distr1[1:]) - np.array(tn_distr1[:-1])) plt_ymin = tn_distr1[0] - distr_diff / 2 plt_ymax = tn_distr1[-1] + distr_diff / 2 if detect_log_distr(tn_distr2): use_distr2 = [np.log10(par_val) for par_val in tn_distr2] par_meds2 = np.log10(par_df[par_name2]).median(axis=1) par_means2 = np.log10(par_df[par_name2]).mean(axis=1) distr_diff = np.mean( np.log10(np.array(tn_distr2[1:])) - np.log10(np.array(tn_distr2[:-1]))) plt_xmin = np.log10(tn_distr2[0]) - distr_diff / 2 plt_xmax = np.log10(tn_distr2[-1]) + distr_diff / 2 else: use_distr2 = tn_distr2 par_meds2 = par_df[par_name2].median(axis=1) par_means2 = par_df[par_name2].mean(axis=1) distr_diff = np.mean( np.array(tn_distr2[1:]) - np.array(tn_distr2[:-1])) plt_xmin = tn_distr2[0] - distr_diff / 2 plt_xmax = tn_distr2[-1] + distr_diff / 2 par_meds1 = par_meds1[auc_clrs.index] par_meds2 = par_meds2[auc_clrs.index] y_adj = (plt_ymax - plt_ymin) / len(tn_distr1) x_adj = (plt_xmax - plt_xmin) / len(tn_distr2) plt_adj = (plt_xmax - plt_xmin) / (plt_ymax - plt_ymin) for med1, med2 in set(zip(par_meds1, par_meds2)): use_indx = (par_meds1 == med1) & (par_meds2 == med2) cnt_adj = use_indx.sum()**0.49 use_sizes = [s for s, ix in zip(size_vec, use_indx) if ix] sort_indx = sorted(enumerate(use_sizes), key=lambda x: x[1], reverse=True) from circlify import circlify mpl.use('Agg') for k, circ in enumerate(circlify([s for _, s in sort_indx])): axarr[i, j].scatter( med2 + (1 / 23) * cnt_adj * circ.y * plt_adj, med1 + (1 / 23) * cnt_adj * circ.x * plt_adj**-1, s=sort_indx[k][1], c=auc_clrs[use_indx][sort_indx[k][0]], alpha=0.36, edgecolor='black') par_means1 += np.random.normal(0, y_adj / 27, auc_df.shape[0]) par_means2 += np.random.normal(0, x_adj / 27, auc_df.shape[0]) axarr[j, i].scatter(par_means1[auc_clrs.index], par_means2[auc_clrs.index], s=size_vec, c=auc_clrs, alpha=0.36, edgecolor='black') axarr[i, j].set_xlim(plt_xmin, plt_xmax) axarr[i, j].set_ylim(plt_ymin, plt_ymax) axarr[j, i].set_ylim(plt_xmin, plt_xmax) axarr[j, i].set_xlim(plt_ymin, plt_ymax) annot_placed = place_annot(par_meds2, par_meds1, size_vec=size_vec, annot_vec=auc_vals.index, x_range=plt_xmax - plt_xmin, y_range=plt_ymax - plt_ymin) for annot_x, annot_y, annot, halign in annot_placed: axarr[i, j].text(annot_x, annot_y, annot, size=11, ha=halign) for par_val1 in use_distr1: axarr[i, j].axhline(y=par_val1, color='#116611', ls=':', linewidth=2.3, alpha=0.19) axarr[j, i].axvline(x=par_val1, color='#116611', ls=':', linewidth=2.3, alpha=0.19) for par_val2 in use_distr2: axarr[i, j].axvline(x=par_val2, color='#116611', ls=':', linewidth=2.3, alpha=0.19) axarr[j, i].axhline(y=par_val2, color='#116611', ls=':', linewidth=2.3, alpha=0.19) plt.tight_layout() fig.savefig(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0], "{}__tuning-mtype-grid.svg".format(args.model_name.split('__')[1])), bbox_inches='tight', format='svg') plt.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('use_dir', type=str) parser.add_argument('--task_ids', type=int, nargs='+') args = parser.parse_args() # load the -omic datasets for this experiment's tumour cohort with bz2.BZ2File(os.path.join(args.use_dir, 'setup', "cohort-data.p.gz"), 'r') as f: cdata = pickle.load(f) with open(os.path.join(args.use_dir, 'setup', "muts-list.p"), 'rb') as f: muts_list = pickle.load(f) # get list of output files from all parallelized jobs file_list = tuple(Path(args.use_dir, 'output').glob("out__cv-*_task*.p")) file_dict = dict() # filter output files according to whether they came from one of the # parallelized tasks assigned to this gather task for out_fl in file_list: fl_info = out_fl.stem.split("out__")[1] out_task = int(fl_info.split("task-")[1]) # gets the parallelized task id and learning cross-validation fold # each output file corresponds to if args.task_ids is None or out_task in args.task_ids: out_cv = int(fl_info.split("cv-")[1].split("_")[0]) file_dict[out_fl] = out_task, out_cv # find the number of parallelized tasks used in this run of the pipeline assert (len(file_dict) % 40) == 0, "Missing output files detected!" task_count = get_task_count(args.use_dir) if args.task_ids is None: use_tasks = set(range(task_count)) out_tag = '' else: use_tasks = set(args.task_ids) out_tag = "_{}".format('-'.join( [str(tsk) for tsk in sorted(use_tasks)])) # organize output files according to their cross-validation fold for # easier collation of output data across parallelized task ids file_sets = { cv_id: { out_fl for out_fl, (out_task, out_cv) in file_dict.items() if out_task in use_tasks and out_cv == cv_id } for cv_id in range(40) } # initialize object that will store raw experiment output data out_dfs = { k: {cis_lbl: [None for cv_id in range(40)] for cis_lbl in cis_lbls} for k in ['Pred', 'Pars', 'Time', 'Acc'] } out_clf = None out_tune = None random.seed(10301) random.shuffle(muts_list) use_muts = [ mut for i, mut in enumerate(muts_list) if i % task_count in use_tasks ] for cv_id, out_fls in file_sets.items(): out_list = [] for out_fl in out_fls: with open(out_fl, 'rb') as f: out_list += [pickle.load(f)] for out_dicts in out_list: if out_clf is None: out_clf = out_dicts['Clf'] else: assert out_clf == out_dicts['Clf'], ( "Each experiment must be run with the same classifier!") if out_tune is None: out_tune = out_dicts['Clf'].tune_priors else: assert out_tune == out_dicts['Clf'].tune_priors, ( "Each experiment must be run with exactly " "one set of tuning priors!") for k in out_dfs: for cis_lbl in cis_lbls: out_dfs[k][cis_lbl][cv_id] = pd.concat([ pd.DataFrame.from_dict( { mtype: out_dict[cis_lbl] for mtype, out_dict in out_dicts[k].items() }, orient='index') for out_dicts in out_list ]) assert (sorted( out_dfs[k][cis_lbl][cv_id].index) == sorted(use_muts)), ( "Mutations with predictions for c-v fold <{}> don't " "match those enumerated during setup!".format(cv_id)) # recover the cohort training/testing data split that was # used to generate the results in this file cdata_samps = sorted(cdata.get_samples()) random.seed((cv_id // 4) * 7712 + 13) random.shuffle(cdata_samps) cdata.update_split(9073 + 97 * cv_id, test_samps=cdata_samps[(cv_id % 4)::4]) test_samps = cdata.get_test_samples() for cis_lbl in cis_lbls: out_dfs['Pred'][cis_lbl][cv_id].columns = test_samps pred_dfs = { cis_lbl: pd.concat(pred_mats, axis=1) for cis_lbl, pred_mats in out_dfs['Pred'].items() } for cis_lbl, pred_df in pred_dfs.items(): assert all(smp in pred_df.columns for smp in cdata.get_samples()), ( "Missing mutation scores for some samples in the cohort!") assert (pred_df.columns.value_counts() == 10).all(), ( "Inconsistent number of CV scores across cohort samples!") pars_dfs = { cis_lbl: pd.concat(pars_mats, axis=1) for cis_lbl, pars_mats in out_dfs['Pars'].items() } for cis_lbl, pars_df in pars_dfs.items(): assert pars_df.shape[1] == (40 * len(out_clf.tune_priors)), ( "Tuned parameter values missing for some CVs!") time_dfs = { cis_lbl: pd.concat(time_mats, axis=1) for cis_lbl, time_mats in out_dfs['Time'].items() } for cis_lbl, time_df in time_dfs.items(): assert time_df.shape[1] == 80, ( "Model fitting times missing for some CVs!") assert (time_df.applymap(len) == out_clf.test_count).values.all(), ( "Model fitting times missing for some hyper-parameter values!") acc_dfs = { cis_lbl: pd.concat(acc_mats, axis=1) for cis_lbl, acc_mats in out_dfs['Acc'].items() } for cis_lbl, acc_df in acc_dfs.items(): assert acc_df.shape[1] == 120, ( "Algorithm tuning accuracies missing for some CVs!") assert (acc_df.applymap(len) == out_clf.test_count).values.all(), ( "Algorithm tuning stats missing for some hyper-parameter values!") for cis_lbl, pred_df in pred_dfs.items(): assert compare_muts(pred_df.index, use_muts), ( "Mutations for which predictions were made do not match the list " "of mutations enumerated during setup!") for cis_lbl1, cis_lbl2 in combn(cis_lbls, 2): assert compare_muts( pred_dfs[cis_lbl1].index, pred_dfs[cis_lbl2].index, time_dfs[cis_lbl1].index, time_dfs[cis_lbl2].index), ( "Mutations tested using cis-exclusion strategy {} do " "not match those tested using strategy {}!".format( cis_lbl1, cis_lbl2)) for cis_lbl1, cis_lbl2 in product(cis_lbls, repeat=2): assert compare_muts( pred_dfs[cis_lbl1].index, pars_dfs[cis_lbl2].index, time_dfs[cis_lbl1].index, acc_dfs[cis_lbl2].index), ( "Mutations with predicted scores do not match those for " "which tuned hyper-parameter values are available!") pred_dfs = { cis_lbl: pd.DataFrame({ mtype: pred_df.loc[mtype].groupby(level=0).apply(lambda x: x.values) for mtype in use_muts }).transpose() for cis_lbl, pred_df in pred_dfs.items() } for cis_lbl, pred_df in pred_dfs.items(): assert (pred_df.applymap(len) == 10).values.all(), ( "Incorrect number of testing CV scores for cis-exclusion " "label `{}`!".format(cis_lbl)) with bz2.BZ2File( os.path.join(args.use_dir, 'merge', "out-pred{}.p.gz".format(out_tag)), 'w') as fl: pickle.dump(pred_dfs, fl, protocol=-1) with bz2.BZ2File( os.path.join(args.use_dir, 'merge', "out-tune{}.p.gz".format(out_tag)), 'w') as fl: pickle.dump([pars_dfs, time_dfs, acc_dfs, out_clf], fl, protocol=-1) cdata.update_split(test_prop=0) train_samps = np.array(cdata.get_train_samples()) pheno_dict = { mtype: np.array(cdata.train_pheno(mtype)) for mtype in use_muts } with bz2.BZ2File( os.path.join(args.use_dir, 'merge', "out-pheno{}.p.gz".format(out_tag)), 'w') as fl: pickle.dump(pheno_dict, fl, protocol=-1) auc_vals = { cis_lbl: { 'all': pd.Series( dict( zip( use_muts, Parallel(n_jobs=12, prefer='threads', pre_dispatch=120)(delayed(calc_auc)(np.vstack( pred_df.loc[mtype] [train_samps].values), pheno_dict[mtype]) for mtype in use_muts)))), # ...and for each cross-validation run considered separately... 'CV': pd.DataFrame.from_records( tuple( zip( cycle(use_muts), Parallel( n_jobs=12, prefer='threads', pre_dispatch=120)( delayed(calc_auc)(np.vstack( pred_df.loc[mtype][train_samps].values) [:, cv_id], pheno_dict[mtype]) for cv_id in range(10) for mtype in use_muts)))).pivot_table( index=0, values=1, aggfunc=list).iloc[:, 0], # ...and finally using the average of predicted scores for each # sample across CV runs 'mean': pd.Series( dict( zip( use_muts, Parallel( n_jobs=12, prefer='threads', pre_dispatch=120)(delayed(calc_auc)(np.vstack( pred_df.loc[mtype][train_samps].values).mean( axis=1), pheno_dict[mtype]) for mtype in use_muts)))) } for cis_lbl, pred_df in pred_dfs.items() } with bz2.BZ2File( os.path.join(args.use_dir, 'merge', "out-aucs{}.p.gz".format(out_tag)), 'w') as fl: pickle.dump(auc_vals, fl, protocol=-1) random.seed(7609) sub_inds = [ random.choices([False, True], k=len(cdata.get_samples())) for _ in range(1000) ] conf_dict = { cis_lbl: pd.DataFrame.from_records( tuple( zip( cycle(use_muts), Parallel(n_jobs=12, prefer='threads', pre_dispatch=120)( delayed(calc_auc)(np.vstack(pred_df.loc[mtype][ train_samps[sub_indx]].values).mean( axis=1), pheno_dict[mtype][sub_indx]) for sub_indx in sub_inds for mtype in use_muts)))).pivot_table( index=0, values=1, aggfunc=list).iloc[:, 0] for cis_lbl, pred_df in pred_dfs.items() } with bz2.BZ2File( os.path.join(args.use_dir, 'merge', "out-conf{}.p.gz".format(out_tag)), 'w') as fl: pickle.dump(conf_dict, fl, protocol=-1)
def plot_comb_distribution(out_data, args, cdata, use_mtypes, use_which=None): if isinstance(use_which, tuple): if use_which[0] == 'freq': use_mtypes = sorted( use_mtypes, key=lambda x: len(x.get_samples(cdata.train_mut)), reverse=True)[:use_which[1]] which_lbl = '_' + '-'.join([str(x) for x in use_which]) elif isinstance(use_which, set): use_mtypes = list(use_which) which_lbl = '_' + '-'.join([str(x) for x in use_which]) else: use_mtypes = out_data.index which_lbl = '' fig, axarr = plt.subplots(nrows=len(use_mtypes), ncols=len(use_mtypes)) fig.tight_layout(w_pad=-0.5, h_pad=-0.87) train_clrs = ['#672885', '#323E8A', '#C03344', '0.5'] plot_fl = 'comb-infer_{}-{}{}.png'.format(args.cohort, args.classif, which_lbl) for mtype in use_mtypes: pi = use_mtypes.index(mtype) axarr[pi, pi].axis('off') axarr[pi, pi].add_patch( mpl.patches.Polygon(np.array([[-0.06, -0.06], [1.06, 1.06], [-0.06, 1.06]]), fill=True, facecolor='#C03344', alpha=0.17, clip_on=False)) axarr[pi, pi].add_patch( mpl.patches.Polygon(np.array([[-0.06, -0.06], [1.06, 1.06], [1.06, -0.06]]), fill=True, facecolor='#323E8A', alpha=0.17, clip_on=False)) axarr[pi, pi].text(x=0.5, y=0.55, s=mtype, size=33, weight='bold', ha='center', va='center') axarr[pi, pi].text(x=0.5, y=0.37, s='{} mutated samples'.format( len(mtype.get_samples(cdata.train_mut))), size=17, ha='center', va='center') for (i, mtype1), (j, mtype2) in combn(enumerate(use_mtypes), 2): plot_data = [] pos_data = [] if (mtype1, mtype2) in out_data.index: mtypes = (mtype1, mtype2) mtype_lbls = ['Mtype1', 'Mtype2'] else: mtypes = (mtype2, mtype1) mtype_lbls = ['Mtype2', 'Mtype1'] mtype1_pheno = np.array(cdata.train_pheno(mtype1)) mtype2_pheno = np.array(cdata.train_pheno(mtype2)) both_stat = mtype1_pheno & mtype2_pheno mtype1_stat = mtype1_pheno & ~mtype2_pheno mtype2_stat = ~mtype1_pheno & mtype2_pheno neith_stat = ~mtype1_pheno & ~mtype2_pheno for (w, train), (v, stat) in product( enumerate(['Both'] + mtype_lbls + ['Diff']), enumerate([both_stat, mtype1_stat, mtype2_stat, neith_stat])): if not isinstance(out_data.loc[mtypes, train], float): pos_data += [w * 2.5 + v / 2] if np.sum(stat) < 3: plot_data += [[]] else: plot_data += [out_data.loc[mtypes, train][stat]] if plot_data: bplot = axarr[i, j].boxplot(x=plot_data, positions=pos_data, patch_artist=True, flierprops=dict(markersize=2), medianprops=dict(color='0.3', linewidth=3)) for patch, color in zip(bplot['boxes'], cycle(train_clrs)): patch.set_facecolor(color) patch.set_alpha(0.6) if (j - i) == 1: axarr[i, j].set_xticks([0.75, 3.25, 5.75, 8.25]) axarr[i, j].set_xticklabels( ['M1 & M2', 'M1 - M2', 'M2 - M1', 'M1 ^ M2'], size=9, ha='center') else: axarr[i, j].xaxis.set_ticklabels([]) if j == (len(use_mtypes) - 1): axarr[i, j].tick_params(axis='y', labelsize=15) axarr[i, j].yaxis.tick_right() else: axarr[i, j].yaxis.set_ticklabels([]) axarr[i, j].set_xlim(-0.5, 9.5) axarr[i, j].set_ylim(-0.02, 1.02) axarr[i, j].grid(axis='x') axarr[i, j].grid(axis='y', which='major', linewidth=2, alpha=0.7) axarr[j, i].set_xlim( len(cdata.samples) * -0.01, len(cdata.samples) * 1.01) axarr[j, i].grid(b=True, axis='x', which='major', linewidth=2, alpha=0.7) if j == (len(use_mtypes) - 1): axarr[j, i].xaxis.set_major_locator(mpl.ticker.MaxNLocator(4)) axarr[j, i].tick_params(axis='x', which='major', labelsize=13) else: axarr[j, i].xaxis.set_ticklabels([]) axarr[j, i].grid(b=True, axis='x', which='minor', linewidth=1, alpha=0.5) axarr[j, i].minorticks_on() axarr[j, i].grid(axis='y') axarr[j, i].yaxis.set_major_formatter(plt.NullFormatter()) axarr[j, i].add_patch( mpl.patches.Rectangle((np.sum(neith_stat) / 2, 0.51), np.sum(mtype1_stat), 0.1, fill=True, facecolor='#323E8A', alpha=0.6)) axarr[j, i].add_patch( mpl.patches.Rectangle( (np.sum(mtype1_stat) + np.sum(neith_stat) / 2, 0.51), np.sum(both_stat), 0.1, fill=True, facecolor='#672885', alpha=0.6)) axarr[j, i].add_patch( mpl.patches.Rectangle( (np.sum(mtype1_stat) + np.sum(neith_stat) / 2, 0.39), np.sum(both_stat), 0.1, fill=True, facecolor='#672885', alpha=0.6)) axarr[j, i].add_patch( mpl.patches.Rectangle( (np.sum(mtype1_pheno) + np.sum(neith_stat) / 2, 0.39), np.sum(mtype2_stat), 0.1, fill=True, facecolor='#C03344', alpha=0.6)) ovlp_test = fisher_exact( [[np.sum(both_stat), np.sum(mtype1_stat)], [np.sum(mtype2_stat), np.sum(neith_stat)]], alternative='two-sided') axarr[j, i].text(x=0, y=0.9, s='{: <8} log2 odds ratio\n$10^{{{: <8}}}$ pval'.format( str(round(log(ovlp_test[0], 2.0), 2)), str(round(log(ovlp_test[1], 10), 1))), size=13, ha='left', va='center') fig_inch = len(use_mtypes) * 3.4 fig.set_size_inches(fig_inch, fig_inch) plt.savefig(os.path.join(plot_dir, plot_fl), dpi=600, bbox_inches='tight') plt.close()
def best_optim(self): """Finds the best mutation partition using the tested sub-types. """ if self.best_mtypes: use_mtypes = list(self.best_mtypes) else: use_mtypes = list(self.mtype_scores.keys()) # gets the list of the best sub-types we have found so far during # sub-type space traversal, initializes the partition optimizer, # creates the variables corresponding to which sub-types are chosen memb = pulp.LpVariable.dicts('memb', use_mtypes, lowBound=0, upBound=1, cat=pulp.LpInteger) partit_mdl = pulp.LpProblem("Mutation Set Model", pulp.LpMaximize) # finds which of the sub-types each mutated sample harbours memb_mat = { mtype: self.cdata.train_mut.status(self.base_train_samps, mtype) for mtype in use_mtypes } # finds how well each mutated sample can be classified against only # non-mutated samples using a classifier for each of the sub-types perf_mat = { mtype: [ memb * (self.mtype_scores[mtype]['Null'] - 0.5) for memb in memb_mat[mtype] ] for mtype in use_mtypes } # adds the objective function of maximizing average AUC across samples partit_mdl += pulp.lpSum([ sum([memb[mtype] * perf_mat[mtype][i] for mtype in use_mtypes]) for i in range(len(self.base_train_samps)) ]) # adds the constraint that we have to pick a group of subsets of # mutations that are disjoint from one another for mtype1, mtype2 in combn(use_mtypes, 2): if not (mtype1 & mtype2).is_empty(): partit_mdl += (pulp.lpSum([memb[mtype1], memb[mtype2]]) <= 1, "{} and {} intersect".format( repr(mtype1), repr(mtype2))) # finds for the optimal solution and parses the results partit_mdl.solve() optim_mtypes = { mtype for mtype in use_mtypes if memb[mtype].varValue > 0 } optim_auc = (pulp.value(partit_mdl.objective) / len(self.base_train_samps)) + 0.5 if self.verbose > 1: print("\n{}".format(pulp.LpStatus[partit_mdl.status])) print("\nResults of partition optimization given current " "best sub-types found during traversal, mean AUC: " "{:.4f}\t(mtype, AUC, #samples)".format(optim_auc)) print('\n'.join('\t{}:\n\t\t{:.4f}\t{}'.format( mtype, self.mtype_scores[mtype]['Null'], len(mtype.get_samples(self.cdata.train_mut))) for mtype in optim_mtypes)) return optim_mtypes
def plot_iso_comparisons(auc_dfs, pheno_dict, use_src, use_coh, args): fig, axarr = plt.subplots(figsize=(15, 15), nrows=3, ncols=3) base_aucs = { ex_lbl: auc_vals[[ not isinstance(mtype, (Mcomb, ExMcomb)) for mtype in auc_vals.index ]] for ex_lbl, auc_vals in auc_dfs.items() } base_mtypes = { tuple(sorted(auc_vals.index)) for auc_vals in base_aucs.values() } assert len(base_mtypes) == 1, ("Mismatching mutation types across " "isolation testing holdout modes!") base_mtypes = tuple(base_mtypes)[0] iso_aucs = {'All': base_aucs['All']} iso_aucs['Iso'] = auc_dfs['Iso'][[ isinstance(mcomb, ExMcomb) and len(mcomb.mtypes) == 1 and tuple(mcomb.mtypes)[0] in base_mtypes and not (mcomb.all_mtype & shal_mtype).is_empty() for mcomb in auc_dfs['Iso'].index ]] iso_aucs['IsoShal'] = auc_dfs['IsoShal'][[ isinstance(mcomb, ExMcomb) and len(mcomb.mtypes) == 1 and tuple(mcomb.mtypes)[0] in base_mtypes and (mcomb.all_mtype & shal_mtype).is_empty() for mcomb in auc_dfs['IsoShal'].index ]] assert not set(iso_aucs['Iso'].index & iso_aucs['IsoShal'].index) for ex_lbl in ('Iso', 'IsoShal'): iso_aucs[ex_lbl].index = [ tuple(mcomb.mtypes)[0] for mcomb in iso_aucs[ex_lbl].index ] plt_min = 0.83 for (i, ex_lbl1), (j, ex_lbl2) in combn(enumerate(base_aucs.keys()), 2): for mtype, auc_val1 in base_aucs[ex_lbl1].iteritems(): plt_min = min(plt_min, auc_val1 - 0.013, base_aucs[ex_lbl2][mtype] - 0.013) mtype_sz = 503 * np.mean(pheno_dict[mtype]) plt_clr = choose_subtype_colour(tuple(mtype.subtype_iter())[0][1]) axarr[i, j].scatter(base_aucs[ex_lbl2][mtype], auc_val1, c=[plt_clr], s=mtype_sz, alpha=0.19, edgecolor='none') for mtype in set(iso_aucs[ex_lbl1].index & iso_aucs[ex_lbl2].index): plt_x = iso_aucs[ex_lbl1][mtype] plt_y = iso_aucs[ex_lbl2][mtype] plt_min = min(plt_min, plt_x - 0.013, plt_y - 0.013) mtype_sz = 503 * np.mean(pheno_dict[mtype]) plt_clr = choose_subtype_colour(tuple(mtype.subtype_iter())[0][1]) axarr[j, i].scatter(plt_x, plt_y, c=[plt_clr], s=mtype_sz, alpha=0.19, edgecolor='none') for i, j in permt(range(3), r=2): axarr[i, j].grid(alpha=0.53, linewidth=0.7) axarr[j, i].grid(alpha=0.53, linewidth=0.7) if j - i != 1 and i < 2: axarr[i, j].xaxis.set_major_formatter(plt.NullFormatter()) else: axarr[i, j].xaxis.set_major_locator( plt.MaxNLocator(7, steps=[1, 2, 4])) if j - i != 1 and j > 0: axarr[i, j].yaxis.set_major_formatter(plt.NullFormatter()) else: axarr[i, j].yaxis.set_major_locator( plt.MaxNLocator(7, steps=[1, 2, 4])) axarr[i, j].plot([plt_min, 1], [0.5, 0.5], color='black', linewidth=1.3, linestyle=':', alpha=0.71) axarr[i, j].plot([0.5, 0.5], [plt_min, 1], color='black', linewidth=1.3, linestyle=':', alpha=0.71) axarr[i, j].plot([plt_min, 1], [1, 1], color='black', linewidth=1.7, alpha=0.89) axarr[i, j].plot([1, 1], [plt_min, 1], color='black', linewidth=1.7, alpha=0.89) axarr[i, j].plot([plt_min, 0.997], [plt_min, 0.997], color='#550000', linewidth=2.1, linestyle='--', alpha=0.41) axarr[i, j].set_xlim([plt_min, 1 + (1 - plt_min) / 113]) axarr[i, j].set_ylim([plt_min, 1 + (1 - plt_min) / 113]) for i, (ex_lbl, auc_vals) in enumerate(base_aucs.items()): axarr[i, i].axis('off') axarr[i, i].text(0.5, 0.5, ex_lbl, size=37, fontweight='bold', ha='center', va='center') plt.tight_layout(w_pad=1.7, h_pad=1.7) plt.savefig(os.path.join( plot_dir, args.gene, "{}__iso-comparisons_{}_{}.svg".format(use_coh, args.classif, use_src)), bbox_inches='tight', format='svg') plt.close()
def plot_score_symmetry(pred_dfs, pheno_dict, auc_dfs, cdata, args, use_src, use_coh, siml_metric): fig, (iso_ax, ish_ax) = plt.subplots(figsize=(15, 8), nrows=1, ncols=2) use_mtree = tuple(cdata.mtrees.values())[0][args.gene] all_mtypes = {'Iso': MuType({('Gene', args.gene): use_mtree.allkey()})} all_mtypes['IsoShal'] = all_mtypes['Iso'] - MuType( {('Gene', args.gene): shal_mtype}) all_phns = { ex_lbl: np.array(cdata.train_pheno(all_mtype)) for ex_lbl, all_mtype in all_mtypes.items() } train_samps = cdata.get_train_samples() iso_combs = remove_pheno_dups( { mut for mut, auc_val in auc_dfs['Iso'].iteritems() if (isinstance(mut, ExMcomb) and auc_val >= args.auc_cutoff and not (mut.all_mtype & shal_mtype).is_empty()) }, pheno_dict) ish_combs = remove_pheno_dups( { mut for mut, auc_val in auc_dfs['IsoShal'].iteritems() if (isinstance(mut, ExMcomb) and auc_val >= args.auc_cutoff and ( mut.all_mtype & shal_mtype).is_empty() and all( (mtp & shal_mtype).is_empty() for mtp in mut.mtypes)) }, pheno_dict) pairs_dict = { ex_lbl: [(mcomb1, mcomb2) for mcomb1, mcomb2 in combn(use_combs, 2) if (all( (mtp1 & mtp2).is_empty() for mtp1, mtp2 in product(mcomb1.mtypes, mcomb2.mtypes)) or not (pheno_dict[mcomb1] & pheno_dict[mcomb2]).any())] for ex_lbl, use_combs in [('Iso', iso_combs), ('IsoShal', ish_combs)] } if args.verbose: for ex_lbl, use_combs in zip(['Iso', 'IsoShal'], [iso_combs, ish_combs]): pair_strs = [ "\n#########\n" "{}: {}({}) {} pairs from {} types".format( use_coh, args.gene, ex_lbl, len(pairs_dict[ex_lbl]), len(use_combs)) ] if pairs_dict[ex_lbl]: pair_strs += ['----------'] pair_strs += [ '\txxxxx\t'.join([str(mcomb) for mcomb in pair]) for pair in pairs_dict[ex_lbl][::( len(pairs_dict[ex_lbl]) // (args.verbose * 7) + 1)] ] combs_dict = { ex_lbl: set(reduce(add, use_pairs)) for ex_lbl, use_pairs in pairs_dict.items() if use_pairs } if not combs_dict: return None map_args = [] ex_indx = [] for ex_lbl, pair_combs in combs_dict.items(): ex_indx += [(ex_lbl, mcombs) for mcombs in pairs_dict[ex_lbl]] use_preds = pred_dfs[ex_lbl].loc[pair_combs, train_samps] wt_vals = { mcomb: use_preds.loc[mcomb, ~all_phns[ex_lbl]] for mcomb in pair_combs } mut_vals = { mcomb: use_preds.loc[mcomb, pheno_dict[mcomb]] for mcomb in pair_combs } if siml_metric == 'mean': wt_means = {mcomb: vals.mean() for mcomb, vals in wt_vals.items()} mut_means = { mcomb: vals.mean() for mcomb, vals in mut_vals.items() } map_args += [(wt_vals[mcomb1], mut_vals[mcomb1], use_preds.loc[mcomb1, pheno_dict[mcomb2]], wt_means[mcomb1], mut_means[mcomb1], None) for mcombs in pairs_dict[ex_lbl] for mcomb1, mcomb2 in permt(mcombs)] elif siml_metric == 'ks': base_dists = { mcomb: ks_2samp(wt_vals[mcomb], mut_vals[mcomb], alternative='greater').statistic for mcomb in pair_combs } map_args += [ (wt_vals[mcomb1], mut_vals[mcomb1], use_preds.loc[mcomb1, pheno_dict[mcomb2]], base_dists[mcomb1]) for mcombs in pairs_dict[ex_lbl] for mcomb1, mcomb2 in permt(mcombs) ] if siml_metric == 'mean': chunk_size = int(len(map_args) / (41 * args.cores)) + 1 elif siml_metric == 'ks': chunk_size = int(len(map_args) / (23 * args.cores)) + 1 pool = mp.Pool(args.cores) siml_list = pool.starmap(siml_fxs[siml_metric], map_args, chunk_size) pool.close() siml_vals = dict(zip(ex_indx, zip(siml_list[::2], siml_list[1::2]))) #TODO: scale by plot ranges or leave as is and thus make sizes # relative to "true" plotting area? plt_lims = min(siml_list) - 0.19, max(siml_list) + 0.19 size_mult = 18301 * len(map_args)**(-5 / 13) clr_norm = colors.Normalize(vmin=-1, vmax=2) for ax, ex_lbl in zip([iso_ax, ish_ax], ['Iso', 'IsoShal']): ax.grid(alpha=0.47, linewidth=0.9) ax.plot(plt_lims, [0, 0], color='black', linewidth=1.37, linestyle=':', alpha=0.53) ax.plot([0, 0], plt_lims, color='black', linewidth=1.37, linestyle=':', alpha=0.53) ax.plot(plt_lims, plt_lims, color='#550000', linewidth=1.43, linestyle='--', alpha=0.41) for siml_val in [-1, 1, 2]: ax.plot(plt_lims, [siml_val] * 2, color=simil_cmap(clr_norm(siml_val)), linewidth=4.1, linestyle=':', alpha=0.37) ax.plot([siml_val] * 2, plt_lims, color=simil_cmap(clr_norm(siml_val)), linewidth=4.1, linestyle=':', alpha=0.37) plt_lctr = plt.MaxNLocator(7, steps=[1, 2, 5]) ax.xaxis.set_major_locator(plt_lctr) ax.yaxis.set_major_locator(plt_lctr) for mcomb1, mcomb2 in pairs_dict[ex_lbl]: plt_sz = size_mult * (np.mean(pheno_dict[mcomb1]) * np.mean(pheno_dict[mcomb2]))**0.5 for i, (plt_half, mcomb) in enumerate( zip(['left', 'right'], [mcomb1, mcomb2])): mrk_style = MarkerStyle('o', fillstyle=plt_half) plt_clr = choose_subtype_colour( tuple(reduce(or_, mcomb.mtypes).subtype_iter())[0][1]) ax.scatter(*siml_vals[ex_lbl, (mcomb1, mcomb2)], s=plt_sz, facecolor=plt_clr, marker=mrk_style, alpha=13 / 71, edgecolor='none') if ex_lbl == 'IsoShal': ax.text(1, 0, "AUC >= {:.2f}".format(args.auc_cutoff), size=19, ha='right', va='bottom', transform=ax.transAxes, fontstyle='italic') iso_ax.set_title( "Similarities Computed Treating\nShallow CNAs as Mutant\n", size=23, weight='bold') ish_ax.set_title( "Similarities Computed Treating\nShallow CNAs as Wild-Type\n", size=23, weight='bold') for ax in [iso_ax, ish_ax]: ax.set_xlim(*plt_lims) ax.set_ylim(*plt_lims) plt.tight_layout(w_pad=3.1) plt.savefig(os.path.join( plot_dir, args.gene, "{}__{}-siml-symmetry_{}_{}.svg".format(use_coh, siml_metric, args.classif, use_src)), bbox_inches='tight', format='svg') plt.close()
def plot_mutual_similarity(pred_df, pheno_dict, auc_vals, cdata, args, use_coh, ex_lbl, siml_metric): use_mtree = tuple(cdata.mtrees.values())[0] use_combs = {mut for mut in auc_vals.index.tolist() if isinstance(mut, ExMcomb) and len(mut.mtypes) == 1} if ex_lbl == 'Iso': use_combs = {mcomb for mcomb in use_combs if not (mcomb.all_mtype & shal_mtype).is_empty()} elif ex_lbl == 'IsoShal': use_combs = {mcomb for mcomb in use_combs if ((mcomb.all_mtype & shal_mtype).is_empty() and all((mtype & shal_mtype).is_empty() for mtype in mcomb.mtypes))} base_phns = {mcomb: (pheno_dict[Mcomb(*mcomb.mtypes)] if Mcomb(*mcomb.mtypes) in pheno_dict else np.array(cdata.get_pheno(Mcomb(*mcomb.mtypes)))) for mcomb in use_combs} use_pairs = [(mcomb1, mcomb2) for mcomb1, mcomb2 in combn(use_combs, 2) if (set(mcomb1.label_iter()) == set(mcomb2.label_iter()) and (all((mtype1 & mtype2).is_empty() for mtype1, mtype2 in product(mcomb1.mtypes, mcomb2.mtypes)) or not (pheno_dict[mcomb1] & pheno_dict[mcomb2]).any()))] use_pairs = remove_pair_dups(use_pairs, pheno_dict) pair_combs = set(reduce(add, use_pairs, tuple())) if args.verbose: print("{}({}): {} pairs containing {} unique mutation types were " "produced from {} possible types".format( use_coh, ex_lbl, len(use_pairs), len(pair_combs), len(use_combs) )) if not use_pairs: return None fig, ax = plt.subplots(figsize=(13, 8)) ax.grid(alpha=0.53, linewidth=0.53) train_samps = cdata.get_train_samples() use_preds = pred_df.loc[pair_combs, train_samps].applymap(np.mean) mutex_dict = {mcombs: None for mcombs in use_pairs} map_args = list() for mcomb1, mcomb2 in use_pairs: ovlp_odds, ovlp_pval = fisher_exact(table=pd.crosstab( base_phns[mcomb1], base_phns[mcomb2])) mutex_dict[mcomb1, mcomb2] = -np.log10(ovlp_pval) if ovlp_odds < 1: mutex_dict[mcomb1, mcomb2] *= -1 all_mtype = reduce( or_, [MuType({('Gene', gene): use_mtree[gene].allkey()}) for gene in mcomb1.label_iter()] ) if ex_lbl == 'IsoShal': all_mtype -= MuType({ ('Gene', tuple(mcomb1.label_iter())): shal_mtype}) all_phn = np.array(cdata.train_pheno(all_mtype)) wt_vals = {mcomb: use_preds.loc[mcomb, ~all_phn] for mcomb in (mcomb1, mcomb2)} mut_vals = {mcomb: use_preds.loc[mcomb, pheno_dict[mcomb]] for mcomb in (mcomb1, mcomb2)} map_args += [(wt_vals[mcomb1], mut_vals[mcomb1], use_preds.loc[mcomb1, pheno_dict[mcomb2]]), (wt_vals[mcomb2], mut_vals[mcomb2], use_preds.loc[mcomb2, pheno_dict[mcomb1]])] pool = mp.Pool(args.cores) siml_list = pool.starmap(SIML_FXS[siml_metric], map_args, chunksize=1) pool.close() siml_vals = dict(zip(use_pairs, zip(siml_list[::2], siml_list[1::2]))) plot_df = pd.DataFrame({'Occur': pd.Series(mutex_dict), 'Simil': pd.Series(siml_vals).apply(np.mean)}) plot_lims = plot_df.quantile(q=[0, 1]) plot_diff = plot_lims.diff().iloc[1] plot_lims.Occur += plot_diff.Occur * np.array([-17., 4.3]) ** -1 plot_lims.Simil += plot_diff.Simil * np.array([-17., 4.3]) ** -1 plot_rngs = plot_lims.diff().iloc[1] plot_lims.Occur[0] = min(plot_lims.Occur[0], -plot_rngs.Occur / 3.41, -1.07) plot_lims.Occur[1] = max(plot_lims.Occur[1], plot_rngs.Occur / 3.41, 1.07) plot_lims.Simil[0] = min(plot_lims.Simil[0], -plot_rngs.Simil / 2.23, -0.53) plot_lims.Simil[1] = max(plot_lims.Simil[1], plot_rngs.Simil / 2.23, 0.53) plot_rngs = plot_lims.diff().iloc[1] size_mult = 20103 * len(map_args) ** (-3 / 7) for (mcomb1, mcomb2), (occur_val, simil_val) in plot_df.iterrows(): plt_sz = size_mult * (pheno_dict[mcomb1].mean() * pheno_dict[mcomb2].mean()) ** 0.5 if (set(tuple(mcomb1.mtypes)[0].label_iter()) == set(tuple(mcomb2.mtypes)[0].label_iter())): use_mrk = 'D' else: use_mrk = 'o' gene_stat = [args.gene in tuple(mcomb.mtypes)[0].label_iter() for mcomb in (mcomb1, mcomb2)] plt_clrs = [ choose_subtype_colour( tuple(tuple(mcomb.mtypes)[0].subtype_iter())[0][1]) if gene_stat[i] else None for i, mcomb in enumerate((mcomb1, mcomb2)) ] if gene_stat[0] ^ gene_stat[1] or plt_clrs[0] != plt_clrs[1]: for i, (plt_half, mcomb) in enumerate(zip(['left', 'right'], [mcomb1, mcomb2])): if gene_stat[i]: mrk_style = MarkerStyle(use_mrk, fillstyle=plt_half) ax.scatter(occur_val, simil_val, s=plt_sz, marker=mrk_style, facecolor=plt_clrs[i], edgecolor='none', alpha=13 / 79) else: if all(gene_stat): fc_clr = plt_clrs[0] eg_clr = 'none' lw = 0 else: fc_clr = 'none' eg_clr = '0.31' lw = 1.9 ax.scatter(occur_val, simil_val, s=plt_sz, marker=use_mrk, facecolor=fc_clr, edgecolor=eg_clr, linewidth=lw, alpha=13 / 79) x_plcs = plot_rngs.Occur / 97, plot_rngs.Occur / 23 y_plc = plot_lims.Simil[1] - plot_rngs.Simil / 41 ax.text(-x_plcs[0], y_plc, '\u2190', size=23, ha='right', va='center', weight='bold') ax.text(-x_plcs[1], y_plc, "significant exclusivity", size=13, ha='right', va='center') ax.text(x_plcs[0], y_plc, '\u2192', size=23, ha='left', va='center', weight='bold') ax.text(x_plcs[1], y_plc, "significant overlap", size=13, ha='left', va='center') x_plc = plot_lims.Occur[1] - plot_rngs.Occur / 17 y_plcs = plot_rngs.Simil / 71, plot_rngs.Simil / 17 ax.text(x_plc, -y_plcs[0], '\u2190', size=23, rotation=90, ha='center', va='top', weight='bold') ax.text(x_plc, -y_plcs[1], "opposite\ndownstream\neffects", size=13, ha='center', va='top') ax.text(x_plc, y_plcs[0], '\u2192', size=23, rotation=90, ha='center', va='bottom', weight='bold') ax.text(x_plc, y_plcs[1], "similar\ndownstream\neffects", size=13, ha='center', va='bottom') plt.xticks(size=11) plt.yticks(size=11) ax.axhline(0, color='black', linewidth=1.7, linestyle='--', alpha=0.41) ax.axvline(0, color='black', linewidth=1.7, linestyle='--', alpha=0.41) plt.xlabel("Genomic Co-occurence", size=23, weight='semibold') plt.ylabel("Transcriptomic Similarity", size=23, weight='semibold') ax.set_xlim(*plot_lims.Occur) ax.set_ylim(*plot_lims.Simil) plt.savefig( os.path.join(plot_dir, args.gene, "{}__{}__{}-mutual-simil_{}_{}.svg".format( use_coh, ex_lbl, siml_metric, args.classif, args.expr_source ) ), bbox_inches='tight', format='svg' ) plt.close()
def plot_symmetry_decomposition(pred_df, pheno_dict, auc_vals, cdata, args, plt_gene, ex_lbl, siml_metric): use_mtree = tuple(cdata.mtrees.values())[0][plt_gene] use_combs = auc_vals.index.tolist() use_pairs = [(mcomb1, mcomb2) for mcomb1, mcomb2 in combn(use_combs, 2) if (all( (mtp1 & mtp2).is_empty() for mtp1, mtp2 in product(mcomb1.mtypes, mcomb2.mtypes)) or not (pheno_dict[mcomb1] & pheno_dict[mcomb2]).any())] if not use_pairs: print("no suitable pairs found among {} possible " "mutations for: {}({}) !".format(len(use_combs), plt_gene, ex_lbl)) return True if len(use_pairs) > PLOT_MAX: print("found {} suitable pairs for {}({}), only plotting " "the top {} by max AUC!".format(len(use_pairs), plt_gene, ex_lbl, PLOT_MAX)) use_pairs = pd.Series({ tuple(mcombs): max(auc_vals[mcomb] for mcomb in mcombs) for mcombs in use_pairs }).sort_values()[-(PLOT_MAX):].index.tolist() mcomb_clx = {mcomb: classify_mcomb(mcomb) for mcomb in use_combs} cls_counts = pd.Series( reduce(add, [[mcomb_clx[mcomb] for mcomb in use_pair] for use_pair in use_pairs])).value_counts() if len(cls_counts) == 1: print("only one partition found, cannot plot decomposition " "for {}({}) !".format(plt_gene, ex_lbl)) return True fig, axarr = plt.subplots( figsize=(1.5 + 3 * len(cls_counts), 1 + 3 * len(cls_counts)), nrows=1 + len(cls_counts), ncols=1 + len(cls_counts), gridspec_kw=dict(width_ratios=[1] + [2] * len(cls_counts), height_ratios=[7] * len(cls_counts) + [2])) all_mtype = MuType({('Gene', plt_gene): use_mtree.allkey()}) if ex_lbl == 'IsoShal': all_mtype -= MuType({('Gene', plt_gene): shal_mtype}) pair_combs = set(reduce(add, use_pairs)) train_samps = cdata.get_train_samples() use_preds = pred_df.loc[pair_combs, train_samps] all_phn = np.array(cdata.train_pheno(all_mtype)) wt_vals = {mcomb: use_preds.loc[mcomb, ~all_phn] for mcomb in pair_combs} mut_vals = { mcomb: use_preds.loc[mcomb, pheno_dict[mcomb]] for mcomb in pair_combs } if siml_metric == 'mean': chunk_size = int(0.91 * len(use_pairs) / args.cores) + 1 wt_means = {mcomb: vals.mean() for mcomb, vals in wt_vals.items()} mut_means = {mcomb: vals.mean() for mcomb, vals in mut_vals.items()} map_args = [(wt_vals[mcomb1], mut_vals[mcomb1], use_preds.loc[mcomb1, pheno_dict[mcomb2]], wt_means[mcomb1], mut_means[mcomb1], None) for mcombs in use_pairs for mcomb1, mcomb2 in permt(mcombs)] elif siml_metric == 'ks': chunk_size = int(0.91 * len(use_pairs) / args.cores) + 1 base_dists = { mcomb: ks_2samp(wt_vals[mcomb], mut_vals[mcomb], alternative='greater').statistic for mcomb in pair_combs } map_args = [(wt_vals[mcomb1], mut_vals[mcomb1], use_preds.loc[mcomb1, pheno_dict[mcomb2]], base_dists[mcomb1]) for mcombs in use_pairs for mcomb1, mcomb2 in permt(mcombs)] pool = mp.Pool(args.cores) siml_list = pool.starmap(siml_fxs[siml_metric], map_args, chunk_size) pool.close() siml_vals = dict(zip(use_pairs, zip(siml_list[::2], siml_list[1::2]))) size_mult = max(727 - math.log(len(use_pairs), 1 + 1 / 77), 31) PAIR_CLRS = ['#0DAAFF', '#FF8B00'] acc_norm = colors.Normalize(vmin=args.auc_cutoff, vmax=auc_vals.max()) acc_cmap = sns.cubehelix_palette(start=1.07, rot=1.31, gamma=0.83, light=0.19, dark=0.73, reverse=True, as_cmap=True) plt_sizes = { (mcomb1, mcomb2): size_mult * (np.mean(pheno_dict[mcomb1]) * np.mean(pheno_dict[mcomb2]))**0.5 for mcomb1, mcomb2 in use_pairs } for (i, cls1), (j, cls2) in combn(enumerate(cls_counts.index), 2): pair_count = len(plt_sizes) for (mcomb1, mcomb2), plt_sz in plt_sizes.items(): if mcomb_clx[mcomb1] == cls2 and mcomb_clx[mcomb2] == cls1: use_clr, use_alpha = PAIR_CLRS[0], 1 / 6.1 elif mcomb_clx[mcomb1] == cls1 and mcomb_clx[mcomb2] == cls2: use_clr, use_alpha = PAIR_CLRS[1], 1 / 6.1 else: use_clr, use_alpha = '0.61', 1 / 17 pair_count -= 1 axarr[i, j + 1].scatter(*siml_vals[mcomb1, mcomb2], c=[use_clr], s=plt_sz, alpha=use_alpha, edgecolor='none') if use_clr in PAIR_CLRS: axarr[j, i + 1].scatter(*siml_vals[mcomb1, mcomb2], c=[acc_cmap(acc_norm(auc_vals[mcomb1]))], s=plt_sz, alpha=use_alpha, edgecolor='none') if pair_count == 1: pair_lbl = "1 pair" else: pair_lbl = "{} pairs".format(pair_count) axarr[j, i + 1].text(0.01, 1, pair_lbl, size=13, ha='left', va='bottom', fontstyle='italic', transform=axarr[j, i + 1].transAxes) axarr[i, j + 1].text(0.99, 1, "({})".format(pair_count), size=13, ha='right', va='bottom', fontstyle='italic', transform=axarr[i, j + 1].transAxes) plt_lims = min(siml_list) - 0.07, max(siml_list) + 0.07 plt_gap = (plt_lims[1] - plt_lims[0]) / 53 cls_counts: pd.Series clx_counts = pd.Series(mcomb_clx).value_counts() for i, (cls, cls_count) in enumerate(cls_counts.iteritems()): axarr[-1, i + 1].text(0.5, 13 / 17, cls, size=23, ha='center', va='top', fontweight='semibold', transform=axarr[-1, i + 1].transAxes) if clx_counts[cls] == 1: count_lbl = "1 subgrouping" else: count_lbl = "{} subgroupings".format(clx_counts[cls]) axarr[-1, i + 1].text(0.5, -1 / 7, count_lbl, size=19, ha='center', va='bottom', fontstyle='italic', transform=axarr[-1, i + 1].transAxes) for (mcomb1, mcomb2), plt_sz in plt_sizes.items(): if mcomb_clx[mcomb1] == cls and mcomb_clx[mcomb2] == cls: use_clr, use_alpha = 'black', 0.37 elif mcomb_clx[mcomb1] == cls: use_clr, use_alpha = PAIR_CLRS[0], 0.19 elif mcomb_clx[mcomb2] == cls: use_clr, use_alpha = PAIR_CLRS[1], 0.19 else: use_clr, use_alpha = '0.73', 1 / 6.1 axarr[i, i + 1].scatter(*siml_vals[mcomb1, mcomb2], c=[use_clr], s=plt_sz, alpha=use_alpha, edgecolor='none') if cls_count == 1: cls_lbl = "1 total pair" else: cls_lbl = "{} total pairs".format(cls_count) axarr[i, i + 1].text(0.99, 1, cls_lbl, size=13, ha='right', va='bottom', fontstyle='italic', transform=axarr[i, i + 1].transAxes) axarr[-2, i + 1].add_patch( ptchs.Rectangle((0.02, -0.23), 0.96, 0.061, facecolor=PAIR_CLRS[0], alpha=0.61, edgecolor='none', transform=axarr[-2, i + 1].transAxes, clip_on=False)) clr_ax = axarr[-2, 0].inset_axes(bounds=(1 / 3, -3 / 17, 4 / 7, 43 / 23), clip_on=False, in_layout=False) clr_bar = ColorbarBase(ax=clr_ax, cmap=acc_cmap, norm=acc_norm, ticklocation='left') clr_ax.set_title("AUC", size=21, fontweight='bold') clr_ax.yaxis.set_major_locator(plt.MaxNLocator(7, steps=[1, 2, 4, 5])) tcks_loc = clr_ax.get_yticks().tolist() clr_ax.yaxis.set_major_locator(mpl.ticker.FixedLocator(tcks_loc)) clr_bar.ax.set_yticklabels( [format(tick, '.2f').lstrip('0') for tick in tcks_loc], size=15, fontweight='semibold') siml_norm = colors.Normalize(vmin=-1, vmax=2) plt_lctr = plt.MaxNLocator(5, steps=[1, 2, 5]) for ax in axarr[:-1, 1:].flatten(): ax.grid(alpha=0.47, linewidth=0.7) ax.plot(plt_lims, [0, 0], color='black', linewidth=0.83, linestyle=':', alpha=0.47) ax.plot([0, 0], plt_lims, color='black', linewidth=0.83, linestyle=':', alpha=0.47) ax.plot(plt_lims, plt_lims, color='#550000', linewidth=1.13, linestyle='--', alpha=0.37) for siml_val in [-1, 1, 2]: ax.plot(plt_lims, [siml_val] * 2, color=simil_cmap(siml_norm(siml_val)), linewidth=2.7, linestyle=':', alpha=0.31) ax.plot([siml_val] * 2, plt_lims, color=simil_cmap(siml_norm(siml_val)), linewidth=2.7, linestyle=':', alpha=0.31) ax.set_xlim(*plt_lims) ax.set_ylim(*plt_lims) ax.xaxis.set_major_locator(plt_lctr) ax.yaxis.set_major_locator(plt_lctr) for i in range(len(cls_counts)): for j in range(1, len(cls_counts) + 1): if i != (j - 1): axarr[i, j].set_xticklabels([]) axarr[i, j].set_yticklabels([]) else: axarr[i, j].tick_params(labelsize=11) for ax in axarr[:, 0].tolist() + axarr[-1, :].tolist(): ax.axis('off') plt.tight_layout(w_pad=2 / 7, h_pad=2 / 7) plt.savefig(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), "{}_{}_{}-symm-decomposition_{}.svg".format(plt_gene, ex_lbl, siml_metric, args.classif)), bbox_inches='tight', format='svg') plt.close()
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description=("Test a classifier's ability to predict the presence " "of a list of sub-types.")) # positional command line arguments parser.add_argument('mtype_dir', type=str, help='the folder where sub-types are stored') parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument('cv_id', type=int, help='a random seed used for cross-validation') parser.add_argument( '--tune_splits', type=int, default=4, help='how many training cohort splits to use for tuning') parser.add_argument( '--test_count', type=int, default=16, help='how many hyper-parameter values to test in each tuning split') parser.add_argument( '--parallel_jobs', type=int, default=16, help='how many parallel CPUs to allocate the tuning tests across') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments and show info about where the # input sub-types are stored and which subset of them will be tested args = parser.parse_args() mtype_list = sorted( pickle.load( open(os.path.join(args.mtype_dir, 'tmp', 'mtype_list.p'), 'rb'))) # loads the pipeline used for classifying variants, gets the mutated # genes for each variant under consideration mut_clf = eval(args.classif) use_genes = reduce( or_, [set(gn for gn, _ in mtype.subtype_list()) for mtype in mtype_list]) # loads the expression data and gene mutation data for the given TCGA # cohort, with the training/testing cohort split defined by the # cross-validation id for this task syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = VariantCohort(cohort=args.cohort, mut_genes=use_genes, mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'], expr_source='Firehose', data_dir=firehose_dir, syn=syn, cv_seed=(args.cv_id + 12) * 71, cv_prop=1.0) infer_mats = {(mtype1, mtype2): {} for mtype1, mtype2 in combn(mtype_list, 2)} for mtype1, mtype2 in combn(mtype_list, 2): print('{} and {}'.format(mtype1, mtype2)) ex_genes = set(gn for gn, _ in mtype1.subtype_list()) ex_genes |= set(gn for gn, _ in mtype2.subtype_list()) clf = mut_clf() samps1 = mtype1.get_samples(cdata.train_mut) samps2 = mtype2.get_samples(cdata.train_mut) if len(samps1 | samps2) <= (len(cdata.samples) - 10): if 10 <= len(samps1 & samps2): clf.tune_coh(cdata, mtype1, exclude_genes=ex_genes, exclude_samps=samps1 ^ samps2, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) infer_mats[(mtype1, mtype2)]['Both'] = clf.infer_coh( cdata, mtype1, exclude_genes=ex_genes, force_test_samps=samps1 ^ samps2, infer_splits=20, infer_folds=4, parallel_jobs=args.parallel_jobs) if 10 <= len(samps1 - samps2): clf.tune_coh(cdata, mtype1, exclude_genes=ex_genes, exclude_samps=samps2, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) infer_mats[(mtype1, mtype2)]['Mtype1'] = clf.infer_coh( cdata, mtype1, exclude_genes=ex_genes, force_test_samps=samps2, infer_splits=20, infer_folds=4, parallel_jobs=args.parallel_jobs) if 10 <= len(samps2 - samps1): clf.tune_coh(cdata, mtype2, exclude_genes=ex_genes, exclude_samps=samps1, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) infer_mats[(mtype1, mtype2)]['Mtype2'] = clf.infer_coh( cdata, mtype2, exclude_genes=ex_genes, force_test_samps=samps1, infer_splits=20, infer_folds=4, parallel_jobs=args.parallel_jobs) if (mtype1.get_levels() == mtype2.get_levels() and 10 <= len(samps2 ^ samps1)): clf.tune_coh(cdata, mtype1 | mtype2, exclude_genes=ex_genes, exclude_samps=samps1 & samps2, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) infer_mats[(mtype1, mtype2)]['Diff'] = clf.infer_coh( cdata, mtype1 | mtype2, exclude_genes=ex_genes, force_test_samps=samps1 & samps2, infer_splits=20, infer_folds=4, parallel_jobs=args.parallel_jobs) # saves the performance measurements for each variant to file out_file = os.path.join(args.mtype_dir, 'results', 'out__cv-{}.p'.format(args.cv_id)) pickle.dump( { 'Infer': infer_mats, 'Info': { 'TuneSplits': args.tune_splits, 'TestCount': args.test_count, 'ParallelJobs': args.parallel_jobs } }, open(out_file, 'wb'))
def main(): parser = argparse.ArgumentParser( "Set up the copy number alteration expression effect isolation " "experiment by enumerating alteration score thresholds to be tested.") # create command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene', type=str, help="which gene to consider") parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found thresholds # and threshold counts will be stored args = parser.parse_args() os.makedirs(os.path.join(base_dir, 'setup', 'ctf_lists'), exist_ok=True) os.makedirs(os.path.join(base_dir, 'setup', 'ctf_counts'), exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # load expression, variant call, and copy number alteration data for # the given TCGA cohort and mutated gene cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, copy_source='Firehose', copy_dir=copy_dir, copy_discrete=False, cv_prop=1.0, syn=syn) ctf_list = [] mut_stat = np.array(cdata.train_mut.status(cdata.copy_data.index)) mut_pheno = np.array(cdata.train_pheno(MuType({('Gene', args.gene): None}))) copy_vals = cdata.copy_data.loc[~mut_stat, args.gene] loss_vals = copy_vals[copy_vals < 0] gain_vals = copy_vals[copy_vals > 0] loss_step = 20 / len(loss_vals) loss_ctfs = np.unique( loss_vals.quantile(np.arange(loss_step, 1, loss_step))) gain_step = 20 / len(gain_vals) gain_ctfs = np.unique( gain_vals.quantile(np.arange(gain_step, 1, gain_step)))[::-1] for low_ctf, high_ctf in combn(loss_ctfs, 2): cna_stat = (~mut_pheno & cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Loss', 'Cutoff': low_ctf })) wt_stat = (~mut_pheno & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Range', 'Cutoff': (low_ctf, high_ctf) }) & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Gain', 'Cutoff': -high_ctf })) if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20): ctf_list += [(low_ctf, high_ctf)] for high_ctf, low_ctf in combn(gain_ctfs, 2): cna_stat = (~mut_pheno & cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Gain', 'Cutoff': high_ctf })) wt_stat = (~mut_pheno & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Range', 'Cutoff': (low_ctf, high_ctf) }) & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Loss', 'Cutoff': -low_ctf })) if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20): ctf_list += [(low_ctf, high_ctf)] # save the list of found non-duplicate subtypes to file pickle.dump( sorted(ctf_list), open( os.path.join(base_dir, 'setup', 'ctf_lists', '{}_{}.p'.format(args.cohort, args.gene)), 'wb')) with open( os.path.join(base_dir, 'setup', 'ctf_counts', '{}_{}.txt'.format(args.cohort, args.gene)), 'w') as fl: fl.write(str(len(ctf_list)))
def main(): parser = argparse.ArgumentParser( "Set up the paired gene expression effect isolation experiment by " "enumerating the dyads of genes to be tested.") parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('--samp_cutoff', type=int, default=40, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found pairs # will be stored args = parser.parse_args() out_path = os.path.join(base_dir, 'setup', args.cohort) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=None, mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, samp_cutoff=args.samp_cutoff, cv_prop=1.0, syn=syn) if args.verbose: print("Looking for pairs of mutated genes present in at least {} of " "the samples in TCGA cohort {} with {} total samples.".format( args.samp_cutoff, args.cohort, len(cdata.samples))) gene_pairs = { (MuType({('Gene', gn1): None}), MuType({('Gene', gn2): None})) for (gn1, muts1), (gn2, muts2) in combn(cdata.train_mut, r=2) if (len(muts1 - muts2) >= args.samp_cutoff and len(muts2 - muts1) >= args.samp_cutoff and len(muts1 | muts2) <= (len(cdata.samples) - args.samp_cutoff)) } if args.verbose: print("Found {} pairs of genes to isolate!".format(len(gene_pairs))) pickle.dump( sorted(gene_pairs), open( os.path.join(out_path, 'pairs_list__samps_{}.p'.format(args.samp_cutoff)), 'wb')) with open( os.path.join(out_path, 'pairs_count__samps_{}.txt'.format(args.samp_cutoff)), 'w') as fl: fl.write(str(len(gene_pairs)))