def ipa_results_to_wideform(res, plogalpha): """ Convert the IPA results dictionary into a wideform pd.DataFrame. Owing to the potentially large number of comparisons, we can't use the Venn approach here, but there's no need. :param res: :param plogalpha: :return: """ de_all_pathways = sorted(setops.reduce_union(*[t.index for t in res.values()])) export_wideform = pd.DataFrame(index=de_all_pathways) member_cols = [] for k, v in res.items(): sign_ix = v.index[v['-logp'] >= plogalpha] this_yn = pd.Series('N', index=de_all_pathways) this_yn.loc[sign_ix] = 'Y' member_cols.append(k) export_wideform.insert( export_wideform.shape[1], k, this_yn ) for col in ['-logp', 'z', 'ratio', 'n_gene']: export_wideform.insert( export_wideform.shape[1], "%s_%s" % (k, col), v.reindex(de_all_pathways)[col] ) # add n gene in pathway as single const column rr = export_wideform.loc[:, export_wideform.columns.str.contains('ratio')] ng = export_wideform.loc[:, export_wideform.columns.str.contains('n_gene')] n_gene_tot = (ng.astype(float).values / rr.astype(float)).mean(axis=1).round().astype(int) export_wideform.insert(0, 'n_gene_in_pathway', n_gene_tot) return export_wideform
def get_dm_associated_de(dmr_ids, de_res_full, dmr_res_full, dmr_id_to_ens, ens_to_dmr_id, ref_labels): all_genes = setops.reduce_union( *[dmr_id_to_ens[i].values for i in dmr_ids]) this_de_full = {} for r in ref_labels: tt = de_res_full[r] tt = tt.loc[tt['Gene Symbol'].isin(all_genes)] this_de_full[r] = tt common_ix = sorted( setops.reduce_intersection(*[t.index for t in this_de_full.values()])) common_gs = this_de_full.values()[0].loc[common_ix, 'Gene Symbol'] dmr_median_delta = {} for e in common_ix: dmr_median_delta[e] = {} for r in ref_labels: dmr_median_delta[e][r] = np.mean([ dmr_res_full[r][i]['median_change'] for i in ens_to_dmr_id[e] ]) dmr_median_delta = pd.DataFrame(dmr_median_delta).transpose().sort_index() this_logfc = pd.concat( (this_de_full[r].loc[common_ix, 'logFC'] for r in ref_labels), axis=1) this_logfc.columns = ref_labels this_logfc.index = common_gs de_logfc = this_logfc.sort_index() return {'de': de_logfc, 'dmr': dmr_median_delta}
def compute_cross_comparison_correction(res, samples, external_refs, set_type='pair_only'): """ Compute the _correction_ list of features for the supplied results. These are the features that are EITHER present in every reference comparison but no cross-comparisons (set_type='ref_only') OR present in no reference comparison but all cross-comparisons (set_type='pair_only') :param res: Dictionary containing comparison results. Each comparison is keyed by the tuple (i, j), where i and j are the IDs of the two groups being compared. Values are iterables of unique feature identifiers (e.g. gene IDs, DMR cluster IDs). :param samples: The core sample list, without including external references. :param external_refs: A list of external reference sample names. :param set_type: See description. :return: Iterable of feature IDs """ members_rows = samples members_cols = members_rows + external_refs the_venn_set = pd.DataFrame(index=members_rows, columns=members_cols) for i in members_rows: p = res[(i, i)] for j in members_cols: r = res[(i, j)] x, _ = setops.venn_from_arrays(p, r) if set_type == 'pair_only': kset = '10' elif set_type == 'ref_only': kset = '01' else: raise AttributeError("set_type must be 'pair_only' or 'ref_only'.") the_venn_set.loc[i, j] = x[kset] # For each reference, get the features that are pair only in that reference and not in any of the iNSC vs_diff = pd.DataFrame(index=members_rows, columns=external_refs) for i in members_rows: for j in external_refs: the_ref = the_venn_set.loc[i, j] all_else = the_venn_set.loc[i, members_rows] union_all_else = setops.reduce_union(*all_else) vs_diff.loc[i, j] = sorted(set(the_ref).difference(union_all_else)) # Intersection down the columns gives us a correction list for each reference vs_specific_to_ref = vs_diff.apply(lambda x: setops.reduce_intersection(*x)) # Intersection across the references gives us a final list that need correcting vs_specific_to_all_refs = setops.reduce_intersection(*vs_specific_to_ref) return { 'specific_to_each_ref': vs_specific_to_ref, 'specific_to_all_refs': vs_specific_to_all_refs, 'venn_set': the_venn_set, 'ref_diff_set': vs_diff }
def set_permutation_test(data, n_iter=1000, parallel=True): K = len(data) N = len(setops.reduce_union(*data.values())) set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()]) simulated_sizes = collections.defaultdict(list) if parallel: pool = mp.Pool() jobs = {} for i in range(n_iter): jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N)) pool.close() pool.join() for i, j in jobs.items(): vc = j.get() for k, v in vc.items(): simulated_sizes[k].append(v) else: for i in range(n_iter): vc = one_random_perm(set_sizes, N) for k, v in vc.items(): simulated_sizes[k].append(v) _, vc_true = setops.venn_from_arrays(*data.values()) # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P # Some interesting discussions on this topic: # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided # However, a 'Z' value is easier to compute z = {} p = {} for k in simulated_sizes.keys(): obs = vc_true[k] t = stats.percentileofscore(simulated_sizes[k], obs) if t <= 50: p[k] = 2 * t / 100. else: p[k] = 2 * (1 - t / 100.) z[k] = t - 50. return { 'simulated_set_sizes': simulated_sizes, 'observed_set_sizes': vc_true, 'p': p, 'z': z }
def check_data_compat(self): if self.dmr_comparison_groups is not None: if self.dmr_res is not None: for grp_name, grp_dict in self.dmr_comparison_groups.items(): if grp_name not in self.dmr_res: raise ValueError("Group %s is not in the DMR results" % grp_name) if self.de_res is not None: for grp_name, grp_dict in self.dmr_comparison_groups.items(): if grp_name not in self.de_res: raise ValueError("Group %s is not in the DE results" % grp_name) if self.mdat is not None: for grp_name, grp_dict in self.dmr_comparison_groups.items(): all_samples = list(setops.reduce_union(*grp_dict.values())) if len(self.mdat.columns.intersection(all_samples)) != len( all_samples): raise ValueError( "Group %s contains samples that are missing from mdat" % grp_name)
def export_de_dmr_groups_for_ipa(de_fdr, de_logfc, groups, fn_out=None, pids=consts.PIDS): """ :param de_fdr: Output of `get_de_dmr_groups` :param de_logfc: Output of `get_de_dmr_groups` :param groups: :param fn_out: If supplied, the IPA results (in Excel format) will be written to this path :param pids: :return: """ # export these for IPA analysis df_for_ipa = pd.DataFrame( index=sorted(setops.reduce_union(*[t.index for t in de_logfc.values()])), columns=reduce(operator.add, [["%s_logFC" % pid, "%s_FDR" % pid] for pid in pids]) ) for grp in groups: for pid in groups[grp]: this_logfc = de_logfc[grp][pid].dropna() this_fdr = de_fdr[grp][pid].dropna() df_for_ipa.loc[this_logfc.index, "%s_logFC" % pid] = this_logfc df_for_ipa.loc[this_fdr.index, "%s_FDR" % pid] = this_fdr if fn_out is not None: df_for_ipa.to_excel(fn_out) return df_for_ipa
ls='--') ax.set_xlabel('Number of variants') ax.set_ylabel('Density') fig.tight_layout() fig.savefig(os.path.join(outdir, "permute_partial_counts_meth_assoc_hyper.png"), dpi=200) # track these partial matches down aa_hypo = [(setops.key_to_members(t, pids), vs[t]) for t in venn_sets_by_group['partial']['Hypo'] if len(setops.key_to_members(t, pids)) > 2] aa_hyper = [(setops.key_to_members(t, pids), vs[t]) for t in venn_sets_by_group['partial']['Hyper'] if len(setops.key_to_members(t, pids)) > 4] all_hypo = setops.reduce_union(*[t[1] for t in aa_hypo]) all_hyper = setops.reduce_union(*[t[1] for t in aa_hyper]) partial_hypo_recs = [] partial_hyper_recs = [] for pid_arr, arr in aa_hypo: for x in arr: the_search_list = dat_classified[pid_arr[0]]['GIC only'] the_search_list.extend([ t['GIC'] for t in dat_classified[pid_arr[0]]['GIC hom iNSC het'] ]) the_search_list.extend( [t['GIC'] for t in dat_classified[pid_arr[0]]['other']]) the_recs = [t for t in the_search_list if str(t) == x]
'Hypo': '#c70039', 'Hyper': '#3d3d6b', 'Discordant': 'b' } # Don't think we need this, but may be useful for a comparison? if False: dmr_by_member = [dmr_res_all[pid].keys() for pid in pids] venn_set, venn_ct = setops.venn_from_arrays(*dmr_by_member) venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups) dmr_groups = {} for grp in groups: # generate bar chart showing number / pct in each direction (DM) this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp] this_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets])) dmr_groups[grp] = this_dmrs # Rather than just looking at genes corresponding to group-specific DMRs, we make the requirements more # stringent. For each Venn set (e.g. 018, 054, 052 - hyper group), we require DE genes in the same patients. # Simplest approach is to use the joint_de_dmr dataframes, which have already been combined. # all relations tmp = get_de_dmr_groups(joint_de_dmr_s1, dmr_res_s1.clusters, groups) de_dmrs_all = tmp['de_dmr_groups'] de_dmr_de_fdr_all = tmp['de_FDR'] de_dmr_de_logfc_all = tmp['de_logFC'] de_dmr_dmr_median_delta_all = tmp['dmr_median_delta_m'] de_dmr_ipa_res_all = export_de_dmr_groups_for_ipa( de_dmr_de_fdr_all, de_dmr_de_logfc_all,
def get_de_dmr_groups( joint_de_dmr, clusters, groups, pids=consts.PIDS, relation_filter=None ): """ Get group-specific DE/DMRs. These are defined as DEs that are consistent with the DMRs in a given selection of patients (from one to many) that are NOT shared across groups. :param joint_de_dmr: :param clusters: :param groups: Dictionary, keyed by group name. Values are iterables giving patient IDs in each group. :param pids: :param relation_filter: :return: """ venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups) if relation_filter is not None: if not hasattr(relation_filter, '__iter__'): relation_filter = [relation_filter] de_dmr_groups = {} de_dmr_de_logfc = {} de_dmr_de_fdr = {} de_dmr_dmr_delta = {} if relation_filter is None: de_dmr_by_member = [joint_de_dmr[pid].index for pid in pids] else: de_dmr_by_member = [] for pid in pids: this_members = [] for t in joint_de_dmr[pid].index: gene_rel_options = [(t[1], rel) for rel in relation_filter] if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0: this_members.append(t) de_dmr_by_member.append(this_members) venn_set, venn_count = setops.venn_from_arrays(*de_dmr_by_member) for grp in groups: this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp] this_de_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets])) if relation_filter is not None: new_de_dmrs = [] for t in this_de_dmrs: # look for any intersection here gene_rel_options = [(t[1], rel) for rel in relation_filter] if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0: new_de_dmrs.append(t) this_de_dmrs = new_de_dmrs de_dmr_groups[grp] = this_de_dmrs # get separate lists of DE genes and DMR IDs # DMRs is straightforward de_dmr_dmr_delta[grp] = pd.DataFrame( index=sorted(set([t[0] for t in this_de_dmrs])), columns=pids + ['consistent'], ) # DEs is trickier: some genes have mapped twice because I was so diligent in curating the original lists! this_de_genes = sorted(set([t[1] for t in this_de_dmrs])) this_de_ens = annotation_gene_to_ensembl.gene_to_ens(this_de_genes) this_de_ens = this_de_ens[~this_de_ens.duplicated()] this_de_genes = this_de_ens.index de_dmr_de_logfc[grp] = pd.DataFrame( index=this_de_genes.tolist(), columns=pids + ['consistent'], ) de_dmr_de_fdr[grp] = pd.DataFrame( index=this_de_genes.tolist(), columns=pids + ['consistent'], ) # fill them in for k in this_sets: this_vs = [t for t in venn_set[k] if t[1] in this_de_genes] this_pids = [pids[i] for i, t in enumerate(k) if t == '1'] for pid in this_pids: de_dmr_dmr_delta[grp].loc[[t[0] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'dmr_median_delta'].values de_dmr_de_logfc[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'de_logFC'].values de_dmr_de_fdr[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'de_FDR'].values for k, row in de_dmr_dmr_delta[grp].iterrows(): tmp_dm = np.sign(row.dropna().astype(float)) row['consistent'] = (tmp_dm == tmp_dm.iloc[0]).all() for k, row in de_dmr_de_logfc[grp].iterrows(): tmp_de = np.sign(row.dropna().astype(float)) row['consistent'] = (tmp_de == tmp_de.iloc[0]).all() de_dmr_de_fdr[grp].loc[k, 'consistent'] = row['consistent'] return { 'dmr_median_delta_m': de_dmr_dmr_delta, 'de_logFC': de_dmr_de_logfc, 'de_FDR': de_dmr_de_fdr, 'de_dmr_groups': de_dmr_groups }
def venn_set_to_dataframe( data, venn_set, set_labels, include_sets=None, full_data=None, logfc_col='logFC', fdr_col='FDR', run_sanity_check=False, add_null_set=False, ): """ Given the input DE data and Venn sets, generate a wide format dataframe containing all the data, one column per patient and one row per gene. Optionally filter the sets to include only a subset. Optionally include non-significant results too. :param data: Dict containing DE results, keyed by the entries of set_labels :param venn_set: :param set_labels: :param include_sets: :param full_data: If supplied, this has the same format as `data`, but the lists are complete so that even non- significant results can be accessed. :param logfc_col: The name of the log fold change column in the input data. Also used to name columns in the df. :param fdr_col: The name of the FDR column in the input data. Also used to name columns in the df. :param run_sanity_check: (default: False) If True, run an additional sanity check at the end. This *should* be unnecessary. It's slow for larger numbers of members. :return: """ if add_null_set and full_data is None: raise ValueError("Can only add_null_set if full_data is supplied.") if include_sets is not None: venn_set = dict([ (k, v) for k, v in venn_set.items() if k in include_sets ]) # precompute columns cols = reduce( lambda x, y: x + y, [[t, "%s_%s" % (t, logfc_col), "%s_%s" % (t, fdr_col)] for t in set_labels] ) + ['consistency'] res = [] genes_seen = set() for k in venn_set: the_genes = venn_set[k] genes_seen.update(the_genes) # populate with individual patient results this_block = pd.DataFrame(index=the_genes, columns=cols) # blocks = [] consistency_check = [] for i, t in enumerate(k): pid = set_labels[i] if t == '1': this_block.loc[:, pid] = 'Y' this_block.loc[the_genes, "%s_%s" % (pid, logfc_col)] = data[pid].loc[the_genes, logfc_col] this_block.loc[the_genes, "%s_%s" % (pid, fdr_col)] = data[pid].loc[the_genes, fdr_col] cc = data[pid].loc[the_genes, 'Direction'] cc.name = pid consistency_check.append(cc) else: this_block.loc[:, pid] = 'N' # this_datum.loc[the_genes, pid] = 'N' if full_data is not None: # we can't guarantee there will be entries for all genes, as filtering removes some # therefore find matches in advance and only fill in those rows the_genes_present = pd.Index(the_genes).intersection(full_data[pid].index) this_block.loc[the_genes_present, "%s_%s" % (pid, logfc_col)] = full_data[pid].loc[the_genes_present, logfc_col] this_block.loc[the_genes_present, "%s_%s" % (pid, fdr_col)] = full_data[pid].loc[the_genes_present, fdr_col] # assess consistency of DE direction consist = pd.Series(index=the_genes) if len(consistency_check) > 0: consistency_check = pd.concat(consistency_check, axis=1) idx = consistency_check.apply(lambda col: col == consistency_check.iloc[:, 0]).all(axis=1) consist.loc[idx] = 'Y' consist.loc[~idx] = 'N' this_block.loc[:, 'consistency'] = consist res.append(this_block) # check: no genes should be in more than one data entry if run_sanity_check: for i, k in enumerate(venn_set): for j, k2 in enumerate(venn_set): if k == k2: continue bb = len(res[i].index.intersection(res[j].index)) if bb > 0: raise AttributeError("Identified %d genes that are in BOTH %s and %s" % (bb, k, k2)) if add_null_set: all_genes = setops.reduce_union(*[t.index for t in full_data.values()]) add_genes = all_genes.difference(genes_seen) this_block = pd.DataFrame(index=add_genes, columns=cols) for pid in set_labels: # by definition, no samples are DE positive in the null set this_block.loc[:, pid] = 'N' the_genes_present = add_genes.intersection(full_data[pid].index) this_block.loc[the_genes_present, "%s_%s" % (pid, logfc_col)] = full_data[pid].loc[the_genes_present, logfc_col] this_block.loc[the_genes_present, "%s_%s" % (pid, fdr_col)] = full_data[pid].loc[the_genes_present, fdr_col] res.append(this_block) res = pd.concat(res, axis=0) # add gene symbols general.add_gene_symbols_to_ensembl_data(res) return res
sample_text = dat.columns rad = (np.array(zip(*[svd_res['feat_dat'][i + 1] for i in dims_pair])) ** 2).sum(axis=1) ** .5 to_annotate = rad > selection_radius p1 = generate_plotly_plot( svd_res, filename="pca_biplot_dims_%d-%d" % tuple(t + 1 for t in dims_pair), feature_size_scaling=size_scaling, feature_text=feature_text, sample_text=sample_text, sample_colours=sample_colours, sample_markers=sample_markers, feature_text_mask=~to_annotate ) # export lists for IPA ix_all = sorted(setops.reduce_union(*[t[0.99].index for t in selected_by_quantile_mean_logfc.values()])) ipa_mean_logfc = pd.DataFrame(index=ix_all) for k, v in selected_by_quantile_mean_logfc.items(): ipa_mean_logfc.insert(0, "pc_%s_q99_logFC" % '-'.join([str(t+1) for t in k]), v[0.99]) ipa_mean_logfc.insert(0, "pc_%s_q995_logFC" % '-'.join([str(t+1) for t in k]), v[0.995]) ipa_mean_logfc.to_excel(os.path.join(outdir, "for_ipa_mean_logfc.xlsx")) for k, v in selected_by_quantile_separate_logfc.items(): ix_all = setops.reduce_union(*[v[q].index for q in quantiles]) this = [] for q in quantiles: tt = v[q] tt.columns = ["%s_%d_logFC" % (p, int(q * 1000)) for p in pids] this.append(tt) pd.concat(this, axis=1, sort=True).to_excel(os.path.join(outdir, "for_ipa_separate_logfc_pc%s.xlsx" % '-'.join([str(t+1) for t in k])))
def plot_m_values( self, mdat, probe_locations, comparisons, colours='default', markers='default', zorder='default', alpha='default', size='default', ): """ :param mdat: pd.DataFrame containing the data to plot. Columns are samples, rows are probes :param probe_locations: pd.Series containing the probe IDs to include and their genomic coordinates :param comparisons: Dictionary keyed by comparison (equivalent to row_names). Each entry is a dictionary keyed by group name (e.g. 'Disease' / 'Healthy') and with values giving the samples in that group. The sample names must be in the columns of `mdat`. :param colours: Dictionary keyed by group name (e.g. 'Disease') giving the colour to use for that group. Defaults are used if not supplied. To disable colours, set to None. :param markers: Dictionary keyed by group name giving the marker to use for that group. Defaults are used if not supplied. To use circle markers for everything, set to None. :param zorder: Dictionary keyed by group name giving the zorder to use for that group. Defaults are used if not supplied. To use matplotlib defaults for everything, set to None. :param alpha: Dictionary keyed by group name giving the alpha to use for that group. Defaults are used if not supplied. To use matplotlib defaults for everything, set to None. :return: """ all_groups = sorted( setops.reduce_union(*(t.keys() for t in comparisons.values()))) n_groups = len(all_groups) def set_property(x, default, default_static): if x == 'default': out = dict(zip(all_groups, default)) elif x is None: out = dict([(k, default_static) for k in all_groups]) elif not hasattr(x, 'get'): # single value supplied out = dict([(k, x) for k in all_groups]) else: out = x return out colours = set_property(colours, common.get_best_cmap(n_groups), '0.5') markers = set_property(markers, common.get_best_marker_map(n_groups), 'o') zorder = set_property(zorder, range(20, 20 + n_groups), 20) # default alpha will be based on zorder a = sorted([(k, zorder[k]) for k in all_groups], key=lambda x: x[1]) a_ix = dict([(t[0], i) for i, t in enumerate(a)]) alpha_values = np.linspace(0.4, 0.6, n_groups) alpha_default = [alpha_values[a_ix[k]] for k in all_groups] alpha = set_property(alpha, alpha_default, '0.6') # default size will be based on zorder s_values = range(20, 20 + n_groups) s_default = [s_values[a_ix[k]] for k in all_groups] size = set_property(size, s_default, 20) # scatter plot individual probes ymin = 0 ymax = 0 for nm in self.row_names: grp_dict = comparisons[nm] this_ax = self.m_axs[nm] for grp_nm, grp_samples in grp_dict.items(): the_colour = colours.get(grp_nm) the_marker = markers.get(grp_nm) the_z = zorder.get(grp_nm) the_alpha = alpha.get(grp_nm) the_s = size.get(grp_nm) for col, x in mdat.loc[probe_locations.index, grp_samples].iteritems(): this_ax.scatter(probe_locations, x.values, c=the_colour, marker=the_marker, zorder=the_z, alpha=the_alpha, s=the_s, edgecolor='k', linewidth=0.5) ymin = min(x.values.min(), ymin) ymax = max(x.values.max(), ymax) this_ax.set_ylabel(nm) self.mdat_min = ymin self.mdat_max = ymax if self.coord_max is None: self.coord_min = probe_locations.min() self.coord_max = probe_locations.max() else: self.coord_min = min(probe_locations.min(), self.coord_min) self.coord_max = max(probe_locations.max(), self.coord_max)
de_res_full_s1 = pickle.load(f) else: raise AttributeError( "Unable to load pre-computed DE results, expected at %s" % fn) de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']]) for k, v in de_res_full_s1.items()]) # get the joint table joint_de_dmr_s1 = rnaseq_methylationarray.compute_joint_de_dmr( dmr_res_s1, de_res_s1) # run the dgidb lookup against all genes # have to chunk this operation to avoid error all_genes = sorted( setops.reduce_union(*[t.gene.values for t in joint_de_dmr_s1.values()])) dgi_all = druggable_genome.dgidb_lookup_drug_gene_interactions(all_genes) # manually resolve a few known ambiguities ambig = {'ELTD1': 'ADGRL4', 'ODZ3': 'TENM3'} for k, v in ambig.items(): x = [t for t in dgi_all['ambiguous'][k] if t['geneName'] == v][0] dgi_all['interactions'][k] = x['interactions'] de_dmr_by_member = [joint_de_dmr_s1[pid].index for pid in pids] venn_set, venn_ct = setops.venn_from_arrays(*de_dmr_by_member) # define short and long list # long list ss = setops.specific_sets(pids)
# functional API - the python bindings are incomplete here? cy = CyRestClient() # reset the session (in case something is already loaded) cy.session.delete() # command API - the python bindings are much better cy_cmd = cyrest.cyclient() for pid in pids: # three networks to work with res_syn = res['%s_syngeneic' % pid] res_r1 = res['%s_h9' % pid] res_r2 = res['%s_gibco' % pid] all_pathways = setops.reduce_union( *[t.index for t in (res_syn, res_r1, res_r2)]) p_to_g = dict([(p, gmt[p]) for p in all_pathways]) # to get connectivity, we need to create the complementary dictionary (indexed by genes) g_to_p = {} for p in all_pathways: for g in p_to_g[p]: g_to_p.setdefault(g, []).append(p) # we're going to use passthrough mapping to customise the node colour # we'll define 3 colourmaps, with -log10(p) assigning the shade: # greyscale for syn. and ref. # reds for ref. only # blues for syn. only # colours are defined by HEX values? Add these to the nodes
for j, (k2, v2) in enumerate(v1.iteritems()): k = 0 ax = fig.add_subplot(gs_sub[j, k]) if j == 0: ax.set_title('Hypo') this_members = [v2.index[v2["median_delta_%s" % r] < 0] for r in esc_ref_names] set_labels = None if j == (len(v1) - 1): set_labels = esc_ref_names vd = venn.venn_diagram( *this_members, set_labels=set_labels, set_colors=set_colours_hypo, ax=ax, normalize_to=(len(setops.reduce_union(*this_members)) / set_size_base) ** 2 )[0] plt.setp(vd.patches, edgecolor='k') if vd.set_labels is not None: for lbl in vd.set_labels: xx, yy = lbl.get_position() lbl.set_position([xx * 3, yy]) k = 1 ax = fig.add_subplot(gs_sub[j, k]) if j == 0: ax.set_title('Hyper') this_members = [v2.index[v2["median_delta_%s" % r] > 0] for r in esc_ref_names] vd = venn.venn_diagram( *this_members, set_labels=set_labels,
# these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)] # myc_corr_probes.append(these_probes) pool.close() pool.join() for p in myc_probes: cor, pval = jobs[p].get(1e4) these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)] myc_corr_probes.append(these_probes) # out of interest, what is the overlap between these? (presumably quite high?) vs, vc = setops.venn_from_arrays(*myc_corr_probes) # union of probes keep_probes = setops.reduce_union(*myc_corr_probes) print "After comparing all data against each MYC probe, we are left with %d correlated probes" % len( keep_probes) genes_corr_with_myc = the_symbols.loc[keep_probes].dropna() print "These correspond to %d unique genes." % len( genes_corr_with_myc.unique()) # check the overlap with validated genes overlap = pd.Index(validated_genes).intersection( genes_corr_with_myc.unique()) if len(overlap) == len(validated_genes): print "Good news: all %d validated genes are in the genes shortlist." % len( validated_genes) else:
if len(diff_kegg): print "%d genes in the geneset mTOR (KEGG) are not in the data and will be removed: %s" % ( len(diff_kegg), ', '.join(diff_kegg.tolist()) ) for t in diff_kegg: mtor_geneset.remove(t) rna_list_hu['mTOR'] = mtor_geneset # export supplementary tables to_export = the_list_mo.copy() to_export.columns = ['Mouse BMDM', 'Mouse MG'] all_genes_in_set = setops.reduce_union(*the_list_hu.values()) # DEBUG: disable filtering genes - why would we need to? if False: # remove genes that have no appreciable expression level # >=10 samples must have FPKM >= 1 to_keep = ((rnaseq_dat > fpkm_cutoff).sum(axis=1) > fpkm_min_samples) | (rnaseq_dat.index.isin(all_genes_in_set)) print "Keeping %d / %d genes that are sufficiently abundant" % (to_keep.sum(), to_keep.size) rnaseq_dat = rnaseq_dat.loc[to_keep] # run ssGSEA rna_es = gsva.ssgsea(rnaseq_dat, rna_list_hu) ffpe_es = gsva.ssgsea(ffpe_dat, rna_list_hu) # scale using the Z transform # TODO: previous operation had axis=None
"iNSC%s" % insc_pid)]) # for each GIC line: get DE genes in syngeneic comparison but NOT in any cross-comparison n_syn_only = pd.DataFrame(index=pd.Index(pids, name='GIC'), columns=pd.Index(pids, name='iNSC')) syn_only = {} for gic_pid, insc_pid in itertools.product(pids, pids): # this_syn isn't necessarily syngeneic, but we're acting as if it were here this_syn = de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % insc_pid)] others = [ de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % p)] for p in pd.Index(pids).drop(insc_pid) ] # 'syn only' index this_so_ix = this_syn.index.difference( setops.reduce_union(*[t.index for t in others])) syn_only[("GBM%s" % gic_pid, "iNSC%s" % insc_pid)] = this_syn.loc[this_so_ix] n_syn_only.loc[gic_pid, insc_pid] = this_so_ix.size true_syn_only = dict([(p, syn_only[("GBM%s" % p, "iNSC%s" % p)]) for p in pids]) # export to list excel.pandas_to_excel(true_syn_only, os.path.join(outdir, "de_only_in_syngeneic.xlsx")) # export for IPA # we're going to run the true syngeneic (10) against non-syngeneic chosen to give the greatest number of DE genes # in practice, this means fixing the identity of the iNSC selected_insc = ['018', '030', '054', '052']
def __init__(self, loaders, intersection_only=True): """ Class to combine multiple loader objects. Each loader represents a separate batch. Inputs can include multiple lane loaders. :param loaders: Iterable of loader objects. :param intersection_only: If True (default), reduce counts to the indices (e.g. genes) that are present in all loaders. """ self.logger = log.get_console_logger(self.__class__.__name__) if len(loaders) < 2: raise ValueError("Must supply 2 or more loaders to use a MultipleBatchLoader.") # we can only claim the meta data is linked here if all loaders have this property self.meta_is_linked = True for l in loaders: if not l.meta_is_linked: self.meta_is_linked = False # set the batch column name avoiding clashes batch_col = 'batch' meta_cols = sorted(setops.reduce_union(*[t.meta.columns for t in loaders if t.meta is not None])) if batch_col in meta_cols: i = 1 while batch_col in meta_cols: batch_col = "batch_%d" % i i += 1 meta_cols += [batch_col] # check attributes that must match in all loaders if len(set([t.tax_id for t in loaders])) > 1: raise AttributeError( "The tax_id of the samples differ between loaders: %s" % ', '.join([str(t.tax_id) for t in loaders]) ) else: self.tax_id = loaders[0].tax_id if len(set([t.row_indexed for t in loaders])) > 1: raise AttributeError("row_indexed bool must be the same in all loaders") else: self.row_indexed = loaders[0].row_indexed extra_df_attributes = {} if self.row_indexed: row_indexed_dat_arr = {} else: dat = {} meta_values = [] meta_index = [] blank_meta_row = dict([(k, None) for k in meta_cols]) # we may need to append a number to sample names sample_appendix = 0 auto_batch = 1 meta_auto_idx = 0 samples_seen = set() for l in loaders: this_batch = l.batch_id if not hasattr(this_batch, '__iter__'): if l.batch_id is None: this_batch = auto_batch auto_batch += 1 this_batch = pd.Series(this_batch, index=l.meta.index) try: this_samples = l.input_files.index.tolist() except AttributeError: # occurs when we are loading a single file # FIXME: find a better catch - this is too general if hasattr(l, 'input_files'): # this occurs if l is a single file loader ## FIXME: single file loaders may contain multiple samples ## in that case, this doesn't spot name clashes!! # FIXME: here's a workaround for now: may not be bulletproof this_samples = [l.input_files] if len(this_samples) != len(l.meta.index): this_samples = l.meta.index.tolist() else: # this occurs if l is a batch loader # FIXME: may not give us valid sample names? this_samples = l.meta.index.tolist() # get a copy of the data if self.row_indexed: this_dat = l.data.copy() else: this_dat = copy.copy(l.data) # get a copy of meta if l.meta is not None: this_meta = l.meta.copy() # resolve any sample clashes in the data (NOT the meta data) clash_resolved = False new_names = [] while len(samples_seen.intersection(this_samples)) > 0: sample_appendix += 1 # find the clash clashes = samples_seen.intersection(this_samples) self.logger.warning( "Found sample name clash(es): %s. Modifying names to avoid errors.", ', '.join(clashes) ) for c in clashes: new_names.append([ this_samples[this_samples.index(c)], this_samples[this_samples.index(c)] + "_%d" % sample_appendix ]) this_samples[this_samples.index(c)] += "_%d" % sample_appendix clash_resolved = True samples_seen.update(this_samples) if clash_resolved: # relabel metadata if linked if l.meta_is_linked: # reorder first to be sure it's the same as data this_meta = this_meta.loc[this_dat.columns] this_meta.index = this_samples # relabel the data if self.row_indexed: this_dat.columns = this_samples else: for prev, new in new_names: this_dat[new] = this_dat.pop(prev) # relabel the batch IDs this_batch.index = this_samples # relabel any other DF data if present for fld in l.extra_df_attributes: x = getattr(l, fld) x.columns = this_samples # data if self.row_indexed: if isinstance(this_dat.columns, pd.MultiIndex): col_list = this_dat.columns.levels[0].tolist() else: col_list = this_dat.columns.tolist() for c in col_list: row_indexed_dat_arr[c] = this_dat[[c]] else: dat.update(this_dat) # other df attributes for fld in l.extra_df_attributes: if fld not in extra_df_attributes: extra_df_attributes[fld] = getattr(l, fld).copy() else: extra_df_attributes[fld] = pd.concat((extra_df_attributes[fld], getattr(l, fld)), axis=1) # rebuild meta if l.meta is not None: for i in this_meta.index: this_row = dict(blank_meta_row) this_row.update(this_meta.loc[i].to_dict()) this_row[batch_col] = this_batch[i] meta_values.append(this_row) if l.meta_is_linked: meta_index.append(i) else: meta_index.append(meta_auto_idx) meta_auto_idx += 1 else: for c in this_dat.columns: this_row = dict(blank_meta_row) this_row[batch_col] = this_batch[c] meta_values.append(this_row) meta_index.append(meta_auto_idx) meta_auto_idx += 1 self.meta = pd.DataFrame(meta_values, index=meta_index, columns=meta_cols) if intersection_only: join = 'inner' else: join = 'outer' if self.row_indexed: dat = pd.concat( [row_indexed_dat_arr[k] for k in self.meta.index], axis=1, sort=True, join=join ) self.data = dat self.batch_id = self.meta.loc[:, batch_col] self.extra_df_attributes = tuple() for fld in extra_df_attributes: setattr(self, fld, extra_df_attributes[fld]) self.extra_df_attributes += (fld,)
indir = os.path.join(GIT_LFS_DATA_DIR, 'ipa_from_biplots') # single components for q in [99, 995]: ipa_pathways_single = {} for dim in range(1, 4): fn = os.path.join(indir, "%d_%d.txt" % (dim, q)) this = pd.read_csv(fn, sep='\t', skiprows=2, header=0, index_col=0) this.columns = ['-logp', 'ratio', 'z', 'genes'] # add ngenes column this.insert(3, 'n_gene', this.genes.str.split(',').apply(len)) this.index = [x.decode('utf-8') for x in this.index] ipa_pathways_single[dim] = this ipa_single = pd.DataFrame(index=sorted(setops.reduce_union(*[ipa_pathways_single[i].index for i in range(1, 4)]))) [ipa_single.insert(0, i, ipa_pathways_single[i]['-logp']) for i in range(1, 4)[::-1]] ipa_single.fillna(0., inplace=True) # drop rows with no significant results ipa_single = ipa_single.loc[(ipa_single > -np.log10(0.05)).sum(axis=1) > 0] p_order = ipa_single.sum(axis=1).sort_values(ascending=False).index fig = plt.figure(figsize=(7., 9.8)) ax = fig.add_subplot(111) sns.heatmap( ipa_single.loc[p_order], mask=ipa_single.loc[p_order] == 0, cmap='YlOrRd', linewidths=.2, linecolor='w', cbar_kws={"orientation": 'vertical', "shrink": 0.6}, ax=ax
svd_res, filename="pca_biplot_dims_%d-%d" % tuple(t + 1 for t in dims_pair), feature_size_scaling=size_scaling, feature_text=feature_text, sample_text=sample_text, sample_colours=sample_colours, sample_markers=sample_markers, feature_text_mask=~to_annotate, components=tuple(i + 1 for i in dims_pair), ) # export lists for IPA ix_all = sorted( setops.reduce_union( *[t.index for t in selected_by_quantile_mean_logfc.values()])) ipa_mean_logfc = pd.DataFrame(index=ix_all) for k, v in selected_by_quantile_mean_logfc.items(): ipa_mean_logfc.insert( 0, "pc_%s_logFC" % '-'.join([str(t + 1) for t in k]), v) ipa_mean_logfc.to_excel(os.path.join(outdir, "for_ipa_mean_logfc.xlsx")) # for separated data, combine single and paired PC for maximum efficiency for first_dim in dims: dims_pair = (first_dim, first_dim + 1) ix_all = setops.reduce_union(*[ selected_by_quantile_separate_logfc[k].index for k in [(first_dim, ), dims_pair] ]) this_df = pd.DataFrame(index=ix_all)
if __name__ == "__main__": """ Here I'm trying to assemble a function that automates statistical testing of upset plot intersection sizes against a fixed-set-size uniform random null. """ n_iter = 1000 data = { 'A': range(5) + range(10, 16), 'B': range(0, 21, 2), 'C': range(1, 22, 2), 'D': range(0, 25, 4) } K = len(data) full = setops.reduce_union(*data.values()) N = len(full) set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()]) # n_intersections = int(sum([special.comb(K, i, exact=True) for i in range(1, K + 1)])) intersections = list(setops.binary_combinations(K)) simulated_sizes = collections.defaultdict(list) pool = mp.Pool() jobs = {} for i in range(n_iter): jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N)) pool.close() pool.join() for i, j in jobs.items(): vc = j.get()
ax = fig.add_subplot(gs_sub[j, k]) if j == 0: ax.set_title('Hypo') this_members = [ v2.index[v2["median_delta_%s" % r] < 0] for r in esc_ref_names ] set_labels = None if j == (len(v1) - 1): set_labels = esc_ref_names vd = venn.venn_diagram( *this_members, set_labels=set_labels, set_colors=set_colours_hypo, ax=ax, normalize_to=(len(setops.reduce_union(*this_members)) / set_size_base)**2)[0] plt.setp(vd.patches, edgecolor='k') if vd.set_labels is not None: for lbl in vd.set_labels: xx, yy = lbl.get_position() lbl.set_position([xx * 3, yy]) k = 1 ax = fig.add_subplot(gs_sub[j, k]) if j == 0: ax.set_title('Hyper') this_members = [ v2.index[v2["median_delta_%s" % r] > 0] for r in esc_ref_names ] vd = venn.venn_diagram(
c5_gmt = gsea.read_gmt_file(msigdb_c5_fn) keep_pathways = c5_gmt.keys() res = collections.OrderedDict() res_full = collections.OrderedDict() for pid in pids: for c in comparison_names: fn = os.path.join(indir, "%s%s.csv" % (pid, c)) this = pd.read_csv(fn, sep='\t', header=0, index_col=0, usecols=[0, 3, 5, 7]) this.columns = ['n_gene', 'nes', 'fdr'] this = this.reindex(keep_pathways).dropna(how='all') res_full["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha_relevant] res["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha] pathways_sign = sorted(setops.reduce_union(*[t.index for t in res.values()])) pathways_rele = sorted(setops.reduce_union(*[t.index for t in res_full.values()])) excel.pandas_to_excel(res, os.path.join(outdir, "gsea_results_significant_by_patient.xlsx")) # use this list to export a second wideform Excel file with the top list of pathways for_export = pd.DataFrame(index=pathways_sign, columns=['n_gene']) nes_columns = [] fdr_columns = [] for k, v in res.items(): for_export.loc[v.index, 'n_gene'] = v.n_gene this_yn = pd.Series('N', index=pathways_sign) this_yn.loc[v.index] = 'Y' for_export.insert( for_export.shape[1], k,
with open(fn, 'wb') as f: pickle.dump(de_res_full_s1, f) logger.info("Saved S1 DE results to %s", fn) # extract only significant DE genes de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']]) for k, v in de_res_full_s1.items()]) # generate wide-form lists and save to Excel file de_by_member = [de_res_s1[pid].index for pid in pids] venn_set, venn_ct = setops.venn_from_arrays(*de_by_member) # add null set manually from full DE results de_genes_all = setops.reduce_union(*venn_set.values()) k_null = ''.join(['0'] * len(pids)) venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all)) venn_ct[k_null] = len(venn_set[k_null]) de_data = setops.venn_set_to_wide_dataframe(de_res_s1, venn_set, pids, full_data=de_res_full_s1, cols_to_include=['logFC', 'FDR'], consistency_check_col='logFC', consistency_check_method='sign') # add gene symbols back in general.add_gene_symbols_to_ensembl_data(de_data) de_data.to_excel(os.path.join(outdir, 'full_de.xlsx'))
dmrs_classified = {} dedmr_results = {} both_genes = {} for pid in pids: fn = os.path.join(dmr_indir, "iPSC%s_classified_dmrs.csv" % pid) if os.path.exists(fn): this_dmr = pd.read_csv(fn, header=0, index_col=0) this_dmr.loc[:, 'genes'] = this_dmr.genes.apply(make_tuple) dmrs_classified[pid] = this_dmr if "iPSC_%s_ours" % pid in ipsc_esc_fb: this_de_res = ipsc_esc_fb["iPSC_%s_ours" % pid] de_genes = this_de_res.loc[:, 'Gene Symbol'].dropna() dmr_genes = this_dmr.loc[:, 'genes'].values dmr_genes = setops.reduce_union( *dmr_genes) if len(dmr_genes) else [] both_genes[pid] = set(de_genes).intersection(dmr_genes) if len(both_genes[pid]): # DE the_de_res = this_de_res.loc[ this_de_res['Gene Symbol'].isin(both_genes[pid])] the_de_res = the_de_res.loc[:, ~the_de_res.columns.str. contains('Direction')] the_de_res.set_index( 'Gene Symbol', inplace=True ) # TODO: may break if there are duplicates? # DMR the_dmr_res = this_dmr.loc[this_dmr.genes.astype( str).str.contains('|'.join(both_genes[pid]))]
','.join(t) if hasattr(t, '__iter__') else '' for t in to_add.UCSC_RefGene_Group ]) new_dat[k] = df excel.pandas_to_excel( new_dat, os.path.join(outdir, fn.replace('.xlsx', '.annotated.xlsx'))) dmp_fn = os.path.join(indir, 'dmps_3021_swan.xlsx') dmps = pd.read_excel(dmp_fn, header=0, index_col=0, sheet_name=None) # combine all DMPs into a single wideform cols = reduce( lambda x, y: x + y, [['%s' % t, '%s_logFC' % t, '%s_FDR' % t] for t in dmps]) all_probes = setops.reduce_union( *[v.loc[v['adj.P.Val'] < 0.05].index for v in dmps.values()]) all_probes = all_probes.intersection(anno.index) dmps_all = pd.DataFrame(index=all_probes, columns=['CHR', 'coord', 'genes'] + cols) dmps_all.loc[:, 'CHR'] = anno.loc[dmps_all.index, 'CHR'] dmps_all.loc[:, 'coord'] = anno.loc[dmps_all.index, 'MAPINFO'] dmps_all.loc[:, 'genes'] = anno.loc[dmps_all.index, 'UCSC_RefGene_Name'] dmps_all.loc[:, dmps.keys()] = False for k, v in dmps.items(): this = v.loc[v['adj.P.Val'] < 0.05] this = this.loc[this.index.intersection(all_probes)] dmps_all.loc[this.index, k] = True dmps_all.loc[this.index, "%s_logFC" % k] = this['logFC'] dmps_all.loc[this.index, "%s_FDR" % k] = this['adj.P.Val']
def all_comparison_groups(self): if self.dmr_comparison_groups is None: raise ValueError("Must first call set_dmr_res.") return sorted( setops.reduce_union( *(t.keys() for t in self.dmr_comparison_groups.values())))
def plot_legend(self, figsize=None): """ Generate a figure showing the interpretation of the various colours / markers :return: """ the_fig_kws = dict(self.fig_kws) if self.dmr_comparison_groups is None: if figsize is None: height = min(2., self.n_comparison_groups / 3.) figsize = (4., height) the_fig_kws['figsize'] = figsize fig = plt.figure(**the_fig_kws) # no legend gs = plt.GridSpec(nrows=2, ncols=1) dm_ax = fig.add_subplot(gs[0]) de_ax = fig.add_subplot(gs[1]) leg_ax = None else: figsize = (5.5, 2.) the_fig_kws['figsize'] = figsize fig = plt.figure(**the_fig_kws) gs = plt.GridSpec(nrows=2, ncols=2, width_ratios=[5, 1]) dm_ax = fig.add_subplot(gs[0, 0]) de_ax = fig.add_subplot(gs[1, 0]) leg_ax = fig.add_subplot(gs[:, 1], frameon=False) leg_ax.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off') leg_ax.grid(False) de_vmin = self.de_vmin or -5 de_vmax = self.de_vmax or 5 dm_vmin = self.dm_vmin or -8 dm_vmax = self.dm_vmax or 8 for_heatmap = { 'de': { 'vmin': de_vmin, 'vmax': de_vmax, 'cmap': self.de_direction_colour, 'ax': de_ax, 'label': "DE log2(fold change)", }, 'dm': { 'vmin': dm_vmin, 'vmax': dm_vmax, 'cmap': self.dm_direction_colour, 'ax': dm_ax, 'label': r"DM median $\Delta$M", }, } heatmaps = {} for k, d in for_heatmap.items(): if isinstance(d['cmap'], colors.LinearSegmentedColormap): the_cmap = d['cmap'] else: the_cmap = colors.LinearSegmentedColormap.from_list(k, [ d['cmap'](t) for t in np.linspace(d['vmin'], d['vmax'], 256) ], N=256) heatmaps[k] = d['ax'].pcolor( [np.linspace(d['vmin'], d['vmax'], 257)] * 2, [np.zeros(257), np.ones(257)], [np.linspace(d['vmin'], d['vmax'], 257)] * 2, cmap=the_cmap) d['ax'].yaxis.set_ticks([]) d['ax'].set_xlabel(d['label'], fontsize=14) # custom legend (if we have the groups needed to plot it) leg = None hleg = None if self.dmr_comparison_groups is not None: all_groups = sorted( setops.reduce_union( *(t.keys() for t in self.dmr_comparison_groups.values()))) type_attrs = { 'class': 'line', 'linestyle': 'none', 'markeredgecolor': 'k', 'markeredgewidth': 1., 'markerfacecolor': 'none', 'markersize': 20 } leg_dict = {} for nm in all_groups: leg_dict[nm] = dict(type_attrs) leg_dict[nm]['markerfacecolor'] = self.colours.get(nm) leg_dict[nm]['marker'] = self.markers.get(nm) leg_dict[nm]['alpha'] = self.alpha.get(nm) leg_dict[nm]['markersize'] = self.size.get(nm) leg = common.add_custom_legend(leg_ax, leg_dict, loc='center', fontsize=14) hleg = leg_ax.get_legend() hleg.set_frame_on(False) gs.update(bottom=0.3, top=0.98, left=0.04, right=0.95, wspace=0.05, hspace=2.) return { 'fig': fig, 'gs': gs, 'legend_objects': leg, 'legend': hleg, 'heatmaps': heatmaps, 'dm_ax': dm_ax, 'de_ax': de_ax, 'leg_ax': leg_ax }
ipa_signatures = ipa.load_supported_signatures_from_raw( IPA_PATHWAY_DIR, "de_s2_{0}_{1}.txt", [pids, comparisons], pathways=ipa_res.index) cy_obj = cyto.CytoscapeSession() nx_graphs = {} # one network per patient: for pid in pids: this_ipa = [ all_ipa[(pid, c)].loc[all_ipa[(pid, c)]['-logp'] >= log_alpha_strict] for c in comparisons ] all_pathways = setops.reduce_union(*[t.index for t in this_ipa]) p_to_g = {} for p in all_pathways: p_to_g[p] = setops.reduce_union(*[ t.loc[p, 'genes'].split(',') if p in t.index else [] for t in this_ipa ]) # to get connectivity, we need to create the complementary dictionary (indexed by genes) g_to_p = {} for p in all_pathways: for g in p_to_g[p]: g_to_p.setdefault(g, []).append(p) # we're going to use passthrough mapping to customise the node colour