def run(json_file, kallisto_spec, eVIPP_predict, output_name): spec_genes = pd.read_csv(kallisto_spec, sep="\t", index_col="#gene_id").index.tolist() pathways = pd.read_csv(eVIPP_predict, sep="\t", index_col="Pathway").index.tolist() with open(json_file) as f: gene_set_dict = json.load(f) #subset gene_set_dict = {k: v for (k, v) in gene_set_dict.items() if k in pathways} if len(gene_set_dict) > 1: spec_dict = {} for k, v in gene_set_dict.items(): spec_dict[k] = [i for i in v if i in spec_genes] e = from_contents(spec_dict) upsetplot.plot(e, sort_by='cardinality', sort_categories_by='cardinality', show_counts=True) plt.savefig(output_name) plt.clf()
def getLevels(R, L, k): n = 1 while (1 != 0): tempR = [] tempL = [] upsetD = [] for i in range(len(R[n])): for j in range(len(R[1])): if (checkExists( R[1][j], R[n][i]) == False): # Fix this to work with lists intersectionTID = intersection(L[n][i], L[1][j]) if (len(intersectionTID) >= k): if (n == 1): tempR.append([R[n][i], R[1][j]]) else: tempR.append(R[n][i] + [R[1][j]]) tempL.append(intersectionTID) if (len(tempR) == 0): return R.append(tempR) L.append(tempL) R[n + 1], L[n + 1] = checkDuplicates(R[n + 1], L[n + 1]) for i in range(len(L[n + 1])): upsetD.append(len(L[n + 1][i])) print("\nLevel ", n + 1, "--> Number of itemsets = ", len(R[n + 1])) #print(R[n+1]) print("\n") upset = from_memberships(R[n + 1], data=upsetD) upset # doctest: +NORMALIZE_WHITESPACE plot(upset) pyplot.show() n += 1
def make_UpSetPlot(GT ,bp_lists, vcf_names): bools = [] for bplist in bp_lists: boollist = [] for bp in GT: boollist.append(bpset in bp_lists) bools.append(boollist) dic = {} for nr, vcf in enumerate(vcf_names): dic[vcf] = bools[nr] dic['breakpoints'] = GT df = pd.DataFrame(dic) cols = df.columns.difference(['breakpoints']).tolist() s = df.groupby(cols).size() plot(s, show_counts='%d', sort_by="cardinality") plt.title('Intersection breakpoints SV callers') plt.show() # save plot currentfig = plt.gcf() currentfig.savefig('UpSetPlot %s' % ' '.join(vcf_names))
def upset_members(self, threshold=0, path=None, plot_upset=False, show_counts_bool=True, exclude_singletons_from_threshold=False, threshold_dual_cats=None, exclude_skids=None): celltypes = self.Celltypes contents = {} # empty dictionary for celltype in celltypes: name = celltype.get_name() contents[name] = celltype.get_skids() data = from_contents(contents) # identify indices of set intersection between all data and exclude_skids if(exclude_skids!=None): ind_dict = dict((k,i) for i,k in enumerate(data.id.values)) inter = set(ind_dict).intersection(exclude_skids) indices = [ind_dict[x] for x in inter] data = data.iloc[np.setdiff1d(range(0, len(data)), indices)] unique_indices = np.unique(data.index) cat_types = [Celltype(' and '.join([data.index.names[i] for i, value in enumerate(index) if value==True]), list(data.loc[index].id)) for index in unique_indices] # apply threshold to all category types if(exclude_singletons_from_threshold==False): cat_bool = [len(x.get_skids())>=threshold for x in cat_types] # allows categories with no intersection ('singletons') to dodge the threshold if((exclude_singletons_from_threshold==True) & (threshold_dual_cats==None)): cat_bool = [(((len(x.get_skids())>=threshold) | (' and ' not in x.get_name()))) for x in cat_types] # allows categories with no intersection ('singletons') to dodge the threshold and additional threshold for dual combos if((exclude_singletons_from_threshold==True) & (threshold_dual_cats!=None)): cat_bool = [(((len(x.get_skids())>=threshold) | (' and ' not in x.get_name())) | (len(x.get_skids())>=threshold_dual_cats) & (x.get_name().count('+')<2)) for x in cat_types] cats_selected = list(np.array(cat_types)[cat_bool]) skids_selected = [x for sublist in [cat.get_skids() for cat in cats_selected] for x in sublist] # identify indices of set intersection between all data and skids_selected ind_dict = dict((k,i) for i,k in enumerate(data.id.values)) inter = set(ind_dict).intersection(skids_selected) indices = [ind_dict[x] for x in inter] data = data.iloc[indices] # identify skids that weren't plotting in upset plot (based on plotting threshold) all_skids = [x for sublist in [cat.get_skids() for cat in cat_types] for x in sublist] skids_excluded = list(np.setdiff1d(all_skids, skids_selected)) if(plot_upset): if(show_counts_bool): fg = plot(data, sort_categories_by = None, show_counts='%d') else: fg = plot(data, sort_categories_by = None) if(threshold_dual_cats==None): plt.savefig(f'{path}_excluded{len(skids_excluded)}_threshold{threshold}.pdf', bbox_inches='tight') if(threshold_dual_cats!=None): plt.savefig(f'{path}_excluded{len(skids_excluded)}_threshold{threshold}_dual-threshold{threshold_dual_cats}.pdf', bbox_inches='tight') return (cat_types, cats_selected, skids_excluded)
def start(self): self.print_arguments() print("Loading data.") decon_df = self.load_file(self.decon_path, nrows=None) columns = [x for x in decon_df.columns if "pvalue" in x] decon_df = decon_df[columns] variable = "p-value" if self.calc_fdr: print("Calculating FDR.") _, decon_df = self.bh_correct(decon_df) variable = "FDR" self.print_n_signif(df=decon_df, variable=variable) print("Preprocessing data.") data = self.parse_df(decon_df, self.alpha) counts = self.count(data) counts = counts[counts > 0] print(counts) print("Creating plot.") up.plot(counts, sort_by='cardinality', show_counts=True) for extension in self.extensions: plt.savefig( os.path.join(self.outdir, "{}.{}".format(self.name, extension))) plt.close()
def test_two_sets(set1, set2): # we had a bug where processing failed if no items were in some set fig = matplotlib.figure.Figure() plot(pd.DataFrame({'val': [5, 7], 'set1': set1, 'set2': set2}).set_index(['set1', 'set2'])['val'], fig)
def test_dataframe_raises(): fig = matplotlib.figure.Figure() df = pd.DataFrame({'val': [5, 7], 'set1': [False, True], 'set2': [True, True]}).set_index(['set1', 'set2']) with pytest.raises(ValueError, match='sum_over must be'): plot(df, fig)
def Upsetplotting(name_file,name_output,folder): UTR5,Exon,Intron,UTR3,Upstream,Downstream = [],[],[],[],[],[] data=open(name_file) next(data) for lines in data.readlines(): Upstream.append(lines.split('\t')[7]) UTR5.append(lines.split('\t')[8]) Exon.append(lines.split('\t')[9]) Intron.append(lines.split('\t')[10]) UTR3.append(lines.split('\t')[11]) Downstream.append(lines.split('\t')[12]) Upstream = pd.Series([True if x=="1" else False for x in Upstream]) UTR5 = pd.Series([True if x=="1" else False for x in UTR5]) Exon = pd.Series([True if x=="1" else False for x in Exon]) Intron = pd.Series([True if x=="1" else False for x in Intron]) UTR3 = pd.Series([True if x=="1" else False for x in UTR3]) Downstream = pd.Series([True if x=="1" else False for x in Downstream]) concat = pd.concat([Upstream,UTR5,Exon,Intron,UTR3,Downstream],axis=1,keys=["Upstream","UTR5","Exon","Intron","UTR3","Downstream"]) result = concat.groupby(["Upstream","UTR5","Exon","Intron","UTR3","Downstream"]).size() result = result.nlargest(12) plot(result, sort_by = "cardinality") pyplot.suptitle("Intersection size") pyplot.savefig(folder+"/"+name_output+"_Upsetplot.pdf")
def generate_upsetplot(rapport, names, min_alt, path): # remove cases with no supporint reads in all samples from alt > min_alt rapport = rapport[rapport[names].apply( lambda alt: False if (alt.str.split('/').str[0].astype(int) < min_alt).all() else True, axis=1)] upsetframe = rapport[names].reset_index() for name in names: mask = (upsetframe[name].str.split('/').str[0].astype(int) >= min_alt) upsetframe.loc[mask, name] = True upsetframe.loc[upsetframe[name] != True, name] = False samples = [c for c in upsetframe.columns if c != 'SNV'] samples_count_series = upsetframe.fillna(False).groupby( samples).count()['SNV'] upsetplot.plot(samples_count_series, sort_by='cardinality') current_figure = plt.gcf() plt.title("Overlaps strict filtered SNVs", fontsize=15) plt.ylabel("SNV count") current_figure.savefig( os.path.join(path, 'upsetplot_' + str(min_alt) + '.png'))
def plot_intersection(ins_dict, save_fig=False): """ Visualize an upsetplot displaying the number of unique subjects found simultaneously in a pair of instruments Parameters ---------- ins_dict: dictionary save_fig: bool """ ins_names = list(ins_dict.keys()) list_comb = sum([ list(map(list, combinations(ins_names, i + 1))) for i in range(len(ins_names) + 1) ], []) list_uniquesubj = [] for lc in list_comb: list_uniquesubj.append([set(ins_dict[n].index) for n in lc]) int_counts = list(map(_count_intersection, list_uniquesubj)) inter_plot = from_memberships(list_comb, data=int_counts) plot(inter_plot, show_counts='%d', element_size=50, orientation='horizontal') if save_fig: plt.savefig(os.path.join(ut.out_folder, 'intersection_plot'), format='pdf') else: plt.show()
def test_dataframe_raises(): fig = matplotlib.figure.Figure() df = pd.DataFrame({ 'val': [5, 7], 'set1': [False, True], 'set2': [True, True] }).set_index(['set1', 'set2']) with pytest.raises(ValueError, match='Please specify subset_size or '): plot(df, fig)
def plot_protein_upset(protein_dict): color = '#21918cff' plot_df = upsetplot.from_contents(protein_dict) upsetplot.plot(plot_df, sort_by='cardinality', subset_size='auto', facecolor=color) # plt.ylim(0, 60) plt.title("Distribution of Protein Overlap") plt.savefig("Protein_upset.svg") plt.savefig("Protein_upset.png")
def upsetplot(self, data, title, outdir, extension): counts = self.count(data) counts = counts[counts > 0] up.plot(counts, sort_by='cardinality', show_counts=True) plt.suptitle('{}'.format(title.replace("_", " ")), fontsize=18, fontweight='bold') plt.savefig( os.path.join(outdir, "{}_upsetplot.{}".format(title, extension))) plt.close()
def writeIntersectionPlot(inputIterators, iter): contents = {} for circIter in inputIterators: contents[circIter.name] = [ c for c in iter if (c.getMeta(circIter.id) != CircRow.META_INDEX_CIRC_NOT_IN_DB) ] df = from_contents(contents) plot(df, facecolor="red", sort_by="cardinality", show_counts='%d') pyplot.savefig('./output/out.png')
def build_diagrams(consolidated_data, graph_storage_path, condition): dn_list, doses_list, up_list = unpack_consolidated_data(consolidated_data) index = build_graph_index(doses_list) data_lists = [dn_list, up_list] for pos in range(0, 2): ser = build_graph_data(data_lists[pos], index) plot(ser) if pos is 0: pyplot.savefig(os.path.join(graph_storage_path, condition + '_down_' + '.png')) else: pyplot.savefig(os.path.join(graph_storage_path, condition + '_up_' + '.png'))
def test_plot_smoke_test(kw): fig = matplotlib.figure.Figure() X = generate_data(n_samples=100) plot(X, fig, **kw) fig.savefig(io.BytesIO(), format='png') # Also check fig is optional n_nums = len(plt.get_fignums()) plot(X, **kw) assert len(plt.get_fignums()) - n_nums == 1 assert plt.gcf().axes
def plot_peptide_upset(self, save=False): color = '#21918cff' plot_df = upsetplot.from_contents(self.peptide_dict) upsetplot.plot(plot_df, sort_by='cardinality', subset_size='auto', facecolor=color) # plt.ylim(0, 400) plt.title("Distribution of Peptide Overlap") if save: plt.savefig("Peptide_upset.svg") plt.savefig("Peptide_upset.png")
def upset(index): selection = clusters[np.where(sets[:, index] > 0)] items, counts = np.unique(selection, return_counts=True) subset = from_memberships(items, counts) sub_classes = np.unique([item for sublist in items for item in sublist]) print("Root Class: ", unique_clusters[index]) print("# Papers: ", len(selection)) print("# Labels: ", len(sub_classes)) print("# Classes: ", len(items)) if len(items) > 40 or len(sub_classes) > 20: print("Too many items") else: plot(subset)
def create_plot(gnps_task, metadata_column, metadata_terms, intensity_threshold): data_df = _get_task_df(gnps_task) metadata_terms = set(metadata_terms) INTENSITY_THRESHOLD = float(intensity_threshold) data_df = data_df[data_df["featurearea"] > INTENSITY_THRESHOLD] membership = [] grouped_df = data_df.groupby("featureid") for group_df in grouped_df: try: groups = set(group_df[1][metadata_column]) groups = list(groups & metadata_terms) membership.append(groups) except: print("ERROR") raise upset_data_df = from_memberships(membership) plotting_object = plot(upset_data_df, subset_size="count", sort_by="cardinality", orientation="horizontal", show_counts=True) uuid_save = str(uuid.uuid4()) pyplot.savefig("./output/{}.svg".format(uuid_save)) return [html.Img(src="/plot/{}".format(uuid_save))]
def plot_species_intersections(self, color, ignore_counts=0, orientation='horizontal'): memberships = [] data = [] species_groups, _ = self.orthogroups_sets() for k in species_groups: memberships.append(k) data.append(len(set(species_groups[k]))) structured_data = from_memberships(memberships, data=data) species_dict = {'P8084_finalAssembly': 'P.betacei', 'P_cactorum_10300': 'P.cactorum', 'P_infestans_RefSeq': 'P. infestans', 'P_palmivora_LILI_trCDS': 'P.palmivora', 'P_parasitica_INRA310': 'P.parasitica', 'P_ramorum_Pr102': 'P.ramorum', 'P_sojae_V3': 'P.sojae'} new_names = [species_dict[old_name] for old_name in structured_data.index.names] structured_data.index.names = new_names structured_data = structured_data[structured_data > ignore_counts].copy() p = plot(structured_data, orientation=orientation, show_counts=True, facecolor=color, element_size=40) return p
def main(): args = get_args() sys.stderr.write("Deprecation warning:\n" "This script is here for legacy purposes.\n" "You should use _surpyvor upset_ instead.") if args.ignore_type: ignore_type = "-1" else: ignore_type = "1" combined_vcf = survivor(samples=[normalize_vcf(s) for s in args.variants], distance=args.distance, ignore_type=ignore_type, minlength=args.minlength, save=args.store) upsets = make_sets(vcf=combined_vcf, names=args.names or args.variants) plot(upsets, sort_by='cardinality') plt.savefig("UpSetPlot.png")
def generate_report(self, report_path=tempfile.gettempdir()): try: os.mkdir(report_path) except FileExistsError: pass rows = {} for entry in self.entries: rows[entry.biotools_id] = [ source.is_available() for source in entry.sources.values() ] df = pd.DataFrame.from_dict( rows, orient='index', columns=[source_class.SOURCE for source_class in SOURCE_CLASSES]) plot(df.groupby( [source_class.SOURCE for source_class in SOURCE_CLASSES]).size(), show_counts=True) with open(os.path.join(report_path, 'detailed_counts.md'), 'w') as md_file: df.replace({ True: '✓', False: '🗙' }).to_markdown(buf=md_file, tablefmt='github') with open(os.path.join(report_path, 'summary.md'), 'w') as md_file: summary_df = df.groupby([ source_class.SOURCE for source_class in SOURCE_CLASSES ]).size() pretty_index = [] for idx_row in summary_df.index: pretty_index.append([ 'No ' + summary_df.index.names[cell_idx] if cell == False else summary_df.index.names[cell_idx] for cell_idx, cell in enumerate(idx_row) ]) summary_df.reindex(pretty_index) summary_df.to_markdown(buf=md_file, tablefmt='github') pyplot.savefig(os.path.join(report_path, 'global_upset.png')) print(df[(df["biotools"] == False) & (df["bioschemas"] == False) & (df["OEB"] == False) & (df["OEB Metrics"] == False) & (df["Debian"] == False) & (df["BioConda"] == False) & (df["BioContainers"] == False) & (df["Biii"] == False)]) print(df[(df["biotools"] == True) & (df["bioschemas"] == True) & (df["OEB"] == True) & (df["OEB Metrics"] == True) & (df["Debian"] == True) & (df["BioConda"] == True) & (df["BioContainers"] == True) & (df["Biii"] == False)])
def test_matrix_plot_margins(x): """Non-regression test addressing a bug where there is are large whitespace margins around the matrix when the number of intersections is large""" axes = plot(x) # Expected behavior is that each matrix column takes up one unit on x-axis expected = len(x) - 1 actual = axes['matrix'].get_xlim()[1] - axes['matrix'].get_xlim()[0] assert expected == actual
def test_matrix_plot_margins(x, orientation): """Non-regression test addressing a bug where there is are large whitespace margins around the matrix when the number of intersections is large""" axes = plot(x, orientation=orientation) # Expected behavior is that each matrix column takes up one unit on x-axis expected = len(x) - 1 attr = 'get_xlim' if orientation == 'horizontal' else 'get_ylim' lim = getattr(axes['matrix'], attr)() assert expected == lim[1] - lim[0]
def start(self): self.print_arguments() print("Loading data.") decon_df = self.load_file(self.decon_path, nrows=None) print("Calculating FDR.") _, decon_fdr_df = self.bh_correct(decon_df) print("Preprocessing data.") data = self.parse_df(decon_fdr_df, self.alpha) counts = self.count(data) counts = counts[counts > 0] print(counts) print("Creating plot.") up.plot(counts, sort_by='cardinality', show_counts=True) for extension in self.extensions: plt.savefig(os.path.join(self.outdir, "eQTL_upsetplot.{}".format(extension))) plt.close()
def plotSetIntersections(df: DataFrame, labels: List[str], unique_id: str) -> None: """ Plots sets size and intersection Args: df: dataframe with labels (one-hot encoding) and unique id class_labels: name of columns with labels, one-hot encoding unique_id name of column with unique id Returns: None """ df_subset = df[labels + [unique_id]] counts = df_subset.astype(bool).groupby(labels).count()[unique_id] upsetplot.plot(counts, subset_size="sum", show_counts="%d", sort_by="cardinality") plt.suptitle("Multiple tags per comment") plt.show()
def create_upset(df, cols, **args): ps = powerset(cols) counts_by_combo = collections.defaultdict(int) for i, r in df.iterrows(): combo = [r[c] != "" and r[c] != False and r[c] != 0 for c in cols] counts_by_combo[tuple(combo)] += 1 counts = [] boolvecs = [] for s in ps: boolvec = makebool(s, cols) c = counts_by_combo[tuple(boolvec)] if c > 0: boolvecs.append(boolvec) counts.append(c) multiindex = pd.MultiIndex.from_tuples(boolvecs, names=cols) #print(multiindex) upsetdata = pd.Series(counts, index=multiindex).sort_values(ascending=False) usp.plot(upsetdata, sort_by='cardinality', **args)
def start(self): print("Loading data.") df = pd.read_excel(self.data_path, header=0, index_col=None, sheet_name="Sheet2") print(df["CellType"].unique()) df["group"] = df["CellType"].map(self.trans_dict) print(df) print("Preprocessing data.") data = {} for ct in df["group"].unique(): data[ct] = set(df.loc[df["group"] == ct, "GeneName"].tolist()) print(data) counts = self.count(data) counts = counts[counts > 0] print(counts) print("Creating plot.") up.plot(counts, sort_by='cardinality', show_counts=True) plt.savefig(os.path.join(self.outdir, "markergenes_upsetplot.png")) plt.close()
def plot_upset(ax): data = np.array([795., 27., 182., 7.]) # plt.rcParams.update({'font.size': fontsize}) example = from_memberships( [[' TP53 WT', ' MDM4 WT'], [' TP53 WT', ' MDM4 amp.'], [' TP53 mutant', ' MDM4 WT'], [' TP53 mutant', ' MDM4 amp.']], data=data) intersections, matrix, shading, totals = plot(example, with_lines=True, show_counts=True, element_size=50) plt.ylabel('Number of patients', fontproperties)
def plot_graph(res, path): """ From upset_plot data Plot upset plots and store corresponding data""" path_figures = f"{path}/figures" import os os.makedirs(path_figures, exist_ok=True) for typ_res, dic in res.items(): liste_cats = sorted(dic.keys()) data_out = [] for cat in liste_cats: data_out.append(dic[cat]) example = from_memberships(liste_cats, data=data_out) plot(example) pyplot.savefig(f"{path_figures}/{typ_res}.png") print(f" figures stored in '{path_figures}/'") path_upset = f"{path}/data_upset.json" write_json_file(path_upset, [liste_cats, data_out]) print(f" output file in upset plot format stored in '{path_upset}'")