def find_overlaps(event1, event2, combi): complete_event1 = pd.DataFrame(event1) complete_event2 = pd.DataFrame(event2) thisdict = { combi[0]: complete_event1.to_dict(), combi[1]: complete_event2.to_dict() } try: grs = { n: pr.from_dict(s).drop_duplicate_positions(keep=False) for n, s in thisdict.items() } except ValueError: return False counts = pr.count_overlaps(grs) countdf = counts.df MATCHED = False for ind, row in counts.df[[combi[0], combi[1]]].iterrows(): if row.sum() == 2: MATCHED = True return MATCHED
def test_strand_vs_strand_same(): expected_result = pr.from_string("""Chromosome Start End Strand a b c chr1 0 10 + 1 0 1 chr1 10 20 + 2 2 1 chr1 20 30 + 0 2 0 chr1 30 40 - 0 0 1""") res = pr.count_overlaps(grs, features, strandedness="same") res = res.apply(lambda df: df.astype({ "a": np.int64, "b": np.int64, "c": np.int64 })) res.print(merge_position=True) assert_df_equal(res.df, expected_result.df)
def make_plots(target_paths): unified = defaultdict(lambda: defaultdict(list)) tools = [ "asgal", "aspli", "eventpointer", "irfinder", "majiq", "sgseq", "spladder", "whippet" ] evs = [] #def unified_upset(): for file in target_paths: if "unified.out" in file: # if len(file.split("/")[-1].split(".")) > 2: # tool = file.split("/")[-1].split(".")[1].split("_")[-2] # else: # tool = file.split("/")[-1].split(".")[0].split("_")[-2] tool = tools[np.where(list(map(lambda x: x in file, tools)))[0][0]] if "_filtered" in tool: tool = tool.replace("_filtered", "") try: tmp = pd.read_csv(file, sep="\t") except EmptyDataError as e: print(f"This file returns the following error: {file}") print(e) continue tmp = tmp.dropna() for ev in tmp.event_type.unique(): events = tmp[tmp['event_type'] == ev] org = events['chr'].copy(deep=False) events.loc[:, ['chr']] = list(map(lambda x: "chr" + str(x), org)) events.columns = [ "Chromosome", "gene", "id", "strand", "event_type", "count", "Start", "End" ] unified[ev][tool].append(events.to_dict('list')) if ev not in set(evs): evs.append(ev) allcomb = dict() for ev, X in unified.items(): if ev == 'MEE' or ev == 'MES': realcount = pd.DataFrame( columns=[" font-size: 1rem;Chromosome", "Start", "End"] + tools) for combi in it.combinations(X.keys(), 2): if len(combi) < 2: df1 = pd.DataFrame(X[combi[0]][0]).reset_index() row1 = create_row([combi[0]]) for _ in range(df1.shape[0]): realcount.loc[realcount.shape[0]] = ["chr", 0, 0 ] + row1 continue df1 = pd.DataFrame(X[combi[0]][0]).reset_index() df1['index'] = df1.index df2 = pd.DataFrame(X[combi[1]][0]).reset_index() df2['index'] = df2.index merged1 = expand_coord(df1) merged2 = expand_coord(df2) matched_index = [] for mergin in merged1.keys(): mergin0_res = find_all_overlaps(mergin, merged1, merged2, combi) if any(mergin0_res): row1 = create_row([combi[0], combi[1]], tools) matched_index.append( np.where(mergin0_res)[0][0] ) #keep track which index of event is found overlapped, so that doesn't duplicate #realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + else: row1 = create_row([combi[0]], tools) realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row1 #add the events in tool 2 that doesn't have overlaps row2 = create_row([combi[1]], tools) for _ in range(len(merged2) - len(matched_index)): realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row2 else: grs = { n: pr.from_dict(s[0]).drop_duplicate_positions(keep=False) for n, s in X.items() } counts = pr.count_overlaps(grs) countdf = counts.df #check if there are tools left out missed = [ tools[x] for x in np.where(np.isin(tools, countdf.columns) == False)[0] ] if len(missed) > 0: for x in missed: countdf[x] = 0 realcount = countdf[["Chromosome", "Start", "End"] + tools] for row in realcount.itertuples(): tmp = list(row[4:]) tmp = [1 if x > 1 else x for x in tmp] binkey = ''.join([str(x) for x in tmp]) if np.sum(tmp) == 0: continue else: if binkey not in set(allcomb.keys()): allcomb.setdefault(binkey, {}) for e in evs: allcomb[binkey].setdefault(e, 0) allcomb[binkey][ev] += 1 forplot = pd.DataFrame(columns=tools + evs) for n, j in allcomb.items(): thisrow = [bool(int(x)) for x in n] + list(j.values()) forplot.loc[forplot.shape[0]] = thisrow forplot = forplot.set_index(tools) return forplot
total_sizes = [] cds_sizes = [] exon_sizes = [] panel_prs = [] for panel in panels: print(panel) panel_pr = pr.PyRanges(genie.loc[(genie['SEQ_ASSAY_ID'] == panel) & genie['Chromosome'].isin(chromosomes), 'Chromosome':'End_Position'].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'})).merge() total_sizes.append(sum([i + 1 for i in panel_pr.lengths()])) cds_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_cds_pr).lengths()])) exon_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_exon_pr).lengths()])) panel_prs.append(panel_pr) grs = {k: v for k, v in zip(['CDS', 'exon'] + list(panels), [gff_cds_pr, gff_exon_pr] + panel_prs)} result = pr.count_overlaps(grs, pr.concat({'maf': maf_pr}.values())) result = result.df tcga_maf = pd.merge(tcga_maf, result.iloc[:, 3:], how='left', on='index') panel_df['total'] = total_sizes panel_df['cds'] = cds_sizes panel_df['exon'] = exon_sizes ##get assumed size of the most common kit: https://bitbucket.org/cghub/cghub-capture-kit-info/src/master/BI/vendor/Agilent/whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed agilent_df = pd.read_csv(file_path / 'whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed', sep='\t', low_memory=False, header=None) kit_pr = pr.PyRanges(agilent_df.rename(columns={0: 'Chromosome', 1: 'Start', 2: 'End'})).merge() kit_total = sum([i + 1 for i in kit_pr.lengths()]) kit_cds = sum([i + 1 for i in kit_pr.intersect(gff_cds_pr).merge().lengths()]) kit_exon = sum([i + 1 for i in kit_pr.intersect(gff_exon_pr).merge().lengths()])
k: annot[annot.Feature == k].drop() for k in ['CDS', 'five_prime_utr', 'three_prime_utr'] } features['intron'] = introns.drop() # high confidence regions highc = pyranges.read_bed(snakemake.input.hc_bed) # exome sequencing target regions exometarg = pyranges.read_bed(snakemake.input.es_bed) # load the variants into a pyranges object prpm = vcf_to_pyranges(pysam.VariantFile(snakemake.input.vcf), tmpfile=snakemake.output.tsv + '_tmp.bed') # count overlaps to different feature types prpm = pyranges.count_overlaps(features, prpm) # annotation by majority vote main_anno = np.argmax( prpm.as_df()[['CDS', 'five_prime_utr', 'three_prime_utr', 'intron']].values, axis=1) d = { i: k for i, k in enumerate( ['CDS', 'five_prime_utr', 'three_prime_utr', 'intron']) } main_anno = pd.Series(main_anno).map(d) prpm.major_anno = main_anno # annotate as intron only those regions that have no other annotations