Beispiel #1
0
def find_overlaps(event1, event2, combi):
    complete_event1 = pd.DataFrame(event1)
    complete_event2 = pd.DataFrame(event2)
    thisdict = {
        combi[0]: complete_event1.to_dict(),
        combi[1]: complete_event2.to_dict()
    }

    try:
        grs = {
            n: pr.from_dict(s).drop_duplicate_positions(keep=False)
            for n, s in thisdict.items()
        }
    except ValueError:
        return False

    counts = pr.count_overlaps(grs)
    countdf = counts.df

    MATCHED = False
    for ind, row in counts.df[[combi[0], combi[1]]].iterrows():
        if row.sum() == 2:
            MATCHED = True

    return MATCHED
def test_strand_vs_strand_same():

    expected_result = pr.from_string("""Chromosome Start End Strand a b c
chr1  0 10  + 1 0 1
chr1 10 20  + 2 2 1
chr1 20 30  + 0 2 0
chr1 30 40  - 0 0 1""")

    res = pr.count_overlaps(grs, features, strandedness="same")
    res = res.apply(lambda df: df.astype({
        "a": np.int64,
        "b": np.int64,
        "c": np.int64
    }))

    res.print(merge_position=True)

    assert_df_equal(res.df, expected_result.df)
Beispiel #3
0
def make_plots(target_paths):
    unified = defaultdict(lambda: defaultdict(list))
    tools = [
        "asgal", "aspli", "eventpointer", "irfinder", "majiq", "sgseq",
        "spladder", "whippet"
    ]
    evs = []
    #def unified_upset():
    for file in target_paths:
        if "unified.out" in file:
            # if len(file.split("/")[-1].split(".")) > 2:
            #     tool = file.split("/")[-1].split(".")[1].split("_")[-2]
            # else:
            #     tool = file.split("/")[-1].split(".")[0].split("_")[-2]
            tool = tools[np.where(list(map(lambda x: x in file, tools)))[0][0]]
            if "_filtered" in tool:
                tool = tool.replace("_filtered", "")

            try:
                tmp = pd.read_csv(file, sep="\t")
            except EmptyDataError as e:
                print(f"This file returns the following error: {file}")
                print(e)
                continue

            tmp = tmp.dropna()
            for ev in tmp.event_type.unique():
                events = tmp[tmp['event_type'] == ev]
                org = events['chr'].copy(deep=False)

                events.loc[:,
                           ['chr']] = list(map(lambda x: "chr" + str(x), org))
                events.columns = [
                    "Chromosome", "gene", "id", "strand", "event_type",
                    "count", "Start", "End"
                ]
                unified[ev][tool].append(events.to_dict('list'))
                if ev not in set(evs):
                    evs.append(ev)

    allcomb = dict()
    for ev, X in unified.items():

        if ev == 'MEE' or ev == 'MES':
            realcount = pd.DataFrame(
                columns=["  font-size: 1rem;Chromosome", "Start", "End"] +
                tools)
            for combi in it.combinations(X.keys(), 2):
                if len(combi) < 2:
                    df1 = pd.DataFrame(X[combi[0]][0]).reset_index()
                    row1 = create_row([combi[0]])
                    for _ in range(df1.shape[0]):
                        realcount.loc[realcount.shape[0]] = ["chr", 0, 0
                                                             ] + row1
                    continue

                df1 = pd.DataFrame(X[combi[0]][0]).reset_index()
                df1['index'] = df1.index
                df2 = pd.DataFrame(X[combi[1]][0]).reset_index()
                df2['index'] = df2.index

                merged1 = expand_coord(df1)
                merged2 = expand_coord(df2)

                matched_index = []
                for mergin in merged1.keys():
                    mergin0_res = find_all_overlaps(mergin, merged1, merged2,
                                                    combi)

                    if any(mergin0_res):
                        row1 = create_row([combi[0], combi[1]], tools)
                        matched_index.append(
                            np.where(mergin0_res)[0][0]
                        )  #keep track which index of event is found overlapped, so that doesn't duplicate
                    #realcount.loc[realcount.shape[0]] = ["chr", 0, 0] +
                    else:
                        row1 = create_row([combi[0]], tools)

                    realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row1

                #add the events in tool 2 that doesn't have overlaps
                row2 = create_row([combi[1]], tools)
                for _ in range(len(merged2) - len(matched_index)):
                    realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row2

        else:
            grs = {
                n: pr.from_dict(s[0]).drop_duplicate_positions(keep=False)
                for n, s in X.items()
            }
            counts = pr.count_overlaps(grs)
            countdf = counts.df

            #check if there are tools left out
            missed = [
                tools[x]
                for x in np.where(np.isin(tools, countdf.columns) == False)[0]
            ]
            if len(missed) > 0:
                for x in missed:
                    countdf[x] = 0
            realcount = countdf[["Chromosome", "Start", "End"] + tools]

        for row in realcount.itertuples():
            tmp = list(row[4:])
            tmp = [1 if x > 1 else x for x in tmp]
            binkey = ''.join([str(x) for x in tmp])
            if np.sum(tmp) == 0:
                continue
            else:
                if binkey not in set(allcomb.keys()):
                    allcomb.setdefault(binkey, {})
                    for e in evs:
                        allcomb[binkey].setdefault(e, 0)

                allcomb[binkey][ev] += 1

    forplot = pd.DataFrame(columns=tools + evs)

    for n, j in allcomb.items():
        thisrow = [bool(int(x)) for x in n] + list(j.values())
        forplot.loc[forplot.shape[0]] = thisrow

    forplot = forplot.set_index(tools)
    return forplot
Beispiel #4
0
total_sizes = []
cds_sizes = []
exon_sizes = []
panel_prs = []

for panel in panels:
    print(panel)
    panel_pr = pr.PyRanges(genie.loc[(genie['SEQ_ASSAY_ID'] == panel) & genie['Chromosome'].isin(chromosomes), 'Chromosome':'End_Position'].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'})).merge()
    total_sizes.append(sum([i + 1 for i in panel_pr.lengths()]))
    cds_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_cds_pr).lengths()]))
    exon_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_exon_pr).lengths()]))
    panel_prs.append(panel_pr)


grs = {k: v for k, v in zip(['CDS', 'exon'] + list(panels), [gff_cds_pr, gff_exon_pr] + panel_prs)}
result = pr.count_overlaps(grs, pr.concat({'maf': maf_pr}.values()))
result = result.df

tcga_maf = pd.merge(tcga_maf, result.iloc[:, 3:], how='left', on='index')


panel_df['total'] = total_sizes
panel_df['cds'] = cds_sizes
panel_df['exon'] = exon_sizes

##get assumed size of the most common kit: https://bitbucket.org/cghub/cghub-capture-kit-info/src/master/BI/vendor/Agilent/whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed
agilent_df = pd.read_csv(file_path / 'whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed', sep='\t', low_memory=False, header=None)
kit_pr = pr.PyRanges(agilent_df.rename(columns={0: 'Chromosome', 1: 'Start', 2: 'End'})).merge()
kit_total = sum([i + 1 for i in kit_pr.lengths()])
kit_cds = sum([i + 1 for i in kit_pr.intersect(gff_cds_pr).merge().lengths()])
kit_exon = sum([i + 1 for i in kit_pr.intersect(gff_exon_pr).merge().lengths()])
Beispiel #5
0
    k: annot[annot.Feature == k].drop()
    for k in ['CDS', 'five_prime_utr', 'three_prime_utr']
}
features['intron'] = introns.drop()

# high confidence regions
highc = pyranges.read_bed(snakemake.input.hc_bed)

# exome sequencing target regions
exometarg = pyranges.read_bed(snakemake.input.es_bed)

# load the variants into a pyranges object
prpm = vcf_to_pyranges(pysam.VariantFile(snakemake.input.vcf),
                       tmpfile=snakemake.output.tsv + '_tmp.bed')
# count overlaps to different feature types
prpm = pyranges.count_overlaps(features, prpm)

# annotation by majority vote
main_anno = np.argmax(
    prpm.as_df()[['CDS', 'five_prime_utr', 'three_prime_utr',
                  'intron']].values,
    axis=1)
d = {
    i: k
    for i, k in enumerate(
        ['CDS', 'five_prime_utr', 'three_prime_utr', 'intron'])
}
main_anno = pd.Series(main_anno).map(d)
prpm.major_anno = main_anno

# annotate as intron only those regions that have no other annotations