Esempio n. 1
0
def frag_length_filter(genotypes, options):
    dist1 = -500
    dist2 = 5000

    flgs = {}

    for sample, dataset in options.iter_10xdatasets():
        sample_info = options.sample_info(sample.name)
        dataset_info = sample_info[dataset.id]
        flgs[sample.name] = FragLengthGenotyper(dataset_info["frag_length_distributions"])

    # for n, cluster in genotypes.groupby("cluster"):
    for event in genotypes.itertuples():
        frag_length_filter = False

        for sample, dataset in options.iter_10xdatasets():
            fragsx, fragsy, merged = structuralvariants.get_supporting_fragments_new(
                options, sample, dataset,
                event.chromx, event.x, event.chromy, event.y, event.orientation,
                dist1, dist2)

            if len(merged) < 5:
                continue
            
            lengths = calc_frag_lengths(event.x, event.y, event.orientation, merged)
            frag_length_filter |= flgs[sample.name].genotype(lengths)

            genotypes.loc[event.Index, "{}_lengths_50".format(sample.name)] = numpy.percentile(lengths, 50)
            genotypes.loc[event.Index, "{}_lengths_90".format(sample.name)] = numpy.percentile(lengths, 90)

        genotypes.loc[event.Index, "frag_length_passes"] = frag_length_filter
    print genotypes
Esempio n. 2
0
def _barcodes_for_breakpoint(options, sample, dataset, nodex, nodey, dist1, dist2):
    fragsx, fragsy, merged = structuralvariants.get_supporting_fragments_new(
        options, sample, dataset, 
        nodex.chrom, nodex.position, nodey.chrom, nodey.position, 
        nodex.orientation+nodey.orientation, dist1, dist2)

    bcx = set(fragsx["bc"])
    bcy = set(fragsy["bc"])

    common_barcodes = bcx.intersection(bcy)
    if len(common_barcodes) < 1:
        return None
    
    return common_barcodes
Esempio n. 3
0
def get_shared_frags(options, sample, dataset, chromx, x, chromy, y,
                     orientation, dist1, dist2):
    fragsx, fragsy, merged = structuralvariants.get_supporting_fragments_new(
        options, sample, dataset, chromx, x, chromy, y, orientation, dist1,
        dist2)

    bcx = set(fragsx["bc"])
    bcy = set(fragsy["bc"])

    common_barcodes = bcx.intersection(bcy)

    shared_fragsx = fragsx.loc[fragsx["bc"].isin(common_barcodes)]
    shared_fragsy = fragsy.loc[fragsy["bc"].isin(common_barcodes)]

    return shared_fragsx, shared_fragsy
Esempio n. 4
0
    def _barcodes_for_breakpoint(self, chromx, x, orientationx, chromy, y,
                                 orientationy, dist1, dist2):
        # TODO: refactor to re-use same version as cluster_svs
        fragsx, fragsy, merged = structuralvariants.get_supporting_fragments_new(
            self.options, self.sample, self.dataset, chromx, x, chromy, y,
            orientationx + orientationy, dist1, dist2)

        bcx = set(fragsx["bc"])
        bcy = set(fragsy["bc"])

        common_barcodes = bcx.intersection(bcy)
        if len(common_barcodes) < 1:
            return None

        return common_barcodes
Esempio n. 5
0
    def run(self):
        dist1 = -500
        dist2 = 5000
        outpath = self.outpaths(final=False)["sv_barcodes"]

        self.logger.log("loading...")
        events = self.load_events()

        barcodes_map = {}

        for i, cluster in events.groupby("cluster"):
            barcodes = set()

            for j, event in cluster.iterrows():
                _, _, merged_frags = \
                    structuralvariants.get_supporting_fragments_new(
                        self.options, self.sample, self.dataset,
                        event["chromx"], int(event["x"]),
                        event["chromy"], int(event["y"]),
                        event["orientation"], dist1, dist2,
                        min_reads_per_frag=0)

                cur_bcs = set(merged_frags["bc"])
                if len(cur_bcs) > MAX_BARCODES:
                    print "TOO MANY BARCODES: sampling {} of {} for cluster {}".format(
                        MAX_BARCODES, len(cur_bcs), i)
                    cur_bcs = random.sample(cur_bcs, MAX_BARCODES)

                if len(cur_bcs) == 0:
                    self.logger.log(
                        "WARNING: no barcodes found for event {} {}:{}::{}:{}{} dist1={} dist2={}"
                        .format(self.sample.name, event.chromx, event.x,
                                event.chromy, event.y, event.orientation,
                                dist1, dist2))
                barcodes.update(cur_bcs)

            barcodes_map[i] = barcodes

        utilities.pickle.dump(barcodes_map, open(outpath, "w"), protocol=-1)
def quantify_breakpoint(chromx,
                        x,
                        chromy,
                        y,
                        orientation,
                        options,
                        good_bc_counts_by_dataset,
                        barcode_frequencies_by_dataset,
                        dist1,
                        dist2,
                        with_phasing=False):

    cur_result = {}
    cur_result["chromx"] = chromx
    cur_result["new_x"] = x
    cur_result["chromy"] = chromy
    cur_result["new_y"] = y
    cur_result["orientation"] = orientation

    cur_result["shared"] = 0
    cur_result["total"] = 0

    for sample, dataset in options.iter_10xdatasets():
        barcode_frequencies = barcode_frequencies_by_dataset[dataset.id]

        fragsx, fragsy, merged = structuralvariants.get_supporting_fragments_new(
            options,
            sample,
            dataset,
            chromx,
            x,
            chromy,
            y,
            orientation,
            dist1,
            dist2,
            with_phasing=with_phasing)

        bcx = set(fragsx["bc"])
        bcy = set(fragsy["bc"])

        common_barcodes = bcx.intersection(bcy)
        total_barcodes = bcx.union(bcy)
        cur_result["{}_total".format(sample.name)] = len(total_barcodes)

        if len(common_barcodes) < 1:
            continue

        good_bc_count = good_bc_counts_by_dataset[dataset.id]
        contingency_table = numpy.array(
            [[len(common_barcodes), len(bcx - bcy)],
             [len(bcy - bcx), good_bc_count - len(total_barcodes)]])
        p_fisher = scipy.stats.fisher_exact(contingency_table,
                                            alternative="greater")[1]
        p_resampling = structuralvariants.score_event(len(bcx),
                                                      len(bcy),
                                                      len(common_barcodes),
                                                      barcode_frequencies,
                                                      resamples=100)

        cur_result["{}_shared".format(sample.name)] = len(common_barcodes)

        cur_result["{}_p_fisher".format(sample.name)] = p_fisher
        cur_result["{}_p_resampling".format(sample.name)] = p_resampling

        if with_phasing:
            cur_result["{}_x_hap0".format(
                sample.name)] = (merged["hap_x"].isin([0, 2])).sum()
            cur_result["{}_x_hap1".format(
                sample.name)] = (merged["hap_x"] == 1).sum()
            cur_result["{}_y_hap0".format(
                sample.name)] = (merged["hap_y"].isin([0, 2])).sum()
            cur_result["{}_y_hap1".format(
                sample.name)] = (merged["hap_y"] == 1).sum()

        # TODO: constants should be constant across steps!
        if (p_resampling < 1e-4) and (
                len(common_barcodes) / float(len(total_barcodes)) > 0.10):
            cur_result["shared"] += len(common_barcodes)
            cur_result["total"] += len(total_barcodes)

    cur_result["p_resampling"] = min(
        cur_result.get("{}_p_resampling".format(sample_name), 1.0)
        for sample_name in options.samples)

    return pandas.Series(cur_result)