Ejemplo n.º 1
0
def split(args):
    assert (args.min_len >= 2 * args.bin_size)

    # We only need the bam to get the chromosome names and lengths.
    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    if args.targets is None:
        target_regions = None
    else:
        with open(args.targets, 'r') as f:
            target_regions = tk_io.get_target_regions(f)

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)

    all_loci = []
    for (chrom_name, chrom_size) in zip(in_bam.references, in_bam.lengths):
        if not chrom_name in primary_contigs:
            continue
        # The chunks will overlap by min_len. This will ensure that we don't
        # miss any regions of low depth that span chunk boundaries.
        new_loci = generate_chrom_loci(target_regions,
                                       chrom_name,
                                       chrom_size,
                                       PARALLEL_LOCUS_SIZE / 2,
                                       overlap=args.min_len)
        all_loci.extend(new_loci)
    in_bam.close()

    # Group loci
    locus_sets = pack_loci(all_loci)

    chunk_defs = [{'loci': loci, '__mem_gb': 16} for loci in locus_sets]
    return {'chunks': chunk_defs}
Ejemplo n.º 2
0
def do_blacklist_filtering(events, blacklist_map):
    for bl_name, bl_list in blacklist_map.iteritems():
        with open(bl_list) as fBL:
            blacklist = tk_io.get_target_regions(fBL)

            for e in events:
                e.check_blacklist(blacklist, bl_name)
Ejemplo n.º 3
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.confident_regions is None:
        confident_regions = None
    else:
        confident_regions = tk_io.get_target_regions(
            open(args.confident_regions))

    outfile = open(outs.confident_windows, "w")
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        conf_regions = get_conf_regions(chrom, confident_regions)
        location = start
        while location < end:
            region = tk_regions.Regions(regions=[(location, location +
                                                  args.window_size)])
            isect = region.intersect(conf_regions)
            size = isect.get_total_size()
            percent = tk_stats.robust_divide(float(size),
                                             float(args.window_size))
            row = [chrom, location, location + args.window_size, percent]
            outfile.write("\t".join(map(str, row)) + "\n")
            location += args.window_size
    outfile.close()
Ejemplo n.º 4
0
def get_coverage_regions(args):
    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    regions = Regions(
        tk_io.get_target_regions(open(
            args.high_coverage_excluded_bed)).get(chrom))
    if regions == None:
        regions = Regions()
    return regions
Ejemplo n.º 5
0
def annotate_bed_info(counts, bed_file):
    regs = tk_io.get_target_regions(open(bed_file))

    for c in counts:
        in_bed = False
        if regs.has_key(c['chrom']):
            in_bed = regs[c['chrom']].contains_point(c['pos'])

        c['in_bed'] = in_bed
Ejemplo n.º 6
0
def merge(bed1, bed2, bedOut):
    if not bed2:
        shutil.copyfile(bed1, bedOut)
        return

    with open(bed1) as f:
        bed_dict1 = tk_io.get_target_regions(f)

    with open(bed2) as f:
        bed_dict2 = tk_io.get_target_regions(f)

    for chrom in bed_dict2:
        for start, end in bed_dict2[chrom]:
            if chrom not in bed_dict1:
                bed_dict1[chrom] = tk_regions.Regions([])
            bed_dict1[chrom].add_region((start, end))

    writeOut(bed_dict1, bedOut)
Ejemplo n.º 7
0
def no_overlap(bed1, bed2, bedOut):
    if not bed2:
        shutil.copyfile(bed1, bedOut)
        return

    with open(bed1) as f:
        bed_dict1 = tk_io.get_target_regions(f)

    with open(bed2) as f:
        bed_dict2 = tk_io.get_target_regions(f)

    bed_dict_no_overlap = {}
    for chrom in bed_dict1:
        if not chrom in bed_dict_no_overlap:
            bed_dict_no_overlap[chrom] = tk_regions.Regions([])
        for start, end in bed_dict1[chrom]:
            if not chrom in bed_dict2 or \
                not bed_dict2[chrom].overlaps_region(start, end):
                bed_dict_no_overlap[chrom].add_region((start, end))

    writeOut(bed_dict_no_overlap, bedOut)
Ejemplo n.º 8
0
def intersect(bed1, bed2, bedOut):
    if not bed2:
        shutil.copyfile(bed1, bedOut)
        return

    with open(bed1) as f:
        bed_dict1 = tk_io.get_target_regions(f)

    with open(bed2) as f:
        bed_dict2 = tk_io.get_target_regions(f)

    all_common_chroms = [
        chrom for chrom in bed_dict1.keys() if chrom in bed_dict2
    ]
    bed_dict_intersect = {}

    for chrom in all_common_chroms:
        bed_dict_intersect[chrom] = bed_dict1[chrom].intersect(
            bed_dict2[chrom])

    writeOut(bed_dict_intersect, bedOut)
Ejemplo n.º 9
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    outs.raw_matrix_mex = None
    if args.fragments is None:
        outs.raw_matrix = None
        return

    with open(args.peaks, 'r') as infile:
        full_peaks = tk_bio.get_target_regions(infile)
    with open(args.peaks, 'r') as pfile:
        peaks_dict = OrderedDict(
            ("{}:{}-{}".format(*peak.strip("\n").split("\t")), num)
            for num, peak in enumerate(pfile))

    with open(args.barcodes, 'r') as barcode_file:
        barcodes_dict = OrderedDict(
            (bc.strip('\n'), num) for num, bc in enumerate(barcode_file))

    if len(barcodes_dict) == 0:
        outs.raw_matrix = None
        return

    # get matrix counts
    peak_bc_counts = Counter()
    for contig, start, stop, barcode, _ in open_fragment_file(args.fragments):
        if barcode not in barcodes_dict:
            continue
        for pos in (start, stop):
            if contig in full_peaks.keys():
                peak = full_peaks[contig].get_region_containing_point(pos)
                if peak is not None:
                    peak_bc_counts[barcodes_dict[barcode],
                                   peaks_dict['{}:{}-{}'.format(
                                       contig, peak[0], peak[1])]] += 1

    data, col, row = (), (), ()
    if len(peak_bc_counts) > 0:
        data, col, row = zip(*[(val, key[0], key[1])
                               for key, val in peak_bc_counts.iteritems()])
    sp_matrix = csc_matrix(
        coo_matrix((data, (row, col)),
                   shape=(len(peaks_dict), len(barcodes_dict)),
                   dtype=int))

    # save as a CountMatrix
    genomes = utils.generate_genome_tag(args.reference_path)
    peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes)
    raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes_dict.keys(),
                                       sp_matrix)
    raw_matrix.save_h5_file(outs.raw_matrix,
                            sw_version=martian.get_pipelines_version())
Ejemplo n.º 10
0
def subtract(bed1, bed2, bedOut):
    if not bed2:
        shutil.copyfile(bed1, bedOut)
        return

    with open(bed1) as f:
        bed_dict1 = tk_io.get_target_regions(f)

    with open(bed2) as f:
        bed_dict2 = tk_io.get_target_regions(f)

    bed_dict_subtract = {}
    for chrom in bed_dict1:
        if not chrom in bed_dict_subtract:
            bed_dict_subtract[chrom] = tk_regions.Regions([])
        for start, end in bed_dict1[chrom]:
            overlappings = []
            if chrom in bed_dict2:
                overlappings = bed_dict2[chrom].overlapping_regions(start, end)
            for interval in interval_subtract(start, end, overlappings):
                bed_dict_subtract[chrom].add_region(interval)

    writeOut(bed_dict_subtract, bedOut)
Ejemplo n.º 11
0
def split(args):
    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    # Load pull-down targets
    if args.targets is None:
        target_regions = None
    else:
        with open(args.targets, 'r') as f:
            target_regions = tk_io.get_target_regions(f)

    all_loci = []
    for (chrom_name, chrom_size) in zip(in_bam.references, in_bam.lengths):
        all_loci.extend(generate_chrom_loci(target_regions, chrom_name, chrom_size, PARALLEL_LOCUS_SIZE))
    in_bam.close()

    locus_sets = pack_loci(all_loci)

    chunk_defs = [{'loci': loci, '__mem_gb':16} for loci in locus_sets]
    return {'chunks': chunk_defs}
Ejemplo n.º 12
0
def main_report_basic(args, outs):
    bam_in = pysam.Samfile(args.input, check_sq=False)
    targets_filename = args.targets_file
    references = bam_in.references

    if args.input_pos is not None:
        bam_in_pos = tk_bam.create_bam_infile(args.input_pos)
        n_mapped = bam_in_pos.mapped
        n_chunk = math.ceil(n_mapped / args.n_chunks)
        bam_in_pos.close()
    else:
        n_mapped = 0
        n_chunk = 0

    if targets_filename is None or targets_filename == '':
        target_regions = None
    else:
        targets_file = open(targets_filename, 'r')
        target_regions = tk_io.get_target_regions(targets_file)

    if args.barcode_whitelist:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
    else:
        barcode_whitelist = None

    bam_slice = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))

    # do basic counting
    misc_sm, qual_sms = \
            compute_basic_stats(bam_slice,
                    target_regions,
                    n_chunk,
                    references,
                    barcode_whitelist)

    misc_sm.save(outs.misc_sm)
    with open(outs.qual_sms, 'wb') as out_handle:
        pickle.dump(qual_sms, out_handle)
Ejemplo n.º 13
0
def main_report_single_partition(args, outs):
    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.fragments = None
        return

    bam_in = tk_bam.create_bam_infile(args.input)

    if args.targets_file is None:
        target_regions = None
    else:
        target_regions = tk_io.get_target_regions(open(args.targets_file))

    # Bail out if we're on a small genome
    if sum(bam_in.lengths) < 3e6:
        outs.fragments = None
        return

    bam_chunk = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))
    # Skip reads without a barcode
    bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk)
    bc_read_iter = itertools.groupby(bam_chunk_filt,
                                     lambda x: tk_io.get_read_barcode(x))

    bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance,
                                 bam_in.references, target_regions)
               for (bc, reads) in bc_read_iter)
    bc_data_filt = (x for x in bc_data if x is not None)

    frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt)
    if frag_tbl is not None:
        # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption
        frag_tbl.sort(['chrom', 'start_pos'], inplace=True)
        tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl)
        tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos',
                                       'end_pos')
    if bc_tbl is not None:
        tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
Ejemplo n.º 14
0
def main(args, outs):

    if not args.wgsmode:
        # blacklist curation
        if args.blacklist_map:
            # If we got an explicit blacklist, use it
            blacklist_map = args.blacklist_map
        else:
            blacklist_map = {}

        # We did not get an explicit blacklist -- in this case, combine the built-in segdup file and the homo_del_blacklist
        # Combine the homo_del_blacklist with the internal segdup file to get the filter set
        if not "LowCov" in blacklist_map:
            blacklist_map["LowCov"] = args.low_coverage_blacklist

        genome = tk_reference.get_genome(args.reference_path)
        if not "SEGDUP" in blacklist_map:
            cnv_segdups = tenkit.constants.find_sv_file(
                genome, "cnv_segdup_filter.bed")
            blacklist_map["SEGDUP"] = cnv_segdups

        if not "whitelist" in blacklist_map: blacklist_map["whitelist"] = {}
        if not "h**o" in blacklist_map["whitelist"]:
            pilot_accs = tenkit.constants.find_sv_file(
                genome, "20141020.pilot_mask.whole_genome.bed")
            blacklist_map["whitelist"]["h**o"] = pilot_accs
        if not "het" in blacklist_map["whitelist"]:
            strict_accs = tenkit.constants.find_sv_file(
                genome, "20141020.strict_mask.whole_genome.bed")
            blacklist_map["whitelist"]["het"] = strict_accs

        outs.blacklist_map = blacklist_map

        # generate filtered target regions for het del and h**o del callers
        ## h**o
        if "LowCov" in blacklist_map:
            bedtools.no_overlap(args.target_regions, blacklist_map["LowCov"],
                                outs.hom_del_query_region + "_tmp.bed")
        else:
            shutil.copyfile(args.target_regions,
                            outs.hom_del_query_region + "_tmp.bed")
        bedtools.overlap(outs.hom_del_query_region + "_tmp.bed",
                         blacklist_map["whitelist"]["h**o"],
                         outs.hom_del_query_region)
        #shutil.copyfile(outs.hom_del_query_region+"_tmp.bed", outs.hom_del_query_region)

        ## het
        bedtools.no_overlap(outs.hom_del_query_region + "_tmp.bed",
                            blacklist_map["SEGDUP"],
                            outs.het_del_query_region + "_tmp.bed")
        bedtools.overlap(outs.het_del_query_region + "_tmp.bed",
                         blacklist_map["whitelist"]["het"],
                         outs.het_del_query_region)

        #         ##
        #         with open(blacklist_map["whitelist"]["h**o"]) as f:
        #             accs_1000g_pilot = tk_io.get_target_regions(f)
        #         with open(blacklist_map["whitelist"]["het"]) as f:
        #             accs_1000g_strict = tk_io.get_target_regions(f)

        if args.wgs_deletions_gt:
            ## het evetns
            with open(outs.het_del_query_region) as f:
                bed_target = tk_io.get_target_regions(f)

            fhet_sen = open(outs.het_gt_sen, "w")
            fhet_ppv = open(outs.het_gt_ppv, "w")

            with open(args.wgs_deletions_gt) as f:
                for line in f:
                    if line[0] == "#": continue
                    infos = line.strip().split()
                    chrom, start, end = infos[0], int(infos[1]), int(infos[5])
                    if chrom in bed_target:
                        overlappings = bed_target[chrom].overlapping_regions(
                            start, end)
                        overlap_size = 0
                        for s, e in overlappings:
                            overlap_size += (min(e, end) - max(s, start))
                        if overlap_size >= args.min_overlap:  #and \
                            #chrom in accs_1000g_strict and\
                            #accs_1000g_strict[chrom].overlaps_region(start, end):
                            record = "\t".join(
                                (infos[0], infos[1], infos[5])) + "\n"
                            if ("HET" in line) and ("TIER=1" in line):
                                fhet_sen.write(record)
                            fhet_ppv.write(record)

            ## h**o events
            with open(outs.hom_del_query_region) as f:
                bed_target = tk_io.get_target_regions(f)

            fhom_sen = open(outs.hom_gt_sen, "w")
            fhom_ppv = open(outs.hom_gt_ppv, "w")

            with open(args.wgs_deletions_gt) as f:
                for line in f:
                    if line[0] == "#": continue
                    infos = line.strip().split()
                    chrom, start, end = infos[0], int(infos[1]), int(infos[5])
                    if chrom in bed_target:
                        overlappings = bed_target[chrom].overlapping_regions(
                            start, end)
                        has_full_exon = False
                        for s, e in overlappings:
                            if start <= s + 1 and end >= e - 1:
                                print start, end, s, e
                                has_full_exon = True
                                break

                        if has_full_exon:  #and \
                            #chrom in accs_1000g_pilot and\
                            #accs_1000g_pilot[chrom].overlaps_region(start, end):
                            record = "\t".join(
                                (infos[0], infos[1], infos[5])) + "\n"
                            if ("HOM" in line) and ("TIER=1" in line):
                                fhom_sen.write(record)
                            fhom_ppv.write(record)

            fhet_sen.flush()
            fhet_sen.close()
            fhet_ppv.flush()
            fhet_ppv.close()
            fhom_sen.flush()
            fhom_sen.close()
            fhom_ppv.flush()
            fhom_ppv.close()

        else:
            outs.het_gt_sen = None
            outs.het_gt_ppv = None
            outs.hom_gt_sen = None
            outs.hom_gt_ppv = None

    else:
        outs.hom_gt_sen = None
        outs.hom_gt_ppv = None
        outs.het_del_query_region = None
        outs.hom_del_query_region = None
        outs.blacklist_map = None
        fhet_sen = open(outs.het_gt_sen, "w")
        fhet_ppv = open(outs.het_gt_ppv, "w")

        if args.wgs_deletions_gt:
            with open(args.wgs_deletions_gt) as f:
                for line in f:
                    if line[0] == "#": continue
                    infos = line.strip().split()
                    record = "\t".join((infos[0], infos[1], infos[5])) + "\n"
                    if "TIER=1" in line:
                        fhet_sen.write(record)
                    fhet_ppv.write(record)

            fhet_sen.flush()
            fhet_sen.close()
            fhet_ppv.flush()
            fhet_ppv.close()
        else:
            outs.het_gt_sen = None
            outs.het_gt_ppv = None
Ejemplo n.º 15
0
def do_homo_whiltelist_filtering(events, whitelist_file, whitelist_name):
    with open(whitelist_file) as fWT:
        whitelist = tk_io.get_target_regions(fWT)

        for e in events:
            e.check_whitelist(whitelist, whitelist_name)
Ejemplo n.º 16
0
def read_bed_file(bed):
    # output chrom_idx_truth is a chr-indexed dictionary with tk_regions.Regions value
    with open(bed) as f:
        bed_dict = tk_io.get_target_regions(f)
    return bed_dict
Ejemplo n.º 17
0
def main(args, outs):

    if args.fragments is None:
        outs.bc_cnv = None
        outs.bc_large_cnv = None
        return

    rust_env = os.environ.copy()
    rust_env["RUST_BACKTRACE"] = "1"
    final_blacklist = lr_gt.get_genomic_track(args.blacklist, "terminal_cnv",
                                              args.reference_path,
                                              "default_blacklist.bed")
    if final_blacklist is None:
        final_blacklist = args.possorted_bam + "_tmp"
        open(final_blacklist, 'w').close()

    if args.subcommand == "bc" and args.fragments:
        frag_csv = outs.bc_cnv + ".csv"
        bin_size, frag_version = convert_fragments_to_csv(
            args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj)
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam,
            final_blacklist, outs.bc_cnv, "--fragver",
            str(frag_version), "--binsize",
            str(bin_size), "--probchange",
            str(args.status_change_penalty), "--minprob",
            str(args.min_prob)
        ]
    elif args.subcommand == "read":
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, args.possorted_bam, final_blacklist,
            outs.bc_cnv, "--binsize",
            str(args.bin_size), "--probchange",
            str(args.status_change_penalty)
        ]
    elif args.subcommand == "asread":
        frag_csv = outs.bc_cnv + ".csv"
        bin_size, frag_version = convert_fragments_to_csv(
            args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj)
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam,
            final_blacklist, outs.bc_cnv, "--fragver",
            str(frag_version), "--binsize",
            str(bin_size), "--probchange",
            str(args.status_change_penalty), "--minprob",
            str(args.min_prob)
        ]

    print cnv_args
    subprocess.check_call(cnv_args, env=rust_env)
    outs.final_bin_size = bin_size

    chroms = []
    starts1 = []
    end1 = []
    starts2 = []
    end2 = []
    info_strs = []
    quals = []

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)

    spikes = tk_io.get_target_regions(open(args.spikes))
    with open(outs.bc_cnv) as fin:
        for line in fin.readlines():
            if line.startswith('#') or line.startswith(
                    'browser') or line.startswith('track') or line.startswith(
                        '-browser') or line.startswith('-track'):
                continue
            infos = line.strip().split("\t")
            cp = int(infos[3])
            ch = infos[0]
            s = int(infos[1])
            e = int(infos[2])

            # Some basic filtering
            if primary_contigs and ch not in primary_contigs:
                continue

            if cp == 2 or (e - s) < args.minimal_cnv_size:
                continue

            if cp > 2:
                if ch not in spikes: continue
                overlaps = spikes[ch].overlapping_regions(
                    max(0, s - bin_size), e + bin_size)
                ln = len(overlaps)
                if ln > 0 and \
                    overlap(s-bin_size, s+bin_size, overlaps[0][0], overlaps[0][1]) and \
                    overlap(e-bin_size, e+bin_size, overlaps[ln-1][0], overlaps[ln-1][1]):
                    continue

            chroms.append(infos[0])
            starts1.append(s)
            end1.append(s + 1)
            starts2.append(e)
            end2.append(e + 1)
            pval = float(infos[4])
            #if pval > args.max_pval:
            #    continue
            if pval < 1e-100:
                qual = 1000
            else:
                qual = int(-log10(pval) * 10)
            quals.append(qual)
            if cp > 2:
                info_strs.append('TYPE=DUP;COPY=%d' % cp)
            elif cp < 2:
                info_strs.append('TYPE=DEL;COPY=%d' % cp)

    sv_df = tk_sv_io.create_sv_df(chroms,
                                  starts1,
                                  end1,
                                  chroms,
                                  starts2,
                                  end2,
                                  np.arange(len(chroms)),
                                  quals,
                                  info_strs=info_strs)
    tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.bc_large_cnv)