コード例 #1
0
    def __init__(self, in_bam_filename, bc_freq, bc_map, read_freq = None,
                 regions_file = None, extend = 0):
        """
        in_bam_filename: BAM file
        bc_freq: numpy array with barcode frequencies
        bc_map: dict from barcodes to indices (in bc_freq and read_freq)
        read_freq: numpy array with read frequencies
        regions_file: BED file with target regions (or None). If provided, only
           reads from these regions will be counted (after extending these regions
            by "extend")
        extend: How much to extend target regions. Should be a non-negative integer.
        """
        self.bam = in_bam_filename
        self.bc_freq = bc_freq
        self.bc_map = bc_map
        self.read_freq = read_freq
        if regions_file is None:
            self.regions = None
        else:
            self.regions = {}
            extend = max(extend, 0)
            bed_iterator = tk_io.get_bed_iterator(regions_file)
            for chrom, start, stop in bed_iterator:
                start = max(0, start - extend)
                stop = stop + extend
                if not chrom in self.regions:
                    self.regions[chrom] = []
                self.regions[chrom].append((start, stop))

            for chrom, region_list in self.regions.iteritems():
                self.regions[chrom] = tk_regions.Regions(self.regions[chrom])
コード例 #2
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    vc_mode, variant_caller, precalled_file, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    locus = args.locus
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    fasta_path = tk_reference.get_fasta(args.reference_path)

    bedfile = outs.default + ".bed"
    regions = Regions()
    if args.targets_file is not None:
        for (chrom, start,
             end) in tk_io.get_bed_iterator(args.targets_file, args.locus):
            regions.add_region((start, end))
    else:
        (chrom, start, stop) = tk_io.get_locus_info(args.locus)
        regions.add_region((start, stop))
    coverage_regions = None
    if (vc_mode !=
            "precalled") and args.high_coverage_excluded_bed is not None:
        coverage_regions = get_coverage_regions(args)
        regions = regions.intersect(coverage_regions)

    bed_length = 0
    with open(bedfile, 'w') as bed_writer:
        for region in regions.get_region_list():
            (start, end) = region
            bed_writer.write(chrom + "\t" + str(start) + "\t" + str(end) +
                             "\n")
            bed_length += 1
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        outs.default = None
        precalled_vars_path = args.split_input
        vcf = tk_io.VariantFileReader(precalled_vars_path)
        with open(outs.precalled, "w") as file_write:
            output = tk_io.VariantFileWriter(
                file_write, template_file=open(precalled_vars_path))
            variant_iter = tk_io.get_variant_iterator_pos(
                vcf, bedfile, args.locus)
            for record in variant_iter:
                output.write_record(record)
    if not (vc_mode == "precalled"):
        outs.precalled = None
        primary_contigs = tk_reference.load_primary_contigs(
            args.reference_path)
        if bed_length > 0 and chrom in primary_contigs:
            vc.run_variant_caller(variant_caller, gatk_path, args.__mem_gb,
                                  fasta_path, args.input, outs.default,
                                  bedfile)
コード例 #3
0
def mask_profile(profile, bin_size, mask_bed_file, cutoff=0, keep_gt_cutoff=True):
    # Auxiliary function
    def mask_bin(masked_profile, chrom, start, end, bin_size):
        #bin_size = end - start + 1
        bin_index = int(math.floor(start / bin_size))
        try:
            masked_profile.get(chrom)[bin_index] = float("NaN")
        except Exception as e:
            #print(traceback.format_exc())
            print(chrom + ":" + str(start) + "-" + str(end) + " not found")
            print(e)
        # try except
    # mask_bin
    #
    masked_profile = copy.deepcopy(profile)
    # TODO: Implement
    # TODO: Test - the input dictionaries profile and mask must have identical keys
    # TODO: Test - for each key (chrName), the arrays profile[chrName] and mask[chrName]
    #       must have identical lengths
    mask_iterator = get_bed_iterator(bed_file_name=mask_bed_file, locus=None)
    for (chrom, start, end, value) in mask_iterator:
        #print("%s\t%d\t%d\t%f\n" % (chrom, start, end, value))
        if value > cutoff:
            if keep_gt_cutoff:
                pass # keep bins above cutoff
            else:
                # Mask bins above cutoff
                mask_bin(masked_profile, chrom, start, end, bin_size)
            # if keep_gt_cutoff
        else:
            if keep_gt_cutoff:
                # Mask bins below cutoff
                mask_bin(masked_profile, chrom, start, end, bin_size)
            else:
                pass # keep bins below cutoff
            # if keep_gt_cutoff
        # if value else
    # for chrom
    return masked_profile
コード例 #4
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.coverage is None or args.bait_file is None:
        outs.bait_csv = None
        return

    f = h5py.File(args.coverage)
    has_basic = ('coverage_deduped' in f) and ('mapq30_coverage_deduped' in f)
    has_subsampling = ('coverage_subsampled'
                       in f) and ('coverage_deduped_subsampled' in f)
    f.close()
    if not has_basic: return

    fasta = tenkit.reference.open_reference(args.reference_path)

    df = p.DataFrame()
    coverage_reader = tenkit.hdf5.DataFrameReader(args.coverage)

    #regs = tenkit.bio_io.get_target_regions_dict(args.bait_file)
    #for chrom in regs:
    #    for start, end in regs[chrom]:

    bedIte = tk_io.get_bed_iterator(args.bait_file)
    for chrom, start, end in bedIte:
        if has_subsampling:
            coverage = coverage_reader.query([chrom, start, end],
                                             query_cols=[
                                                 'coverage_deduped',
                                                 'coverage_deduped_subsampled',
                                                 'coverage_subsampled',
                                                 'mapq30_coverage_deduped'
                                             ],
                                             coords=False)
            mean_cov = coverage.mean()
            gc = get_gc(chrom, (start, end), fasta)
            #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
            df = df.append(
                {
                    'chrom':
                    chrom,
                    'start':
                    start,
                    'end':
                    end,
                    'tag':
                    args.tag,
                    'coverage_deduped':
                    mean_cov['coverage_deduped'],
                    'coverage_deduped_subsampled':
                    mean_cov['coverage_deduped_subsampled'],
                    'coverage_subsampled':
                    mean_cov['coverage_subsampled'],
                    'mapq30_coverage_deduped':
                    mean_cov['mapq30_coverage_deduped'],
                    'gc':
                    gc
                },
                ignore_index=True)
        else:
            coverage = coverage_reader.query(
                [chrom, start, end],
                query_cols=['coverage_deduped', 'mapq30_coverage_deduped'],
                coords=False)
            mean_cov = coverage.mean()
            gc = get_gc(chrom, (start, end), fasta)
            #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
            df = df.append(
                {
                    'chrom': chrom,
                    'start': start,
                    'end': end,
                    'tag': args.tag,
                    'coverage_deduped': mean_cov['coverage_deduped'],
                    'mapq30_coverage_deduped':
                    mean_cov['mapq30_coverage_deduped'],
                    'gc': gc
                },
                ignore_index=True)

    df.to_csv(outs.bait_csv)
コード例 #5
0
def split(args):
    win = args.window_size
    input_bam = tk_bam.create_bam_infile(args.input)
    chroms = input_bam.references
    chrom_lengths = input_bam.lengths
    chrom_len_map = {}
    for i, chrom in enumerate(chroms):
        chrom_len_map[chrom] = chrom_lengths[i]
    input_bam.close()

    max_mem_in_gb = 4  # Be a little conservative
    chunk_size = get_max_chunk(win, max_mem_in_gb)
    if not args.restrict_locus is None:
        locus_chrom, locus_start, locus_stop = tk_io.get_locus_info(
            args.restrict_locus)
        assert (locus_chrom in chrom_len_map)
        locus_start = max(0, locus_start)
        locus_stop = min(locus_stop, chrom_len_map[locus_chrom])

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)
    genome_size = np.sum([
        length for chrom, length in chrom_len_map.iteritems()
        if chrom in primary_contigs
    ])

    prev_chrom = ''
    tot_bp = 0
    starts = []
    stops = []
    chunks = []

    # Genome-wide windows
    if not args.restrict_locus is None:
        chunks.append({
            'chrom': locus_chrom,
            'starts': [locus_start],
            'stops': [locus_stop],
            '__mem_gb': 8
        })
    else:
        for chrom, length in chrom_len_map.iteritems():
            if not args.sex is None and args.sex.lower() in [
                    'f', 'female'
            ] and chrom in ['Y', 'chrY']:
                continue
            if not chrom in primary_contigs:
                continue
            nchunks = int(np.ceil(length / float(chunk_size)))
            # Divide as evenly as possible the windows across the chunks
            # This also makes sure that all chunks except the last will
            # have sizes that are multiples of the window size.
            win_per_chunk = int(np.ceil(length / float(nchunks * win)))
            new_chunk_size = win_per_chunk * win
            for c in range(nchunks):
                chunk_start = c * new_chunk_size
                chunk_stop = min((c + 1) * new_chunk_size, length)
                chunks.append({
                    'chrom': chrom,
                    'starts': [chunk_start],
                    'stops': [chunk_stop],
                    '__mem_gb': 8
                })

    # Target-centered windows. If the targets (plus the extent) cover too much of the
    # genome, then skip these.
    if not args.targets is None and not args.target_extend is None:
        target_regions = []
        bed_iterator = tk_io.get_bed_iterator(args.targets)
        for chrom, start, stop in bed_iterator:
            if not args.sex is None and args.sex.lower() in [
                    'f', 'female'
            ] and chrom in ['Y', 'chrY']:
                continue
            if not chrom in primary_contigs:
                continue
            stop = min(chrom_len_map[chrom], stop)
            if args.restrict_locus is None or (
                    chrom == locus_chrom
                    and overlaps(start, stop, locus_start, locus_stop)):
                target_regions.append((chrom, start, stop))

        target_regions = sort_and_merge(target_regions, args.target_extend)
        target_size = np.sum(
            [stop - start for _, start, stop in target_regions])

        if target_size / float(genome_size) < MIN_TARGET_FRAC:
            for (chrom, start, stop) in target_regions:
                if (prev_chrom != chrom
                        and prev_chrom != '') or tot_bp > 1 * 1e7:
                    chunks.append({
                        'chrom': str(prev_chrom),
                        'starts': starts,
                        'stops': stops,
                        '__mem_gb': 8
                    })
                    starts = []
                    stops = []
                    tot_bp = 0
                    tot_bp += (stop - start)
                prev_chrom = chrom
                starts.append(start)
                stops.append(stop)

            if prev_chrom != '':
                chunks.append({
                    'chrom': str(prev_chrom),
                    'starts': starts,
                    'stops': stops,
                    '__mem_gb': 8
                })

    return {'chunks': chunks}