コード例 #1
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    vc_mode, variant_caller, precalled_file, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    locus = args.locus
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    fasta_path = tk_reference.get_fasta(args.reference_path)

    bedfile = outs.default + ".bed"
    regions = Regions()
    if args.targets_file is not None:
        for (chrom, start,
             end) in tk_io.get_bed_iterator(args.targets_file, args.locus):
            regions.add_region((start, end))
    else:
        (chrom, start, stop) = tk_io.get_locus_info(args.locus)
        regions.add_region((start, stop))
    coverage_regions = None
    if (vc_mode !=
            "precalled") and args.high_coverage_excluded_bed is not None:
        coverage_regions = get_coverage_regions(args)
        regions = regions.intersect(coverage_regions)

    bed_length = 0
    with open(bedfile, 'w') as bed_writer:
        for region in regions.get_region_list():
            (start, end) = region
            bed_writer.write(chrom + "\t" + str(start) + "\t" + str(end) +
                             "\n")
            bed_length += 1
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        outs.default = None
        precalled_vars_path = args.split_input
        vcf = tk_io.VariantFileReader(precalled_vars_path)
        with open(outs.precalled, "w") as file_write:
            output = tk_io.VariantFileWriter(
                file_write, template_file=open(precalled_vars_path))
            variant_iter = tk_io.get_variant_iterator_pos(
                vcf, bedfile, args.locus)
            for record in variant_iter:
                output.write_record(record)
    if not (vc_mode == "precalled"):
        outs.precalled = None
        primary_contigs = tk_reference.load_primary_contigs(
            args.reference_path)
        if bed_length > 0 and chrom in primary_contigs:
            vc.run_variant_caller(variant_caller, gatk_path, args.__mem_gb,
                                  fasta_path, args.input, outs.default,
                                  bedfile)
コード例 #2
0
def call_haploid(haplotype, bam, locus, reference_path, variant_caller,
                 gatk_path, mem_gb):
    bam_name = "hap" + str(haplotype) + ".bam"
    haploid_bam, _ = tenkit.bam.create_bam_outfile(bam_name,
                                                   None,
                                                   None,
                                                   template=bam)
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    for read in bam.fetch(chrom, start, stop):
        readhap = dict(read.tags).get('HP')
        if readhap != None and int(readhap) == haplotype:
            haploid_bam.write(read)
    haploid_bam.close()
    tk_bam.index(bam_name)
    tmp_vcf_name = "tmp_hap" + str(haplotype) + ".vcf"
    vcf_name = "hap" + str(haplotype) + ".vcf"

    fasta_path = tk_ref.get_fasta(reference_path)
    vc.run_variant_caller(variant_caller,
                          gatk_path,
                          mem_gb,
                          fasta_path,
                          bam_name,
                          tmp_vcf_name,
                          haploid_mode=True)

    longranger.variants.canonicalize(tmp_vcf_name, vcf_name)
    tenkit.tabix.index_vcf(vcf_name)
    bam_in = tk_bam.create_bam_infile(bam_name)
    return (vcf_name + ".gz", bam_in)
コード例 #3
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    if args.locus is None:
        outs.spikes = None
        return

    if args.mean is None or (not args.mean):
        outs.spikes = None
        return

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)
    cov = tenkit.hdf5.read_data_frame_indexed(args.coverage,
                                              [(chrom, start, stop)],
                                              query_cols=['pos', 'coverage'])
    #cov.coverage = tk_stats.robust_divide(cov.coverage, args.mean)
    cov.coverage /= max(
        1.0, args.mean
    )  # args.mean should not be zero unless due to other issue, like missing args.cov_hist
    cov = cov[cov.coverage > 10]
    spikes_pos = cov["pos"].values
    breaks = list(np.where(np.diff(spikes_pos) != 1)[0] + 1)

    if len(breaks) > 0:
        starts = [spikes_pos[b] for b in [0] + breaks]
        ends = [spikes_pos[b - 1] + 1 for b in breaks + [len(spikes_pos)]]
    else:
        starts = []
        ends = []

    with open(outs.spikes, "w") as fout:
        for s, e in zip(starts, ends):
            fout.write(chrom + "\t" + str(s) + "\t" + str(e) + "\n")
コード例 #4
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.confident_regions is None:
        confident_regions = None
    else:
        confident_regions = tk_io.get_target_regions(
            open(args.confident_regions))

    outfile = open(outs.confident_windows, "w")
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        conf_regions = get_conf_regions(chrom, confident_regions)
        location = start
        while location < end:
            region = tk_regions.Regions(regions=[(location, location +
                                                  args.window_size)])
            isect = region.intersect(conf_regions)
            size = isect.get_total_size()
            percent = tk_stats.robust_divide(float(size),
                                             float(args.window_size))
            row = [chrom, location, location + args.window_size, percent]
            outfile.write("\t".join(map(str, row)) + "\n")
            location += args.window_size
    outfile.close()
コード例 #5
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()
    input_vfr = tk_io.VariantFileReader(args.input)

    bc_mix_prob = args.bc_mix_prob
    min_var_hap_conf = args.min_var_hap_conf
    min_junction_hap_conf = args.min_junction_hap_conf
    hap_block_size = args.hap_block_size
    hap_block_buffer_size = args.hap_block_buffer_size
    max_reassign_rounds = args.max_reassign_rounds
    chrom, start, stop = tk_io.get_locus_info(args.locus)

    output_file = open(outs.default.strip('.gz'), 'w')
    fragment_output_file = open(outs.fragment_phasing.strip('.gz'), 'w')
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    # Add the component name and the version of the phasing code
    new_source = "10X/pipelines/stages/snpindels/phase_snpindels %s" % martian.get_pipelines_version(
    )
    new_filters = [
        ("10X_PHASING_INCONSISTENT",
         "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with phasing."
         ),
        ("10X_HOMOPOLYMER_UNPHASED_INSERTION",
         "Unphased insertions in homopolymer regions tend to be false positives"
         )
    ]
    new_formats = [
        ("PS", 1, "Integer", "ID of Phase Set for Variant"),
        ("PQ", 1, "Integer",
         "Phred QV indicating probability at this variant is incorrectly phased"
         ),
        ("JQ", 1, "Integer",
         "Phred QV indicating probability of a phasing switch error in gap prior to this variant"
         ),
    ]
    vfw = tk_io.VariantFileWriter(output_file,
                                  template_file=open(args.input),
                                  new_source=new_source,
                                  new_format_fields=new_formats,
                                  new_filters=new_filters)
    if args.do_phasing:
        phaser = Phaser(input_vfr, args.fragments, chrom, start, stop,
                        bc_mix_prob, min_junction_hap_conf, min_var_hap_conf,
                        hap_block_buffer_size, hap_block_size,
                        max_reassign_rounds, vc_mode)
        phaser.call_haps(vfw, fragment_output_file)
    else:
        pass_variants(input_vfr,
                      vfw,
                      chrom,
                      start,
                      stop,
                      strip_phasing_info=True)
    output_file.close()
    fragment_output_file.close()

    tk_tabix.sort_unique_tabix_vcf(outs.default.strip('.gz'))
コード例 #6
0
ファイル: __init__.py プロジェクト: umccr/longranger
def get_coverage_regions(args):
    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    regions = Regions(
        tk_io.get_target_regions(open(
            args.high_coverage_excluded_bed)).get(chrom))
    if regions == None:
        regions = Regions()
    return regions
コード例 #7
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov_df = get_hap_coverage(in_bam, args.phase_set_h5, chrom, start, stop, cov_quals=COV_QUALS)
        tk_hdf5.append_data_frame(outs.hap_coverage, cov_df)

    in_bam.close()
コード例 #8
0
ファイル: __init__.py プロジェクト: umccr/longranger
def split(args):
    bam_in = tk_bam.create_bam_infile(args.input)

    if args.restrict_locus is None:
        chunk_defs = [{'chrom': chrom} for chrom in bam_in.references]
    else:
        chrom, start, stop = tk_io.get_locus_info(args.restrict_locus)
        chunk_defs = [{'chrom': chrom}]

    return {'chunks': chunk_defs}
コード例 #9
0
def main(args, outs):
    genome_fasta_path = cr_utils.get_reference_genome_fasta(args.reference_path)

    chrom, start, stop = tk_io.get_locus_info(args.locus)
    bed_path = martian.make_path('region.bed')
    with open(bed_path, 'w') as f:
        f.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\n")

    freebayes_args = ['freebayes', '-f', genome_fasta_path, '-b', args.input, '-0', '-t', bed_path]

    with open(outs.output, 'w') as f:
        subprocess.check_call(freebayes_args, stdout=f)
コード例 #10
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if not args.fragments:
        outs.rPM = None
        outs.rPMFiles = None
        outs.stat = None
        outs.fracPhased = None
        outs.covStat = None
        return

    tool = "molecular_count"
    cmd0 = tool
    if args.wgsmode:
        cmd0 += " --wgs"

    tmp_bed_file = outs.stat + "_tmp.bed"
    (chrom, start, end) = tk_io.get_locus_info(args.locus)

    nrows = 0
    with open(tmp_bed_file, "w") as fout:
        with open(args.queryRegions) as f:
            for l in f:
                items = l.split()
                reg_chrom = items[0]
                reg_start = int(items[1])
                reg_end = int(items[2])

                if chrom == reg_chrom and reg_start >= start and reg_end < end:
                    nrows += 1
                    fout.write(l)

    if nrows == 0:
        outs.rPM = None
        return

    # specify the target file
    cmd0 += " -b " + tmp_bed_file

    cmd = cmd0 + \
          " -f " + args.fragments + \
          " -p " + args.fragment_phasing + \
          " -m " + args.phased_possorted_bam + \
          " --rPM " + outs.rPM + " --stat " + outs.stat + \
          " --fracPhased " + outs.fracPhased + " --mapq "+str(args.mapq) + \
          " --overlap " + str(args.overlap) + " --covstat " + str(outs.covStat)

    print "Running cmd:"
    print cmd

    subprocess.check_call(cmd, shell=True)
コード例 #11
0
def main(args, outs):
    """For each slice produce a fasta file sampling reads from that slice. 
    We split our section of the genome into a bunch of 20kb chunks. For each
    chunk we sample an identical number of paired end reads. The name of each
    read encodes the true position that it was sampled from."""

    # Grab basic stats for the read lengths and quality scores
    stats_fp = open(args.basic_stats)
    stats = json.load(stats_fp)

    # Fix the random seed
    np.random.seed(0)

    # Info is a map we use everywhere to track the sampling parameters.
    # r1_len: the length of read1
    # r2_len: the length of read2
    # insert_size_map: a map of insert-size (as a string) to frequency
    # q_score_map a map of quality score (as a string) to frequency

    info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']}

    info['q_score_map'] = {
        '30': stats['bc_q30_bases'],
        '20': stats['bc_q20_bases'] - stats['bc_q30_bases'],
        '0': stats['bc_tot_bases'] - stats['bc_q20_bases']
    }

    stats_is_fp = open(args.insert_sizes)
    info['insert_size_map'] = json.load(stats_is_fp)['60']

    # How many samples will we make from each window?
    samples = int(
        round(2.0 * args.target_coverage *
              (float(args.window_size) / (stats['r1_len'] + stats['r2_len']))))

    martian.log_info("Using %i samples per %i bin" %
                     (samples, args.window_size))
    output_path = martian.make_path("chnk.fasta")
    output = open(output_path, "w")

    ref = reference.open_reference(args.reference_path)
    #Loop over every window in every loci.
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        cur = start
        while (cur < end):
            # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put
            # the results in the output file
            perbin(chrom, cur, ref, output, info, args.window_size, samples)
            cur += args.window_size
    outs.tmp = output_path
    outs.samples_per_bin = samples
    output.close()
コード例 #12
0
ファイル: chunk_utils.py プロジェクト: Shians/cellranger
def pack_loci(loci):
    packed_loci = []

    current_group = []
    current_size = 0
    for locus in loci:
        (chrom, start, end) = tk_io.get_locus_info(locus)
        current_group.append(locus)
        current_size += end - start

        if current_size >= 0.25 * PARALLEL_LOCUS_SIZE:
            packed_loci.append(current_group)
            current_group = []
            current_size = 0

    if len(current_group) > 0:
        packed_loci.append(current_group)

    return packed_loci
コード例 #13
0
ファイル: chunk_utils.py プロジェクト: umccr/longranger
def validate_loci(bam, loci, whitelist):

    loci = sorted([tk_io.get_locus_info(l) for l in loci])

    good_chroms = []
    for (chrom, items) in groupby(loci, lambda x: x[0]):
        sorted_items = sorted(items, key=lambda x:x[1])
        last_end = 0
        for (_,s,e) in sorted_items:
            assert(e-s > 0)
            assert(s == last_end)
            last_end = e

        assert(last_end == chrom_size(bam, chrom))
        good_chroms.append(chrom)

    if whitelist == None:
        assert(set(good_chroms) == set(bam.references))
    else:
        assert(set(good_chroms) == set(whitelist))
コード例 #14
0
def merge_haploid(novel_vcf, putative_vcf, locus, output_filename, bam, reference_pyfasta, args, add_gl = True):
    novel = tenkit.bio_io.VariantFileReader(novel_vcf)
    putative_variants = tenkit.bio_io.VariantFileReader(putative_vcf)
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    with open(output_filename,'w') as output:
        output_vcf = tenkit.bio_io.VariantFileWriter(output, template_file = open(putative_vcf))
        for (novel, putative) in pair_iter(novel.record_getter(), putative_variants.record_getter(fetch_chrom=chrom, fetch_start = start, fetch_end=stop)):
            if putative is not None and (tk_io.get_record_passes_filters(putative) or tk_io.get_record_filters(putative) != ['10X_QUAL_FILTER']):
                putative.INFO['HAPLOCALLED'] = 0
                output_vcf.write_record(putative)
            elif novel is not None:
                tk_io.set_record_phase_set(novel, get_phase_set(novel, bam))
                tk_io.set_record_phase_qual(novel, 25)
                tk_io.set_record_junction_qual(novel, 25)
                populate_fields(novel, bam, reference_pyfasta, args)
                novel.INFO['HAPLOCALLED'] = 1
                if add_gl:
                    tk_io.set_record_genotype_likelihoods(novel, calculate_psuedo_genotype_likelihoods(novel))
                if novel.QUAL is not None: # freebayes ploidy 1 gives some variants with '.' as the qual. These are extremely low quality not worth even tracking
                    output_vcf.write_record(novel)
            else:
                putative.INFO['HAPLOCALLED'] = 0
                output_vcf.write_record(putative)
コード例 #15
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()
    frag_files = [c.fragment_phasing.strip('.gz') for c in chunk_outs]
    vcf_files = [c.default.strip('.gz') for c in chunk_outs]
    loci = [tk_io.get_locus_info(c.locus) for c in chunk_defs]

    out_frag_base = outs.fragment_phasing.strip('.gz')
    out_vcf_base = outs.default.strip('.gz')

    # stitch phase blocks
    (stitched_chrom_vcfs, stitched_chrom_frags) = stitcher.multi_join_parallel(
        frag_files, vcf_files, loci, args.__threads)

    # combine the chromosome-level outputs
    combine_frags(out_frag_base, stitched_chrom_frags)
    tk_io.combine_vcfs(out_vcf_base, stitched_chrom_vcfs)
    if args.vc_precalled is not None:
        outs.vc_precalled = outs.default
    else:
        outs.vc_precalled = None

    # final indexing
    pysam.tabix_index(out_frag_base, seq_col=0, start_col=1, end_col=2)
コード例 #16
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']
    ext_cols = list(sel_cols)
    ext_cols.append('total_cov')

    out_loci = []
    summary_df = None
    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov = reader.query((chrom, start, stop))
        cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int)
        cov['total_cov'] = cov[sel_cols].sum(axis=1)
        mean_cov = np.mean(cov['total_cov'])
        summary_df = pd.concat([
            summary_df,
            pd.DataFrame(
                {
                    'chrom': chrom,
                    'start': start,
                    'stop': stop,
                    'mean_cov': mean_cov
                },
                index=[0])
        ],
                               ignore_index=True)
        # Remove very small phase sets. These tend to be single-SNP phase sets
        # and can result from erroneous SNPs.
        cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000)
        sum_df = cov.groupby(['bin',
                              'phase_set'])[ext_cols].mean().reset_index()
        sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov
        sum_df['low_hap0'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1)
        sum_df['low_hap1'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0)

        if not sum_df.empty:
            any_low = np.logical_or(
                sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0))

            bins = np.array(sum_df['bin'])
            bins = np.concatenate([bins, [np.max(bins) + 1]])
            pos = 0
            # Get runs of 0s and 1s in any_low
            for bit, group in groupby(any_low):
                group_size = len(list(group))
                group_start = bins[pos] * args.bin_size
                group_stop = bins[pos + group_size] * args.bin_size
                region_len = group_stop - group_start
                if bit and region_len >= args.min_len:
                    out_loci.append((chrom, max(0,
                                                group_start - args.bin_size),
                                     group_start + args.bin_size, chrom,
                                     max(0, group_stop - args.bin_size),
                                     group_stop + args.bin_size))
                pos += group_size

    with open(outs.loci, 'w') as f:
        cPickle.dump(out_loci, f)

    summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
コード例 #17
0
def split(args):
    win = args.window_size
    input_bam = tk_bam.create_bam_infile(args.input)
    chroms = input_bam.references
    chrom_lengths = input_bam.lengths
    chrom_len_map = {}
    for i, chrom in enumerate(chroms):
        chrom_len_map[chrom] = chrom_lengths[i]
    input_bam.close()

    max_mem_in_gb = 4  # Be a little conservative
    chunk_size = get_max_chunk(win, max_mem_in_gb)
    if not args.restrict_locus is None:
        locus_chrom, locus_start, locus_stop = tk_io.get_locus_info(
            args.restrict_locus)
        assert (locus_chrom in chrom_len_map)
        locus_start = max(0, locus_start)
        locus_stop = min(locus_stop, chrom_len_map[locus_chrom])

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)
    genome_size = np.sum([
        length for chrom, length in chrom_len_map.iteritems()
        if chrom in primary_contigs
    ])

    prev_chrom = ''
    tot_bp = 0
    starts = []
    stops = []
    chunks = []

    # Genome-wide windows
    if not args.restrict_locus is None:
        chunks.append({
            'chrom': locus_chrom,
            'starts': [locus_start],
            'stops': [locus_stop],
            '__mem_gb': 8
        })
    else:
        for chrom, length in chrom_len_map.iteritems():
            if not args.sex is None and args.sex.lower() in [
                    'f', 'female'
            ] and chrom in ['Y', 'chrY']:
                continue
            if not chrom in primary_contigs:
                continue
            nchunks = int(np.ceil(length / float(chunk_size)))
            # Divide as evenly as possible the windows across the chunks
            # This also makes sure that all chunks except the last will
            # have sizes that are multiples of the window size.
            win_per_chunk = int(np.ceil(length / float(nchunks * win)))
            new_chunk_size = win_per_chunk * win
            for c in range(nchunks):
                chunk_start = c * new_chunk_size
                chunk_stop = min((c + 1) * new_chunk_size, length)
                chunks.append({
                    'chrom': chrom,
                    'starts': [chunk_start],
                    'stops': [chunk_stop],
                    '__mem_gb': 8
                })

    # Target-centered windows. If the targets (plus the extent) cover too much of the
    # genome, then skip these.
    if not args.targets is None and not args.target_extend is None:
        target_regions = []
        bed_iterator = tk_io.get_bed_iterator(args.targets)
        for chrom, start, stop in bed_iterator:
            if not args.sex is None and args.sex.lower() in [
                    'f', 'female'
            ] and chrom in ['Y', 'chrY']:
                continue
            if not chrom in primary_contigs:
                continue
            stop = min(chrom_len_map[chrom], stop)
            if args.restrict_locus is None or (
                    chrom == locus_chrom
                    and overlaps(start, stop, locus_start, locus_stop)):
                target_regions.append((chrom, start, stop))

        target_regions = sort_and_merge(target_regions, args.target_extend)
        target_size = np.sum(
            [stop - start for _, start, stop in target_regions])

        if target_size / float(genome_size) < MIN_TARGET_FRAC:
            for (chrom, start, stop) in target_regions:
                if (prev_chrom != chrom
                        and prev_chrom != '') or tot_bp > 1 * 1e7:
                    chunks.append({
                        'chrom': str(prev_chrom),
                        'starts': starts,
                        'stops': stops,
                        '__mem_gb': 8
                    })
                    starts = []
                    stops = []
                    tot_bp = 0
                    tot_bp += (stop - start)
                prev_chrom = chrom
                starts.append(start)
                stops.append(stop)

            if prev_chrom != '':
                chunks.append({
                    'chrom': str(prev_chrom),
                    'starts': starts,
                    'stops': stops,
                    '__mem_gb': 8
                })

    return {'chunks': chunks}
コード例 #18
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    """ Outputs barcode file """
    args.coerce_strings()
    bam_in = tk_bam.create_bam_infile(args.input)
    unsorted_temp_name = martian.make_path(outs.contig_output +
                                           '_TEMPUNSORTED')
    sorted_temp_name = martian.make_path(outs.contig_output + '_TEMPSORTED')
    base_dir = os.path.dirname(outs.contig_output)
    unsorted_temp_file = open(unsorted_temp_name, 'w')
    contig_output_file = open(outs.contig_output, 'w')
    window_size = args.window_size

    chroms = bam_in.references

    # Output the raw poses
    unsorted_temp_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ']) +
                             '\n')
    if args.restrict_locus is None:
        bam_iter = bam_in.fetch(args.chrom)
    else:
        restrict_chrom, restrict_start, restrict_stop = tk_io.get_locus_info(
            args.restrict_locus)
        assert (args.chrom == restrict_chrom)
        bam_iter = bam_in.fetch(restrict_chrom, restrict_start, restrict_stop)

    for read in bam_iter:
        chrom = chroms[read.tid]
        start = read.pos
        end = read.aend

        if end is None:
            end = start + len(read.seq)

        bc = tk_io.get_read_barcode(read)

        if not (bc is None):
            unsorted_temp_file.write(
                '\t'.join([chrom, str(start), str(end), bc]) + '\n')

    # Sort the poses
    unsorted_temp_file.close()
    tk_tabix.sort_bc_loc_tabix(unsorted_temp_name,
                               sorted_temp_name,
                               temp_dir_name=base_dir)

    # Infer the contig locations
    # This header is written during join
    #contig_output_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ', 'NUM_READS']) + '\n')
    sorted_temp_file = open(sorted_temp_name, 'r')
    sorted_temp_file.readline()
    old_bc_seq = None
    bc_poses = []
    for line in sorted_temp_file:
        (chrom, start, end, bc_seq) = line.strip('\n').split('\t')
        start = int(start)
        end = int(end)

        if not (bc_seq == old_bc_seq):
            if not (old_bc_seq is None):
                frags = infer_fragments(bc_poses, window_size)
                for (frag_chrom, frag_start, frag_end, num_reads) in frags:
                    contig_output_file.write('\t'.join([
                        frag_chrom,
                        str(frag_start - BUFFER),
                        str(frag_end + BUFFER), old_bc_seq,
                        str(num_reads)
                    ]) + '\n')
            bc_poses = []
        old_bc_seq = bc_seq
        bc_poses.append((chrom, start, end))

    # Output for the last barcode
    if not (old_bc_seq is None):
        frags = infer_fragments(bc_poses, window_size)
        for (frag_chrom, frag_start, frag_end, num_reads) in frags:
            contig_output_file.write('\t'.join([
                frag_chrom,
                str(frag_start - BUFFER),
                str(frag_end + BUFFER), old_bc_seq,
                str(num_reads)
            ]) + '\n')

    sorted_temp_file.close()
    subprocess.check_call(['rm', sorted_temp_name])
    subprocess.check_call(['rm', unsorted_temp_name])
    contig_output_file.close()
コード例 #19
0
def main(args, outs):

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)

    # further split each chunk into regions of 100kB
    regionSize = 10000
    regionStarts = range(start, stop, regionSize)
    regionEnds = [x for x in regionStarts[1:]]
    regionEnds.append(stop)

    # read the bam file
    samfile = pysam.Samfile(args.bam_infile, "rb")
    fouts = [
        open(outs.hp0.strip(".gz"), 'w'),
        open(outs.hp1.strip(".gz"), 'w'),
        open(outs.hp2.strip(".gz"), 'w')
    ]

    for rStart, rEnd in zip(regionStarts, regionEnds):
        print "\n", rStart, rEnd
        barcode_reads = {}
        barcode_hp = {}
        # initialize the coverage track
        coverage = n.zeros((3, rEnd - rStart))

        for r in samfile.fetch(chrom, rStart, rEnd):
            if not r.is_proper_pair: continue
            if r.is_duplicate or not r.is_paired or r.is_qcfail or \
                r.is_secondary or r.is_unmapped or r.mapq<20 or r.tid != r.rnext or \
                abs(r.pos - r.pnext)>5000:
                continue

            tags = dict(r.tags)
            if not "MI" in tags: continue
            mid = tags["MI"]
            hp = 0
            if "HP" in tags: hp = tags["HP"]
            if not mid in barcode_hp:
                barcode_hp[mid] = hp
                barcode_reads[mid] = tk_regions.Regions()
            barcode_reads[mid].add_region((r.pos, r.aend))

        # add the unique sequenced mol coverage
        for bc in barcode_reads.keys():
            regions = barcode_reads[bc]
            hp = barcode_hp[bc]
            for rgs in regions:
                rgs_start = max(rStart, rgs[0])
                rgs_end = min(rEnd, rgs[1])
                coverage[hp][rgs_start - rStart:(rgs_end - rStart)] += 1

        for hp in range(3):
            disc_cov = Disc(coverage[hp])
            #disc_cov = [ discretize(x) for x in coverage[hp] ]
            sel = (disc_cov[:-1] != disc_cov[1:])
            print sel.sum()
            pos = n.arange(len(disc_cov))
            boundaries = n.append(0, pos[sel], len(disc_cov))
            print disc_cov.size, sel.sum(), boundaries.size
            #print coverage[hp][:10]
            #print disc_cov[:min(10,len(disc_cov))]
            for i in range(len(boundaries) - 1):
                fouts[hp].write(
                    "%s\t%d\t%d\t%d\n" %
                    (chrom, boundaries[i] + rStart, boundaries[i + 1] + rStart,
                     disc_cov[boundaries[i]]))
    #disc_cov = Disc(coverage) # discereitzed coverage

    for hp in range(3):
        fouts[hp].close()
コード例 #20
0
ファイル: __init__.py プロジェクト: umccr/longranger
def split(args):
    input_bam = tk_bam.create_bam_infile(args.bam_file)

    if args.sex is None or args.sex.lower() in ['m', 'male']:
        remove_chroms = ['chrX', 'chrY', 'chrM', 'X', 'Y', 'MT', 'M']
    elif args.sex.lower() in ['f', 'female']:
        remove_chroms = ['chrY', 'chrM', 'Y', 'MT', 'M']
    else:
        martian.throw("Unrecognized sex: %s" % args.sex)

    primary_contigs = tenkit.reference.load_primary_contigs(
        args.reference_path)

    # estimate density of het snps and barcodes
    primary_contig_lengths = [
        (chrom, length)
        for (chrom, length) in zip(input_bam.references, input_bam.lengths)
        if chrom in primary_contigs
    ]
    (frac_het_snps, bcs_per_het_snp,
     het_rate) = smooth_sample_bcs_and_het_snps(args.input,
                                                primary_contig_lengths)
    martian.log_info(
        "Fraction of SNPs that are het: %f, BCs per SNP: %f, Hets per bp: %f" %
        (frac_het_snps, bcs_per_het_snp, het_rate))

    # Set up dynamic chunk sizes to make smaller chunks for highly het organisms
    het_rate = max(min(0.05, het_rate), 0.0001)
    parallel_locus_size = min(tenkit.constants.PARALLEL_LOCUS_SIZE,
                              tk_stats.robust_divide(60000, het_rate))

    if args.restrict_locus is None:
        loci = tk_bam.generate_tiling_windows(
            input_bam,
            parallel_locus_size,
            overlap=args.chunk_stitching_overlap)
    else:
        loci = [args.restrict_locus]

    chunks = []
    for (idx, locus) in enumerate(loci):
        (chrom, start, end) = tk_io.get_locus_info(locus)

        if args.fragments is None or chrom not in primary_contigs or chrom in remove_chroms:
            mem_gb = 3
            martian.log_info("Chunk %d: No phasing, requesting %d GB" %
                             (idx, mem_gb))
            chunk = {'locus': locus, '__mem_gb': mem_gb, 'do_phasing': False}
        else:
            est_het_snps = round(frac_het_snps *
                                 count_records(args.input, chrom, start, end))
            est_gb = np.ceil(0.5 + 400.0 / 1e9 * est_het_snps *
                             bcs_per_het_snp)  # empirical memory usage
            min_gb = 4
            if np.isnan(est_gb):
                mem_gb = min_gb
            else:
                mem_gb = max(min_gb, int(est_gb))
            martian.log_info(
                "Chunk %d: Estimated %f het SNPs, requesting %f GB" %
                (idx, est_het_snps, mem_gb))
            chunk = {'locus': locus, '__mem_gb': mem_gb, 'do_phasing': True}

        chunks.append(chunk)

    return {'chunks': chunks, 'join': {'__mem_gb': 16, '__threads': 4}}
コード例 #21
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):

    if args.barcode_whitelist is None:
        # write empty dataframe
        tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates)
        martian.log_info('Data seem un-barcoded. No deletion candidates will be computed.')
        return

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    del_loci = []
    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov_df = get_hap_coverage(in_bam, None, chrom, start, stop, cov_quals=[30])
        best_path = get_candidate_del_loci(cov_df, transition_prob=args.transition_prob, het_read_prob=args.het_read_prob)

        # Get regions with good coverage for a het del (not too high, not too low)
        bad_cov = np.logical_or(cov_df['total_cov'] < MIN_COV,
                                cov_df['total_cov'] > MAX_COV)
        bad_regions = tk_regions.Regions([ (s,e) for (s,e) in group_bit_arr(bad_cov, start) if e-s > args.min_bad_region])

        # Group the states of the HMM and exclude bad regions
        pos = start
        out_loci = []
        for bit, group in groupby(best_path):
            group_size = len(list(group))
            group_start = pos
            group_stop = group_start + group_size
            if bit and group_size >= args.min_del_len and group_size <= args.max_del_len and \
               not bad_regions.overlapping_regions(group_start, group_stop):
                out_loci.append((chrom, group_start, group_stop))
            pos += group_size

        # Get regions that look like hom dels
        hom_del_loci = group_bit_arr(cov_df['total_cov'] < MIN_COV, start)
        out_loci.extend([(chrom, s, e) for (s, e) in hom_del_loci])
        out_loci = sorted(out_loci)

        # Now merge deletion candidates that are separated by short non-dels
        if out_loci:
            new_out_loci = []
            last_locus = out_loci[0]
            for i, locus in enumerate(out_loci[1:]):
                if locus[1] - last_locus[2] > MIN_GAP:
                    new_out_loci.append(last_locus)
                    last_locus = locus
                else:
                    last_locus = (last_locus[0], min(locus[1], last_locus[1]), max(locus[2], last_locus[2]))
            new_out_loci.append(last_locus)

            del_loci.extend(new_out_loci)

    final_loci = [locus for locus in del_loci if locus[2] - locus[1] >= args.min_del_len and locus[2] - locus[1] <= args.max_del_len]
    info_strs = ['TYPE=DEL' for _ in final_loci]
    in_bam.close()

    chroms = [locus[0] for locus in final_loci]
    starts1 = np.array([locus[1] for locus in final_loci], dtype=np.int)
    starts2 = np.array([locus[2] for locus in final_loci], dtype=np.int)
    sv_df = tk_sv_io.create_sv_df(chroms, starts1, starts1 + 1,
                                  chroms, starts2, starts2 + 1,
                                  np.arange(len(chroms)), 1, info_strs = info_strs)
    tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.del_candidates)
コード例 #22
0
def main(args, outs):

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)

    dtypes = {
        '#chrom': "str",
        "frag_start": "int64",
        "frag_end": "int64",
        "h0": "float64",
        "h1": "float64"
    }
    frags = pd.read_csv(args.fragment_phasing, sep="\t", compression='gzip',\
       usecols=["#chrom","frag_start","frag_end","h0","h1"], dtype=dtypes)
    frags = frags[(frags["#chrom"] == chrom) & (frags["frag_end"] >= start) &
                  (frags["frag_start"] <= stop)]
    frags["hp"] = 0
    frags.loc[frags["h0"] >= 0.95, 'hp'] = 1
    frags.loc[frags["h1"] >= 0.95, 'hp'] = 2
    del frags["h0"]
    del frags["h1"]

    # further split each chunk into regions of 100kB
    regionSize = 10000
    regionStarts = range(start, stop, regionSize)
    regionEnds = [x for x in regionStarts[1:]]
    regionEnds.append(stop)

    # read the bam file
    samfile = pysam.Samfile(args.possorted_bam, "rb")
    fouts = [[
        open(outs.hp_read_0.strip(".bw") + ".bedGraph", 'w'),
        open(outs.hp_read_1.strip(".bw") + ".bedGraph", 'w'),
        open(outs.hp_read_2.strip(".bw") + ".bedGraph", 'w'),
        open(outs.hp_read_t.strip(".bw") + ".bedGraph", 'w')
    ],
             [
                 open(outs.hp_bc_0.strip(".bw") + ".bedGraph", 'w'),
                 open(outs.hp_bc_1.strip(".bw") + ".bedGraph", 'w'),
                 open(outs.hp_bc_2.strip(".bw") + ".bedGraph", 'w'),
                 open(outs.hp_bc_t.strip(".bw") + ".bedGraph", 'w')
             ]]

    # convert h5 file to filtered csv file
    def filter_func(df):
        return (df["chrom"] == chrom) & (df["start_pos"] <=
                                         stop) & (df["end_pos"] >= start)

    # work with small region at a time to avoid large memory
    for rStart, rEnd in zip(regionStarts, regionEnds):
        frags2 = frags[(frags["frag_end"] >= rStart)
                       & (frags["frag_start"] <= rEnd)]
        coverage = [np.zeros((4, rEnd - rStart)), np.zeros((4, rEnd - rStart))]
        #bc_2_phase = {}
        print "\n", rStart, rEnd
        # initialize the coverage track

        ## read count
        for r in samfile.fetch(chrom, rStart, rEnd):
            if not r.is_proper_pair: continue
            if r.is_duplicate or (not r.is_paired) or r.is_qcfail or \
                r.is_secondary or r.is_unmapped or r.mapq<30 or r.tid != r.rnext or \
                abs(r.pos - r.pnext)>5000:
                continue

            tags = dict(r.tags)
            if not "MI" in tags: continue
            hp = 0
            if "HP" in tags:
                hp = tags["HP"]

            s = max(rStart, r.pos)
            e = min(rEnd, r.aend)
            if s >= e: continue
            #print max(rStart, r.pos), min(rEnd,r.aend)
            coverage[0][hp][(s - rStart):(e - rStart)] += 1
            coverage[0][3][(s - rStart):(e - rStart)] += 1
            #bc = tags["BX"]
            #if not bc in bc_2_phase:
            #    bc_2_phase[bc] = hp

        ## bc count
        for _, row in frags2.iterrows():
            s = max(rStart, row["frag_start"])
            e = min(rEnd, row["frag_end"])

            if s >= e: continue
            coverage[1][3][(s - rStart):(e - rStart)] += 1
            coverage[1][row["hp"]][(s - rStart):(e - rStart)] += 1

        # discretization and then print out in the bedGraph format
        for kind in range(2):  ## read and then bc counts
            for hp in range(4):
                disc_cov = Disc(coverage[kind][hp])
                #disc_cov = [ discretize(x) for x in coverage[hp] ]
                sel = (disc_cov[:-1] != disc_cov[1:])
                print sel.sum()
                pos = np.arange(len(disc_cov))
                boundaries = np.append(0, [x + 1 for x in pos[sel]])
                boundaries = np.append(boundaries, len(disc_cov))
                print disc_cov.size, sel.sum(), boundaries.size
                #print coverage[hp][:10]
                #print disc_cov[:min(10,len(disc_cov))]
                for i in range(len(boundaries) - 1):
                    fouts[kind][hp].write(
                        "%s\t%d\t%d\t%d\n" %
                        (chrom, boundaries[i] + rStart,
                         boundaries[i + 1] + rStart, disc_cov[boundaries[i]]))
    #disc_cov = Disc(coverage) # discereitzed coverage

    for kind in range(2):  ## read and then bc counts
        for hp in range(4):
            fouts[kind][hp].close()
コード例 #23
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)

    if chrom in ['chrM', 'MT', 'M'] or (args.sex.lower() in ["f", "female"]
                                        and chrom in ["chrY", "Y"]):
        return

    fragment_barcode_info = pysam.Tabixfile(args.fragment_phasing)
    AH_0_BH_0 = (
        'AH_0_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    AH_1_BH_1 = (
        'AH_1_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_0_BH_1 = (
        'AH_0_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_1_BH_0 = (
        'AH_1_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    BX_HAP_OR = (
        'BX_HAP_OR', '1', 'Float',
        "Barcode aware haplotype filtering score (log odds ratio currently)")
    BARCODE_AWARE_FILTER = [(
        "BARCODE_AWARE_FILTER",
        "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with haplotype (ie variants should have most of their allele haplotype 0 alleles coming from barcodes whose fragments are haplotype 0 etc)"
    )]
    extra_fields = [AH_0_BH_0, AH_1_BH_1, AH_0_BH_1, AH_1_BH_0, BX_HAP_OR]
    input_variants = tk_io.VariantFileReader(args.variants)
    with open(outs.default.strip(".gz"), 'w') as output_file:
        output_variants = tk_io.VariantFileWriter(
            output_file,
            template_file=open(args.variants, 'r'),
            new_info_fields=extra_fields,
            new_filters=BARCODE_AWARE_FILTER)
        variant_iterator = tk_io.get_variant_iterator_pos(
            input_variants, None, args.locus)
        for record in variant_iterator:
            sample = record.samples[0]
            ref = tk_io.get_record_ref(record)
            alt_alleles = tk_io.get_record_alt_alleles(record)

            if not tk_io.get_record_passes_filters(record):
                output_variants.write_record(record)
                continue
            if len(sample.gt_alleles) > 1:
                genotype_1 = int(sample.gt_alleles[0])
                genotype_2 = int(sample.gt_alleles[1])
                if genotype_1 == genotype_2:
                    output_variants.write_record(record)
                    continue  #homozygous, can't filter this way
            else:
                output_variants.write_record(record)
                continue  #homozygous, can't filter this way

            chrom = tk_io.get_record_chrom(record)
            if not chrom == "chrM":
                variant_barcode_info = load_variant_barcode_phasing_info(
                    record, fragment_barcode_info)
                if not barcode_aware_filter(record, variant_barcode_info):
                    if record.FILTER is None:
                        record.FILTER = []
                    if tk_io.get_var_type(ref, alt_alleles[0]) == "S" and (
                        (vc_mode == 'call') or (vc_mode == "precalled_plus"
                                                and "TENX" in record.INFO)):
                        record.FILTER.append("BARCODE_AWARE_FILTER")
            output_variants.write_record(record)