Esempio n. 1
0
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                            '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath],
                           stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    ## sort by read names
    bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted')
    sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True)
    matepair_regions = defaultdict(list)
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            matepair_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    return matepair_regions
Esempio n. 2
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Esempio n. 3
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Esempio n. 4
0
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath,
                using_reads):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    if not is_non_empty_file(bam_filtered_fpath):
        filter_rule = 'not unmapped and not supplementary and not secondary_alignment'
        sambamba_view(bam_fpath,
                      bam_filtered_fpath,
                      qconfig.max_threads,
                      err_fpath,
                      logger,
                      filter_rule=filter_rule)
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_sorted_fpath):
        sort_bam(bam_filtered_fpath,
                 bam_sorted_fpath,
                 err_fpath,
                 logger,
                 sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath,
                           using_reads,
                           bam_sorted_fpath,
                           err_fpath,
                           logger,
                           bedpe=using_reads == 'mp')
    intervals = defaultdict(list)
    if using_reads == 'mp':
        insert_size, std_dev = calculate_insert_size(sam_fpath,
                                                     output_dirpath,
                                                     ref_name,
                                                     reads_suffix='mp')
        min_is = insert_size - std_dev
        max_is = insert_size + std_dev
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            if using_reads == 'mp' and insert_size:
                interval_len = int(fs[2]) - int(fs[1])
                if min_is <= abs(interval_len) <= max_is:
                    intervals[fs[0]].append((int(fs[1]), int(fs[2])))
            else:
                intervals[fs[0]].append((int(fs[1]), int(fs[2])))
    return intervals
Esempio n. 5
0
def analyse_coverage(output_dirpath, fpath, chr_names, bam_fpath, stats_fpath, err_fpath, logger):
    filename = qutils.name_from_fpath(fpath)
    bed_fpath = bam_to_bed(output_dirpath, filename, bam_fpath, err_fpath, logger)
    chr_len_fpath = get_chr_len_fpath(fpath, chr_names)
    cov_fpath = join(output_dirpath, filename + '.genomecov')
    calculate_genome_cov(bed_fpath, cov_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=False)

    avg_depth = 0
    coverage_for_thresholds = [0 for threshold in qconfig.coverage_thresholds]
    with open(cov_fpath) as f:
        for line in f:
            l = line.split()  # genome; depth; number of bases; size of genome; fraction of bases with depth
            depth, genome_fraction = int(l[1]), float(l[4])
            if l[0] == 'genome':
                avg_depth += depth * genome_fraction
                for i, threshold in enumerate(qconfig.coverage_thresholds):
                    if depth >= threshold:
                        coverage_for_thresholds[i] += genome_fraction

    with open(stats_fpath, 'a') as out_f:
        out_f.write('%s depth\n' % int(avg_depth))
        for i, threshold in enumerate(qconfig.coverage_thresholds):
            out_f.write('%.2f coverage >= %sx\n' % (coverage_for_thresholds[i] * 100, threshold))
Esempio n. 6
0
def analyse_coverage(output_dirpath, fpath, chr_names, bam_fpath, stats_fpath, err_fpath, logger):
    filename = qutils.name_from_fpath(fpath)
    bed_fpath = bam_to_bed(output_dirpath, filename, bam_fpath, err_fpath, logger)
    chr_len_fpath = get_chr_len_fpath(fpath, chr_names)
    cov_fpath = join(output_dirpath, filename + '.genomecov')
    calculate_genome_cov(bed_fpath, cov_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=False)

    avg_depth = 0
    coverage_for_thresholds = [0 for threshold in qconfig.coverage_thresholds]
    with open(cov_fpath) as f:
        for line in f:
            l = line.split()  # genome; depth; number of bases; size of genome; fraction of bases with depth
            depth, genome_fraction = int(l[1]), float(l[4])
            if l[0] == 'genome':
                avg_depth += depth * genome_fraction
                for i, threshold in enumerate(qconfig.coverage_thresholds):
                    if depth >= threshold:
                        coverage_for_thresholds[i] += genome_fraction

    with open(stats_fpath, 'a') as out_f:
        out_f.write('%s depth\n' % int(avg_depth))
        for i, threshold in enumerate(qconfig.coverage_thresholds):
            out_f.write('%.2f coverage >= %sx\n' % (coverage_for_thresholds[i] * 100, threshold))