コード例 #1
0
def build_snps_panel(bcbio_projs=None, bed_files=None, output_dir=None, genome=None):
    selected_snps_file = join(output_dir, 'snps.bed')
    if can_reuse(selected_snps_file, bed_files):
        return selected_snps_file

    work_dir = safe_mkdir(join(output_dir, 'work'))

    log.info('Intersecting BED files for projects.')
    all_bed_files = set()
    for proj in bcbio_projs or []:
        if proj.coverage_bed:
            log.info(proj.project_name + ': selecting ' + proj.coverage_bed)
            all_bed_files.add(proj.coverage_bed)
        else:
            all_bed_files.add(proj.call)
    all_bed_files |= set(bed_files or [])


    overlapped_bed = join(work_dir, 'merged_bed_files.bed')
    log.info(f'BED files: {all_bed_files}, mergin, writing {overlapped_bed}')
    overlap_bed_files(all_bed_files, overlapped_bed)

    # Selecting SNPs from dbSNP
    dbsnp_file = get_dbsnp(genome)
    dbsnp_snps_file = join(work_dir, 'snps_in_merged_bed_files.bed')
    if not can_reuse(dbsnp_snps_file, [dbsnp_file, overlapped_bed]):
        cmdl = f'bedtools intersect -header -a {dbsnp_file} -b {overlapped_bed}'
        call_process.run(cmdl, dbsnp_snps_file)

    subset_bed_file = add_suffix(dbsnp_snps_file, 'subset')
    _make_snp_file(dbsnp_snps_file, genome, subset_bed_file)

    shutil.copyfile(subset_bed_file, selected_snps_file)
    return selected_snps_file
コード例 #2
0
def sambamba_depth(work_dir,
                   bed,
                   bam,
                   depth_thresholds=None,
                   output_fpath=None,
                   sample_name=None,
                   threads=1):
    if not bam:
        return None
    sample_name = sample_name or splitext_plus(basename(bam))[0]
    depth_thresholds = depth_thresholds or []

    if isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if not output_fpath:
        output_fpath = join(
            work_dir,
            splitext_plus(basename(bed))[0] + '_' + sample_name +
            '_sambamba_depth.txt')

    if can_reuse(output_fpath, [bam, bed]):
        return output_fpath

    thresholds_str = ''.join(
        [' -T' + str(int(d)) for d in depth_thresholds if d is not None])
    cmdline = (
        'depth region -F "not duplicate and not failed_quality_control" '
        '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals())

    call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath)
    return output_fpath
コード例 #3
0
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file
コード例 #4
0
def annotate_target(cnf, target_bed):
    output_fpath = intermediate_fname(cnf, target_bed, 'ann')
    if not cnf.genome.bed_annotation_features:
        return output_fpath
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    features_bed = verify_bed(
        cnf.genome.bed_annotation_features,
        is_critical=True,
        description='bed_annotation_features in system config')

    # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py'))
    # bedtools = get_system_path(cnf, 'bedtools')

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \
              '-o {output_fpath} --canonical'.format(**locals())
    # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \
    #           '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \
    #           '-o {output_fpath}'.format(**locals())
    call(cnf, cmdline, output_fpath, stdout_to_outputfile=False)

    output_fpath = remove_comments(cnf, output_fpath)

    return output_fpath
コード例 #5
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
コード例 #6
0
def count_in_bam(work_dir,
                 bam,
                 query,
                 dedup=False,
                 bed=None,
                 use_grid=False,
                 sample_name=None,
                 target_name=None):
    if dedup:
        query += ' and not duplicate'
    name = 'num_' + (query.replace(' ', '_') or 'reads')
    if bed is not None and isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if bed is not None:
        target_name = target_name or ('target_' + basename(bed))
        name += '_on_' + target_name

    sample_name = sample_name or basename(bam)
    output_fpath = join(work_dir, sample_name + '_' + name)

    if can_reuse(output_fpath, cmp_f=bam):
        pass
    else:
        cmdline = 'view -c -F "{query}" {bam}'.format(**locals())
        if bed is not None:
            cmdline += ' -L ' + bed

        call_sambamba(cmdline,
                      bam_fpath=bam,
                      output_fpath=output_fpath,
                      command_name=name)

    with open(output_fpath) as f:
        return int(f.read().strip())
コード例 #7
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
コード例 #8
0
ファイル: genotype.py プロジェクト: vladsaveliev/ClearUp
def _split_bed(bed_file, work_dir):
    """ Splits into autosomal and sex chromosomes
    """
    autosomal_bed = intermediate_fname(work_dir, bed_file, 'autosomal')
    sex_bed = intermediate_fname(work_dir, bed_file, 'sex')
    if not can_reuse(autosomal_bed, bed_file) or not can_reuse(
            sex_bed, bed_file):
        with open(bed_file) as f, open(autosomal_bed,
                                       'w') as a_f, open(sex_bed, 'w') as s_f:
            for l in f:
                chrom = l.split()[0]
                if is_sex_chrom(chrom):
                    s_f.write(l)
                else:
                    a_f.write(l)
    return autosomal_bed, sex_bed
コード例 #9
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(
        1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
コード例 #10
0
def _overlap_bed_files(bed_files, work_dir, genome):
    from clearup.panel import overlap_bed_files

    fnames = [basename(splitext_plus(fp)[0]) for fp in bed_files]
    overlapped_file = join(work_dir, f'{"__".join(fnames)}.{genome}.bed')
    if not can_reuse(overlapped_file, bed_files):
        overlap_bed_files(bed_files, overlapped_file)
    return overlapped_file
コード例 #11
0
def overlap_bed_files(bed_files, output_bed_file):
    if can_reuse(output_bed_file, bed_files):
        return output_bed_file
    if len(bed_files) == 1:
        shutil.copy(bed_files.pop(), output_bed_file)
        return output_bed_file
    cmdl = 'bedops --intersect' + ''.join([' <(sort-bed ' + bf + ')' for bf in bed_files])
    call_process.run(cmdl, output_bed_file)
    return output_bed_file
コード例 #12
0
def index_bam(bam_fpath, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    indexed_bam = bam_fpath + '.bai'
    if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = run(cmdline,
                  output_fpath=indexed_bam,
                  stdout_to_outputfile=False,
                  stdout_tx=False)
コード例 #13
0
def sample_callable_bed(bam_file, output_bed_file, work_dir, genome_fasta_file, min_depth):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    callable_bed = _calculate(bam_file, work_dir, genome_fasta_file, min_depth)
    if not can_reuse(output_bed_file, callable_bed):
        with file_transaction(work_dir, output_bed_file) as tx_out_file:
            callable_regions = pybedtools.BedTool(callable_bed).filter(lambda x: x.name == 'CALLABLE')
            callable_regions.saveas(tx_out_file)
    return output_bed_file
コード例 #14
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check])
    return output_fpath
コード例 #15
0
def load_bam_file(bam_file, bams_dir, snp_bed, sample_name):
    """ Slicing to fingerprints locations
    """
    sliced_bam_file = join(bams_dir, sample_name + '.bam')
    if not can_reuse(sliced_bam_file, [bam_file, snp_bed]):
        cmdl = 'view {bam_file} -L {snp_bed} -F "not duplicate" -f bam'.format(
            **locals())
        call_sambamba(cmdl, bam_fpath=bam_file, output_fpath=sliced_bam_file)
        # index_bam(sliced_bam_file)
    return sliced_bam_file
コード例 #16
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
コード例 #17
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def bam_to_bed(bam_fpath, to_gzip=True):
    debug('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
コード例 #18
0
def make_fingerprint(vcf_file,
                     work_dir=None,
                     label=None,
                     fp_size=20,
                     bed_file=None):
    log.info('Starting processing file ' + vcf_file)
    work_dir = work_dir or dirname(vcf_file)

    if label: print_name = label
    else: print_name = splitext_plus(basename(vcf_file))[0]
    print_name += '.print' + str(fp_size)
    print_name += '_dist' + str(Params.MIN_DIST)
    print_name += '_af' + str(Params.MIN_AF)
    if not Params.INTERREGION_PAIRS:
        print_name += '_skip_interregion_pairs'

    raw_print_file = join(work_dir, print_name)
    if can_reuse(raw_print_file, vcf_file):
        with open(raw_print_file) as f:
            raw = np.fromfile(f).reshape((len(index_by_key), fp_size))
    else:
        raw = _raw_fingerprint(vcf_file, fp_size=fp_size, bed_file=bed_file)
        with open(raw_print_file, 'w') as f:
            raw.tofile(f)
        log.info(f'Saved raw fingerprints into {raw_print_file}')

    norm_print_name = print_name
    if Params.NORMALIZE_DIST: norm_print_name += '_normdist'
    if Params.NORMALIZE_VAR: norm_print_name += '_normvar'

    norm_print_file = join(work_dir, norm_print_name)
    if can_reuse(norm_print_file, raw_print_file):
        with open(norm_print_file) as f:
            norm = np.fromfile(f).reshape((len(index_by_key), fp_size))
    else:
        norm = _normalize_fingerprint(raw)
        with open(norm_print_file, 'w') as f:
            norm.tofile(f)
        log.info(f'Saved normalised fingerprints into {norm_print_file}')

    return label, norm
コード例 #19
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline,
                     output_fpath=output_fpath,
                     checks=[call_process.file_exists_check])
    return output_fpath
コード例 #20
0
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    output_prefix = os.path.join(work_dir, bam_samplename(bam_file))
    callability_annotation_file = output_prefix + '.callable.bed'
    if not can_reuse(callability_annotation_file, bam_file):
        info(f'Calculating coverage at {bam_file}')
        run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered'
            f' --prefix {output_prefix} {bam_file}')

    callable_file = output_prefix + '.callable.CALLABLE.bed'
    if not can_reuse(callable_file, callability_annotation_file):
        with file_transaction(None, callable_file) as tx:
            pybedtools.BedTool(callability_annotation_file)\
                .filter(lambda x: x.name == 'CALLABLE')\
                .saveas(tx)

    return callable_file
コード例 #21
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
コード例 #22
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
コード例 #23
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        import pybedtools
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = pybedtools.BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
コード例 #24
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.')

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
コード例 #25
0
def _slice_vcf_fn(work_dir, label, vcf_file, overlapped_bed):
    sliced_vcf_file = join(work_dir, label + '.sliced.vcf')
    if not can_reuse(sliced_vcf_file, [vcf_file]):
        run(f'bcftools view {vcf_file} --targets-file {overlapped_bed} -o {sliced_vcf_file}'
            )

    # ann_vcf_file = join(work_dir, label + '.sliced.ann.vcf')
    # if not can_reuse(ann_vcf_file, [sliced_vcf_file]):
    #     vcf_header = join(work_dir, label + '.vcf_header')
    #     with open(vcf_header, 'w') as f:
    #         f.write('##INFO=<ID=CHROM,Number=1,Type=String,Description="Region chromosome">\n')
    #         f.write('##INFO=<ID=FROM,Number=1,Type=String,Description="Region start">\n')
    #         f.write('##INFO=<ID=TO,Number=1,Type=String,Description="Region end">\n')
    #     run(f'bcftools annotate -c CHROM,FROM,TO -a {overlapped_bed} {sliced_vcf_file} '
    #         f'-h {vcf_header} -o {ann_vcf_file}')

    return label, sliced_vcf_file
コード例 #26
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical('Either of chr_order, fai_fpath, or genome build name must be specified')
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath)
    return output_bed_fpath
コード例 #27
0
ファイル: bed_utils.py プロジェクト: pdiakumis/NGS_Utils
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical(
            'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.'
        )

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
コード例 #28
0
ファイル: bed_utils.py プロジェクト: vladsaveliev/Utils
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
コード例 #29
0
def phylo_tree_page(run_id):
    project_names = run_id.split(',')
    projects = [Project.query.filter_by(name=pn).first() for pn in project_names]
    if not projects:
        log.err('Projects ' + ', '.join(project_names) + ' not found in database')
        abort(404)
    color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)}
    work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names)))
    safe_mkdir(work_dirpath)
    merged_fasta_fpath = merge_fasta(projects, work_dirpath)

    prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0])
    tree_fpath = os.path.join(prank_out + '.best.dnd')
    if not can_reuse(tree_fpath, merged_fasta_fpath):
        return render_template(
            'processing.html',
            projects=[{
                'name': p.name,
            } for i, p in enumerate(projects)],
            run_id=run_id,
            title='Processing ' + ', '.join(project_names),
        )

    log.debug('Prank results found, rendering tree!')
    tree = next(Phylo.parse(tree_fpath, 'newick'))
    seq_by_id = read_fasta(merged_fasta_fpath)
    tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id)

    all_samples_count = sum(len(p.samples.all()) for p in projects)
    return render_template(
        'tree.html',
        projects=[{
            'name': p.name,
            'color': color_by_proj[p.name],
        } for i, p in enumerate(projects)],
        title=', '.join(project_names),
        data=tree_json,
        tree_height=20 * all_samples_count,
        tree_width=5 * all_samples_count,
    )
コード例 #30
0
ファイル: sambamba.py プロジェクト: vladsaveliev/Utils
def sambamba_depth(work_dir, bed, bam, depth_thresholds=None,
                   output_fpath=None, sample_name=None, threads=1):
    if not bam:
        return None
    sample_name = sample_name or splitext_plus(basename(bam))[0]
    depth_thresholds = depth_thresholds or []
    
    if isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if not output_fpath:
        output_fpath = join(work_dir,
            splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt')

    if can_reuse(output_fpath, [bam, bed]):
        return output_fpath

    thresholds_str = ''.join([' -T' + str(int(d)) for d in depth_thresholds if d is not None])
    cmdline = ('depth region -F "not duplicate and not failed_quality_control" '
               '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals())

    call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath)
    return output_fpath
コード例 #31
0
ファイル: sambamba.py プロジェクト: vladsaveliev/Utils
def count_in_bam(work_dir, bam, query, dedup=False, bed=None, use_grid=False, sample_name=None, target_name=None):
    if dedup:
        query += ' and not duplicate'
    name = 'num_' + (query.replace(' ', '_') or 'reads')
    if bed is not None and isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if bed is not None:
        target_name = target_name or ('target_' + basename(bed))
        name += '_on_' + target_name

    sample_name = sample_name or basename(bam)
    output_fpath = join(work_dir, sample_name + '_' + name)

    if can_reuse(output_fpath, cmp_f=bam):
        pass
    else:
        cmdline = 'view -c -F "{query}" {bam}'.format(**locals())
        if bed is not None:
            cmdline += ' -L ' + bed

        call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath, command_name=name)

    with open(output_fpath) as f:
        return int(f.read().strip())
コード例 #32
0
ファイル: sambamba.py プロジェクト: vladsaveliev/Utils
def index_bam(bam_fpath, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    indexed_bam = bam_fpath + '.bai'
    if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)
コード例 #33
0
def _make_snp_file(dbsnp_snps_file, genome_build, output_file,
                   autosomal_locations_limit=175, min_snp_amount=30):
    if can_reuse(output_file, dbsnp_snps_file):
        return output_file

    locs_by_gene = defaultdict(list)
    total_locs = 0
    for i, interval in enumerate(BedTool(dbsnp_snps_file)):
        if is_sex_chrom(interval.chrom):
            continue
        pos = int(interval.start) + 1
        annots = interval.name.split('|')
        # if len(annots) == 2:
        #     rsid, gene = interval.name.split('|')
        #     ref = interval[4]
        # else:
        rsid, gene, ref, alts = interval.name.split('|')
        loc = (interval.chrom, pos, rsid, gene, ref, alts)
        locs_by_gene[gene].append(loc)
        total_locs += 1

    random.seed(1234)  # seeding random for reproducability

    # Selecting random genes
    gnames = random.sample(locs_by_gene.keys(), min(len(locs_by_gene), autosomal_locations_limit))
    locs_by_gene = {g: locs_by_gene[g] for g in gnames}
    # Selecting random SNPs in each gene
    # min_locs_per_gene = min(len(locs) for locs in locs_by_gene.values())
    # if pick_unclustered:
    #     locs_per_gene = min(autosomal_locations_limit / len(gnames), min_locs_per_gene)
    #     while locs_per_gene * len(gnames) < min_snp_amount:
    #         locs_per_gene = math.ceil(float(min_snp_amount) / len(gnames))
    #     selected_locs_by_gene = {g: random.sample(locs_by_gene[g], locs_per_gene) for g in gnames}
    #     selected_locs = [l for gene_locs in selected_locs_by_gene.values() for l in gene_locs]
    # else:
    all_locs = [l for gene_locs in locs_by_gene.values() for l in gene_locs]

    # Selecting unclustered SNPs within genes
    non_clustered_locs = []
    prev_pos = 0
    for (chrom, pos, rsid, gene, ref, alts) in all_locs:
        if 0 < pos - prev_pos < 500:
            continue
        else:
            prev_pos = pos
            non_clustered_locs.append((chrom, pos, rsid, gene, ref, alts))

    # Selecting random SNPs within the limit
    selected_locs = random.sample(non_clustered_locs, min(len(non_clustered_locs), autosomal_locations_limit))

    # Sorting final locations
    chrom_order = get_chrom_order(genome_build)
    selected_locs.sort(key=lambda a: (chrom_order.get(a[0], -1), a[1:]))

    log.debug('Selected the following autosomal SNPs:')
    for (chrom, pos, rsid, gene, ref, alts) in selected_locs:
        log.debug('  ' + chrom + ':' + str(pos) + '\t' + rsid + '\t' + gene + '\t' + ref + '>' + ','.join(alts))

    with file_transaction(None, output_file) as tx:
        with open(tx, 'w') as out:
            for (chrom, pos, rsid, gene, ref, alts) in selected_locs:
                out.write('\t'.join([chrom, str(pos-1), str(pos), rsid + '|' + gene + '|' + ref + '|' + alts]) + '\n')
    return output_file
コード例 #34
0
ファイル: genotype.py プロジェクト: vladsaveliev/ClearUp
def _vardict_pileup_sample(sample, work_dir, output_dir, genome_fasta_file,
                           snp_file):
    vardict_snp_vars = join(work_dir, sample.name + '_vars.txt')
    vcf_file = join(output_dir, sample.name + '.vcf')
    if can_reuse(vardict_snp_vars, [sample.bam, snp_file]) and can_reuse(
            vcf_file, vardict_snp_vars):
        return vcf_file

    vardict_exec = which('vardict')
    if not vardict_exec:
        critical(
            'Error: vardict is not in PATH. Please install it with `conda install -c bioconda vardict`'
        )
    vardict_bin_dir = dirname(vardict_exec)

    # Run VarDict
    index_bam(sample.bam)
    cmdl = '{vardict_exec} -G {genome_fasta_file} -N {sample.name} -b {sample.bam} -p -D {snp_file}'.format(
        **locals())
    call_process.run(cmdl, output_fpath=vardict_snp_vars)

    # Complex variants might have a shifted start positions with respect to rsid so we are
    # associating starts with rsid for futher snp identification
    ann_by_var = defaultdict(list)
    with open(vardict_snp_vars) as f:
        for l in f:
            fs = l.split('\t')
            ann, chrom, start = fs[1], fs[2], fs[3]
            ann_by_var[(chrom, start)] = ann

    info()
    info('Converting to VCF')
    work_vcf_file = join(work_dir, sample.name + '_vars.vcf')
    cmdl = (
        'cut -f-34 ' + vardict_snp_vars +
        ' | awk -F"\\t" -v OFS="\\t" \'{for (i=1;i<=NF;i++) { if ($i=="") $i="0" } print $0 }\''
        ' | ' + join('teststrandbias.R') + ' | ' + join('var2vcf_valid.pl') +
        ' -A -f 0.2' + '')
    call_process.run(cmdl, output_fpath=work_vcf_file)

    # Fix non-call records with empty REF and LAT, and "NA" values assigned to INFO's SN and HICOV
    fixed_vcf_file = add_suffix(work_vcf_file, 'fixed')
    info('Fixing VCF for parsing, writing to ' + fixed_vcf_file)
    with open(work_vcf_file) as inp, open(fixed_vcf_file, 'w') as out_f:
        for l in inp:
            if l.startswith('#'):
                out_f.write(l)
            else:
                fs = l.split('\t')
                chrom, pos, _, ref, alt = fs[0], int(
                    fs[1]), fs[2], fs[3], fs[4]
                if alt in ['.', '']:
                    fs[4] = fs[3] = _get_fasta_ref(
                        genome_fasta_file, chrom,
                        pos)  # Reading the reference allele from fasta
                l = '\t'.join(fs)
                l = l.replace('=NA;', '=.;')
                l = l.replace('=;', '=.;')
                l = l.replace('TYPE=0', 'TYPE=REF')
                out_f.write(l)
    assert verify_file(fixed_vcf_file)

    info('Annotating VCF with gene names and rsIDs')
    ann_vcf_file = add_suffix(fixed_vcf_file, 'ann')
    with open(fixed_vcf_file) as f, open(ann_vcf_file, 'w') as out:
        vcf_reader = vcf.Reader(f)
        vcf_writer = vcf.Writer(out, vcf_reader)
        for rec in vcf_reader:
            ann = ann_by_var[(rec.CHROM, str(rec.POS))]
            rec.ID = ann.split('|')[0]
            rec.INFO['ANNOTATION'] = ann
            vcf_writer.write_record(rec)
    assert verify_file(ann_vcf_file), ann_vcf_file

    ann_hdr_vcf_file = add_suffix(ann_vcf_file, 'hdr')
    cmdl = 'bcftools annotate -h <(echo ' \
           '\'##INFO=<ID=ANNOTATION,Number=1,Type=String,Description="rsid|gene_name|ref|alts">\') ' + \
           bgzip_and_tabix(ann_vcf_file)
    call_process.run(cmdl, output_fpath=ann_hdr_vcf_file)

    debug('Renaming ' + ann_hdr_vcf_file + ' -> ' + vcf_file)
    os.rename(ann_hdr_vcf_file, vcf_file)
    return vcf_file