def build_snps_panel(bcbio_projs=None, bed_files=None, output_dir=None, genome=None): selected_snps_file = join(output_dir, 'snps.bed') if can_reuse(selected_snps_file, bed_files): return selected_snps_file work_dir = safe_mkdir(join(output_dir, 'work')) log.info('Intersecting BED files for projects.') all_bed_files = set() for proj in bcbio_projs or []: if proj.coverage_bed: log.info(proj.project_name + ': selecting ' + proj.coverage_bed) all_bed_files.add(proj.coverage_bed) else: all_bed_files.add(proj.call) all_bed_files |= set(bed_files or []) overlapped_bed = join(work_dir, 'merged_bed_files.bed') log.info(f'BED files: {all_bed_files}, mergin, writing {overlapped_bed}') overlap_bed_files(all_bed_files, overlapped_bed) # Selecting SNPs from dbSNP dbsnp_file = get_dbsnp(genome) dbsnp_snps_file = join(work_dir, 'snps_in_merged_bed_files.bed') if not can_reuse(dbsnp_snps_file, [dbsnp_file, overlapped_bed]): cmdl = f'bedtools intersect -header -a {dbsnp_file} -b {overlapped_bed}' call_process.run(cmdl, dbsnp_snps_file) subset_bed_file = add_suffix(dbsnp_snps_file, 'subset') _make_snp_file(dbsnp_snps_file, genome, subset_bed_file) shutil.copyfile(subset_bed_file, selected_snps_file) return selected_snps_file
def sambamba_depth(work_dir, bed, bam, depth_thresholds=None, output_fpath=None, sample_name=None, threads=1): if not bam: return None sample_name = sample_name or splitext_plus(basename(bam))[0] depth_thresholds = depth_thresholds or [] if isinstance(bed, BedTool): bed = bed.saveas().fn if not output_fpath: output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt') if can_reuse(output_fpath, [bam, bed]): return output_fpath thresholds_str = ''.join( [' -T' + str(int(d)) for d in depth_thresholds if d is not None]) cmdline = ( 'depth region -F "not duplicate and not failed_quality_control" ' '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals()) call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath) return output_fpath
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth, parall_view=None): """ Picking random 3 samples and getting a callable for them. Trade off between looping through all samples in a huge batch, and hitting an sample with outstanding coverage. """ if can_reuse(output_bed_file, bam_files): return output_bed_file work_dir = safe_mkdir(join(work_dir, 'callable_work')) # random.seed(1234) # seeding random for reproducability # bam_files = random.sample(bam_files, min(len(bam_files), 3)) if parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) else: with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) good_overlap_sample_fraction = 0.8 # we want to pick those regions that have coverage at 80% of samples good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds)) info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} ' f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})') with file_transaction(work_dir, output_bed_file) as tx: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) intersection = pybedtools.BedTool() \ .multi_intersect(i=callable_beds) \ .filter(lambda r: len(r[4].split(',')) >= good_overlap_count) intersection.saveas(tx) info(f'Saved to {output_bed_file}') return output_bed_file
def annotate_target(cnf, target_bed): output_fpath = intermediate_fname(cnf, target_bed, 'ann') if not cnf.genome.bed_annotation_features: return output_fpath if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath features_bed = verify_bed( cnf.genome.bed_annotation_features, is_critical=True, description='bed_annotation_features in system config') # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py')) # bedtools = get_system_path(cnf, 'bedtools') annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \ '-o {output_fpath} --canonical'.format(**locals()) # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \ # '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \ # '-o {output_fpath}'.format(**locals()) call(cnf, cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = remove_comments(cnf, output_fpath) return output_fpath
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range(1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def count_in_bam(work_dir, bam, query, dedup=False, bed=None, use_grid=False, sample_name=None, target_name=None): if dedup: query += ' and not duplicate' name = 'num_' + (query.replace(' ', '_') or 'reads') if bed is not None and isinstance(bed, BedTool): bed = bed.saveas().fn if bed is not None: target_name = target_name or ('target_' + basename(bed)) name += '_on_' + target_name sample_name = sample_name or basename(bam) output_fpath = join(work_dir, sample_name + '_' + name) if can_reuse(output_fpath, cmp_f=bam): pass else: cmdline = 'view -c -F "{query}" {bam}'.format(**locals()) if bed is not None: cmdline += ' -L ' + bed call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath, command_name=name) with open(output_fpath) as f: return int(f.read().strip())
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def _split_bed(bed_file, work_dir): """ Splits into autosomal and sex chromosomes """ autosomal_bed = intermediate_fname(work_dir, bed_file, 'autosomal') sex_bed = intermediate_fname(work_dir, bed_file, 'sex') if not can_reuse(autosomal_bed, bed_file) or not can_reuse( sex_bed, bed_file): with open(bed_file) as f, open(autosomal_bed, 'w') as a_f, open(sex_bed, 'w') as s_f: for l in f: chrom = l.split()[0] if is_sex_chrom(chrom): s_f.write(l) else: a_f.write(l) return autosomal_bed, sex_bed
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range( 1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def _overlap_bed_files(bed_files, work_dir, genome): from clearup.panel import overlap_bed_files fnames = [basename(splitext_plus(fp)[0]) for fp in bed_files] overlapped_file = join(work_dir, f'{"__".join(fnames)}.{genome}.bed') if not can_reuse(overlapped_file, bed_files): overlap_bed_files(bed_files, overlapped_file) return overlapped_file
def overlap_bed_files(bed_files, output_bed_file): if can_reuse(output_bed_file, bed_files): return output_bed_file if len(bed_files) == 1: shutil.copy(bed_files.pop(), output_bed_file) return output_bed_file cmdl = 'bedops --intersect' + ''.join([' <(sort-bed ' + bf + ')' for bf in bed_files]) call_process.run(cmdl, output_bed_file) return output_bed_file
def index_bam(bam_fpath, sambamba=None, samtools=None): sambamba = sambamba or get_executable() indexed_bam = bam_fpath + '.bai' if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True): cmdline = '{sambamba} index {bam_fpath}'.format(**locals()) res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)
def sample_callable_bed(bam_file, output_bed_file, work_dir, genome_fasta_file, min_depth): """Retrieve callable regions for a sample subset by defined analysis regions. """ callable_bed = _calculate(bam_file, work_dir, genome_fasta_file, min_depth) if not can_reuse(output_bed_file, callable_bed): with file_transaction(work_dir, output_bed_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed).filter(lambda x: x.name == 'CALLABLE') callable_regions.saveas(tx_out_file) return output_bed_file
def intersect_bed(work_dir, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed') if can_reuse(output_fpath, [bed1, bed2]): return output_fpath bedtools = which('bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check]) return output_fpath
def load_bam_file(bam_file, bams_dir, snp_bed, sample_name): """ Slicing to fingerprints locations """ sliced_bam_file = join(bams_dir, sample_name + '.bam') if not can_reuse(sliced_bam_file, [bam_file, snp_bed]): cmdl = 'view {bam_file} -L {snp_bed} -F "not duplicate" -f bam'.format( **locals()) call_sambamba(cmdl, bam_fpath=bam_file, output_fpath=sliced_bam_file) # index_bam(sliced_bam_file) return sliced_bam_file
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def make_fingerprint(vcf_file, work_dir=None, label=None, fp_size=20, bed_file=None): log.info('Starting processing file ' + vcf_file) work_dir = work_dir or dirname(vcf_file) if label: print_name = label else: print_name = splitext_plus(basename(vcf_file))[0] print_name += '.print' + str(fp_size) print_name += '_dist' + str(Params.MIN_DIST) print_name += '_af' + str(Params.MIN_AF) if not Params.INTERREGION_PAIRS: print_name += '_skip_interregion_pairs' raw_print_file = join(work_dir, print_name) if can_reuse(raw_print_file, vcf_file): with open(raw_print_file) as f: raw = np.fromfile(f).reshape((len(index_by_key), fp_size)) else: raw = _raw_fingerprint(vcf_file, fp_size=fp_size, bed_file=bed_file) with open(raw_print_file, 'w') as f: raw.tofile(f) log.info(f'Saved raw fingerprints into {raw_print_file}') norm_print_name = print_name if Params.NORMALIZE_DIST: norm_print_name += '_normdist' if Params.NORMALIZE_VAR: norm_print_name += '_normvar' norm_print_file = join(work_dir, norm_print_name) if can_reuse(norm_print_file, raw_print_file): with open(norm_print_file) as f: norm = np.fromfile(f).reshape((len(index_by_key), fp_size)) else: norm = _normalize_fingerprint(raw) with open(norm_print_file, 'w') as f: norm.tofile(f) log.info(f'Saved normalised fingerprints into {norm_print_file}') return label, norm
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ output_prefix = os.path.join(work_dir, bam_samplename(bam_file)) callability_annotation_file = output_prefix + '.callable.bed' if not can_reuse(callability_annotation_file, bam_file): info(f'Calculating coverage at {bam_file}') run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered' f' --prefix {output_prefix} {bam_file}') callable_file = output_prefix + '.callable.CALLABLE.bed' if not can_reuse(callable_file, callability_annotation_file): with file_transaction(None, callable_file) as tx: pybedtools.BedTool(callability_annotation_file)\ .filter(lambda x: x.name == 'CALLABLE')\ .saveas(tx) return callable_file
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): import pybedtools pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = pybedtools.BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.') cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def _slice_vcf_fn(work_dir, label, vcf_file, overlapped_bed): sliced_vcf_file = join(work_dir, label + '.sliced.vcf') if not can_reuse(sliced_vcf_file, [vcf_file]): run(f'bcftools view {vcf_file} --targets-file {overlapped_bed} -o {sliced_vcf_file}' ) # ann_vcf_file = join(work_dir, label + '.sliced.ann.vcf') # if not can_reuse(ann_vcf_file, [sliced_vcf_file]): # vcf_header = join(work_dir, label + '.vcf_header') # with open(vcf_header, 'w') as f: # f.write('##INFO=<ID=CHROM,Number=1,Type=String,Description="Region chromosome">\n') # f.write('##INFO=<ID=FROM,Number=1,Type=String,Description="Region start">\n') # f.write('##INFO=<ID=TO,Number=1,Type=String,Description="Region end">\n') # run(f'bcftools annotate -c CHROM,FROM,TO -a {overlapped_bed} {sliced_vcf_file} ' # f'-h {vcf_header} -o {ann_vcf_file}') return label, sliced_vcf_file
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of chr_order, fai_fpath, or genome build name must be specified') chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append(Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical( 'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.' ) cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def phylo_tree_page(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)} work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) tree_fpath = os.path.join(prank_out + '.best.dnd') if not can_reuse(tree_fpath, merged_fasta_fpath): return render_template( 'processing.html', projects=[{ 'name': p.name, } for i, p in enumerate(projects)], run_id=run_id, title='Processing ' + ', '.join(project_names), ) log.debug('Prank results found, rendering tree!') tree = next(Phylo.parse(tree_fpath, 'newick')) seq_by_id = read_fasta(merged_fasta_fpath) tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id) all_samples_count = sum(len(p.samples.all()) for p in projects) return render_template( 'tree.html', projects=[{ 'name': p.name, 'color': color_by_proj[p.name], } for i, p in enumerate(projects)], title=', '.join(project_names), data=tree_json, tree_height=20 * all_samples_count, tree_width=5 * all_samples_count, )
def sambamba_depth(work_dir, bed, bam, depth_thresholds=None, output_fpath=None, sample_name=None, threads=1): if not bam: return None sample_name = sample_name or splitext_plus(basename(bam))[0] depth_thresholds = depth_thresholds or [] if isinstance(bed, BedTool): bed = bed.saveas().fn if not output_fpath: output_fpath = join(work_dir, splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt') if can_reuse(output_fpath, [bam, bed]): return output_fpath thresholds_str = ''.join([' -T' + str(int(d)) for d in depth_thresholds if d is not None]) cmdline = ('depth region -F "not duplicate and not failed_quality_control" ' '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals()) call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath) return output_fpath
def _make_snp_file(dbsnp_snps_file, genome_build, output_file, autosomal_locations_limit=175, min_snp_amount=30): if can_reuse(output_file, dbsnp_snps_file): return output_file locs_by_gene = defaultdict(list) total_locs = 0 for i, interval in enumerate(BedTool(dbsnp_snps_file)): if is_sex_chrom(interval.chrom): continue pos = int(interval.start) + 1 annots = interval.name.split('|') # if len(annots) == 2: # rsid, gene = interval.name.split('|') # ref = interval[4] # else: rsid, gene, ref, alts = interval.name.split('|') loc = (interval.chrom, pos, rsid, gene, ref, alts) locs_by_gene[gene].append(loc) total_locs += 1 random.seed(1234) # seeding random for reproducability # Selecting random genes gnames = random.sample(locs_by_gene.keys(), min(len(locs_by_gene), autosomal_locations_limit)) locs_by_gene = {g: locs_by_gene[g] for g in gnames} # Selecting random SNPs in each gene # min_locs_per_gene = min(len(locs) for locs in locs_by_gene.values()) # if pick_unclustered: # locs_per_gene = min(autosomal_locations_limit / len(gnames), min_locs_per_gene) # while locs_per_gene * len(gnames) < min_snp_amount: # locs_per_gene = math.ceil(float(min_snp_amount) / len(gnames)) # selected_locs_by_gene = {g: random.sample(locs_by_gene[g], locs_per_gene) for g in gnames} # selected_locs = [l for gene_locs in selected_locs_by_gene.values() for l in gene_locs] # else: all_locs = [l for gene_locs in locs_by_gene.values() for l in gene_locs] # Selecting unclustered SNPs within genes non_clustered_locs = [] prev_pos = 0 for (chrom, pos, rsid, gene, ref, alts) in all_locs: if 0 < pos - prev_pos < 500: continue else: prev_pos = pos non_clustered_locs.append((chrom, pos, rsid, gene, ref, alts)) # Selecting random SNPs within the limit selected_locs = random.sample(non_clustered_locs, min(len(non_clustered_locs), autosomal_locations_limit)) # Sorting final locations chrom_order = get_chrom_order(genome_build) selected_locs.sort(key=lambda a: (chrom_order.get(a[0], -1), a[1:])) log.debug('Selected the following autosomal SNPs:') for (chrom, pos, rsid, gene, ref, alts) in selected_locs: log.debug(' ' + chrom + ':' + str(pos) + '\t' + rsid + '\t' + gene + '\t' + ref + '>' + ','.join(alts)) with file_transaction(None, output_file) as tx: with open(tx, 'w') as out: for (chrom, pos, rsid, gene, ref, alts) in selected_locs: out.write('\t'.join([chrom, str(pos-1), str(pos), rsid + '|' + gene + '|' + ref + '|' + alts]) + '\n') return output_file
def _vardict_pileup_sample(sample, work_dir, output_dir, genome_fasta_file, snp_file): vardict_snp_vars = join(work_dir, sample.name + '_vars.txt') vcf_file = join(output_dir, sample.name + '.vcf') if can_reuse(vardict_snp_vars, [sample.bam, snp_file]) and can_reuse( vcf_file, vardict_snp_vars): return vcf_file vardict_exec = which('vardict') if not vardict_exec: critical( 'Error: vardict is not in PATH. Please install it with `conda install -c bioconda vardict`' ) vardict_bin_dir = dirname(vardict_exec) # Run VarDict index_bam(sample.bam) cmdl = '{vardict_exec} -G {genome_fasta_file} -N {sample.name} -b {sample.bam} -p -D {snp_file}'.format( **locals()) call_process.run(cmdl, output_fpath=vardict_snp_vars) # Complex variants might have a shifted start positions with respect to rsid so we are # associating starts with rsid for futher snp identification ann_by_var = defaultdict(list) with open(vardict_snp_vars) as f: for l in f: fs = l.split('\t') ann, chrom, start = fs[1], fs[2], fs[3] ann_by_var[(chrom, start)] = ann info() info('Converting to VCF') work_vcf_file = join(work_dir, sample.name + '_vars.vcf') cmdl = ( 'cut -f-34 ' + vardict_snp_vars + ' | awk -F"\\t" -v OFS="\\t" \'{for (i=1;i<=NF;i++) { if ($i=="") $i="0" } print $0 }\'' ' | ' + join('teststrandbias.R') + ' | ' + join('var2vcf_valid.pl') + ' -A -f 0.2' + '') call_process.run(cmdl, output_fpath=work_vcf_file) # Fix non-call records with empty REF and LAT, and "NA" values assigned to INFO's SN and HICOV fixed_vcf_file = add_suffix(work_vcf_file, 'fixed') info('Fixing VCF for parsing, writing to ' + fixed_vcf_file) with open(work_vcf_file) as inp, open(fixed_vcf_file, 'w') as out_f: for l in inp: if l.startswith('#'): out_f.write(l) else: fs = l.split('\t') chrom, pos, _, ref, alt = fs[0], int( fs[1]), fs[2], fs[3], fs[4] if alt in ['.', '']: fs[4] = fs[3] = _get_fasta_ref( genome_fasta_file, chrom, pos) # Reading the reference allele from fasta l = '\t'.join(fs) l = l.replace('=NA;', '=.;') l = l.replace('=;', '=.;') l = l.replace('TYPE=0', 'TYPE=REF') out_f.write(l) assert verify_file(fixed_vcf_file) info('Annotating VCF with gene names and rsIDs') ann_vcf_file = add_suffix(fixed_vcf_file, 'ann') with open(fixed_vcf_file) as f, open(ann_vcf_file, 'w') as out: vcf_reader = vcf.Reader(f) vcf_writer = vcf.Writer(out, vcf_reader) for rec in vcf_reader: ann = ann_by_var[(rec.CHROM, str(rec.POS))] rec.ID = ann.split('|')[0] rec.INFO['ANNOTATION'] = ann vcf_writer.write_record(rec) assert verify_file(ann_vcf_file), ann_vcf_file ann_hdr_vcf_file = add_suffix(ann_vcf_file, 'hdr') cmdl = 'bcftools annotate -h <(echo ' \ '\'##INFO=<ID=ANNOTATION,Number=1,Type=String,Description="rsid|gene_name|ref|alts">\') ' + \ bgzip_and_tabix(ann_vcf_file) call_process.run(cmdl, output_fpath=ann_hdr_vcf_file) debug('Renaming ' + ann_hdr_vcf_file + ' -> ' + vcf_file) os.rename(ann_hdr_vcf_file, vcf_file) return vcf_file