def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth, parall_view=None): """ Picking random 3 samples and getting a callable for them. Trade off between looping through all samples in a huge batch, and hitting an sample with outstanding coverage. """ if can_reuse(output_bed_file, bam_files): return output_bed_file work_dir = safe_mkdir(join(work_dir, 'callable_work')) # random.seed(1234) # seeding random for reproducability # bam_files = random.sample(bam_files, min(len(bam_files), 3)) if parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) else: with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) good_overlap_sample_fraction = 0.8 # we want to pick those regions that have coverage at 80% of samples good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds)) info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} ' f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})') with file_transaction(work_dir, output_bed_file) as tx: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) intersection = pybedtools.BedTool() \ .multi_intersect(i=callable_beds) \ .filter(lambda r: len(r[4].split(',')) >= good_overlap_count) intersection.saveas(tx) info(f'Saved to {output_bed_file}') return output_bed_file
def main(bcbio_dir, bed, depth, threads=None, isdebug=True): snp_file = verify_file(bed) depth_cutoff = depth log.init(isdebug) try: import az except ImportError: parallel_cfg = ParallelCfg(threads=threads) else: sys_cfg = az.init_sys_cfg() parallel_cfg = ParallelCfg( scheduler=sys_cfg.get('scheduler'), queue=sys_cfg.get('queue'), resources=sys_cfg.get('resources'), threads=threads or sys_cfg.get('threads'), tag='clearup') log.info('Loading bcbio project from ' + bcbio_dir) log.info('-' * 70) proj = BcbioProject() proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup') log.info('Loaded ' + proj.final_dir) log_dir = safe_mkdir(join(proj.log_dir, 'clearup')) work_dir = safe_mkdir(join(proj.work_dir, 'clearup')) out_dir = safe_mkdir(join(proj.date_dir, 'clearup')) with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view: genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
def _add_project(bam_by_sample, project_name, bed_file=None, use_callable=False, data_dir='', genome='hg19', min_depth=DEPTH_CUTOFF, depth_by_sample=None, reuse_files=False): fp_proj = Project.query.filter(Project.name == project_name).first() if fp_proj: fp_proj.delete(reuse_files=reuse_files) fp_proj = Project( name=project_name, data_dir=data_dir, genome=genome, bed_fpath=bed_file, min_depth=min_depth, used_callable=use_callable, ) db.session.add(fp_proj) db_samples = [] for sname, bam_file in bam_by_sample.items(): db_samples.append(Sample(sname, fp_proj, bam_file)) db.session.add_all(db_samples) work_dir = safe_mkdir(fp_proj.get_work_dir()) do_ngb = False do_sex = False do_create_run = False if do_ngb or do_sex or do_create_run or use_callable: with parallel_view(len(bam_by_sample), parallel_cfg, work_dir) as p_view: if use_callable: log.info(f'Calculating callable regions for {project_name}.') genome_fasta_file = get_ref_fasta(genome) fp_proj.bed_fpath = batch_callable_bed( bam_by_sample.values(), join(work_dir, 'callable_regions.bed'), work_dir, genome_fasta_file, min_depth, parall_view=p_view) log.debug(f'Set bed file {fp_proj.bed_fpath}') if do_create_run: get_or_create_run([fp_proj], parall_view=p_view) if do_ngb: log.info('Exposing to NGB') _add_to_ngb(work_dir, project_name, bam_by_sample, genome, bed_file, p_view) if do_sex: log.info('Genotyping sex') sex_work_dir = safe_mkdir(join(work_dir, 'sex')) sexes = p_view.run(_sex_from_bam, [[ db_s.name, bam_by_sample[db_s.name], bed_file, sex_work_dir, genome, depth_by_sample.get(db_s.name) if depth_by_sample else None, [snp.depth for snp in db_s.snps.all()] ] for db_s in db_samples]) for s, sex in zip(db_samples, sexes): s.sex = sex db.session.commit() log.info() log.info('Done.')
def main(ctx, subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False): """ Generates a PNG image with a relatedness heatmap. """ if not subdirs: ctx.fail('Provide at least on input directory.') datasets = _load_datasets(subdirs) title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + '' if not Params.NORMALIZE_VAR: title += ', not norm by var' if not Params.NORMALIZE_DIST: title += ', not norm by dist' if Params.SKIP_DAMAGE: title += ', skipped damage' if Params.SKIP_REJECT: title += ', skipped REJECT' if Params.SKIP_NOCALL: title += ', skipped num called = 0' if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF) if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST) if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions' else: title += ', skiped SNP pairs between regions' run_id = '__'.join(d.name for d in datasets) run_dir = safe_mkdir(join((output_dir or join(code_dir, 'runs')), run_id)) log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True) work_dir = safe_mkdir(join(run_dir, 'work')) all_vcf_by_label = dict() bed_files_by_genome = defaultdict(set) for d in datasets: all_vcf_by_label.update(d.vcf_by_label) if d.bed_file: bed_files_by_genome[d.genome].add( d.bed_file) # d.bed_file=None for WGS genome_by_label = dict() for d in datasets: for label in d.vcf_by_label: genome_by_label[label] = d.genome parallel_cfg = ParallelCfg(threads=threads) log.info(f'Starting using {parallel_cfg.threads} threads') with parallel_view(len(all_vcf_by_label), parallel_cfg, work_dir) as parall_view: overlap_bed_file_by_genome = dict() if bed_files_by_genome: overlap_bed_file_by_genome = _prep_bed(work_dir, bed_files_by_genome, overlap_bed_file_by_genome) log.info('Slicing VCFs to regions in BED files') out = parall_view.run(_slice_vcf_fn, [[ work_dir, label, vcf, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) all_vcf_by_label = dict(out) log.info() log.info('Calculating fingerprints for individual samples') out = parall_view.run(make_fingerprint, [[ vcf, work_dir, label, fp_size, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) print_label_pairs = dict(out) log.info() log.info('Comparing fingerprints pairwise') pairwise_dict = defaultdict(dict) for ((label1, print1), (label2, print2)) in it.combinations_with_replacement( print_label_pairs.items(), 2): dist, pvalue = compare(print1, print2) if dist: log.info( f' {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}') else: log.info(f' {label1} VS {label2}: failed to calculate') dist = float('NaN') pairwise_dict[label1][label2] = dist pairwise_dict[label2][label1] = dist log.info('Plotting comparison heatmap') plot_heatmap(pairwise_dict, run_dir, title)
def create(projects, parall_view=None): run = Run() db.session.add(run) for p in projects: run.projects.append(p) db.session.commit() genomes = [p.genome for p in projects] if len(set(genomes)) > 1: log.critical('Error: multiple genomes in projects: ' + str(genomes)) genome_build = genomes[0] snps_dir = safe_mkdir(join(run.work_dir_path(), 'snps')) run.snps_file = build_snps_panel( bed_files=[p.bed_fpath for p in projects if p.bed_fpath], output_dir=snps_dir, genome=genome_build) locations = extract_locations_from_file(run.snps_file) for loc in locations: db.session.add(loc) db.session.commit() log.info() log.info('Genotyping') samples = [s for p in projects for s in p.samples] snps_left_to_call_file = _get_snps_not_called(run.snps_file, samples) vcf_dir = safe_mkdir(join(run.work_dir_path(), 'vcf')) work_dir = safe_mkdir(join(vcf_dir, 'work')) bs = [BaseSample(s.long_name(), bam=s.bam) for s in samples] if parall_view: vcf_by_sample = genotype(bs, snps_left_to_call_file, parall_view, work_dir=work_dir, output_dir=vcf_dir, genome_build=genome_build) else: n_threads = parallel_cfg.threads if len(samples) < n_threads: # vardict is running in 1 thread parallel_cfg.threads = len(samples) with parallel_view(len(samples), parallel_cfg, safe_mkdir(join(run.work_dir_path(), 'log'))) as view: vcf_by_sample = genotype(bs, snps_left_to_call_file, view, work_dir=work_dir, output_dir=vcf_dir, genome_build=genome_build) parallel_cfg.threads = n_threads # TODO: speed this up log.info('Loading called SNPs into the DB') for s in samples: recs = [r for r in VCF(vcf_by_sample[s.long_name()])] recs_by_rsid = defaultdict(list) for r in recs: recs_by_rsid[r.ID].append(r) for loc in locations: assert loc snp = s.snps.filter(SNP.rsid == loc.rsid).first() if not snp: snp = SNP(loc) build_snp_from_records(snp, recs_by_rsid[loc.rsid], s.project.min_depth) s.snps.append(snp) db.session.add(snp) log.info('Adding locations into the DB') run.locations.delete() for l in locations: run.locations.append(l) db.session.add(run) db.session.commit() log.info('Saved locations in the DB') log.info() log.info('Building tree') build_tree(run) log.info() log.info('Loading BAMs sliced to fingerprints') if parall_view: parall_view.run(load_bam_file, [[ s.bam, safe_mkdir(join(run.work_dir_path(), 'bams')), run.snps_file, s.long_name() ] for s in samples]) else: with parallel_view(len(samples), parallel_cfg, safe_mkdir(join(run.work_dir_path(), 'log'))) as view: view.run(load_bam_file, [[ s.bam, safe_mkdir(join(run.work_dir_path(), 'bams')), run.snps_file, s.long_name() ] for s in samples]) return run
def main(subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False): """ Generates a PNG image with a relatedness heatmap. """ datasets = _load_datasets(subdirs) title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + '' if not Params.NORMALIZE_VAR: title += ', not norm by var' if not Params.NORMALIZE_DIST: title += ', not norm by dist' if Params.SKIP_DAMAGE: title += ', skipped damage' if Params.SKIP_REJECT: title += ', skipped REJECT' if Params.SKIP_NOCALL: title += ', skipped num called = 0' if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF) if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST) if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions' else: title += ', skiped SNP pairs between regions' run_id = '__'.join(d.name for d in datasets) run_dir = safe_mkdir(join((output_dir or join(basedir, 'runs')), run_id)) log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True) work_dir = safe_mkdir(join(run_dir, 'work')) all_vcf_by_label = dict() bed_files_by_genome = defaultdict(set) for d in datasets: all_vcf_by_label.update(d.vcf_by_label) bed_files_by_genome[d.genome].add( d.bed_file) # d.bed_file=None for WGS genome_by_label = dict() for d in datasets: for label in d.vcf_by_label: genome_by_label[label] = d.genome parallel_cfg = ParallelCfg(threads=threads) log.info(f'Starting using {parallel_cfg.threads} threads') overlap_bed_file_by_genome = dict() with parallel_view(len(all_vcf_by_label), parallel_cfg, work_dir) as parall_view: if bed_files_by_genome: log.info(f'Found BED files: {bed_files_by_genome}') for genome, bed_files in bed_files_by_genome.items(): bed_files = [b for b in bed_files if b] log.info(f'Overlapping BED files for genome {genome}') overlap_bed_file_by_genome[genome] = _overlap_bed_files(bed_files, work_dir, genome) \ if bed_files else None primary_genome = sorted(bed_files_by_genome.items(), key=lambda kv: len(kv[1]))[-1][0] lifted_bed_files = [] for genome, overlap_bed_file in overlap_bed_file_by_genome.items(): if overlap_bed_file and genome != primary_genome: lifted_bed_file = lift_over(overlap_bed_file, genome, primary_genome) lifted_bed_files.append(lifted_bed_file) if lifted_bed_files: primary_bed_files = [ b for b in lifted_bed_files + [overlap_bed_file_by_genome[primary_genome]] if b ] overlap_bed_file_by_genome[ primary_genome] = _overlap_bed_files( primary_bed_files, work_dir, primary_genome) log.info('Lifting BED files back') for genome in overlap_bed_file_by_genome: if genome != primary_genome: overlap_bed_file_by_genome[genome] = lift_over( overlap_bed_file_by_genome[primary_genome], primary_genome, genome) log.info() log.info('Sorting, bgzipping and tabixing BED files') for g, bed in overlap_bed_file_by_genome.items(): overlap_bed_file_by_genome[g] = bgzip_and_tabix( sort_bed(bed, genome=g)) log.info() log.info('Slicing VCFs to regions in BED files') out = parall_view.run(_slice_vcf_fn, [[ work_dir, label, vcf, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) all_vcf_by_label = dict(out) log.info() log.info('Calculating fingerprints for individual samples') out = parall_view.run(make_fingerprint, [[ vcf, work_dir, label, fp_size, overlap_bed_file_by_genome[genome_by_label[label]] ] for label, vcf in all_vcf_by_label.items()]) print_label_pairs = dict(out) log.info() log.info('Comparing fingerprints pairwise') pairwise_dict = defaultdict(dict) for ((label1, print1), (label2, print2)) in it.combinations_with_replacement( print_label_pairs.items(), 2): dist, pvalue = compare(print1, print2) if dist: log.info( f' {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}') else: log.info(f' {label1} VS {label2}: failed to calculate') dist = float('NaN') pairwise_dict[label1][label2] = dist pairwise_dict[label2][label1] = dist log.info('Plotting comparison heatmap') plot_heatmap(pairwise_dict, run_dir, title)