def main(bcbio_dir, bed, depth, threads=None, isdebug=True): snp_file = verify_file(bed) depth_cutoff = depth log.init(isdebug) try: import az except ImportError: parallel_cfg = ParallelCfg(threads=threads) else: sys_cfg = az.init_sys_cfg() parallel_cfg = ParallelCfg( scheduler=sys_cfg.get('scheduler'), queue=sys_cfg.get('queue'), resources=sys_cfg.get('resources'), threads=threads or sys_cfg.get('threads'), tag='clearup') log.info('Loading bcbio project from ' + bcbio_dir) log.info('-' * 70) proj = BcbioProject() proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup') log.info('Loaded ' + proj.final_dir) log_dir = safe_mkdir(join(proj.log_dir, 'clearup')) work_dir = safe_mkdir(join(proj.work_dir, 'clearup')) out_dir = safe_mkdir(join(proj.date_dir, 'clearup')) with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view: genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
def make_cluster_cmdl(log_dir, refdata, app_name, cluster_submit_cmd=None): """ Generates cluster command line parameters for snakemake """ if not cluster_submit_cmd and not refdata.cluster_cmd: logger.critical(f'Automatic cluster submission ' f'is not supported for the machine "{refdata.name}". ' f'Use exclicit --cluster-cmd') if not cluster_submit_cmd: cluster_submit_cmd = refdata.cluster_cmd # Replacing the curly braces to avoid confusing snakemake formatter which for some reason triggers cluster_submit_cmd = cluster_submit_cmd.replace('{', '[').replace('}', ']') cluster_submitter = get_submit_script() timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') from reference_data import api as refdata cluster_cmdl = \ f' --cluster "{cluster_submitter} {timestamp} {log_dir} {app_name} ' \ f'\'{cluster_submit_cmd}\'"' # Also overriding jobscript? jobscript = refdata.cluster_jobscript if jobscript: safe_mkdir(log_dir) jobscript_file = join(log_dir, 'jobscript.sh') with open(jobscript_file, 'w') as f_out: f_out.write(jobscript.replace('{path}', os.environ["PATH"])) cluster_cmdl += f' --jobscript "{jobscript_file}"' return cluster_cmdl
def _run_cmd(self, cmdl, input_paths, output_path, before_run_fn=None): only_diff = BaseTestCase.only_diff or any( 'TEST' in e.upper() and 'DIFF' in e.upper() for e in os.environ) reuse = BaseTestCase.reuse or any( 'TEST' in e.upper() and 'REUSE' in e.upper() for e in os.environ) if only_diff: echo( 'TESRT_DIFF_ONLY set: not actually running the program, only checking diffs with the previous results' ) if reuse: echo('TEST_REUSE set: running on top of the previous results') tools_opts = next((os.environ[e] for e in os.environ if 'TEST' in e.upper() and 'OPTS' in e.upper()), '') if not only_diff: input_paths = [input_paths] if isinstance(input_paths, str) else input_paths for ip in input_paths: assert exists(ip), f'Data {ip} does not exist.' if not reuse: swap_output(output_path) safe_mkdir(dirname(output_path)) if before_run_fn: before_run_fn() echo('-' * 100) check_call(cmdl + ' ' + tools_opts) echo('-' * 100) echo('')
def run_prank(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) cmdl = prank_bin + ' -d=' + merged_fasta_fpath + ' -o=' + prank_out + ' -showtree' log.debug('Starting prank ' + cmdl) proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # lines = [] # prev_time = time.time() for stdout_line in iter(proc.stdout.readline, ''): print stdout_line.rstrip() # lines.append(stdout_line) cur_time = time.time() # if cur_time - prev_time > 2: emit('running', json.dumps({ 'finished': False, 'lines': [stdout_line.rstrip()], }) ) # lines = [] emit('running', json.dumps({ 'finished': True, 'lines': [], }) )
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth, parall_view=None): """ Picking random 3 samples and getting a callable for them. Trade off between looping through all samples in a huge batch, and hitting an sample with outstanding coverage. """ if can_reuse(output_bed_file, bam_files): return output_bed_file work_dir = safe_mkdir(join(work_dir, 'callable_work')) # random.seed(1234) # seeding random for reproducability # bam_files = random.sample(bam_files, min(len(bam_files), 3)) if parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) else: with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) good_overlap_sample_fraction = 0.8 # we want to pick those regions that have coverage at 80% of samples good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds)) info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} ' f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})') with file_transaction(work_dir, output_bed_file) as tx: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) intersection = pybedtools.BedTool() \ .multi_intersect(i=callable_beds) \ .filter(lambda r: len(r[4].split(',')) >= good_overlap_count) intersection.saveas(tx) info(f'Saved to {output_bed_file}') return output_bed_file
def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False): if not date_dir: fc_date = bcbio_cnf.get('fc_date') fc_name = bcbio_cnf.get('fc_name') or 'project' if fc_date: # Date dirpath is from bcbio and named after fc_name, not our own project name date_dir = join(final_dir, fc_date + '_' + fc_name) if not create_dir and not verify_dir(date_dir, silent=True): critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}') else: if isdir(join(final_dir, 'project')): # bcbio-CWL? date_dir = join(final_dir, 'project') if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir) else: regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}'] date_dirs = [join(final_dir, dirpath) for dirpath in listdir(final_dir) if any(re.match(regex, dirpath) for regex in regexs)] if len(date_dirs) == 0: raise NoDateStampsException('Error: no datestamp directory!') elif len(date_dirs) == 1: date_dir = date_dirs[0] else: dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs] newest_date, newest_dir = sorted(dates, reverse=True)[0] newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir] if len(newest_dirs) > 1: raise MultipleDateStampsException(f'Error: multiple datestamp directory found, ' f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}') date_dir = newest_dirs[0] if not silent: info('Using the datestamp dir: ' + date_dir) if create_dir: safe_mkdir(date_dir) return date_dir
def get_ref_fasta(genome): if is_az(): path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa' if isfile(path): logger.info('Found genome fasta at ' + path) return path if isdir(join(DATA_DIR, 'genomes', genome)): genome_dir = safe_mkdir(join(DATA_DIR, 'genomes')) else: genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes')) if genome not in genomepy.list_installed_genomes(genome_dir): genome_rec = [ rec for rec in genomepy.list_available_genomes() if rec[1] == genome ] if genome_rec: genome_rec = genome_rec[0] else: logger.critical('Error: genome ' + genome + ' is not available') logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] + ' and installing into ' + genome_dir) genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir) genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename return genome_fasta_file
def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False): if not date_dir: fc_date = bcbio_cnf.get('fc_date') fc_name = bcbio_cnf.get('fc_name') or 'project' if fc_date: # Date dirpath is from bcbio and named after fc_name, not our own project name date_dir = join(final_dir, fc_date + '_' + fc_name) if not create_dir and not verify_dir(date_dir, silent=True): critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}') else: if isdir(join(final_dir, 'project')): # bcbio-CWL? date_dir = join(final_dir, 'project') if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir) else: regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}'] date_dirs = [join(final_dir, dirpath) for dirpath in listdir(final_dir) if any(re.match(regex, dirpath) for regex in regexs)] if len(date_dirs) == 0: raise NoDateStampsException('Error: no datestamp directory!') elif len(date_dirs) == 1: date_dir = date_dirs[0] else: dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs] newest_date, newest_dir = sorted(dates, reverse=True)[0] newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir] if len(newest_dirs) > 1: raise MultipleDateStampsException(f'Error: multiple datestamp directory found, ' f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}') date_dir = newest_dirs[0] if not silent: info('Using the datestamp dir: ' + date_dir) if create_dir: safe_mkdir(date_dir) return date_dir
def _run_cmd(self, cmdl, input_paths, output_path, before_run_fn=None): only_diff = BaseTestCase.only_diff or any('TEST' in e.upper() and 'DIFF' in e.upper() for e in os.environ) reuse = BaseTestCase.reuse or any('TEST' in e.upper() and 'REUSE' in e.upper() for e in os.environ) if only_diff: echo('TESRT_DIFF_ONLY set: not actually running the program, only checking diffs with the previous results') if reuse: echo('TEST_REUSE set: running on top of the previous results') tools_opts = next((os.environ[e] for e in os.environ if 'TEST' in e.upper() and 'OPTS' in e.upper()), '') if not only_diff: input_paths = [input_paths] if isinstance(input_paths, str) else input_paths for ip in input_paths: assert exists(ip), f'Data {ip} does not exist.' if not reuse: swap_output(output_path) safe_mkdir(dirname(output_path)) if before_run_fn: before_run_fn() echo('-' * 100) check_call(cmdl + ' ' + tools_opts) echo('-' * 100) echo('')
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir(adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def _genotype(run, samples, genome_build, parall_view): snps_left_to_call_file = _get_snps_not_called(run.snps_file, samples) vcf_dir = safe_mkdir(join(run.work_dir_path(), 'vcf')) work_dir = safe_mkdir(join(vcf_dir, 'work')) bs = [BaseSample(s.long_name(), bam=s.bam) for s in samples] vcf_by_sample = genotype(bs, snps_left_to_call_file, parall_view, work_dir=work_dir, output_dir=vcf_dir, genome_build=genome_build) return vcf_by_sample
def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False): if final_dir: return final_dir elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']: final_dirname = bcbio_cnf['upload']['dir'] final_dir = adjust_path(join(config_dir, final_dirname)) if create_dir: safe_mkdir(final_dir) verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True) else: final_dir = abspath(join(config_dir, pardir, 'final')) if create_dir: safe_mkdir(final_dir) if not verify_dir(final_dir): critical('If final directory it is not named "final", please, specify it in the bcbio config.') return final_dir
def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False): if final_dir: return final_dir elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']: final_dirname = bcbio_cnf['upload']['dir'] final_dir = adjust_path(join(config_dir, final_dirname)) if create_dir: safe_mkdir(final_dir) verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True) else: final_dir = abspath(join(config_dir, pardir, 'final')) if create_dir: safe_mkdir(final_dir) if not verify_dir(final_dir): critical('If final directory it is not named "final", please, specify it in the bcbio config.') return final_dir
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir( adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def build_snps_panel(bcbio_projs=None, bed_files=None, output_dir=None, genome=None): selected_snps_file = join(output_dir, 'snps.bed') if can_reuse(selected_snps_file, bed_files): return selected_snps_file work_dir = safe_mkdir(join(output_dir, 'work')) log.info('Intersecting BED files for projects.') all_bed_files = set() for proj in bcbio_projs or []: if proj.coverage_bed: log.info(proj.project_name + ': selecting ' + proj.coverage_bed) all_bed_files.add(proj.coverage_bed) else: all_bed_files.add(proj.call) all_bed_files |= set(bed_files or []) overlapped_bed = join(work_dir, 'merged_bed_files.bed') log.info(f'BED files: {all_bed_files}, mergin, writing {overlapped_bed}') overlap_bed_files(all_bed_files, overlapped_bed) # Selecting SNPs from dbSNP dbsnp_file = get_dbsnp(genome) dbsnp_snps_file = join(work_dir, 'snps_in_merged_bed_files.bed') if not can_reuse(dbsnp_snps_file, [dbsnp_file, overlapped_bed]): cmdl = f'bedtools intersect -header -a {dbsnp_file} -b {overlapped_bed}' call_process.run(cmdl, dbsnp_snps_file) subset_bed_file = add_suffix(dbsnp_snps_file, 'subset') _make_snp_file(dbsnp_snps_file, genome, subset_bed_file) shutil.copyfile(subset_bed_file, selected_snps_file) return selected_snps_file
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter('--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate( input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def parallel_view(n_samples, parallel_cfg, work_dir): prev_dir = os.getcwd() os.chdir(safe_mkdir(work_dir)) view = get_parallel_view(n_samples, parallel_cfg) os.chdir(prev_dir) try: yield view finally: view.stop()
def parallel_view(n_samples, parallel_cfg, work_dir): prev_dir = os.getcwd() os.chdir(safe_mkdir(work_dir)) view = get_parallel_view(n_samples, parallel_cfg) os.chdir(prev_dir) try: yield view finally: view.stop()
def reload_all_data(): # if verify_dir(DATA_DIR): # os.rename(DATA_DIR, DATA_DIR + '.bak' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) safe_mkdir(DATA_DIR) init_db() if is_local(): #load_data(abspath('tests/test_project'), 'SA-1826796__SA-30853', genome='hg38') load_bcbio_project(abspath('tests/Dev_0261_newstyle'), 'Dev_0261_newstyle') load_bcbio_project(abspath('tests/Dev_0261_newstyle_smallercopy'), 'Dev_0261_newstyle_smallercopy') #load_bcbio_project(abspath('/Users/vlad/vagrant/NGS_Reporting/tests/results/bcbio_postproc/dream_chr21/final'), 'dream_chr21') elif is_us(): load_bcbio_project(abspath( '/ngs/oncology/analysis/external/EXT_070_Plasma_Seq_Pilot/Resolution/bcbio/final' ), 'EXT_070_Plasma_Seq_Pilot_Resolution', use_callable=True) load_bcbio_project( abspath( '/ngs/oncology/analysis/external/EXT_070_Plasma_Seq_Pilot/Foundation/bcbio/final' ), 'EXT_070_Plasma_Seq_Pilot_Foundation') load_bcbio_project(abspath( '/ngs/oncology/analysis/external/EXT_070_Plasma_Seq_Pilot/Foundation/plasma/bcbio/final' ), 'EXT_070_Plasma_Seq_Pilot_Foundation_plasma', use_callable=True) load_bcbio_project( abspath( '/ngs/oncology/analysis/external/EXT_070_Plasma_Seq_Pilot/Foundation/tissue/OurType/bcbio_complete' ), 'EXT_070_Plasma_Seq_Pilot_Foundation_tissue') load_bcbio_project( abspath( '/ngs/oncology/analysis/external/EXT_070_Plasma_Seq_Pilot/PGDx/bcbio/final' ), 'EXT_070_Plasma_Seq_Pilot_PGDx') load_bcbio_project( abspath( '/ngs/oncology/analysis/dev/Dev_0327_MiSeq_SNP251/bcbio_preprint/final' ), 'Dev_0327_MiSeq_SNP251_initial_preprint') load_bcbio_project( abspath( '/ngs/oncology/analysis/dev/Dev_0320_HiSeq4000_PARPiResistant_Exome/bcbio/final' ), 'Dev_0320_HiSeq4000_PARPiResistant_Exome')
def get_gender(genome, bam_fpath, bed_fpath, sample, avg_depth): gender = None chrom_lengths = ref.get_chrom_lengths(genome) chrom_names = [chrom for chrom, length in chrom_lengths] if 'Y' in chrom_names or 'chrY' in chrom_names: gender = determine_sex(sample.work_dir, bam_fpath, avg_depth, genome, bed_fpath) if gender: with open(join(safe_mkdir(sample.dirpath), 'gender.txt'), 'w') as f: f.write(gender[0].upper()) return gender
def _extract_qc_file(fp, new_mq_data_dir, final_dir, f_by_fp=None): """ Extracts QC file `fp` either by copying from `final_dir` (native bcbio), or from tar.gz file `tar_path` (CWL bcbio). Writes into a new file at new_mq_data_dir """ if fp.startswith('report/metrics/'): fp = fp.replace('report/metrics/', 'project/multiqc/') # for CWL _bcbio.txt files dst_fp = join(new_mq_data_dir, fp) fp_in_final = join(final_dir, fp) if isfile(fp_in_final): safe_mkdir(dirname(dst_fp)) shutil.copy2(fp_in_final, dst_fp) return dst_fp elif f_by_fp and fp in f_by_fp: safe_mkdir(dirname(dst_fp)) with open(dst_fp, 'wb') as out: out.write(f_by_fp[fp].read()) return dst_fp
def get_gender(genome, bam_fpath, bed_fpath, sample, avg_depth): gender = None chrom_lengths = ref.get_chrom_lengths(genome) chrom_names = [chrom for chrom, length in chrom_lengths] if 'Y' in chrom_names or 'chrY' in chrom_names: gender = determine_sex(sample.work_dir, bam_fpath, avg_depth, genome, bed_fpath) if gender: with open(join(safe_mkdir(sample.dirpath), 'gender.txt'), 'w') as f: f.write(gender[0].upper()) return gender
def phylo_tree_page(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)} work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) tree_fpath = os.path.join(prank_out + '.best.dnd') if not can_reuse(tree_fpath, merged_fasta_fpath): return render_template( 'processing.html', projects=[{ 'name': p.name, } for i, p in enumerate(projects)], run_id=run_id, title='Processing ' + ', '.join(project_names), ) log.debug('Prank results found, rendering tree!') tree = next(Phylo.parse(tree_fpath, 'newick')) seq_by_id = read_fasta(merged_fasta_fpath) tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id) all_samples_count = sum(len(p.samples.all()) for p in projects) return render_template( 'tree.html', projects=[{ 'name': p.name, 'color': color_by_proj[p.name], } for i, p in enumerate(projects)], title=', '.join(project_names), data=tree_json, tree_height=20 * all_samples_count, tree_width=5 * all_samples_count, )
def main(paths, output_dir, genome, depth): log.init(True) bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)] bcbio_projs = [] dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)] if dirs: for d in dirs: proj = BcbioProject() proj.load_from_bcbio_dir(d, proc_name='clearup') bcbio_projs.append(proj) build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): import pybedtools pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = pybedtools.BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def _sex_from_bam(sname, bam_file, bed_file, work_dir, genome_build, avg_depth=None, snp_depths=None): from os.path import join from ngs_utils.file_utils import safe_mkdir from ngs_utils.sex import determine_sex if avg_depth is None: if not snp_depths: log.critical( 'Error: avg_depth is NOT provided and no SNPs in sample ' + sname) avg_depth = sum(snp_depths) / len(snp_depths) sex = determine_sex(safe_mkdir(join(work_dir, sname)), bam_file, avg_depth, genome_build, target_bed=bed_file) return sex
def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None, create_dirs=False, proc_name='postproc'): self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir) if create_dirs: safe_mkdir(self.final_dir) self.project_name = self._set_project_name(self.final_dir, project_name) self.work_dir = abspath(join(self.final_dir, pardir, 'work')) if create_dirs: safe_mkdir(self.work_dir) self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs, silent=self.silent) self.log_dir = join(self.date_dir, 'log') self.postproc_log_dir = join(self.log_dir, proc_name) if create_dirs: safe_mkdir(self.postproc_log_dir) self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True) self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None, create_dirs=False, proc_name='postproc'): self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir) if create_dirs: safe_mkdir(self.final_dir) self.project_name = self._set_project_name(self.final_dir, project_name) self.work_dir = abspath(join(self.final_dir, pardir, 'work')) if create_dirs: safe_mkdir(self.work_dir) self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs, silent=self.silent) self.log_dir = join(self.date_dir, 'log') self.postproc_log_dir = join(self.log_dir, proc_name) if create_dirs: safe_mkdir(self.postproc_log_dir) self.var_dir = join(self.date_dir, BcbioProject.var_dir) self.raw_var_dir = join(self.var_dir, 'raw') self.expression_dir = join(self.date_dir, BcbioProject.expression_dir) self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True) self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
def main(ctx, subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False): """ Generates a PNG image with a relatedness heatmap. """ if not subdirs: ctx.fail('Provide at least on input directory.') datasets = _load_datasets(subdirs) title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + '' if not Params.NORMALIZE_VAR: title += ', not norm by var' if not Params.NORMALIZE_DIST: title += ', not norm by dist' if Params.SKIP_DAMAGE: title += ', skipped damage' if Params.SKIP_REJECT: title += ', skipped REJECT' if Params.SKIP_NOCALL: title += ', skipped num called = 0' if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF) if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST) if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions' else: title += ', skiped SNP pairs between regions' run_id = '__'.join(d.name for d in datasets) run_dir = safe_mkdir(join((output_dir or join(code_dir, 'runs')), run_id)) log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True) work_dir = safe_mkdir(join(run_dir, 'work')) all_vcf_by_label = dict() bed_files_by_genome = defaultdict(set) for d in datasets: all_vcf_by_label.update(d.vcf_by_label) if d.bed_file: bed_files_by_genome[d.genome].add( d.bed_file) # d.bed_file=None for WGS genome_by_label = dict() for d in datasets: for label in d.vcf_by_label: genome_by_label[label] = d.genome parallel_cfg = ParallelCfg(threads=threads) log.info(f'Starting using {parallel_cfg.threads} threads') with parallel_view(len(all_vcf_by_label), parallel_cfg, work_dir) as parall_view: overlap_bed_file_by_genome = dict() if bed_files_by_genome: overlap_bed_file_by_genome = _prep_bed(work_dir, bed_files_by_genome, overlap_bed_file_by_genome) log.info('Slicing VCFs to regions in BED files') out = parall_view.run(_slice_vcf_fn, [[ work_dir, label, vcf, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) all_vcf_by_label = dict(out) log.info() log.info('Calculating fingerprints for individual samples') out = parall_view.run(make_fingerprint, [[ vcf, work_dir, label, fp_size, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) print_label_pairs = dict(out) log.info() log.info('Comparing fingerprints pairwise') pairwise_dict = defaultdict(dict) for ((label1, print1), (label2, print2)) in it.combinations_with_replacement( print_label_pairs.items(), 2): dist, pvalue = compare(print1, print2) if dist: log.info( f' {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}') else: log.info(f' {label1} VS {label2}: failed to calculate') dist = float('NaN') pairwise_dict[label1][label2] = dist pairwise_dict[label2][label1] = dist log.info('Plotting comparison heatmap') plot_heatmap(pairwise_dict, run_dir, title)
from pybedtools import BedTool from ngs_utils.Sample import BaseSample from ngs_utils.file_utils import safe_mkdir, can_reuse, file_transaction, verify_file from ngs_utils.parallel import ParallelCfg, parallel_view from ngs_utils import logger as log from clearup.panel import build_snps_panel from clearup.genotype import genotype, build_tree, build_snp_from_records from clearup.utils import FASTA_ID_PROJECT_SEPARATOR, load_bam_file from clearup import app, db, DATA_DIR, parallel_cfg # import logging # logging.basicConfig() # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) safe_mkdir(DATA_DIR) run_to_project_assoc_table = db.Table( 'run_to_project_association', db.Model.metadata, db.Column('run_id', db.Integer, db.ForeignKey('run.id')), db.Column('project_name', db.String, db.ForeignKey('project.name'))) class Location(db.Model): __tablename__ = 'location' id = db.Column(db.Integer, primary_key=True) rsid = db.Column(db.String) chrom = db.Column(db.String) pos = db.Column(db.Integer) gene = db.Column(db.String) ref = db.Column(db.String)
def init_db(): safe_mkdir(DATA_DIR) db.init_app(app) db.drop_all() db.create_all()
def run_snakemake(snakefile, conf, jobs=None, output_dir=None, forcerun=None, unlock=False, dryrun=False, target_rules=None, cluster=None, cluster_cmd=None, log_dir=None, dag=None, report=None, restart_times=None): conf['total_cores'] = jobs ######################### #### Setting cluster #### ######################### cluster_param = '' cluster_log_dir = '' if cluster or cluster_cmd: assert log_dir, 'For cluster run, must also specify log_dir' if cluster_cmd: cluster_param = f' --cluster "{cluster_cmd}"' else: cluster_log_dir = safe_mkdir(join(log_dir, 'cluster')) cluster_param = make_cluster_cmdl(cluster_log_dir, 'umccrise') ########################## #### Preparing config #### ########################## if log_dir: safe_mkdir(log_dir) conf_f = open(join(log_dir, '.conf.yaml'), 'w') else: conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False) yaml.dump(conf, conf_f) conf_f.close() ############################### #### Building command line #### ############################### if forcerun: forcerun = " ".join(forcerun.split(',')) cmd = ( f'snakemake ' f'{" ".join(flatten([target_rules])) if target_rules else ""} ' + f'--snakefile {snakefile} ' f'--printshellcmds ' f'{"--dryrun " if dryrun else ""}' f'{"--dag " if dag else ""}' f'{f"--report {report} " if report else ""}' f'{f"--directory {output_dir} " if output_dir else ""}' f'{f"-j {jobs} " if jobs else ""}' f'--rerun-incomplete ' f'{f"--restart-times {restart_times} " if restart_times else ""}' f'{cluster_param} ' f'--configfile {conf_f.name} ' + f'{"--dag " if dag else ""}' f'{f"--forcerun {forcerun}" if forcerun else ""}' ) ################# #### Running #### ################# if unlock: print('* Unlocking previous run... *') run_simple(cmd + ' --unlock') print('* Now rerunning *') try: run_simple(cmd) except subprocess.CalledProcessError: logger.error('--------') logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) except KeyboardInterrupt: logger.error('--------') logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) else: logger.info('--------') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.info(f'Finished. Output directory: {output_dir}')
def get_total_bed_size(bed_fpath, work_dir=None): if work_dir: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) return sum(len(x) for x in BedTool(bed_fpath).merge())
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ba.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical( f'Cannot parse the reference BED file - unexpected number of lines ' f'({len(inters_fields_list)} in {inters_fields_list} (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ba.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:])] = intersection_fields[ori_col_num:] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ba.BedCols.GENE] if not high_confidence else overlap_fields[ba.BedCols.HUGO] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ba.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def create(projects, parall_view=None): run = Run() db.session.add(run) for p in projects: run.projects.append(p) db.session.commit() genomes = [p.genome for p in projects] if len(set(genomes)) > 1: log.critical('Error: multiple genomes in projects: ' + str(genomes)) genome_build = genomes[0] snps_dir = safe_mkdir(join(run.work_dir_path(), 'snps')) run.snps_file = build_snps_panel( bed_files=[p.bed_fpath for p in projects if p.bed_fpath], output_dir=snps_dir, genome=genome_build) locations = extract_locations_from_file(run.snps_file) for loc in locations: db.session.add(loc) db.session.commit() log.info() log.info('Genotyping') samples = [s for p in projects for s in p.samples] snps_left_to_call_file = _get_snps_not_called(run.snps_file, samples) vcf_dir = safe_mkdir(join(run.work_dir_path(), 'vcf')) work_dir = safe_mkdir(join(vcf_dir, 'work')) bs = [BaseSample(s.long_name(), bam=s.bam) for s in samples] if parall_view: vcf_by_sample = genotype(bs, snps_left_to_call_file, parall_view, work_dir=work_dir, output_dir=vcf_dir, genome_build=genome_build) else: n_threads = parallel_cfg.threads if len(samples) < n_threads: # vardict is running in 1 thread parallel_cfg.threads = len(samples) with parallel_view(len(samples), parallel_cfg, safe_mkdir(join(run.work_dir_path(), 'log'))) as view: vcf_by_sample = genotype(bs, snps_left_to_call_file, view, work_dir=work_dir, output_dir=vcf_dir, genome_build=genome_build) parallel_cfg.threads = n_threads # TODO: speed this up log.info('Loading called SNPs into the DB') for s in samples: recs = [r for r in VCF(vcf_by_sample[s.long_name()])] recs_by_rsid = defaultdict(list) for r in recs: recs_by_rsid[r.ID].append(r) for loc in locations: assert loc snp = s.snps.filter(SNP.rsid == loc.rsid).first() if not snp: snp = SNP(loc) build_snp_from_records(snp, recs_by_rsid[loc.rsid], s.project.min_depth) s.snps.append(snp) db.session.add(snp) log.info('Adding locations into the DB') run.locations.delete() for l in locations: run.locations.append(l) db.session.add(run) db.session.commit() log.info('Saved locations in the DB') log.info() log.info('Building tree') build_tree(run) log.info() log.info('Loading BAMs sliced to fingerprints') if parall_view: parall_view.run(load_bam_file, [[ s.bam, safe_mkdir(join(run.work_dir_path(), 'bams')), run.snps_file, s.long_name() ] for s in samples]) else: with parallel_view(len(samples), parallel_cfg, safe_mkdir(join(run.work_dir_path(), 'log'))) as view: view.run(load_bam_file, [[ s.bam, safe_mkdir(join(run.work_dir_path(), 'bams')), run.snps_file, s.long_name() ] for s in samples]) return run
def main(output_dir=None, tumor_bam=None, normal_bam=None, normal_name=None, tumor_name=None, genome=None, input_genomes_url=None, ref_fa=None, viruses_fa=None, repeat_masker_bed=None, breakend_pon=None, bp_pon=None, bp_hotspots=None, min_tumor_af=None, requested_cores=None, unlock=False, dryrun=False, maxcoverage=None, chunksize_mil=None, jvm_heap=None, externalaligner=None): conf = {} output_dir = output_dir or 'gridss_results' output_dir = safe_mkdir(abspath(output_dir)) log_dir = safe_mkdir(join(output_dir, 'log')) logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True) if isfile(join(output_dir, 'work', 'all.done')): run_simple('rm ' + join(output_dir, 'work', 'all.done')) conf['output_dir'] = adjust_path(output_dir) tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') if normal_bam: normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'), conf['normal_name'] = normal_name conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option') conf['tumor_name'] = tumor_name try: machine_cores = len(os.sched_getaffinity(0)) except: machine_cores = 1 cores = min(machine_cores, 8) if requested_cores: cores = min(cores, requested_cores) conf['cores'] = cores if maxcoverage: conf['maxcoverage'] = maxcoverage if chunksize_mil: conf['chunksize_mil'] = chunksize_mil if jvm_heap: conf['jvm_heap'] = jvm_heap if externalaligner: conf['externalaligner'] = externalaligner conf['genome'] = genome try: from reference_data import api as refdata except: pass else: # check reference_data can find the genomes dir, and error out if not genomes_dir = refdata.find_genomes_dir(input_genomes_url) if genomes_dir: conf['genomes_dir'] = genomes_dir if ref_fa: if not externalaligner == 'minimap2' and not verify_file(ref_fa + '.bwt'): log.critical(f'Please, index {ref_fa} using' f' bwa index {ref_fa}') if not verify_file(ref_fa + '.fai'): log.critical(f'Please, index {ref_fa} using' f' samtools faidx {ref_fa}') conf['ref_fa'] = ref_fa if viruses_fa: if not externalaligner == 'minimap2' and not verify_file(viruses_fa + '.bwt'): log.critical(f'Please, index {viruses_fa} using: ' f' bwa index {viruses_fa}') if not verify_file(viruses_fa + '.fai'): log.critical(f'Please, index {viruses_fa} using ' f' samtools faidx {viruses_fa}') dict_file = viruses_fa.replace('.fa', '.dict') if not verify_file(dict_file): log.critical(f'Please, index {viruses_fa} using: ' f' samtools dict {viruses_fa} -o {dict_file}') img_file = viruses_fa + '.img' if not verify_file(img_file): log.critical( f'Please, create an img file for {viruses_fa} using:\n' f' gatk BwaMemIndexImageCreator -I {viruses_fa} -O {img_file}' ) conf['viruses_fa'] = verify_file(viruses_fa) if repeat_masker_bed: conf['repeat_masker_bed'] = repeat_masker_bed if breakend_pon: conf['breakend_pon'] = breakend_pon if bp_pon: conf['bp_pon'] = bp_pon if bp_hotspots: conf['bp_hotspots'] = bp_hotspots if min_tumor_af: conf['min_tumor_af'] = min_tumor_af py_path = sys.executable # e.g. /miniconda/envs/umccrise_hmf/bin/python env_path = dirname(dirname(py_path)) # e.g. /miniconda/envs/umccrise_hmf found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar')) if not found: hmf_env_path = secondary_conda_env('hmf', is_critical=False) if hmf_env_path: found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar')) if not found: critical( 'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`' ) conf['gridss_env'] = hmf_env_path conf['gridss_jar'] = found[0] run_snakemake(join(package_path(), 'gridss', 'Snakefile'), conf, cores=cores, output_dir=output_dir, unlock=unlock, dryrun=dryrun)
def main(output_dir=None, normal_bam=None, tumor_bam=None, snv_vcf=None, normal_name=None, tumor_name=None, sample=None, genome=None, genomes_dir=None, gridss_ref_dir=None, ref_fa=None, threads=None, jvmheap=None): gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx')) gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts')) normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') sample = sample or tumor_name output_dir = safe_mkdir(abspath(output_dir or 'gridss')) logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True) output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf') assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet' if genomes_dir: refdata.find_genomes_dir(genomes_dir) if not gridss_ref_dir: gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir') if not ref_fa: ref_fa = ref_fa.get_ref_file(genome, 'fa') hmf_env_path = conda_utils.secondary_conda_env('hmf') gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0] amber_jar = glob.glob( join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0] cobalt_jar = glob.glob( join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0] purple_jar = glob.glob( join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0] linx_jar = glob.glob( join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0] cmd = f""" PATH={hmf_env_path}/bin:$PATH \ THREADS={threads} \ GRIDSS_JAR={gridss_jar} \ AMBER_JAR={amber_jar} \ COBALT_JAR={cobalt_jar} \ PURPLE_JAR={purple_jar} \ LINX_JAR={linx_jar} \ bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \ -n {normal_bam} \ -t {tumor_bam} \ -v {output_vcf} \ -s {sample} \ --normal_sample {normal_name} \ --tumour_sample {tumor_name} \ --snvvcf {snv_vcf} \ --ref_dir {gridss_ref_dir} \ --install_dir {gridss_scripts_dir} \ --reference {ref_fa} \ --output_dir {output_dir} \ {f"--jvmheap {jvmheap}" if jvmheap else ""} """.strip() try: run_simple(cmd) except subprocess.SubprocessError: err('--------\n') err(f'Error running GRIDSS-PURPLE-LINX.\n') raise
def _add_project(bam_by_sample, project_name, bed_file=None, use_callable=False, data_dir='', genome='hg19', min_depth=DEPTH_CUTOFF, depth_by_sample=None, reuse_files=False): fp_proj = Project.query.filter(Project.name == project_name).first() if fp_proj: fp_proj.delete(reuse_files=reuse_files) fp_proj = Project( name=project_name, data_dir=data_dir, genome=genome, bed_fpath=bed_file, min_depth=min_depth, used_callable=use_callable, ) db.session.add(fp_proj) db_samples = [] for sname, bam_file in bam_by_sample.items(): db_samples.append(Sample(sname, fp_proj, bam_file)) db.session.add_all(db_samples) work_dir = safe_mkdir(fp_proj.get_work_dir()) do_ngb = False do_sex = False do_create_run = False if do_ngb or do_sex or do_create_run or use_callable: with parallel_view(len(bam_by_sample), parallel_cfg, work_dir) as p_view: if use_callable: log.info(f'Calculating callable regions for {project_name}.') genome_fasta_file = get_ref_fasta(genome) fp_proj.bed_fpath = batch_callable_bed( bam_by_sample.values(), join(work_dir, 'callable_regions.bed'), work_dir, genome_fasta_file, min_depth, parall_view=p_view) log.debug(f'Set bed file {fp_proj.bed_fpath}') if do_create_run: get_or_create_run([fp_proj], parall_view=p_view) if do_ngb: log.info('Exposing to NGB') _add_to_ngb(work_dir, project_name, bam_by_sample, genome, bed_file, p_view) if do_sex: log.info('Genotyping sex') sex_work_dir = safe_mkdir(join(work_dir, 'sex')) sexes = p_view.run(_sex_from_bam, [[ db_s.name, bam_by_sample[db_s.name], bed_file, sex_work_dir, genome, depth_by_sample.get(db_s.name) if depth_by_sample else None, [snp.depth for snp in db_s.snps.all()] ] for db_s in db_samples]) for s, sex in zip(db_samples, sexes): s.sex = sex db.session.commit() log.info() log.info('Done.')
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter( '--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter( '--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter( f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed( input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate(input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def work_dir_path(self): return safe_mkdir(join(DATA_DIR, str(self.id)))
def main(prefix, output_bedpe, output_fasta=None, output_json=None, min_read_support=None, ensembl_release=None, peptide_flanking_len=None, debug=False, no_filtering=False, check_transcript=True, pizzly_ref_fa=None, reads=None, min_tpm=None): # input_flat_fpath = prefix + '-flat.tsv' input_json_fpath = prefix + '.json' input_fasta = prefix + '.fusions.fasta' output_bedpe = abspath(output_bedpe) logger.init(debug) global ENSEMBL_RELEASE ENSEMBL_RELEASE = ensembl_release ebl = EnsemblRelease(ENSEMBL_RELEASE) # Reading filtered tsv # filt_fusions = set() # with open(input_flat_fpath) as f: # for row in csv.DictReader(f, delimiter='\t'): # filt_fusions.add((row['geneA.name'], row['geneB.name'])) # Read json json_data = {'genes': []} with open(input_json_fpath) as f: data = json.load(f) for g_event in data['genes']: gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # if (gene_a, gene_b) in filt_fusions: json_data['genes'].append(g_event) # Read fasta fasta_dict = SeqIO.index(input_fasta, 'fasta') # First round: genomic coordinates and fasta logger.info( f'Round 1: reading {len(json_data["genes"])} gene-pairs events from pizzly JSON' ) fusions = [] for g_event in json_data[ 'genes']: # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'} gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # logger.info(f'Processing event {gene_a}>>{gene_b}') met_fasta_keys = set( ) # collecting to get rid of duplicate transcript events for t_event in g_event['transcripts']: fusion = Fusion.create_from_pizzly_event(ebl, t_event) if check_transcript: if not _transcript_is_good( fusion.side_5p.trx) or not _transcript_is_good( fusion.side_3p.trx): # logger.info(f'Transcripts {fusion.side_5p.trx} and {fusion.side_3p.trx} didn\'t pass check') continue if no_filtering is not True and fusion.support < min_read_support: continue calc_positions_ok = fusion.calc_genomic_positions() if not calc_positions_ok: continue # comparing our fasta to pizzly fasta fusion.fasta_rec = fasta_dict[t_event['fasta_record']] _check_fusion_fasta(fusion.fasta_rec, fusion) # skipping duplicate fastas k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.fasta assert k not in met_fasta_keys met_fasta_keys.add(k) fusions.append(fusion) # if not met_fasta_keys: # logger.info(' Filtered all fusions for this gene pair.') if met_fasta_keys: logger.info( f'Keeping {len(met_fasta_keys)} fusion(s) for the event {gene_a}-{gene_b}' ) if not fusions: logger.warn('Finished: no fusions passed filtering') # Calculate expression of fused transcripts expr_by_fusion = None if reads and fusions: # filtered fasta for re-calling expression work_dir = safe_mkdir(splitext(output_bedpe)[0] + '_quant') fasta_path = join(work_dir, 'fusions.fasta') fasta_recs = [f.fasta_rec for f in fusions] SeqIO.write(fasta_recs, fasta_path, 'fasta') if pizzly_ref_fa: expr_by_fusion = requanitify_pizzly(pizzly_ref_fa, fasta_path, work_dir, reads) # expr_by_fusion = {fusion-fasta-id -> {length eff_length est_counts tpm}} # Second round: peptides and expression logger.info() logger.info( f'Round 2: making peptides for {len(fusions)} events in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in fusions]))} genes pairs' ) met_peptide_keys = set() # collecting to get rid of duplicate peptides bedpe_entries = [] peptide_fusions = [] if peptide_flanking_len < 0: peptide_flanking_len = None for fusion in fusions: if fusion.side_3p.trx.contains_start_codon: logger.info( f'Translating {fusion.side_5p.trx.gene.name}>>{fusion.side_3p.trx.gene.name} fusion: {fusion}' ) fusion.make_peptide(peptide_flanking_len) if fusion.peptide: _verify_peptides(fusion.fasta_rec, fusion, peptide_flanking_len) # skipping duplicate peptides k = fusion.side_5p.trx.gene.name, fusion.side_3p.trx.gene.name, fusion.peptide if k in met_peptide_keys: logger.debug(f'Skipping peptide {k}: already added') continue met_peptide_keys.add(k) # writing bedpe entry = fusion.to_bedpe() # add expression if expr_by_fusion: entry.update(expr_by_fusion[fusion.fasta_rec.id]) tpm = float(entry['tpm']) if no_filtering is not True and tpm < min_tpm: logger.debug( f'Skipping peptide {entry}: TPM={tpm} is below {min_tpm}') continue if fusion.peptide: peptide_fusions.append(fusion) bedpe_entries.append(entry) # Writing bedpe with open(output_bedpe, 'w') as bedpe_fh: bedpe_header = [ 'chr 5p', 'start 5p', 'end 5p', 'chr 3p', 'start 3p', 'end 3p', 'name', 'tier', 'strand 5p', 'strand 3p', 'support', 'is canon bndry', 'inframe', 'peptide', 'fusion pos', 'nt in the break', 'transcripts', 'is canon intron dinuc', ] if expr_by_fusion: bedpe_header.extend(list(expr_by_fusion.values())[0].keys()) bedpe_writer = csv.DictWriter(bedpe_fh, fieldnames=bedpe_header, delimiter='\t') bedpe_writer.writeheader() for bedpe_entry in bedpe_entries: bedpe_writer.writerow(bedpe_entry) # _test_pvac(output_bedpe) # Write fasta if output_fasta: SeqIO.write([f.fasta_rec for f in peptide_fusions], output_fasta, 'fasta') logger.info() logger.info( f'Written {len(peptide_fusions)} fusions in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in peptide_fusions]))} ' f'gene pairs good peptides bedpe: {output_bedpe}')