def sambamba_depth(work_dir, bed, bam, depth_thresholds=None, output_fpath=None, sample_name=None, threads=1): if not bam: return None sample_name = sample_name or splitext_plus(basename(bam))[0] depth_thresholds = depth_thresholds or [] if isinstance(bed, BedTool): bed = bed.saveas().fn if not output_fpath: output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt') if can_reuse(output_fpath, [bam, bed]): return output_fpath thresholds_str = ''.join( [' -T' + str(int(d)) for d in depth_thresholds if d is not None]) cmdline = ( 'depth region -F "not duplicate and not failed_quality_control" ' '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals()) call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath) return output_fpath
def intersect_bed(work_dir, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed') if can_reuse(output_fpath, [bed1, bed2]): return output_fpath bedtools = which('bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check]) return output_fpath
def intersect_bed(work_dir, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed') if can_reuse(output_fpath, [bed1, bed2]): return output_fpath bedtools = which('bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check]) return output_fpath
def _overlap_bed_files(bed_files, work_dir, genome): from clearup.panel import overlap_bed_files fnames = [basename(splitext_plus(fp)[0]) for fp in bed_files] overlapped_file = join(work_dir, f'{"__".join(fnames)}.{genome}.bed') if not can_reuse(overlapped_file, bed_files): overlap_bed_files(bed_files, overlapped_file) return overlapped_file
def bam_to_bed(bam_fpath, to_gzip=True): debug('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def sambamba_depth(work_dir, bed, bam, depth_thresholds=None, output_fpath=None, sample_name=None, threads=1): if not bam: return None sample_name = sample_name or splitext_plus(basename(bam))[0] depth_thresholds = depth_thresholds or [] if isinstance(bed, BedTool): bed = bed.saveas().fn if not output_fpath: output_fpath = join(work_dir, splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt') if can_reuse(output_fpath, [bam, bed]): return output_fpath thresholds_str = ''.join([' -T' + str(int(d)) for d in depth_thresholds if d is not None]) cmdline = ('depth region -F "not duplicate and not failed_quality_control" ' '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals()) call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath) return output_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def _load_datasets(subdirs): vcf_by_project_by_genome = defaultdict(dict) # vcf_by_label = dict() # all_bed_files = [] # project_names = [] datasets = [] for subdir in subdirs: dataset = Dataset() if ':' in subdir: subdir, dataset.genome = subdir.split(':') else: dataset.genome = 'hg19' dir_path = subdir if glob(join(dir_path, '*.vcf.gz')): log.info(f'Found .vcf.gz files in directory {dir_path}') # Simple directory with VCF files and an optional BED file? dataset.name = subdir.replace('/', '__') if glob(join(dir_path, '*.bed')): dataset.bed_file = glob(join(dir_path, '*.bed'))[0] for vcf_fpath in glob(join(dir_path, '*.vcf.gz')): label = join(subdir, basename(splitext_plus(vcf_fpath)[0])).replace( '/', '__') dataset.vcf_by_label[label] = vcf_fpath else: log.info( f'Not found any .vcf.gz files in directory {dir_path}. Checking if that\'s a bcbio folder.' ) # Bcbio directory? bcbio_proj = BcbioProject() bcbio_proj.load_from_bcbio_dir(subdir, proc_name='clearup') dataset.name = bcbio_proj.project_name dataset.genome = bcbio_proj.genome_build for s in bcbio_proj.samples: vcf_file = s.find_raw_vcf() if vcf_file: dataset.vcf_by_label[bcbio_proj.project_name + '__' + s.name] = vcf_file if bcbio_proj.coverage_bed: dataset.bed_file = bcbio_proj.coverage_bed datasets.append(dataset) return datasets
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def make_fingerprint(vcf_file, work_dir=None, label=None, fp_size=20, bed_file=None): log.info('Starting processing file ' + vcf_file) work_dir = work_dir or dirname(vcf_file) if label: print_name = label else: print_name = splitext_plus(basename(vcf_file))[0] print_name += '.print' + str(fp_size) print_name += '_dist' + str(Params.MIN_DIST) print_name += '_af' + str(Params.MIN_AF) if not Params.INTERREGION_PAIRS: print_name += '_skip_interregion_pairs' raw_print_file = join(work_dir, print_name) if can_reuse(raw_print_file, vcf_file): with open(raw_print_file) as f: raw = np.fromfile(f).reshape((len(index_by_key), fp_size)) else: raw = _raw_fingerprint(vcf_file, fp_size=fp_size, bed_file=bed_file) with open(raw_print_file, 'w') as f: raw.tofile(f) log.info(f'Saved raw fingerprints into {raw_print_file}') norm_print_name = print_name if Params.NORMALIZE_DIST: norm_print_name += '_normdist' if Params.NORMALIZE_VAR: norm_print_name += '_normvar' norm_print_file = join(work_dir, norm_print_name) if can_reuse(norm_print_file, raw_print_file): with open(norm_print_file) as f: norm = np.fromfile(f).reshape((len(index_by_key), fp_size)) else: norm = _normalize_fingerprint(raw) with open(norm_print_file, 'w') as f: norm.tofile(f) log.info(f'Saved normalised fingerprints into {norm_print_file}') return label, norm
def main(output_dir=None, tumor_bam=None, normal_bam=None, normal_name=None, tumor_name=None, genome=None, input_genomes_url=None, ref_fa=None, viruses_fa=None, repeat_masker_bed=None, breakend_pon=None, bp_pon=None, bp_hotspots=None, min_tumor_af=None, requested_cores=None, unlock=False, dryrun=False, maxcoverage=None, chunksize_mil=None, jvm_heap=None, externalaligner=None): conf = {} output_dir = output_dir or 'gridss_results' output_dir = safe_mkdir(abspath(output_dir)) log_dir = safe_mkdir(join(output_dir, 'log')) logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True) if isfile(join(output_dir, 'work', 'all.done')): run_simple('rm ' + join(output_dir, 'work', 'all.done')) conf['output_dir'] = adjust_path(output_dir) tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') if normal_bam: normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'), conf['normal_name'] = normal_name conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option') conf['tumor_name'] = tumor_name try: machine_cores = len(os.sched_getaffinity(0)) except: machine_cores = 1 cores = min(machine_cores, 8) if requested_cores: cores = min(cores, requested_cores) conf['cores'] = cores if maxcoverage: conf['maxcoverage'] = maxcoverage if chunksize_mil: conf['chunksize_mil'] = chunksize_mil if jvm_heap: conf['jvm_heap'] = jvm_heap if externalaligner: conf['externalaligner'] = externalaligner conf['genome'] = genome try: from reference_data import api as refdata except: pass else: # check reference_data can find the genomes dir, and error out if not genomes_dir = refdata.find_genomes_dir(input_genomes_url) if genomes_dir: conf['genomes_dir'] = genomes_dir if ref_fa: if not externalaligner == 'minimap2' and not verify_file(ref_fa + '.bwt'): log.critical(f'Please, index {ref_fa} using' f' bwa index {ref_fa}') if not verify_file(ref_fa + '.fai'): log.critical(f'Please, index {ref_fa} using' f' samtools faidx {ref_fa}') conf['ref_fa'] = ref_fa if viruses_fa: if not externalaligner == 'minimap2' and not verify_file(viruses_fa + '.bwt'): log.critical(f'Please, index {viruses_fa} using: ' f' bwa index {viruses_fa}') if not verify_file(viruses_fa + '.fai'): log.critical(f'Please, index {viruses_fa} using ' f' samtools faidx {viruses_fa}') dict_file = viruses_fa.replace('.fa', '.dict') if not verify_file(dict_file): log.critical(f'Please, index {viruses_fa} using: ' f' samtools dict {viruses_fa} -o {dict_file}') img_file = viruses_fa + '.img' if not verify_file(img_file): log.critical( f'Please, create an img file for {viruses_fa} using:\n' f' gatk BwaMemIndexImageCreator -I {viruses_fa} -O {img_file}' ) conf['viruses_fa'] = verify_file(viruses_fa) if repeat_masker_bed: conf['repeat_masker_bed'] = repeat_masker_bed if breakend_pon: conf['breakend_pon'] = breakend_pon if bp_pon: conf['bp_pon'] = bp_pon if bp_hotspots: conf['bp_hotspots'] = bp_hotspots if min_tumor_af: conf['min_tumor_af'] = min_tumor_af py_path = sys.executable # e.g. /miniconda/envs/umccrise_hmf/bin/python env_path = dirname(dirname(py_path)) # e.g. /miniconda/envs/umccrise_hmf found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar')) if not found: hmf_env_path = secondary_conda_env('hmf', is_critical=False) if hmf_env_path: found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar')) if not found: critical( 'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`' ) conf['gridss_env'] = hmf_env_path conf['gridss_jar'] = found[0] run_snakemake(join(package_path(), 'gridss', 'Snakefile'), conf, cores=cores, output_dir=output_dir, unlock=unlock, dryrun=dryrun)
def main(output_dir=None, normal_bam=None, tumor_bam=None, snv_vcf=None, normal_name=None, tumor_name=None, sample=None, genome=None, genomes_dir=None, gridss_ref_dir=None, ref_fa=None, threads=None, jvmheap=None): gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx')) gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts')) normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') sample = sample or tumor_name output_dir = safe_mkdir(abspath(output_dir or 'gridss')) logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True) output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf') assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet' if genomes_dir: refdata.find_genomes_dir(genomes_dir) if not gridss_ref_dir: gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir') if not ref_fa: ref_fa = ref_fa.get_ref_file(genome, 'fa') hmf_env_path = conda_utils.secondary_conda_env('hmf') gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0] amber_jar = glob.glob( join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0] cobalt_jar = glob.glob( join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0] purple_jar = glob.glob( join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0] linx_jar = glob.glob( join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0] cmd = f""" PATH={hmf_env_path}/bin:$PATH \ THREADS={threads} \ GRIDSS_JAR={gridss_jar} \ AMBER_JAR={amber_jar} \ COBALT_JAR={cobalt_jar} \ PURPLE_JAR={purple_jar} \ LINX_JAR={linx_jar} \ bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \ -n {normal_bam} \ -t {tumor_bam} \ -v {output_vcf} \ -s {sample} \ --normal_sample {normal_name} \ --tumour_sample {tumor_name} \ --snvvcf {snv_vcf} \ --ref_dir {gridss_ref_dir} \ --install_dir {gridss_scripts_dir} \ --reference {ref_fa} \ --output_dir {output_dir} \ {f"--jvmheap {jvmheap}" if jvmheap else ""} """.strip() try: run_simple(cmd) except subprocess.SubprocessError: err('--------\n') err(f'Error running GRIDSS-PURPLE-LINX.\n') raise