def _proc_file(inp_f, out_f, ctx=None): max_bunch_size = 1000 * 1000 written_lines = 0 bunch = [] for i, line in enumerate(inp_f): clean_line = line.replace('\n', '') if clean_line: if ctx: new_l = proc_line_fun(clean_line, i, ctx) else: new_l = proc_line_fun(clean_line, i) if new_l is not None: bunch.append(new_l + '\n') written_lines += 1 else: bunch.append(line) written_lines += 1 if len(bunch) >= max_bunch_size: out_f.writelines(bunch) debug('Written lines: ' + str(written_lines)) bunch = [] out_f.writelines(bunch) debug('Written lines: ' + str(written_lines))
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None, check_result=True, overwrite=False, reuse=True, ctx=None): assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix) output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix) if output_fpath.endswith('.gz'): debug('output_fpath is .gz, but writing to uncompressed.') output_fpath = splitext(output_fpath)[0] if not overwrite: if can_reuse(output_fpath, cmp_f=input_fpath): debug('Reusing ' + output_fpath) return output_fpath if can_reuse(output_fpath + '.gz', cmp_f=input_fpath): debug('Reusing ' + output_fpath + '.gz') return output_fpath if islink(output_fpath): os.unlink(output_fpath) debug('Writing to ' + output_fpath) with file_transaction(work_dir, output_fpath) as tx_fpath: with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f: if ctx: convert_file_fn(inp_f, out_f, ctx) else: convert_file_fn(inp_f, out_f) if suffix or output_fpath: debug('Saved to ' + output_fpath) verify_file(output_fpath, is_critical=check_result) return output_fpath
def _proc_file(inp_f, out_f, ctx=None): max_bunch_size = 1000 * 1000 written_lines = 0 bunch = [] for i, line in enumerate(inp_f): clean_line = line.replace('\n', '') if clean_line: if ctx: new_l = proc_line_fun(clean_line, i, ctx) else: new_l = proc_line_fun(clean_line, i) if new_l is not None: bunch.append(new_l + '\n') written_lines += 1 else: bunch.append(line) written_lines += 1 if len(bunch) >= max_bunch_size: out_f.writelines(bunch) debug('Written lines: ' + str(written_lines)) bunch = [] out_f.writelines(bunch) debug('Written lines: ' + str(written_lines))
def __init__(self, work_dir, output_dir, fai_fpath, bed_fpath=None, padding=None, reannotate=False, genome=None, is_debug=False): self.bed = None self.original_bed_fpath = None self.bed_fpath = None # with genomic features self.capture_bed_fpath = None # w/o genomic features self.qualimap_bed_fpath = None self.padded_bed_fpath = None self.gene_keys_set = set() # set of pairs (gene_name, chrom) self.gene_keys_list = list() # list of pairs (gene_name, chrom) self.regions_num = None self.bases_num = None self.fraction = None if bed_fpath: debug('Using target BED file ' + bed_fpath) self.is_wgs = False verify_bed(bed_fpath, is_critical=True) self.original_bed_fpath = bed_fpath self._make_target_bed(bed_fpath, work_dir, output_dir, padding=padding, is_debug=is_debug, fai_fpath=fai_fpath, genome=genome, reannotate=reannotate) else: debug('No input BED. Assuming whole genome. For region-based reports, analysing RefSeq CDS.') self.is_wgs = True self._make_wgs_regions_file(work_dir, genome=genome)
def combined_regional_reports(work_dir, output_dir, samples): if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples): return None, None tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv)) debug('Combining regional reports, writing to ' + tsv_region_rep_fpath) with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv: with open(tx_tsv, 'w') as tsv_out: # sample_i = 0 # for s in samples: # if s.targqc_region_txt and verify_file(s.targqc_region_txt): # with open(s.targqc_region_txt) as txt_in: # for l in txt_in: # if l.startswith('#'): # if not l.startswith('##') and sample_i == 0: # txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr ')) # else: # txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l) # sample_i += 1 sample_i = 0 for s in samples: if s.targqc_region_tsv and verify_file(s.targqc_region_tsv): with open(s.targqc_region_tsv) as tsv_in: for i, l in enumerate(tsv_in): if i == 0: if sample_i == 0: tsv_out.write('sample\t' + l) else: tsv_out.write(s.name + '\t' + l) sample_i += 1 return tsv_region_rep_fpath
def _make_wgs_regions_file(self, work_dir, genome=None): self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed') if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)): return self.wgs_bed_fpath chr_order = reference_data.get_chrom_order(genome or cfg.genome) r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list)) all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True) debug('Select best transcript to report') for r in all_features: if r[ebl.BedCols.FEATURE] != 'gene': gene = r[ebl.BedCols.GENE] tx = r[ebl.BedCols.ENSEMBL_ID] r_by_tx_by_gene[gene][tx].append(r.fields) with file_transaction(work_dir, self.wgs_bed_fpath) as tx: with open(tx, 'w') as out: for gname, r_by_tx in r_by_tx_by_gene.items(): all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript') tx_sorted_list = [x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key)] if not tx_sorted_list: continue tx_id = tx_sorted_list[0] for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)): out.write('\t'.join(str(f) for f in r) + '\n') return self.wgs_bed_fpath
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def check_md5(work_dir, fpath, file_type, silent=False): md5_fpath = join(work_dir, file_type + '_md5.txt') new_md5 = md5(fpath) info('md5 of ' + fpath + ' is ' + str(new_md5)) prev_md5 = None if isfile(md5_fpath): with open(md5_fpath) as f: prev_md5 = f.read() else: info('Previous md5 file ' + md5_fpath + ' does not exist') info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5)) if prev_md5 == new_md5: if not silent: debug('Reusing previous ' + file_type.upper() + ' files.') return True else: if not silent: info('Pre-processing input ' + file_type.upper() + ' file') if prev_md5: if not silent: info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5)) info('New ' + file_type.upper() + ' md5: ' + str(new_md5)) with open(md5_fpath, 'w') as f: f.write(str(new_md5)) return False
def get_all_features(genome, high_confidence=False, features=None, gene_names=None, only_canonical=False): _canon_filt = get_only_canonical_filter(genome) if only_canonical else None ori_genome = genome genome = genome.replace('GRCh37', 'hg19') genome = genome.replace('GRCh38', 'hg38') bed = _get_ensembl_file('ensembl.bed', genome) def _filter(x): if features: if x[BedCols.FEATURE] not in features: return False if gene_names: if x[BedCols.GENE] not in gene_names: return False if _canon_filt: if not _canon_filt(x): return False return True debug('Filtering BEDTool for: specific features, specific genes, canonical') bed = bed.filter(_filter) if ori_genome.startswith('GRCh'): def fix_chr(r): r.chrom = r.chrom.replace('chrM', 'MT').replace('chr', '') return r bed = bed.each(fix_chr) # selecting columns up to TX_OVERLAP_PERCENTAGE (to remove Hugo) def _select_cols(r): r = r[:len(BedCols.cols)-4] return r bed = bed.each(_select_cols) return bed
def combined_regional_reports(work_dir, output_dir, samples): if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples): return None, None tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv)) debug('Combining regional reports, writing to ' + tsv_region_rep_fpath) with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv: with open(tx_tsv, 'w') as tsv_out: # sample_i = 0 # for s in samples: # if s.targqc_region_txt and verify_file(s.targqc_region_txt): # with open(s.targqc_region_txt) as txt_in: # for l in txt_in: # if l.startswith('#'): # if not l.startswith('##') and sample_i == 0: # txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr ')) # else: # txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l) # sample_i += 1 sample_i = 0 for s in samples: if s.targqc_region_tsv and verify_file(s.targqc_region_tsv): with open(s.targqc_region_tsv) as tsv_in: for i, l in enumerate(tsv_in): if i == 0: if sample_i == 0: tsv_out.write('sample\t' + l) else: tsv_out.write(s.name + '\t' + l) sample_i += 1 return tsv_region_rep_fpath
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def _make_wgs_regions_file(self, work_dir, genome=None): self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed') if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)): return self.wgs_bed_fpath chr_order = reference_data.get_chrom_order(genome or cfg.genome) r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list)) all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True) debug('Select best transcript to report') for r in all_features: if r[ebl.BedCols.FEATURE] != 'gene': gene = r[ebl.BedCols.GENE] tx = r[ebl.BedCols.ENSEMBL_ID] r_by_tx_by_gene[gene][tx].append(r.fields) with file_transaction(work_dir, self.wgs_bed_fpath) as tx: with open(tx, 'w') as out: for gname, r_by_tx in r_by_tx_by_gene.items(): all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript') tx_sorted_list = [ x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key) ] if not tx_sorted_list: continue tx_id = tx_sorted_list[0] for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)): out.write('\t'.join(str(f) for f in r) + '\n') return self.wgs_bed_fpath
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \ 'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath) if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def make_region_reports(view, work_dir, samples, target, genome, depth_thresholds): bed_fpath = target.bed_fpath or target.wgs_bed_fpath if all(can_reuse(s.targqc_region_tsv, [s.bam, bed_fpath]) for s in samples): debug('All region reports exist, reusing') return [s.targqc_region_tsv for s in samples] info('Calculating coverage statistics for CDS and exon regions from RefSeq...') depth_thresholds_by_sample = dict() for s in samples: depth_thresholds_by_sample[s.name] = depth_thresholds debug() debug('Running sambamba...') sambamba_depth_output_fpaths = view.run(sambamba_depth, [[s.work_dir, bed_fpath, s.bam, depth_thresholds_by_sample[s.name], None, s.name] for s in samples]) assert len(sambamba_depth_output_fpaths) == len(samples), \ 'Number of sambamba results = ' + str(len(sambamba_depth_output_fpaths)) + \ ' which is less then the number of samples ' + str(len(samples)) debug() debug('Parsing sambamba results and writing results...') view.run(_proc_sambamba_depth, [[sambamba_output_fpath, s.targqc_region_tsv, s.name, depth_thresholds_by_sample[s.name]] for sambamba_output_fpath, s in zip(sambamba_depth_output_fpaths, samples)]) info('Done.') return [s.targqc_region_tsv for s in samples]
def safe_symlink_to(fpath, dst_dirpath): dst = join(dst_dirpath, basename(fpath)) if not exists(dst): try: if os.lstat(dst): # broken symlink os.remove(dst) except OSError: pass debug('Symlink ' + fpath + ' -> ' + dst) os.symlink(fpath, dst) return dst
def safe_symlink_to(fpath, dst_dirpath): dst = join(dst_dirpath, basename(fpath)) if not exists(dst): try: if os.lstat(dst): # broken symlink os.remove(dst) except OSError: pass debug('Symlink ' + fpath + ' -> ' + dst) os.symlink(fpath, dst) return dst
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all( can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all( verify_file( fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def get_merged_cds(genome): """ Returns all CDS merged, used: - for TargQC general reports CDS coverage statistics for WGS - for Seq2C CNV calling when no capture BED available """ bed = get_all_features(genome) debug('Filtering BEDTool for high confidence CDS and stop codons') return bed\ .filter(lambda r: r.fields[BedCols.FEATURE] in ['CDS', 'stop_codon'])\ .filter(high_confidence_filter)\ .merge()
def can_reuse(fpath, cmp_f, silent=False): do_reuse = os.environ.get('REUSE', '1') if do_reuse == '0': return False if not fpath or not isfile(fpath): return False elif verify_file(fpath, cmp_f=cmp_f, silent=True): if not silent: debug('Reusing ' + fpath) return True else: return False
def can_reuse(fpath, cmp_f, silent=False): do_reuse = os.environ.get('REUSE', '1') if do_reuse == '0': return False if not fpath or not isfile(fpath): return False elif verify_file(fpath, cmp_f=cmp_f, silent=True): if not silent: debug('Reusing ' + fpath) return True else: return False
def get_parallel_view(n_samples, parallel_cfg): if parallel_cfg.scheduler and parallel_cfg.threads > 1: debug('Starting' + (' test' if not is_cluster() else '') + ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' + parallel_cfg.queue + ') ' 'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' + str(parallel_cfg.cores_per_job(n_samples)) + ' threads per each sample') return ClusterView(n_samples, parallel_cfg) else: debug('Running locally using ' + str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)') return ThreadedView(n_samples, parallel_cfg)
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir( adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable() is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.') return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.') return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB """ db_file = gtf + '.db' if gtf.endswith('.gz'): db_file = gtf[:-3] + '.db' if file_exists(db_file): return gffutils.FeatureDB(db_file) db_file = ':memory:' if in_memory else db_file if in_memory or not file_exists(db_file): debug('GTF database does not exist, creating...') infer_extent = guess_infer_extent(gtf) db = gffutils.create_db(gtf, dbfn=db_file, infer_gene_extent=infer_extent) return db else: return gffutils.FeatureDB(db_file)
def _make_qualimap_bed(self, work_dir): if self.is_wgs: return None self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready') if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath): return self.qualimap_bed_fpath debug('Merging and saving BED into required bed6 format for Qualimap') bed = self.bed.sort().merge() with file_transaction(work_dir, self.qualimap_bed_fpath) as tx: with open(tx, 'w') as out: for i, region in enumerate(x for x in bed): region = [x for x in list(region) if x] fillers = [str(i), "1.0", "+"] full = region + fillers[:6 - len(region)] out.write("\t".join(full) + "\n") verify_file(self.qualimap_bed_fpath, is_critical=True) return self.qualimap_bed_fpath
def __init__(self, work_dir, output_dir, fai_fpath, bed_fpath=None, padding=None, reannotate=False, genome=None, is_debug=False): self.bed = None self.original_bed_fpath = None self.bed_fpath = None # with genomic features self.capture_bed_fpath = None # w/o genomic features self.qualimap_bed_fpath = None self.padded_bed_fpath = None self.gene_keys_set = set() # set of pairs (gene_name, chrom) self.gene_keys_list = list() # list of pairs (gene_name, chrom) self.regions_num = None self.bases_num = None self.fraction = None if bed_fpath: debug('Using target BED file ' + bed_fpath) self.is_wgs = False verify_bed(bed_fpath, is_critical=True) self.original_bed_fpath = bed_fpath self._make_target_bed(bed_fpath, work_dir, output_dir, padding=padding, is_debug=is_debug, fai_fpath=fai_fpath, genome=genome, reannotate=reannotate) else: debug( 'No input BED. Assuming whole genome. For region-based reports, analysing RefSeq CDS.' ) self.is_wgs = True self._make_wgs_regions_file(work_dir, genome=genome)
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def set_up_log(log_dir, log_fname): log_fname = log_fname log_fpath = join(log_dir, log_fname) if file_exists(log_fpath): timestamp = datetime.datetime.fromtimestamp( os.stat(log_fpath).st_mtime) mv_log_fpath = log_fpath + '.' + timestamp.strftime( '%Y-%m-%d_%H-%M-%S_' + str(random() * 1000)) try: if isfile(mv_log_fpath): os.remove(mv_log_fpath) if not isfile(mv_log_fpath): os.rename(log_fpath, mv_log_fpath) except OSError: pass logger.set_log_path(log_fpath) debug('Logging to ' + log_fpath) debug() return log_fpath
def _make_qualimap_bed(self, work_dir): if self.is_wgs: return None self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready') if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath): return self.qualimap_bed_fpath debug('Merging and saving BED into required bed6 format for Qualimap') bed = self.bed.sort().merge() with file_transaction(work_dir, self.qualimap_bed_fpath) as tx: with open(tx, 'w') as out: for i, region in enumerate(x for x in bed): region = [x for x in list(region) if x] fillers = [str(i), "1.0", "+"] full = region + fillers[:6 - len(region)] out.write("\t".join(full) + "\n") verify_file(self.qualimap_bed_fpath, is_critical=True) return self.qualimap_bed_fpath
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None, check_result=True, overwrite=False, reuse=True, ctx=None): assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix) output_fpath = output_fpath or intermediate_fname( work_dir, input_fpath, suf=suffix) if output_fpath.endswith('.gz'): debug('output_fpath is .gz, but writing to uncompressed.') output_fpath = splitext(output_fpath)[0] if not overwrite: if can_reuse(output_fpath, cmp_f=input_fpath): debug('Reusing ' + output_fpath) return output_fpath if can_reuse(output_fpath + '.gz', cmp_f=input_fpath): debug('Reusing ' + output_fpath + '.gz') return output_fpath if islink(output_fpath): os.unlink(output_fpath) debug('Writing to ' + output_fpath) with file_transaction(work_dir, output_fpath) as tx_fpath: with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f: if ctx: convert_file_fn(inp_f, out_f, ctx) else: convert_file_fn(inp_f, out_f) if suffix or output_fpath: debug('Saved to ' + output_fpath) verify_file(output_fpath, is_critical=check_result) return output_fpath
def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding, num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None): if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam) for fp in _qualimap_outputs(s)) for s in samples): debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing') else: info('Running QualiMap...') view.run(runner.run_qualimap, [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job] for s in samples]) for s in samples: for fp in _qualimap_outputs(s): verify_file(fp, is_critical=True) summary_reports = [] for sample in samples: info('-'*70) info(sample.name) debug('-'*70) debug('Parsing QualiMap results...') depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample) _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats, target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath) r = _build_report(depth_stats, reads_stats, indels_stats, sample, target, depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug, reannotate=reannotate) summary_reports.append(r) return summary_reports
def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical(f'Cannot parse the reference BED file - unexpected number of lines ' '({len(inters_fields_list} in {inters_fields_list}' + ' (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ebl.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ebl.BedCols.GENE] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all( can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = { s.name: int( open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples } else: info('Counting read pairs') num_pairs = parall_view.run( count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = { s.name: pairs_count for s, pairs_count in zip(samples, num_pairs) } # Downsampling debug() if all( can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name) ] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all( can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job ] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def __init__(self, n_samples, parallel_cfg): BaseView.__init__(self, n_samples, parallel_cfg) from cluster_helper.cluster import ClusterView as CV self._view = CV(**parallel_cfg.get_cluster_params(n_samples)) debug('Starting cluster with ' + str(self.num_jobs) + ' open nodes, ' + str(self.cores_per_job) + ' cores per node')
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def run(self, fn, param_lists): debug('Starting multithreaded function' + str(fn)) assert self.n_samples == len(param_lists) return self._view(delayed(fn)(*params) for params in param_lists)
def _proc_sambamba_depth(sambamba_depth_output_fpath, output_fpath, sample_name, depth_thresholds): read_count_col = None mean_cov_col = None median_cov_col = None min_depth_col = None std_dev_col = None wn_20_pcnt_col = None regions_by_genekey = defaultdict(list) ##################################### ##################################### if can_reuse(output_fpath, sambamba_depth_output_fpath): return output_fpath debug('Reading coverage statistics and writing regions to ' + output_fpath) def write_line(f, fields): f.write('\t'.join(fields) + '\n') with file_transaction(None, output_fpath) as tx: with open(sambamba_depth_output_fpath) as sambabma_depth_file, open(tx, 'w') as out: total_regions_count = 0 for line in sambabma_depth_file: fs = line.strip('\n').split('\t') if line.startswith('#'): fs = line.split('\t') read_count_col = fs.index('readCount') + 1 mean_cov_col = fs.index('meanCoverage') + 1 #median_cov_col = fs.index('medianCoverage') if 'medianCoverage' in fs else None #min_depth_col = fs.index('minDepth') if 'minDepth' in fs else None #std_dev_col = fs.index('stdDev') if 'stdDev' in fs else None #wn_20_pcnt_col = fs.index('percentWithin20PercentOfMedian') if 'percentWithin20PercentOfMedian' in fs else None write_line(out, [ 'chrom', 'start', 'end', 'size', 'gene', 'exon', 'strand', 'feature', 'biotype', 'transcript', 'trx_overlap', 'exome_overlap', 'cds_overlap', # 'min_depth', 'avg_depth', # 'median_depth', # 'std_dev', # 'within_20pct_of_median', ] + ['at{}x'.format(ths) for ths in depth_thresholds]) continue chrom = fs[0] start, end = int(fs[1]), int(fs[2]) region_size = end - start gene_name = fs[ebl.BedCols.GENE] if read_count_col != ebl.BedCols.GENE else '.' exon = fs[ebl.BedCols.EXON] strand = fs[ebl.BedCols.STRAND] feature = fs[ebl.BedCols.FEATURE] biotype = fs[ebl.BedCols.BIOTYPE] transcript = fs[ebl.BedCols.ENSEMBL_ID] transcript_overlap = fs[ebl.BedCols.TX_OVERLAP_PERCENTAGE] exome_overlap = fs[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] cds_overlap = fs[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] avg_depth = float(fs[mean_cov_col]) # min_depth = int(fs[min_depth_col]) if min_depth_col is not None else '.' # std_dev = float(fs[std_dev_col]) if std_dev_col is not None else '.' # median_depth = int(fs[median_cov_col]) if median_cov_col is not None else '.' # rate_within_normal = float(fs[wn_20_pcnt_col]) if wn_20_pcnt_col is not None else '.' last_cov_col = max(mean_cov_col or 0, median_cov_col or 0, std_dev_col or 0, wn_20_pcnt_col or 0) rates_within_threshs = fs[last_cov_col+1:-1] write_line(out, [str(v) if v not in ['', None, '.'] else '.' for v in [ chrom, start, end, region_size, gene_name, exon, strand, feature, biotype, transcript, ((transcript_overlap + '%') if transcript_overlap not in ['', None, '.'] else '.'), ((exome_overlap + '%') if exome_overlap not in ['', None, '.'] else '.'), ((cds_overlap + '%') if cds_overlap not in ['', None, '.'] else '.'), # min_depth, avg_depth, # median_depth, # std_dev, # rate_within_normal, ] + rates_within_threshs]) total_regions_count += 1 if total_regions_count > 0 and total_regions_count % 10000 == 0: debug(' Processed {0:,} regions'.format(total_regions_count)) debug('Total regions: ' + str(len(regions_by_genekey))) return output_fpath
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate: debug('Annotating target BED file and collecting overlapping genome features') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples} else: info('Counting read pairs') num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)} # Downsampling debug() if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)