def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical( 'Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']][ 'Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def tmpdir(): dirpath = make_tmpdir() try: yield dirpath finally: try: shutil.rmtree(dirpath) except OSError: warn('Warning: cannot clean up temporary dir ' + dirpath)
def workdir(cnf): if cnf.work_dir: verify_dir(cnf.work_dir, is_critical=True) yield cnf.work_dir else: cnf.work_dir = make_tmpdir() yield cnf.work_dir try: shutil.rmtree(cnf.work_dir) except OSError: warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable() is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.') return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.') return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical('Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']]['Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [ rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand ] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn( str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def _log(msg, silent, is_critical): if is_critical: critical(msg) if not silent: warn(msg)
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len( [s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable( ) is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows( ([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable( ) + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format( **locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn( 'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.' ) return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn( 'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.' ) return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None): info() info('Determining sex') male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = male_bed.count() info('Male region total size: ' + str(male_area_size)) if target_bed: male_bed = BedTool(target_bed).intersect(male_bed).merge() target_male_area_size = male_bed.count() if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR: info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex') return None else: info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' + 'Determining sex based on coverage in those regions.') else: info('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, []) chry_mean_coverage = get_mean_cov(chry_cov_output_fpath) info('Y key regions average depth: ' + str(chry_mean_coverage)) ave_depth = float(ave_depth) info('Sample average depth: ' + str(ave_depth)) if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX: info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: info('Y depth is 0 - it\s female') sex = 'F' else: factor = ave_depth / chry_mean_coverage info('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' info('Sex is ' + sex) info() return sex