def bgzip_and_tabix(cnf, vcf_fpath, tabix_parameters='', **kwargs): gzipped_fpath = join(vcf_fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if cnf.reuse_intermediate and \ file_exists(gzipped_fpath) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed VCF and index exist, reusing') return gzipped_fpath info('Compressing and tabixing VCF file, writing ' + gzipped_fpath + '(.tbi)') bgzip = get_system_path(cnf, 'bgzip') tabix = get_system_path(cnf, 'tabix') if not bgzip: err('Cannot index VCF because bgzip is not found in PATH or ' + cnf.sys_cnf) if not tabix: err('Cannot index VCF because tabix is not found in PATH or ' + cnf.sys_cnf) if not bgzip and not tabix: return vcf_fpath retrying = False while True: if isfile(tbi_fpath): os.remove(tbi_fpath) if isfile(vcf_fpath): if isfile(gzipped_fpath): os.remove(gzipped_fpath) info('BGzipping VCF') cmdline = '{bgzip} {vcf_fpath}'.format(**locals()) call(cnf, cmdline, None, **kwargs) else: if not verify_file(gzipped_fpath): err('Neither uncompressed ' + vcf_fpath + ' nor ' + gzipped_fpath + ' exist') return None info('Tabixing VCF') cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) exit_on_error = False if retrying: exit_on_error = True kwargs['exit_on_error'] = exit_on_error call(cnf, cmdline, **kwargs) if isfile(gzipped_fpath + '.tbi'): break if retrying: critical('Cannot tabix ' + vcf_fpath) if not isfile(vcf_fpath): call(cnf, 'gunzip ' + gzipped_fpath, None) retrying = True return gzipped_fpath
def proc_args(argv): group1_name = 'Resistant' group2_name = 'Sensitive' description = 'This script find genes with mutations presented in (almost) all samples in one groups' \ 'and (almost) not presented in another group' \ ' (default group names: Resistant vs Sensitive). Input is PASS.txt files from bcbio-postproc.' parser = OptionParser(description=description) parser.add_option( '-n', '--num-samples-limit', dest='ns', default=1, type=int, help= 'For each reported gene: max number of samples WITHOUT the gene in group1, ' 'max number of samples WITH the gene in group2') (opts, args) = parser.parse_args(argv) if len(args) == 0: critical('No PASS.txt files provided to input.') variants_fpaths = [fpath for fpath in args if file_exists(fpath)] return opts, [group1_name, group2_name], variants_fpaths
def set_up_log(cnf, proc_name=None, project_name=None, project_fpath=None, output_dir=None): logger.proc_name = proc_name logger.project_name = project_name logger.project_fpath = project_fpath or output_dir logger.cnf_address = remove_quotes(cnf.email) if cnf.email else '' logger.smtp_host = cnf.smtp_host if cnf.log_dir: log_fname = (proc_name + '_' if proc_name else '') + (cnf.sample + '_' if cnf.sample else '') + 'log.txt' log_fpath = join(cnf.log_dir, log_fname) if file_exists(log_fpath): timestamp = datetime.datetime.fromtimestamp( os.stat(log_fpath).st_mtime) mv_log_fpath = log_fpath + '.' + timestamp.strftime( "%Y-%m-%d_%H-%M-%S") try: if isfile(mv_log_fpath): os.remove(mv_log_fpath) if not isfile(mv_log_fpath): os.rename(log_fpath, mv_log_fpath) except OSError: pass info('log_fpath: ' + log_fpath) info() logger.log_fpath = cnf.log = log_fpath
def check_system_resources(cnf, required=list(), optional=list()): to_exit = False for program in required: if not which(program): if cnf.resources is None: critical('No "resources" section in system config.') data = cnf.resources.get(program) if data is None: err(program + ' is required. Specify path in system config or in your environment.' ) to_exit = True else: if 'module' in data: os.system('module load ' + data['module']) # if 'path' not in data: # data['path'] = program elif 'path' in data: data['path'] = adjust_system_path(data['path']) if not isdir(data['path']) and not file_exists( data['path']): err(data['path'] + ' does not exist.') to_exit = True for program in optional: resources = cnf.get('resources') if not resources: break data = resources.get(program) if data is None: continue else: data['path'] = adjust_system_path(data['path']) if not isdir(data['path']) and not file_exists(data['path']): err(data['path'] + ' does not exist.') to_exit = True if to_exit: exit()
def check_file_changed(cnf, new, in_work): if not file_exists(in_work): cnf['reuse_intermediate'] = False if cnf.get('reuse_intermediate'): if (basename(in_work) != basename(new) or md5_for_file(open(in_work, 'rb')) != md5_for_file(open_gzipsafe(new, 'rb'))): info('Input file %s changed, setting "reuse_intermediate" ' 'to False.' % str(new)) cnf['reuse_intermediate'] = False
def get_chr_len_fpath(cnf): chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt') if cnf.reuse_intermediate and file_exists(chr_len_fpath): info(chr_len_fpath + ' exists, reusing') return chr_len_fpath else: if not cnf.genome.seq: critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' + cnf.genome.name + '" section') return None chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq)) with file_transaction(cnf.work_dir, chr_len_fpath) as tx: with open(tx, 'w') as handle: for c, l in chr_lengths: handle.write(c + '\t' + str(l) + '\n') return chr_len_fpath
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0): fname, _ = splitext_plus(basename(vcf_fpath)) tsv_fpath = join(cnf.work_dir, fname + '.tsv') if cnf.get('reuse_intermediate'): if file_exists(tsv_fpath): info(tsv_fpath + ' exists, reusing') return tsv_fpath manual_tsv_fields = cnf.annotation['tsv_fields'] if not manual_tsv_fields: return None all_fields = [] basic_fields = [] info_fields = [] eff_fields = [] gt_fields = [] tumor_gt = 'GEN[' + str(main_sample_index) + '].' normal_gt = 'GEN[' + str(1 - main_sample_index) + '].' lines = [] with open(vcf_fpath) as inp: reader = vcf.Reader(inp) info('TSV saver: Building field list') for f in [rec.keys()[0] for rec in manual_tsv_fields]: if f.startswith('GEN'): _f = f.split('.')[1] if len(reader.samples) > 0: if _f in reader.formats: gt_fields.append(_f) all_fields.append(f.replace('GEN[*].', tumor_gt)) if len(reader.samples) > 1: all_fields.append(f.replace('GEN[*].', normal_gt)) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records') elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']: all_fields.append(f) basic_fields.append(f) elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']): all_fields.append(f) eff_fields.append(f) else: if f in reader.infos: info_fields.append(f) all_fields.append(f) elif f == 'SAMPLE': all_fields.append(f) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records') info('TSV saver: Iterating over records...') d = OrderedDict() for rec in reader: for f in basic_fields: d[f] = rec.__dict__[f] for f in info_fields: d[f] = rec.INFO[f] if f in rec.INFO else '' if 'SAMPLE' not in d: d['SAMPLE'] = samplename if eff_fields: eff = rec.INFO.get(eff_fields[0][:3]) if not eff: for f in eff_fields: d[f] = '' else: eff_fs = eff[0].split('|') eff_d = dict() for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']): if 'POSLEN' in header: eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else '' eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else '' else: eff_d[header] = val #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|; #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO' for f in eff_fields: d[f] = eff_d[f.split('.')[1]] if rec.FORMAT: for _f in gt_fields: if _f in rec.FORMAT: d[tumor_gt + _f] = rec.samples[main_sample_index][_f] if len(rec.samples) > 1 - main_sample_index: d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f] else: d[normal_gt + _f] = '' else: d[tumor_gt + _f] = '' d[normal_gt + _f] = '' fs = [] for f in all_fields: v = d[f] fs.append(v if v != '.' else '') lines.append(fs) info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...') field_map = dict() for rec in manual_tsv_fields: k = rec.keys()[0] v = rec.values()[0] if k.startswith('GEN[*].'): _f = k.split('.')[1] field_map[tumor_gt + _f] = v field_map[normal_gt + _f] = 'Matched_' + v else: field_map[k] = v info('TSV saver: Writing TSV to ' + tsv_fpath) with file_transaction(cnf.work_dir, tsv_fpath) as tx: with open(tx, 'w') as out: out.write('\t'.join(field_map[f] for f in all_fields) + '\n') for fs in lines: new_fs = [] for f in fs: if isinstance(f, list): new_fs.append(','.join(map(str, f))) elif f is None: new_fs.append('') else: new_fs.append(str(f)) out.write('\t'.join(new_fs) + '\n') info('TSV saver: saved ' + tsv_fpath) return tsv_fpath
def run_annotators(cnf, vcf_fpath, bam_fpath): original_vcf = cnf.vcf db_section_by_name = OrderedDict( (dbname, cnf.annotation[dbname]) for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine'] if dbname in cnf.annotation and not cnf.annotation[dbname].get('skip-annotation')) # if not cnf.no_check: # to_delete_id_ref = [] # if 'dbsnp' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as rs*') # to_delete_id_ref.append('rs') # if 'cosmic' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as COS*') # to_delete_id_ref.append('COS') # # def delete_ids(rec): # deleting existing dbsnp and cosmic ID annotations # if rec.ID: # if isinstance(rec.ID, basestring): # if any(rec.ID.startswith(pref) for pref in to_delete_id_ref): # rec.ID = None # else: # rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)] # # if not rec.FILTER: # rec.FILTER = 'PASS' # # return rec # # info('Removing previous rs* and COS* IDs') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID') bcftools = get_system_path(cnf, 'bcftools') if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'): vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) cmdl = '{bcftools} annotate --remove ID {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid')) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get( 'custom_vcfs', dict()).items(): step_greetings('Annotating using ' + dbname) annotations = ','.join('INFO/' + a for a in dbconf.get('annotations')) if dbname in ('cosmic', 'dbsnp'): annotations += ',=ID' db_fpath = get_db_path(cnf, dbconf, dbname) if db_fpath: cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname)) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) verify_vcf(vcf_fpath, is_critical=True) if 'dbnsfp' in cnf.annotation: res = _snpsift_db_nsfp(cnf, vcf_fpath) if res: vcf_fpath = res if 'snpeff' in cnf.annotation: res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath) if res: vcf_fpath = res verify_vcf(vcf_fpath, is_critical=True) final_summary_fpath = join(cnf.output_dir, basename(summary_fpath)) final_genes_fpath = join(cnf.output_dir, basename(genes_fpath)) if isfile(final_summary_fpath): os.remove(final_summary_fpath) if isfile(final_genes_fpath): os.remove(final_genes_fpath) if file_exists(summary_fpath): shutil.move(summary_fpath, final_summary_fpath) if file_exists(genes_fpath): shutil.move(genes_fpath, final_genes_fpath) if 'tracks' in cnf.annotation and cnf.annotation[ 'tracks'] and cnf.annotation['tracks']: track_fapths = [] for track_name in cnf.annotation['tracks']: if isfile(track_name) and verify_file(track_name): track_fapths.append(track_name) else: if 'tracks' in cnf['genome'] and cnf['genome'][ 'tracks'] and track_name in cnf['genome']['tracks']: track_fpath = cnf['genome']['tracks'][track_name] if verify_file(track_fpath): track_fapths.append(track_fpath) for track_fapth in track_fapths: res = _tracks(cnf, track_fapth, vcf_fpath) if res: vcf_fpath = res step_greetings('Intersection with database VCFs...') if 'intersect_with' in cnf.annotation: for key, db_fpath in cnf.annotation['intersect_with'].items(): res = intersect_vcf(cnf, input_fpath=vcf_fpath, db_fpath=db_fpath, key=key) if res: vcf_fpath = res if 'mongo' in cnf.annotation: res = _mongo(cnf, vcf_fpath) if res: vcf_fpath = res return vcf_fpath