def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def _snpsift_db_nsfp(cnf, input_fpath): if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome: return None step_greetings('DB SNFP') output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') db_path = cnf['genome']['dbnsfp'] if not verify_file(db_path, 'DB NSFP file'): err('DB NSFP file is incorrect. Skipping.') return None annotations = cnf.annotation['dbnsfp'].get('annotations') or [] # all_fields.extend(['dbNSFP_' + ann for ann in annotations]) ann_line = ('-f ' + ','.join(annotations)) if annotations else '' cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \ '{input_fpath}'.format(**locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True): return verify_vcf(output_fpath, is_critical=True) else: return None
def combine_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath, additional_parameters=''): gatk = get_java_tool_cmdline(cnf, 'gatk') if not gatk: info('GATK is not found, skipping merging VCFs') return None cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format( **locals()) for s_name, vcf_fpath in vcf_fpath_by_sname.items(): if vcf_fpath: cmdl += ' --variant:' + s_name + ' ' + vcf_fpath if ' --variant:' not in cmdl: err('No VCFs to combine') return None if cnf.reuse_intermediate and isfile( combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath + '.gz'): info(combined_vcf_fpath + '.gz exists, reusing') return combined_vcf_fpath + '.gz' cmdl += ' -o ' + combined_vcf_fpath res = call(cnf, cmdl, output_fpath=combined_vcf_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: info('Joined VCFs, saved into ' + combined_vcf_fpath) if isfile(combined_vcf_fpath + '.tx.idx'): try: os.remove(combined_vcf_fpath + '.tx.idx') except OSError: err(traceback.format_exc()) info() return bgzip_and_tabix(cnf, combined_vcf_fpath) else: warn('Could not join VCFs') return None
def add_annotation(cnf, input_fpath, key, value, number, type_, description): step_greetings('Adding annotation...') def proc_rec(rec): rec.INFO[key] = value return rec output_fpath = iterate_vcf(cnf, input_fpath, proc_rec) info('Adding header meta info...') def _add_format_header(l, i): if l.startswith('#CHROM'): ext_l = '' ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format( key=key, number=number, type_=type_, desc=description) return ext_l + l return l output_fpath = iterate_file(cnf, output_fpath, _add_format_header) return verify_vcf(output_fpath, is_critical=True)
def _annotate(cnf, samples): varannotate_cmdl = (get_script_cmdline( cnf, 'python', join('scripts', 'post', 'varannotate.py')) + ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' + cnf.run_cnf + ' --project-name ' + cnf.project_name + (' --reuse ' if cnf.reuse_intermediate else '') + ' --log-dir -' + ' --genome ' + cnf.genome.name + (' --no-check ' if cnf.no_check else '') + (' --qc' if cnf.qc else ' --no-qc') + ((' --caller ' + cnf.caller) if cnf.caller else '')) total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for sample in not_submitted_samples: if not sample.varannotate_dirpath: sample.varannotate_dirpath = join(sample.dirpath, source.varannotate_name) if not sample.anno_vcf_fpath: sample.anno_vcf_fpath = join( sample.varannotate_dirpath, add_suffix(basename(sample.vcf), 'anno')) output_fpath = sample.anno_vcf_fpath if not output_fpath.endswith('.gz'): output_fpath += '.gz' debug('Checking ' + output_fpath) if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf( output_fpath): info('Annotated results ' + output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue work_dir = join(cnf.work_dir, source.varannotate_name + '_' + sample.name) j = submit_job( cnf, cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' + sample.varannotate_dirpath + ' -s ' + sample.name + ' --work-dir ' + work_dir + ' --output-file ' + output_fpath, job_name='VA_' + cnf.project_name + '_' + sample.name, output_fpath=output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_vcf( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but j.work_dir ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info()
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to filter') ), (['--vcf2txt'], dict( dest='vcf2txt', help='variants in vcf2txt to filter') ), (['--cohort-freqs'], dict( dest='cohort_freqs_fpath', help='frequencies of variants in a cohort') ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), (['--no-tsv'], dict( dest='tsv', action='store_false', default=True, help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varfilter_name + '_post') check_system_resources(cnf, required=['perl']) check_genome_resources(cnf) if not cnf.output_file: cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt') safe_mkdir(dirname(cnf.output_file)) safe_mkdir(cnf.output_dir) if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'): verify_vcf(cnf.vcf, is_critical=True) if not cnf.vcf2txt: vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file) if not vcf2txt_res_fpath: critical('vcf2txt run returned non-0') info('Saved vcf2txt output to ' + vcf2txt_res_fpath) else: cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True) info('Input is vcf2txt output, grepping by sample name ' + cnf.sample) vcf2txt_res_fpath = cnf.output_file with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx: with open(cnf.vcf2txt) as f, open(tx, 'w') as out: for i, l in enumerate(f): if l.strip(): if i == 0: out.write(l) else: if l.split('\t')[0] == cnf.sample: out.write(l) info('Using vcf2txt from ' + vcf2txt_res_fpath) # if is_local(): # vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl')) # info('Running vardict2mut perl') # res = run_vardict2mut(cnf, vcf2txt_res_fpath, # add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'), # vardict2mut_executable=vardict2mut_pl) # if not res: # critical('vardict2mut.pl run returned non-0') mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix)) if not mut_fpath: err('vardict2mut failed') else: info('Saved passed mutations to ' + mut_fpath) var_s = source.VarSample(cnf.sample, cnf.output_dir) var_s.anno_vcf_fpath = cnf.vcf var_s.varfilter_dirpath = var_s.dirpath ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0] ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt')) var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz' var_s.variants_fpath = vcf2txt_res_fpath var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix) ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass') var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass') filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath) index_vcf(cnf, var_s.name, filt_vcf, cnf.caller) index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller) if cnf.qc: report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir']) info() info('*' * 70) info('Done filtering ' + var_s.name)
def convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample, output_dir=None, pass_only=False): info('') info('Preparing data for ' + sample.name) anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name) if not anno_filt_vcf_fpath: return None, None if not output_dir: output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath) output_vcf_fpath = join( output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending) pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass') if cnf.reuse_intermediate and verify_vcf( output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath + '.gz'): info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath + '.gz exists, reusing') return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz' info('Parsing PASS and REJECT mutations...') pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts( cnf, bs, sample, pass_only=pass_only) sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict) info('') info('Writing VCFs') vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r')) vcf_reader = add_keys_to_header(vcf_reader, filter_values) with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \ file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx: vcf_writer = None if not pass_only: vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader) vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader) for key, mut in sorted_mut_dict.items(): record = get_record_from_vcf(vcf_reader, mut) if record: if key in pass_mut_dict: record.FILTER = ['PASS'] if mut.reason: record.INFO['Reason'] = mut.reason.replace(' ', '_') elif pass_only: continue elif key in reject_mut_dict: if not mut.reason: continue reject_reason_ids = [ filter_descriptions_dict[reason] if reason in filter_descriptions_dict else reason for reason in mut.reason.split(' and ') ] record.FILTER = [';'.join(reject_reason_ids)] if mut.signif: record.INFO['Signif'] = mut.signif if mut.status: record.INFO['Status'] = mut.status if vcf_writer: vcf_writer.write_record(record) if key in pass_mut_dict: vcf_pass_writer.write_record(record) else: warn('No record was found in ' + anno_filt_vcf_fpath + ' for mutation ' + str(mut)) output_gzipped_vcf_fpath = None if vcf_writer: vcf_writer.close() output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath) info('VCF file for vardict.txt is saved to ' + output_gzipped_vcf_fpath) vcf_pass_writer.close() output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath) info('VCF file for vardict.PASS.txt is saved to ' + output_gzipped_pass_vcf_fpath) return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath
def read_samples(args, caller_name=None): vcf_fpath_by_sample = OrderedDict() bad_vcf_fpaths = [] info('Reading samples...') if len(args) == 1: first_fpath = args[0] if not first_fpath.endswith('.vcf') and not first_fpath.endswith( '.vcf.gz'): # TODO: check ##fileformat=VCF ? info( 'First argument file name does not look like VCF, assuming TSV with files names' ) with open(first_fpath) as f: for i, l in enumerate(f): fs = l.strip().split('\t') if len(fs) != 2: critical('Line ' + str(i) + ' has only ' + str(len(fs)) + ' fields. Expecting 2 (sample and vcf_fpath)') sn, vcf_fpath = fs if not verify_file(vcf_fpath): bad_vcf_fpaths.append(vcf_fpath) vcf_fpath_by_sample[sn] = adjust_path(vcf_fpath) if bad_vcf_fpaths: critical('VCF files cannot be found, empty or not VCFs:' + ', '.join(bad_vcf_fpaths)) info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples') return vcf_fpath_by_sample for arg in args or [os.getcwd()]: vcf_fpath = verify_vcf(arg.split(',')[0]) if not verify_file(vcf_fpath): bad_vcf_fpaths.append(vcf_fpath) if len(arg.split(',')) > 1: sn = arg.split(',')[1] else: sn = basename(splitext_plus(vcf_fpath)[0]) if caller_name and sn.endswith('-' + caller_name): sn = sn[:-len(caller_name) - 1] info(' ' + sn) if sn in vcf_fpath_by_sample: if vcf_fpath_by_sample[sn] != vcf_fpath: warn('Duplicated record ' + sn + ', VCF file is different (existing: ' + vcf_fpath_by_sample[sn] + ', new: ' + vcf_fpath + ')') else: warn('Duplicated record ' + sn + ', VCF file is the same: ' + vcf_fpath) else: vcf_fpath_by_sample[sn] = vcf_fpath if bad_vcf_fpaths: critical('VCF files cannot be found, empty or not VCFs:' + ', '.join(bad_vcf_fpaths)) info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples') # TODO: read sample names from VCF # def get_main_sample(self, main_sample_index=None): # if len(self._sample_indexes) == 0: # return None # if main_sample_index is not None: # return self.samples[main_sample_index] # try: # sample_index = [sname.lower() for sname in self._sample_indexes] \ # .index(self.sample_name_from_file.lower()) # except ValueError: # return self.samples[0] # else: # return self.samples[sample_index] return vcf_fpath_by_sample
def _snpeff(cnf, input_fpath): if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome: return None, None, None step_greetings('SnpEff') output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff') stats_fpath = join( cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') + '.snpEff_summary.csv') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' snpeff = get_java_tool_cmdline(cnf, 'snpeff') ref_name = cnf.genome.snpeff.reference or cnf.genome.name if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'): ref_name = 'GRCh37.75' if ref_name.startswith('hg38'): ref_name = 'GRCh38.82' opts = '' if cnf.annotation.snpeff.cancer: opts += ' -cancer' assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!' verify_file(cnf.transcripts_fpath, 'Transcripts for snpEff -onlyTr', is_critical=True) opts += ' -onlyTr ' + cnf.transcripts_fpath + ' ' db_path = adjust_system_path(cnf.genome.snpeff.data) if db_path: opts += ' -dataDir ' + db_path elif cnf.resources.snpeff.config: conf = get_system_path(cnf, cnf.resources.snpeff.config) if conf: opts += ' -c ' + conf + ' ' else: err('Cannot find snpEff config file ' + str(cnf.resources.snpeff.config)) if cnf.annotation.snpeff.extra_options: opts += '' if not cnf.no_check: info('Removing previous snpEff annotations...') res = remove_prev_eff_annotation(cnf, input_fpath) if not res: err('Could not remove preivous snpEff annotations') return None, None, None input_fpath = res snpeff_type = get_snpeff_type(snpeff) if snpeff_type == "old": opts += ' -stats ' + stats_fpath + ' -csvStats' else: opts += ' -csvStats ' + stats_fpath cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format( **locals()) for i in range(1, 20): try: res = call_subprocess(cnf, cmdline, input_fpath, output_fpath, exit_on_error=False, stdout_to_outputfile=True, overwrite=True) except OSError: import traceback, time err(traceback.format_exc()) warn() info('Waiting 1 minute') time.sleep(60) info('Rerunning ' + str(i)) else: break output_fpath = verify_vcf(output_fpath, is_critical=True) snpeff_summary_html_fpath = 'snpEff_summary.html' if isfile(snpeff_summary_html_fpath): info('SnpEff created ' + snpeff_summary_html_fpath + ' in the cwd, removing it...') try: os.remove(snpeff_summary_html_fpath) except OSError: pass if res: return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' else: return None, None, None
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)
def run_annotators(cnf, vcf_fpath, bam_fpath): original_vcf = cnf.vcf db_section_by_name = OrderedDict( (dbname, cnf.annotation[dbname]) for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine'] if dbname in cnf.annotation and not cnf.annotation[dbname].get('skip-annotation')) # if not cnf.no_check: # to_delete_id_ref = [] # if 'dbsnp' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as rs*') # to_delete_id_ref.append('rs') # if 'cosmic' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as COS*') # to_delete_id_ref.append('COS') # # def delete_ids(rec): # deleting existing dbsnp and cosmic ID annotations # if rec.ID: # if isinstance(rec.ID, basestring): # if any(rec.ID.startswith(pref) for pref in to_delete_id_ref): # rec.ID = None # else: # rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)] # # if not rec.FILTER: # rec.FILTER = 'PASS' # # return rec # # info('Removing previous rs* and COS* IDs') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID') bcftools = get_system_path(cnf, 'bcftools') if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'): vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) cmdl = '{bcftools} annotate --remove ID {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid')) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get( 'custom_vcfs', dict()).items(): step_greetings('Annotating using ' + dbname) annotations = ','.join('INFO/' + a for a in dbconf.get('annotations')) if dbname in ('cosmic', 'dbsnp'): annotations += ',=ID' db_fpath = get_db_path(cnf, dbconf, dbname) if db_fpath: cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname)) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) verify_vcf(vcf_fpath, is_critical=True) if 'dbnsfp' in cnf.annotation: res = _snpsift_db_nsfp(cnf, vcf_fpath) if res: vcf_fpath = res if 'snpeff' in cnf.annotation: res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath) if res: vcf_fpath = res verify_vcf(vcf_fpath, is_critical=True) final_summary_fpath = join(cnf.output_dir, basename(summary_fpath)) final_genes_fpath = join(cnf.output_dir, basename(genes_fpath)) if isfile(final_summary_fpath): os.remove(final_summary_fpath) if isfile(final_genes_fpath): os.remove(final_genes_fpath) if file_exists(summary_fpath): shutil.move(summary_fpath, final_summary_fpath) if file_exists(genes_fpath): shutil.move(genes_fpath, final_genes_fpath) if 'tracks' in cnf.annotation and cnf.annotation[ 'tracks'] and cnf.annotation['tracks']: track_fapths = [] for track_name in cnf.annotation['tracks']: if isfile(track_name) and verify_file(track_name): track_fapths.append(track_name) else: if 'tracks' in cnf['genome'] and cnf['genome'][ 'tracks'] and track_name in cnf['genome']['tracks']: track_fpath = cnf['genome']['tracks'][track_name] if verify_file(track_fpath): track_fapths.append(track_fpath) for track_fapth in track_fapths: res = _tracks(cnf, track_fapth, vcf_fpath) if res: vcf_fpath = res step_greetings('Intersection with database VCFs...') if 'intersect_with' in cnf.annotation: for key, db_fpath in cnf.annotation['intersect_with'].items(): res = intersect_vcf(cnf, input_fpath=vcf_fpath, db_fpath=db_fpath, key=key) if res: vcf_fpath = res if 'mongo' in cnf.annotation: res = _mongo(cnf, vcf_fpath) if res: vcf_fpath = res return vcf_fpath