def igvtools_index(cnf, vcf_fpath): igvtools = get_system_path(cnf, 'igvtools') if not igvtools: err('Warning: no igvtools found, cannot index VCF.') return None if igvtools.endswith('.jar'): igvtools = get_java_tool_cmdline(cnf, 'igvtools') if igvtools is None: err('Warning: no jar igvtools found, cannot index VCF.') return None cmdline = '{igvtools} index {vcf_fpath}'.format(**locals()) call(cnf, cmdline, exit_on_error=False) if exists('igv.log'): try: os.remove('igv.log') except OSError: pass return vcf_fpath + '.idx'
def combine_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath, additional_parameters=''): gatk = get_java_tool_cmdline(cnf, 'gatk') if not gatk: info('GATK is not found, skipping merging VCFs') return None cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format( **locals()) for s_name, vcf_fpath in vcf_fpath_by_sname.items(): if vcf_fpath: cmdl += ' --variant:' + s_name + ' ' + vcf_fpath if ' --variant:' not in cmdl: err('No VCFs to combine') return None if cnf.reuse_intermediate and isfile( combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath + '.gz'): info(combined_vcf_fpath + '.gz exists, reusing') return combined_vcf_fpath + '.gz' cmdl += ' -o ' + combined_vcf_fpath res = call(cnf, cmdl, output_fpath=combined_vcf_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: info('Joined VCFs, saved into ' + combined_vcf_fpath) if isfile(combined_vcf_fpath + '.tx.idx'): try: os.remove(combined_vcf_fpath + '.tx.idx') except OSError: err(traceback.format_exc()) info() return bgzip_and_tabix(cnf, combined_vcf_fpath) else: warn('Could not join VCFs') return None
def get_trasncripts_fpath(cnf): if cnf.transcripts_fpath: if verify_file(cnf.transcripts_fpath): return cnf.transcripts_fpath if isfile(cnf.transcripts_fpath): os.remove(cnf.transcripts_fpath) # custom_transcripts_fpath = cnf['snpeff'].get('only_transcripts') # if custom_transcripts_fpath: # if verify_file(custom_transcripts_fpath, 'Transcripts for snpEff -onlyTr'): # transcripts_fpath = custom_transcripts_fpath # # else: dump_transcript_fpath = join(cnf.work_dir, 'snpeff_transcripts.txt') if isfile(dump_transcript_fpath) and verify_file(dump_transcript_fpath): cnf.transcripts_fpath = dump_transcript_fpath return cnf.transcripts_fpath snpeff = get_java_tool_cmdline(cnf, 'snpeff') if not snpeff: critical('No snpeff or it is incorrect path in system config.') db_path = cnf['genome'].get('snpeff') if db_path: db_path_cmdline = ' -dataDir ' + db_path else: db_path_cmdline = '' # err('Please, provide a path to SnpEff data in ' # 'the "genomes" section in the system config.') if isfile(dump_transcript_fpath): os.remove(dump_transcript_fpath) genome = cnf.genome.name cmdline = '{snpeff} dump {db_path_cmdline} -v -txt {genome}'.format(**locals()) if call(cnf, cmdline, output_fpath=dump_transcript_fpath): cnf.transcripts_fpath = dump_transcript_fpath return cnf.transcripts_fpath
def _snpsift_db_nsfp(cnf, input_fpath): if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome: return None step_greetings('DB SNFP') output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') db_path = cnf['genome']['dbnsfp'] if not verify_file(db_path, 'DB NSFP file'): err('DB NSFP file is incorrect. Skipping.') return None annotations = cnf.annotation['dbnsfp'].get('annotations') or [] # all_fields.extend(['dbNSFP_' + ann for ann in annotations]) ann_line = ('-f ' + ','.join(annotations)) if annotations else '' cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \ '{input_fpath}'.format(**locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True): return verify_vcf(output_fpath, is_critical=True) else: return None
def _mongo(cnf, input_fpath): step_greetings('Annotating from Mongo') if 'mongo' not in cnf.annotation: return None executable = get_java_tool_cmdline( cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar')) output_fpath = intermediate_fname(cnf, input_fpath, 'mongo') project_name = cnf.project_name cmdline = ('{executable} -module annotation -inputFile {input_fpath} ' '' '-outputFile {output_fpath} -project {project_name} ').format( **locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=False, exit_on_error=False): return output_fpath else: return None
def _snpeff(cnf, input_fpath): if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome: return None, None, None step_greetings('SnpEff') output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff') stats_fpath = join( cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') + '.snpEff_summary.csv') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' snpeff = get_java_tool_cmdline(cnf, 'snpeff') ref_name = cnf.genome.snpeff.reference or cnf.genome.name if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'): ref_name = 'GRCh37.75' if ref_name.startswith('hg38'): ref_name = 'GRCh38.82' opts = '' if cnf.annotation.snpeff.cancer: opts += ' -cancer' assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!' verify_file(cnf.transcripts_fpath, 'Transcripts for snpEff -onlyTr', is_critical=True) opts += ' -onlyTr ' + cnf.transcripts_fpath + ' ' db_path = adjust_system_path(cnf.genome.snpeff.data) if db_path: opts += ' -dataDir ' + db_path elif cnf.resources.snpeff.config: conf = get_system_path(cnf, cnf.resources.snpeff.config) if conf: opts += ' -c ' + conf + ' ' else: err('Cannot find snpEff config file ' + str(cnf.resources.snpeff.config)) if cnf.annotation.snpeff.extra_options: opts += '' if not cnf.no_check: info('Removing previous snpEff annotations...') res = remove_prev_eff_annotation(cnf, input_fpath) if not res: err('Could not remove preivous snpEff annotations') return None, None, None input_fpath = res snpeff_type = get_snpeff_type(snpeff) if snpeff_type == "old": opts += ' -stats ' + stats_fpath + ' -csvStats' else: opts += ' -csvStats ' + stats_fpath cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format( **locals()) for i in range(1, 20): try: res = call_subprocess(cnf, cmdline, input_fpath, output_fpath, exit_on_error=False, stdout_to_outputfile=True, overwrite=True) except OSError: import traceback, time err(traceback.format_exc()) warn() info('Waiting 1 minute') time.sleep(60) info('Rerunning ' + str(i)) else: break output_fpath = verify_vcf(output_fpath, is_critical=True) snpeff_summary_html_fpath = 'snpEff_summary.html' if isfile(snpeff_summary_html_fpath): info('SnpEff created ' + snpeff_summary_html_fpath + ' in the cwd, removing it...') try: os.remove(snpeff_summary_html_fpath) except OSError: pass if res: return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' else: return None, None, None
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)