def leave_main_sample(cnf, vcf_fpath, samplename): index = get_sample_column_index(vcf_fpath, samplename) if index is None: return vcf_fpath # def _f1(rec): # rec.samples = [sample_name] # return rec # info('Keeping SAMPLE only for the first sample (' + samplename + ')') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name) # out_fpath = extract_sample(cnf, vcf_fpath, sample_name) # info() def _f(line, i): if line and (line.startswith('#CHROM') or line[0] != '#'): ts = line.split('\t') return '\t'.join(ts[:9] + [ts[9 + index]]) return line vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm') if not verify_file(vcf_fpath): err('Error: leave_first_sample didnt generate output file.') return None return vcf_fpath
def remove_prev_eff_annotation(cnf, input_fpath): fields_to_del = ['EFF', 'ANN'] def proc_line(l, i): if l.startswith('##SnpEff'): return None elif any(f in l for f in fields_to_del): if l.startswith('##INFO='): try: if l.split('=', 1)[1].split(',', 1)[0].split('=')[1] in fields_to_del: return None except IndexError: critical('Incorrect VCF at line: ' + l) elif not l.startswith('#'): fields = l.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = filter(lambda pair: pair[0] not in fields_to_del, info_pairs) info_line = ';'.join('='.join(pair) if len(pair) == 2 and pair[0] not in fields_to_del else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return l return iterate_file(cnf, input_fpath, proc_line, suffix='noEFF')
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def remove_comments(cnf, bed_fpath): def f(l, i): if not l.startswith('#'): return l else: return None return iterate_file(cnf, bed_fpath, f, suffix='rmcmt')
def fix_vcf_sample_name(cnf, sample_name, vcf_fpath, output_fpath=None): output_fpath = output_fpath or intermediate_fname(cnf, vcf_fpath, 'sample') def fix_sample_name(l, i): if l.startswith('#CHROM'): fs = l.split('\t') fs[9] = sample_name l = '\t'.join(fs) elif not l.startswith('#'): fs = l.split('\t') kvs = fs[7].split(';') for i, kv in enumerate(kvs[:]): if kv.startswith('SAMPLE='): kvs[i] = 'SAMPLE=' + sample_name l = '\t'.join(fs[:7]) + '\t' + ';'.join(kvs) + '\t' + '\t'.join(fs[8:]) # l = re.sub("(?<=SAMPLE=)[^;](?=;)", sample_name, l) return l fixed_vcf = iterate_file(cnf, vcf_fpath, fix_sample_name, output_fpath=output_fpath) return bgzip_and_tabix(cnf, fixed_vcf)
def _filter_malformed_fields(cnf, input_fpath): step_greetings('Correcting malformed fields...') def proc_rec(rec): for k, v in rec.INFO.items(): if isinstance(v, list): if v[-1] == '.': rec.INFO[k] = rec.INFO[k][:-1] if v[0] == '.': rec.INFO[k] = rec.INFO[k][1:] return rec def proc_line(line, i): if line.startswith('#'): return line.replace("\' \">", "\'\">") # For vcf-merge return line # else: # if ',.' in line or '.,' in line: # fields = line.split('\t') # info_line = fields[7] # info_pairs = [attr.split('=') for attr in info_line.split(';')] # new_info_pairs = [] # for p in info_pairs: # if len(p) == 2: # if p[1].endswith(',.'): # p[1] = p[1][:-2] # if p[1].startswith('.,'): # p[1] = p[1][2:] # new_info_pairs.append('='.join(p)) # info_line = ';'.join(new_info_pairs) # fields = fields[:7] + [info_line] + fields[8:] # return '\t'.join(fields) info('Correcting INFO fields...') output_fpath = iterate_vcf(cnf, input_fpath, proc_rec, suffix='corr') info('') info('Correcting headers for vcf-merge...') output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='corr_headr') return output_fpath
def filter_bed_with_gene_set(cnf, bed_fpath, gene_keys_set, suffix=None): def fn(l, i): if l: fs = l.split('\t') if len(fs) < 4: return None new_gns = [] c = fs[0] for g in fs[3].split(','): if (g, c) in gene_keys_set: new_gns.append(g) if new_gns: return l.replace(fs[3], ','.join(new_gns)) return iterate_file(cnf, bed_fpath, fn, suffix=suffix or 'filt_genes', check_result=False)
def add_annotation(cnf, input_fpath, key, value, number, type_, description): step_greetings('Adding annotation...') def proc_rec(rec): rec.INFO[key] = value return rec output_fpath = iterate_vcf(cnf, input_fpath, proc_rec) info('Adding header meta info...') def _add_format_header(l, i): if l.startswith('#CHROM'): ext_l = '' ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format( key=key, number=number, type_=type_, desc=description) return ext_l + l return l output_fpath = iterate_file(cnf, output_fpath, _add_format_header) return verify_vcf(output_fpath, is_critical=True)
def remove_rejected(cnf, input_fpath, output_fpath=None): # if not input_fpath.endswith('.gz') or not file_exists(input_fpath + '.tbi'): # input_fpath = bgzip_and_tabix(cnf, input_fpath) qual_threshold = _get_qual_threshold(input_fpath) info('VCF QUAL threshold is ' + str(qual_threshold)) if qual_threshold > cnf.variant_filtering.min_q_mean: info('Requested QUAL threshold is ' + str(cnf.variant_filtering.min_q_mean) + ', which is higher than in VCF, so keeping records with FILTER=q' + str(qual_threshold)) def fn(l, i): if l.startswith('#'): return l else: fs = l.split('\t') if fs[6] == 'q' + str(qual_threshold) and qual_threshold > cnf.variant_filtering.min_q_mean: fs[6] = 'PASS' if fs[6] == 'PASS': return l else: return None return iterate_file(cnf, input_fpath, fn, suffix='pass')
def prep_bed_for_seq2c(cnf, bed_fpath): info() info('Doing some Seq2C specific preparation of the BED file...') cols = count_bed_cols(bed_fpath) seq2c_bed = None if 8 > cols > 4: seq2c_bed = cut(cnf, bed_fpath, 4) elif cols > 8: seq2c_bed = cut(cnf, bed_fpath, 8) else: seq2c_bed = bed_fpath if cols >= 4: # removing regions with no gene annotation def f(l, i): if l.split('\t')[3].strip() == '.': return None else: return l seq2c_bed = iterate_file(cnf, seq2c_bed, f, suffix='filt') info('Done: ' + seq2c_bed) return seq2c_bed
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None): if features_bed is None and target_bed is None: warn( 'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.' ) # return None, None, None, None if target_bed: target_bed = verify_bed(target_bed, is_critical=True) if seq2c_bed: seq2c_bed = verify_bed(seq2c_bed, is_critical=True) if features_bed: features_bed = verify_bed(features_bed, is_critical=True) # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed): # warn('Same file used for exons and amplicons: ' + features_bed) # Features features_no_genes_bed = None if features_bed: # info() # info('Merging regions within genes...') # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True) # # info() # info('Sorting exons by (chrom, gene name, start)') # exons_bed = sort_bed(cnf, exons_bed) info() info( 'Filtering the features bed file to have only non-gene and no-transcript records...' ) features_no_genes_bed = intermediate_fname(cnf, features_bed, 'no_genes') call(cnf, 'grep -vw Gene ' + features_bed + ' | grep -vw Transcript', output_fpath=features_no_genes_bed) ori_target_bed_path = target_bed if target_bed: info() info('Remove comments in target...') target_bed = remove_comments(cnf, target_bed) info() info('Cut -f1,2,3,4 target...') target_bed = cut(cnf, target_bed, 4) info() info('Sorting target...') target_bed = sort_bed(cnf, target_bed) cols = count_bed_cols(target_bed) if cnf.reannotate or cols < 4: info() if not features_bed: critical( str(cols) + ' columns (less than 4), and no features to annotate regions ' '(please make sure you have set the "features" key in the corresponding genome section ' '(' + cnf.genome.name + ') in ' + cnf.sys_cnf) info( 'cnf.reannotate is ' + str(cnf.reannotate) + ', and cols in the target BED is ' + str(cols) + '. Annotating target with the gene names from the "features" file ' + features_bed + '...') target_bed = annotate_target(cnf, target_bed) def remove_no_anno(l, i): if l.split('\t')[3].strip() == '.': return None else: return l if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path: info('Seq2C bed: remove regions with no gene annotation') seq2c_bed = target_bed seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') elif seq2c_bed: info() info('Remove comments in seq2c bed...') seq2c_bed = remove_comments(cnf, seq2c_bed) info() info('Sorting seq2c bed...') seq2c_bed = sort_bed(cnf, seq2c_bed) cols = count_bed_cols(seq2c_bed) if cols < 4: info() info('Number columns in SV bed is ' + str(cols) + '. Annotating amplicons with gene names...') seq2c_bed = annotate_target(cnf, seq2c_bed) elif 8 > cols > 4: seq2c_bed = cut(cnf, seq2c_bed, 4) elif cols > 8: seq2c_bed = cut(cnf, seq2c_bed, 8) info('Filtering non-annotated entries in seq2c bed') seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') else: seq2c_bed = verify_bed(cnf.genome.cds) if target_bed: info() # info('Merging amplicons...') # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False) info('Sorting target by (chrom, gene name, start)') target_bed = sort_bed(cnf, target_bed) return features_bed, features_no_genes_bed, target_bed, seq2c_bed
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)
def intersect_vcf(cnf, input_fpath, db_fpath, key): vcf_fpath = input_fpath db_fpath = verify_file(db_fpath) if not db_fpath: return None info('Intersecting with ' + db_fpath + ', writing key ' + str(key)) info('Preparing db...') def _add_info_flag(l, i): if l.startswith('#'): return l fs = l.split('\t') info_col, ft_keys, ft_vals = fs[-3], fs[-2], fs[-1] ft_dict = dict(zip(ft_keys.split(':'), ft_vals.split(':'))) for ann in ['DP', 'MQ']: val = ft_dict.get(ann, None) if val: # ft_keys[key.replace('.', '_') + '_' + ann] = val # del ft_keys[ann] info_col += ';' + key.replace('.', '_') + '_' + ann + '=' + val # ft_items = ft_dict.items() # ft_keys = [k for k, v in ft_items] # ft_vals = [v for k, v in ft_items] # return '\t'.join(fs[:-2]) + '\t' + ':'.join(ft_keys) + '\t' + ':'.join(ft_vals) return '\t'.join(fs[:-3]) + '\t' + info_col + '\t' + '\t'.join(fs[-2:]) # rec.FORMAT[key.replace('.', '_') + '_DP'] = rec.genotype(key.split('.')[0])['DP'] # rec.INFO[key.replace('.', '_') + '_MQ'] = rec.genotype(key.split('.')[0])['MQ'] # return rec # db_fpath = iterate_vcf(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS') db_fpath = iterate_file(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS') info('Adding header meta info...') def _add_header(l, i): if l.startswith('#CHROM'): ext_l = '' for ann in ['DP', 'MQ']: ext_l += '##INFO=<ID=' + key.replace( '.', '_' ) + '_' + ann + ',Number=1,Type=Integer,Description="description">\n' return ext_l + l return l db_fpath = iterate_file(cnf, db_fpath, _add_header, suffix='INFO_HEADER') # out_fpath = add_suffix(db_fpath, 'HEADERS') # if cnf.reuse_intermediate and verify_file(out_fpath, silent=True): # info(out_fpath + ' exists, reusing') # else: # reader = vcf_parser.Reader(open(db_fpath)) # for k in 'DP', 'MQ': # k = k + '_' + key.replace('.', '_') # reader.infos[k] = _Info(id=k, num=1, type='Integer', desc=k + ' ' + key) # # with file_transaction(cnf.work_dir, out_fpath) as tx: # recs = [] # cnt = 0 # with open(tx, 'w') as f: # writer = vcf_parser.Writer(f, reader) # while True: # cnt += 1 # rec = next(reader, None) # if rec is None: # break # recs.append(rec) # if cnt % 1000000 == 0: # info('Written ' + str(cnt) + ' lines') # writer.write_records(recs) # recs = [] # writer.write_records(recs) # db_fpath = out_fpath db_fpath = bgzip_and_tabix(cnf, db_fpath) info('Annotating using this db...') vcf_conf = { 'path': db_fpath, 'annotations': [key.replace('.', '_') + '_DP', key.replace('.', '_') + '_MQ'] } vcf_fpath = _snpsift_annotate(cnf, vcf_conf, key, vcf_fpath) info('Moving INFO to FORMAT...') def _move_info_to_format(l, i): if l.startswith('#'): return l fs = l.split('\t') info_dict = dict([ kv.split('=') if '=' in kv else (kv, True) for kv in fs[7].split(';') ]) ft_keys = fs[8].split(':') all_ft_vals = [ft_vals.split(':') for ft_vals in fs[9:]] ft_dicts = [ OrderedDict(zip(ft_keys, ft_vals)) for ft_vals in all_ft_vals ] for ann in ['DP', 'MQ']: k = key.replace('.', '_') + '_' + ann for ft_dict in ft_dicts: ft_dict[k] = info_dict.get(k, '.') all_ft_vals = [] for ft_dict in ft_dicts: ft_items = ft_dict.items() ft_keys = [k for k, v in ft_items] all_ft_vals.append([v for k, v in ft_items]) l = '\t'.join(fs[:8]) + '\t' + ':'.join(ft_keys) for ft_vals in all_ft_vals: l += '\t' + ':'.join(ft_vals) return l # rec.FORMAT[key.replace('.', '_') + '_DP'] = rec.genotype(key.split('.')[0])['DP'] # rec.INFO[key.replace('.', '_') + '_MQ'] = rec.genotype(key.split('.')[0])['MQ'] # return rec # db_fpath = iterate_vcf(cnf, db_fpath, _add_info_flag, suffix='INFO_FLAGS') vcf_fpath = iterate_file(cnf, vcf_fpath, _move_info_to_format, suffix='FORMAT_FLAGS') info('Adding FORMAT header meta info...') def _add_format_header(l, i): if l.startswith('#CHROM'): ext_l = '' ext_l += '##FORMAT=<ID=' + key.replace( '.', '_' ) + '_DP,Number=1,Type=Integer,Description="Number of high-quality bases">\n' ext_l += '##FORMAT=<ID=' + key.replace( '.', '_' ) + '_MQ,Number=1,Type=Integer,Description="Average mapping quality">\n' return ext_l + l return l vcf_fpath = iterate_file(cnf, vcf_fpath, _add_format_header, suffix='FORMAT_HEADER') info() if vcf_fpath: info('Renaming ' + vcf_fpath + ' -> ' + input_fpath) os.rename(vcf_fpath, input_fpath) else: warn('Intersection with ' + key + ' didn\'t work') return input_fpath