def flatten_vcf_data(fields, vcfdata, flatdata=defaultdict(dict)): infofields = fields[:9] samples = fields[9:] for d in vcfdata: if d['CHROM']=='chr7': (d['CHROM'], pos) = cftr.hg19_to_CFTR(d['CHROM'], d['POS']) d['POS'] = str(pos) poskey = create_pos_key(d['CHROM'], d['POS']) ids = d['ID'].split(';') dbsnp = [ rs for rs in ids if rs.startswith('rs') ] d['hg19_coordinates'] = "{}:{}".format(*cftr.CFTR_to_hg19( d['CHROM'], d['POS'])) d['ID'] = ';'.join(dbsnp) if dbsnp else '-' d['hg19_ID'] = '-' for sample in samples: sfields = d['FORMAT'].split(':') sdata = dict(zip(sfields, d[sample].split(':'))) idata = dict([ v.split('=') for v in d['INFO'].split(';') \ if '=' in v ]) if sdata['GT']=='0/0' or sdata['GT']=='./.': continue sample_row = d.copy() sample_row['Sample ID'] = sample gtvals = [int(i) for i in sdata['GT'].split('/')] altchoices = [d['REF'],] + d['ALT'].split(',') altvals = set([ altchoices[i] for i in gtvals if i>0 ]) sample_row['ALT'] = ','.join(altvals) sample_row['hom/het'] = 'hom' if gtvals[0]==gtvals[1] else 'het' (sample_row['DP'], sample_row['AF']) = DPcalculate_AF(sdata, d['INFO']) sample_row['QD'] = idata['QD'] if 'QD' in idata else '' sample_row['FS'] = idata['FS'] if 'FS' in idata else '' newgtvals = [] if gtvals[0]==0: sdata['GT'] = '0/1' elif gtvals[0]==gtvals[1]: sdata['GT'] = '1/1' else: sdata['GT'] = '1/2' sample_row['format_values'] = ':'.join([ sdata[f] for f in sfields ]) flatdata[sample][poskey] = sample_row newfields = ['Sample ID',]+infofields+['format_values', 'hom/het', 'ROI (Y/N)', 'c./p.(AnnoVar)', 'DP', 'AF', 'QD', 'FS', 'hg19_coordinates', ] return (flatdata, newfields)
def get_annovar_data(annovar_files): if not annovar_files: return {} sys.stderr.write("\nParsing annovar data {}\n".format(annovar_files)) afiles = [] for ext in ('.variant_function', '.exonic_variant_function'): files = [ f for f in annovar_files if f.endswith(ext) ] if files: afiles.append(files[0]) if not os.path.isfile(files[0]): sys.stderr.write(" Not a valid file: {}\n".format(files[0])) sys.exit(1) else: sys.stderr.write(" Could not find annovar {} file\n".format(ext)) sys.exit(1) sys.stderr.write(" Reading {}\n".format(afiles[0])) with open(afiles[0], 'r') as fh: data_rows = [ [l.rstrip().split("\t"),] for l in fh.readlines() \ if len(l)>1 ] sys.stderr.write(" Read {} lines\n".format(len(data_rows))) sys.stderr.write(" Reading {}\n".format(afiles[1])) with open(afiles[1], 'r') as fh: numdat = 0 for l in fh.readlines(): d = l.rstrip().split("\t") if not d: continue linenum = int(d.pop(0).lstrip('line')) data_rows[linenum-1].append(d) numdat += 1 sys.stderr.write(" Read {} lines\n".format(numdat)) annovar_data = defaultdict(dict) for row in data_rows: if len(row[0])>10: (chrom_hg19, pos_hg19) = row[0][10:12] else: (chrom_hg19, pos_hg19) = row[0][2:4] (chrom, pos) = cftr.hg19_to_CFTR(chrom_hg19, pos_hg19) poskey = create_pos_key(chrom, pos) if len(row)>1: #has exonic_variant_function data annovar_data[poskey][row[1][1].rstrip(',')] = row[1][0] elif ':' in row[0][1]: annovar_data[poskey][row[0][1]] = row[0][0] return annovar_data
def flatten_vcf_data(fields, vcfdata, flatdata=defaultdict(dict)): infofields = fields[:9] samples = fields[9:] ifields = [] #'DP', 'SRF', 'SRR', 'SAF', 'SAR','ABP', 'QA', 'AO'] for d in vcfdata: if d['CHROM']=='chr7': (d['CHROM'], pos) = cftr.hg19_to_CFTR(d['CHROM'], d['POS']) d['POS'] = str(pos) poskey = create_pos_key(d['CHROM'], d['POS']) ids = d['ID'].split(';') dbsnp = [ rs for rs in ids if rs.startswith('rs') ] d['hg19_coordinates'] = "{}:{}".format(*cftr.CFTR_to_hg19( d['CHROM'], d['POS'])) d['ID'] = ';'.join(dbsnp) if dbsnp else '-' d['hg19_ID'] = '-' for sample in samples: sfields = d['FORMAT'].split(':') sdata = dict(zip(sfields, d[sample].split(':'))) idata = dict([ v.split('=') for v in d['INFO'].split(';') \ if '=' in v ]) if sdata['GT']=='0/0' or sdata['GT']=='./.': continue sample_row = d.copy() sample_row['Sample ID'] = sample gtvals = [int(i) for i in sdata['GT'].split('/')] sample_row['hom/het'] = 'hom' if gtvals[0]==gtvals[1] else 'het' (sample_row['alt_AF'], sample_row['DP']) = calculate_AF(idata) # for f in ifields: sample_row[f] = idata[f] add_avg_base_quality(sample_row, idata) sample_row['format_values'] = ':'.join([ sdata[f] for f in sfields ]) flatdata[sample][poskey] = sample_row ifields.extend(['QA/AO',]) newfields = ['Sample ID',]+infofields+['format_values', 'hom/het', 'ROI (Y/N)', 'c./p.(AnnoVar)', 'DP', 'alt_AF',]+\ ifields + ['hg19_coordinates', ] return (flatdata, newfields)