Ejemplo n.º 1
0
def flatten_vcf_data(fields, vcfdata, flatdata=defaultdict(dict)):
    infofields = fields[:9]
    samples = fields[9:]
    for d in vcfdata:
        if d['CHROM']=='chr7':
            (d['CHROM'], pos) = cftr.hg19_to_CFTR(d['CHROM'], d['POS'])
            d['POS'] = str(pos)
        poskey = create_pos_key(d['CHROM'], d['POS'])
        ids = d['ID'].split(';')
        dbsnp = [ rs for rs in ids if rs.startswith('rs') ]
        d['hg19_coordinates'] = "{}:{}".format(*cftr.CFTR_to_hg19(
                                d['CHROM'], d['POS']))
        d['ID'] = ';'.join(dbsnp) if dbsnp else '-'
        d['hg19_ID'] = '-' 
        for sample in samples:
            sfields = d['FORMAT'].split(':')
            sdata = dict(zip(sfields, d[sample].split(':')))
            idata = dict([ v.split('=') for v in d['INFO'].split(';') \
                           if '=' in v ])
            if sdata['GT']=='0/0' or sdata['GT']=='./.':
                continue
            sample_row = d.copy()
            sample_row['Sample ID'] = sample
            gtvals = [int(i) for i in sdata['GT'].split('/')]
            altchoices = [d['REF'],] + d['ALT'].split(',')
            altvals = set([ altchoices[i] for i in gtvals if i>0 ])
            sample_row['ALT'] = ','.join(altvals)
            sample_row['hom/het'] = 'hom' if gtvals[0]==gtvals[1] else 'het'
            (sample_row['DP'], sample_row['AF']) = DPcalculate_AF(sdata,
                                                   d['INFO'])
            sample_row['QD'] = idata['QD'] if 'QD' in idata else ''
            sample_row['FS'] = idata['FS'] if 'FS' in idata else ''
            newgtvals = []
            if gtvals[0]==0:
                sdata['GT'] = '0/1'
            elif gtvals[0]==gtvals[1]:
                sdata['GT'] = '1/1'
            else:
                sdata['GT'] = '1/2'
            sample_row['format_values'] = ':'.join([ sdata[f] for f in sfields ])
            flatdata[sample][poskey] = sample_row
    newfields = ['Sample ID',]+infofields+['format_values', 'hom/het', 
                 'ROI (Y/N)', 'c./p.(AnnoVar)', 'DP', 'AF', 'QD', 'FS', 
                 'hg19_coordinates', ]
    return (flatdata, newfields)
Ejemplo n.º 2
0
def get_annovar_data(annovar_files):
    if not annovar_files: return {}
    sys.stderr.write("\nParsing annovar data {}\n".format(annovar_files))
    afiles = []
    for ext in ('.variant_function', '.exonic_variant_function'):
        files = [ f for f in annovar_files if f.endswith(ext) ]
        if files:
            afiles.append(files[0])
            if not os.path.isfile(files[0]):
                sys.stderr.write("  Not a valid file: {}\n".format(files[0]))
                sys.exit(1)
        else:
            sys.stderr.write("  Could not find annovar {} file\n".format(ext))
            sys.exit(1)
    sys.stderr.write("  Reading {}\n".format(afiles[0]))
    with open(afiles[0], 'r') as fh:
        data_rows = [ [l.rstrip().split("\t"),] for l in fh.readlines() \
                      if len(l)>1 ]
    sys.stderr.write("    Read {} lines\n".format(len(data_rows)))
    sys.stderr.write("  Reading {}\n".format(afiles[1]))
    with open(afiles[1], 'r') as fh:
        numdat = 0
        for l in fh.readlines():
            d = l.rstrip().split("\t")
            if not d: continue
            linenum = int(d.pop(0).lstrip('line'))
            data_rows[linenum-1].append(d)
            numdat += 1
        sys.stderr.write("    Read {} lines\n".format(numdat))
    annovar_data = defaultdict(dict)
    for row in data_rows:
        if len(row[0])>10:
            (chrom_hg19, pos_hg19) = row[0][10:12]
        else:
            (chrom_hg19, pos_hg19) = row[0][2:4]
        (chrom, pos) = cftr.hg19_to_CFTR(chrom_hg19, pos_hg19)
        poskey = create_pos_key(chrom, pos)
        if len(row)>1: #has exonic_variant_function data
            annovar_data[poskey][row[1][1].rstrip(',')] = row[1][0]
        elif ':' in row[0][1]:
            annovar_data[poskey][row[0][1]] = row[0][0]
    return annovar_data
Ejemplo n.º 3
0
def flatten_vcf_data(fields, vcfdata, flatdata=defaultdict(dict)):
    infofields = fields[:9]
    samples = fields[9:]
    ifields = [] #'DP', 'SRF', 'SRR', 'SAF', 'SAR','ABP', 'QA', 'AO']
    for d in vcfdata:
        if d['CHROM']=='chr7':
            (d['CHROM'], pos) = cftr.hg19_to_CFTR(d['CHROM'], d['POS'])
            d['POS'] = str(pos)
        poskey = create_pos_key(d['CHROM'], d['POS'])
        ids = d['ID'].split(';')
        dbsnp = [ rs for rs in ids if rs.startswith('rs') ]
        d['hg19_coordinates'] = "{}:{}".format(*cftr.CFTR_to_hg19(
                                d['CHROM'], d['POS']))
        d['ID'] = ';'.join(dbsnp) if dbsnp else '-'
        d['hg19_ID'] = '-' 
        for sample in samples:
            sfields = d['FORMAT'].split(':')
            sdata = dict(zip(sfields, d[sample].split(':')))
            idata = dict([ v.split('=') for v in d['INFO'].split(';') \
                           if '=' in v ])
            if sdata['GT']=='0/0' or sdata['GT']=='./.':
                continue
            sample_row = d.copy()
            sample_row['Sample ID'] = sample
            gtvals = [int(i) for i in sdata['GT'].split('/')]
            sample_row['hom/het'] = 'hom' if gtvals[0]==gtvals[1] else 'het'
            (sample_row['alt_AF'], sample_row['DP']) = calculate_AF(idata)
#            for f in ifields: sample_row[f] = idata[f]
            add_avg_base_quality(sample_row, idata)
            sample_row['format_values'] = ':'.join([ sdata[f] for f in sfields ])
            flatdata[sample][poskey] = sample_row
    ifields.extend(['QA/AO',])
    newfields = ['Sample ID',]+infofields+['format_values', 'hom/het', 
                 'ROI (Y/N)', 'c./p.(AnnoVar)', 'DP', 'alt_AF',]+\
                 ifields + ['hg19_coordinates', ]
    return (flatdata, newfields)