def get_variant_from_vcf_fields(vcf_fields, alt_allele_pos): """ Get a basic variant from vcf_fields, for allele given by alt_allele_pos """ chrom = vcf_fields[0] if 'chr' in vcf_fields[0] else 'chr' + vcf_fields[0] pos = int(vcf_fields[1]) # if we can't get a genomic location, just ignore it and print a message humans will ignore too # obviously need a better way to approach this if not genomeloc.valid_pos(chrom, pos): print "ERROR: could not figure out coordinates for %s:%d...maybe a nonstandard chromosome?" % (chrom, pos) return None ref = vcf_fields[3] orig_alt_alleles = vcf_fields[4].split(',') alt = orig_alt_alleles[alt_allele_pos] xpos = genomeloc.get_single_location(chrom, pos) xpos, ref, alt = get_minimal_representation(xpos, ref, alt) variant = Variant(xpos, ref, alt) variant.set_extra('alt_allele_pos', alt_allele_pos) variant.set_extra('orig_alt_alleles', orig_alt_alleles) if vcf_fields[2] and vcf_fields[2] != '.': variant.vcf_id = vcf_fields[2] return variant
def get_variant_from_vcf_fields(vcf_fields, alt_allele_pos): """ Get a basic variant from vcf_fields, for allele given by alt_allele_pos """ chrom = vcf_fields[0] if 'chr' in vcf_fields[0] else 'chr' + vcf_fields[0] pos = int(vcf_fields[1]) # if we can't get a genomic location, just ignore it and print a message humans will ignore too # obviously need a better way to approach this if not genomeloc.valid_pos(chrom, pos): print "ERROR: could not figure out coordinates for %s:%d...maybe a nonstandard chromosome?" % ( chrom, pos) return None ref = vcf_fields[3] orig_alt_alleles = vcf_fields[4].split(',') alt = orig_alt_alleles[alt_allele_pos] xpos = genomeloc.get_single_location(chrom, pos) xpos, ref, alt = get_minimal_representation(xpos, ref, alt) variant = Variant(xpos, ref, alt) variant.set_extra('alt_allele_pos', alt_allele_pos) variant.set_extra('orig_alt_alleles', orig_alt_alleles) if vcf_fields[2] and vcf_fields[2] != '.': variant.vcf_id = vcf_fields[2] return variant
def get_exac_af(chrom, pos, ref, alt): populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR'] chrom_without_chr = chrom.replace("chr", "") xpos = genomeloc.get_single_location(chrom, pos) variant_length = len(ref) + len(alt) # check whether the alleles match matching_exac_variant = None matching_exac_variant_i = None for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length): exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS) for exac_alt_i, exac_alt in enumerate(record.ALT): exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation( exac_xpos, str(record.REF), str(exac_alt)) if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt: if matching_exac_variant is not None: print( "ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt)) matching_exac_variant = record matching_exac_variant_i = exac_alt_i #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) ) if matching_exac_variant is None: #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref)) return None, None, None pop_max_af = -1 pop_max_population = None for p in populations: if matching_exac_variant.INFO['AN_' + p] > 0: pop_af = matching_exac_variant.INFO[ 'AC_' + p][matching_exac_variant_i] / float( matching_exac_variant.INFO['AN_' + p]) if pop_af > pop_max_af: pop_max_af = pop_af pop_max_population = p if matching_exac_variant.INFO['AN_Adj'] != 0: global_af = float(matching_exac_variant.INFO['AC_Adj'] [matching_exac_variant_i]) / float( matching_exac_variant.INFO['AN_Adj']) else: assert float( matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0 global_af = 0 return global_af, pop_max_af, pop_max_population
def get_exac_af(chrom, pos, ref, alt): populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR'] chrom_without_chr = chrom.replace("chr", "") xpos = genomeloc.get_single_location(chrom, pos) variant_length = len(ref) + len(alt) # check whether the alleles match matching_exac_variant = None matching_exac_variant_i = None for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length): exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS) for exac_alt_i, exac_alt in enumerate(record.ALT): exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation(exac_xpos, str(record.REF), str(exac_alt)) if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt: if matching_exac_variant is not None: print("ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt)) matching_exac_variant = record matching_exac_variant_i = exac_alt_i #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) ) if matching_exac_variant is None: #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref)) return None, None, None pop_max_af = -1 pop_max_population = None for p in populations: if matching_exac_variant.INFO['AN_'+p] > 0: pop_af = matching_exac_variant.INFO['AC_'+p][matching_exac_variant_i]/float(matching_exac_variant.INFO['AN_'+p]) if pop_af > pop_max_af: pop_max_af = pop_af pop_max_population = p if matching_exac_variant.INFO['AN_Adj'] != 0: global_af = float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i])/float(matching_exac_variant.INFO['AN_Adj']) else: assert float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0 global_af = 0 return global_af, pop_max_af, pop_max_population