import sys from xbrowse.parsers.vcf_stuff import iterate_vcf_path from xbrowse.utils import get_aaf if __name__ == '__main__': for variant in iterate_vcf_path(sys.argv[1], genotypes=True): print '\t'.join([ str(variant.xpos), variant.ref, variant.alt, str(get_aaf(variant)), ])
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar( size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ( "1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = [ "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF" ] else: meta_fields = [ meta_key, ] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max( freq, float( variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float( variant.extras.get( meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath( os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar( file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']]) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith( '.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf( vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int( variant.extras.get(ac_info_key).split(',') [alt_allele_pos].replace("NA", "0")) except Exception, e: print( "Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int( variant.extras.get(an_info_key).split(',') [AN_index].replace("NA", "0")) except Exception, e: print( "Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac) / an self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"] else: meta_fields = [meta_key,] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0")) except Exception, e: print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0")) except Exception, e: print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac)/an self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq )
import sys from xbrowse.parsers.vcf_stuff import iterate_vcf from xbrowse.utils import get_aaf, compressed_file if __name__ == '__main__': vcf_file = compressed_file(sys.argv[1]) for variant in iterate_vcf(vcf_file, genotypes=True): print '\t'.join([ str(variant.xpos), variant.ref, variant.alt, str(get_aaf(variant)), ])
def load_population_to_annotator(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)): if i % 10000 == 0: print i freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) meta_key = population['vcf_info_key'] for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])): if i % 10000 == 0: print i freq = float(variant.extras.get(meta_key, 0)) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): print "Adding %s" % filename file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) for i, variant in enumerate(get_variants_from_esp_file(f)): if i % 10000 == 0: print i self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq )