Ejemplo n.º 1
0
import sys
from xbrowse.parsers.vcf_stuff import iterate_vcf_path
from xbrowse.utils import get_aaf


if __name__ == '__main__':

    for variant in iterate_vcf_path(sys.argv[1], genotypes=True):
        print '\t'.join([
            str(variant.xpos),
            variant.ref,
            variant.alt,
            str(get_aaf(variant)),
        ])
Ejemplo n.º 2
0
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(
                size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 genotypes=True,
                                                 genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and (
                "1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = [
                    "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"
                ]
            else:
                meta_fields = [
                    meta_key,
                ]

            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes"
                                                     in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(
                            freq,
                            float(
                                variant.extras.get(meta_key,
                                                   0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(
                        variant.extras.get(
                            meta_key,
                            0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(
                    os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(
                    file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'], variant['ref'], variant['alt'],
                        population['slug'], variant[population['counts_key']])
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith(
                    '.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(
                    vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(
                        variant.extras.get(ac_info_key).split(',')
                        [alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AC value %s from %s: %s" %
                        (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(
                        variant.extras.get(an_info_key).split(',')
                        [AN_index].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AN value %s from %s: %s" %
                        (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac) / an
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"]
            else:
                meta_fields = [meta_key,]

            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac)/an
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
Ejemplo n.º 4
0
import sys
from xbrowse.parsers.vcf_stuff import iterate_vcf
from xbrowse.utils import get_aaf, compressed_file

if __name__ == '__main__':

    vcf_file = compressed_file(sys.argv[1])
    for variant in iterate_vcf(vcf_file, genotypes=True):
        print '\t'.join([
            str(variant.xpos),
            variant.ref,
            variant.alt,
            str(get_aaf(variant)),
        ])
Ejemplo n.º 5
0
    def load_population_to_annotator(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)):
                if i % 10000 == 0:
                    print i
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            meta_key = population['vcf_info_key']
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])):
                if i % 10000 == 0:
                    print i
                freq = float(variant.extras.get(meta_key, 0))
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                print "Adding %s" % filename
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                for i, variant in enumerate(get_variants_from_esp_file(f)):
                    if i % 10000 == 0:
                        print i
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )

        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )