Esempio n. 1
0
    def _load_additional_gene_info(self):

        gene_ids = self.get_all_gene_ids()
        size = len(gene_ids)
        progress = get_progressbar(size, 'Loading additional info about genes')
        for i, gene_id in enumerate(gene_ids):
            progress.update(i)

            # calculate coding size
            gene_structure = self.get_gene_structure(gene_id)
            coding_size = get_coding_size_from_gene_structure(gene_id, gene_structure)
            self._db.genes.update({'gene_id': gene_id}, {'$set': {'coding_size': coding_size}})

            # phenotypes
            if self.has_phenotype_data:
                phenotype_info = self.get_ensembl_rest_proxy().get_phenotype_info(gene_id)
            else:
                phenotype_info = {
                    'has_mendelian_phenotype': True,
                    'mim_id': "180901",
                    'mim_phenotypes': [],
                    'orphanet_phenotypes': [],
                }
            self._db.genes.update(
                {'gene_id': gene_id},
                {'$set': {'phenotype_info': phenotype_info}}
            )
Esempio n. 2
0
    def _load_additional_gene_info(self):

        gene_ids = self.get_all_gene_ids()
        size = len(gene_ids)
        progress = get_progressbar(size, 'Loading additional info about genes')
        for i, gene_id in enumerate(gene_ids):
            progress.update(i)

            # calculate coding size
            gene_structure = self.get_gene_structure(gene_id)
            coding_size = get_coding_size_from_gene_structure(
                gene_id, gene_structure)
            self._db.genes.update({'gene_id': gene_id},
                                  {'$set': {
                                      'coding_size': coding_size
                                  }})

            # phenotypes
            if self.has_phenotype_data:
                phenotype_info = self.get_ensembl_rest_proxy(
                ).get_phenotype_info(gene_id)
            else:
                phenotype_info = {
                    'has_mendelian_phenotype': False,
                    'mim_id': "",
                    'mim_phenotypes': [],
                    'orphanet_phenotypes': [],
                }
            self._db.genes.update({'gene_id': gene_id},
                                  {'$set': {
                                      'phenotype_info': phenotype_info
                                  }})
Esempio n. 3
0
    def _load_genes(self):

        self._db.drop_collection('genes')
        self._db.genes.ensure_index('gene_id')

        self._db.drop_collection('transcripts')
        self._db.transcripts.ensure_index('transcript_id')
        self._db.transcripts.ensure_index('gene_id')

        self._db.drop_collection('exons')
        self._db.exons.ensure_index('exon_id')
        self._db.exons.ensure_index('gene_id')

        gencode_file = gzip.open(self.settings_module.gencode_gtf_file)
        size = os.path.getsize(self.settings_module.gencode_gtf_file)
        progress = get_progressbar(size, 'Loading gene definitions from GTF')
        for datatype, obj in get_data_from_gencode_gtf(gencode_file):
            progress.update(gencode_file.fileobj.tell())

            if datatype == 'gene':
                gene_id = obj['gene_id']
                obj['symbol'] = obj['gene_name']

                obj['tags'] = {}

                # TODO
                #obj['coding_size'] = loading_utils.get_coding_size_from_gene_structure(obj)
                obj['coding_size'] = 0

                self._db.genes.insert(obj)

            if datatype == 'transcript':
                transcript_id = obj['transcript_id']
                obj['tags'] = {}
                self._db.transcripts.insert(obj)

            if datatype == 'exon':
                exon_id = obj['exon_id']
                transcript_id = obj['transcript_id']
                del obj['transcript_id']
                if self._db.exons.find_one({'exon_id': exon_id}):
                    self._db.exons.update({'exon_id': exon_id}, {'$push': {'transcripts': transcript_id}})
                else:
                    obj['transcripts'] = [transcript_id,]
                    obj['tags'] = {}
                    self._db.exons.insert(obj)

            if datatype == 'cds':
                exon_id = obj['exon_id']
                # this works because cds always comes after exon
                # this is obviously an inglorious hack - all the gtf parsing should be improved
                self._db.exons.update({'exon_id': exon_id}, {'$set': {
                    'cds_start': obj['start'],
                    'cds_stop': obj['stop'],
                    'cds_xstart': obj['xstart'],
                    'cds_xstop': obj['xstop'],
                }})
Esempio n. 4
0
    def _load_genes(self):

        self._db.drop_collection('genes')
        self._db.genes.ensure_index('gene_id')

        self._db.drop_collection('transcripts')
        self._db.transcripts.ensure_index('transcript_id')
        self._db.transcripts.ensure_index('gene_id')

        self._db.drop_collection('exons')
        self._db.exons.ensure_index('exon_id')
        self._db.exons.ensure_index('gene_id')

        gencode_file = gzip.open(self.settings_module.gencode_gtf_file)
        size = os.path.getsize(self.settings_module.gencode_gtf_file)
        progress = get_progressbar(size, 'Loading gene definitions from GTF')
        for datatype, obj in get_data_from_gencode_gtf(gencode_file):
            progress.update(gencode_file.fileobj.tell())

            if datatype == 'gene':
                gene_id = obj['gene_id']
                obj['symbol'] = obj['gene_name']

                obj['tags'] = {}

                # TODO
                #obj['coding_size'] = loading_utils.get_coding_size_from_gene_structure(obj)
                obj['coding_size'] = 0

                self._db.genes.insert(obj)

            if datatype == 'transcript':
                transcript_id = obj['transcript_id']
                obj['tags'] = {}
                self._db.transcripts.insert(obj)

            if datatype == 'exon':
                exon_id = obj['exon_id']
                transcript_id = obj['transcript_id']
                del obj['transcript_id']
                if self._db.exons.find_one({'exon_id': exon_id}):
                    self._db.exons.update({'exon_id': exon_id}, {'$push': {'transcripts': transcript_id}})
                else:
                    obj['transcripts'] = [transcript_id,]
                    obj['tags'] = {}
                    self._db.exons.insert(obj)

            if datatype == 'cds':
                exon_id = obj['exon_id']
                # this works because cds always comes after exon
                # this is obviously an inglorious hack - all the gtf parsing should be improved
                self._db.exons.update({'exon_id': exon_id}, {'$set': {
                    'cds_start': obj['start'],
                    'cds_stop': obj['stop'],
                    'cds_xstart': obj['xstart'],
                    'cds_xstop': obj['xstop'],
                }})
Esempio n. 5
0
def get_tissue_expression_values_by_gene(expression_file_name,
                                         samples_file_name):
    """
    Return iterator of (gene_id, expression array) tuples
    Expression array is:
    expressions: {
        tissue_type: [array of expression values]
    }

    expression_file (RPKM_GeneLevel_September.gct) is in gtex format; 
    samples file is just two columns: sample -> tissue type

    Command for getting samples file:
    awk -F"\t" '{ gsub(/ /,"_",$47); gsub(/-/,".",$1); print $1"\t"tolower($47) }' RNA-SeQC_metrics_September.tsv > gtex_samples.txt

    """

    # read samples file to get a map of sample_id -> tissue_type
    tissue_type_map = get_tissue_type_map(samples_file_name)

    expression_file = open(expression_file_name)

    size = os.path.getsize(expression_file_name)
    progress = get_progressbar(size, 'Loading GTeX data')
    for i, line in enumerate(expression_file):
        progress.update(expression_file.tell())
        line = line.strip('\n')
        if not line:
            break

        # first two lines are junk; third is the header
        if i < 2:
            continue
        if i == 2:
            # read header of expression file to get tissue type list
            # (used to link column to tissue type)
            # this wouldn't be necessary if samples file is in the same order as expression file,
            # but I don't wait to rely on that guarantee (mainly because they have a different # of fields)
            tissues_by_column = get_tissues_by_column(line, tissue_type_map)
            continue

        fields = line.split('\t')
        gene_id = fields[0].split('.')[0]

        yield (gene_id, get_expressions(line, tissues_by_column))
Esempio n. 6
0
def get_tissue_expression_values_by_gene(expression_file_name, samples_file_name):
    """
    Return iterator of (gene_id, expression array) tuples
    Expression array is:
    expressions: {
        tissue_type: [array of expression values]
    }

    expression_file (RPKM_GeneLevel_September.gct) is in gtex format; 
    samples file is just two columns: sample -> tissue type

    Command for getting samples file:
    awk -F"\t" '{ gsub(/ /,"_",$47); gsub(/-/,".",$1); print $1"\t"tolower($47) }' RNA-SeQC_metrics_September.tsv > gtex_samples.txt

    """

    # read samples file to get a map of sample_id -> tissue_type
    tissue_type_map = get_tissue_type_map(samples_file_name)

    expression_file = open(expression_file_name)

    size = os.path.getsize(expression_file_name)
    progress = get_progressbar(size, 'Loading GTeX data')
    for i, line in enumerate(expression_file):
        progress.update(expression_file.tell())
        line = line.strip('\n')
        if not line:
            break

        # first two lines are junk; third is the header
        if i < 2:
            continue
        if i == 2:
            # read header of expression file to get tissue type list
            # (used to link column to tissue type)
            # this wouldn't be necessary if samples file is in the same order as expression file,
            # but I don't wait to rely on that guarantee (mainly because they have a different # of fields)
            tissues_by_column = get_tissues_by_column(line, tissue_type_map)
            continue

        fields = line.split('\t')
        gene_id = fields[0].split('.')[0]

        yield (gene_id, get_expressions(line, tissues_by_column))
Esempio n. 7
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        for collection in collections.values():
            collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    collection.insert(family_variant_dict)
Esempio n. 8
0
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(
                size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 genotypes=True,
                                                 genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and (
                "1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = [
                    "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"
                ]
            else:
                meta_fields = [
                    meta_key,
                ]

            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes"
                                                     in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(
                            freq,
                            float(
                                variant.extras.get(meta_key,
                                                   0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(
                        variant.extras.get(
                            meta_key,
                            0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(
                    os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(
                    file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'], variant['ref'], variant['alt'],
                        population['slug'], variant[population['counts_key']])
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith(
                    '.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(
                    vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(
                        variant.extras.get(ac_info_key).split(',')
                        [alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AC value %s from %s: %s" %
                        (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(
                        variant.extras.get(an_info_key).split(',')
                        [AN_index].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AN value %s from %s: %s" %
                        (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac) / an
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
Esempio n. 9
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1,24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:     #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1,24):
                position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos/1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8)))
        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict, annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                            family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"]
            else:
                meta_fields = [meta_key,]

            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac)/an
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
Esempio n. 11
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        #for family in family_info_list:
        #    print("Indexing family: " + str(family))
        #    collection = collections[family['family_id']]
        #    collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                        family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                        variants_buffered_counter += 1

            if variants_buffered_counter > 10000:
                print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list)))

                insert_all_variants_in_buffer(family_id_to_variant_list, collections)

                assert len(family_id_to_variant_list) == 0
                vcf_rows_counter = 0
                variants_buffered_counter = 0
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population['vcf_info_key']

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,]):
                progress.update(progress_file.tell())
                freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']])
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )

        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )