def _load_additional_gene_info(self): gene_ids = self.get_all_gene_ids() size = len(gene_ids) progress = get_progressbar(size, 'Loading additional info about genes') for i, gene_id in enumerate(gene_ids): progress.update(i) # calculate coding size gene_structure = self.get_gene_structure(gene_id) coding_size = get_coding_size_from_gene_structure(gene_id, gene_structure) self._db.genes.update({'gene_id': gene_id}, {'$set': {'coding_size': coding_size}}) # phenotypes if self.has_phenotype_data: phenotype_info = self.get_ensembl_rest_proxy().get_phenotype_info(gene_id) else: phenotype_info = { 'has_mendelian_phenotype': True, 'mim_id': "180901", 'mim_phenotypes': [], 'orphanet_phenotypes': [], } self._db.genes.update( {'gene_id': gene_id}, {'$set': {'phenotype_info': phenotype_info}} )
def _load_additional_gene_info(self): gene_ids = self.get_all_gene_ids() size = len(gene_ids) progress = get_progressbar(size, 'Loading additional info about genes') for i, gene_id in enumerate(gene_ids): progress.update(i) # calculate coding size gene_structure = self.get_gene_structure(gene_id) coding_size = get_coding_size_from_gene_structure( gene_id, gene_structure) self._db.genes.update({'gene_id': gene_id}, {'$set': { 'coding_size': coding_size }}) # phenotypes if self.has_phenotype_data: phenotype_info = self.get_ensembl_rest_proxy( ).get_phenotype_info(gene_id) else: phenotype_info = { 'has_mendelian_phenotype': False, 'mim_id': "", 'mim_phenotypes': [], 'orphanet_phenotypes': [], } self._db.genes.update({'gene_id': gene_id}, {'$set': { 'phenotype_info': phenotype_info }})
def _load_genes(self): self._db.drop_collection('genes') self._db.genes.ensure_index('gene_id') self._db.drop_collection('transcripts') self._db.transcripts.ensure_index('transcript_id') self._db.transcripts.ensure_index('gene_id') self._db.drop_collection('exons') self._db.exons.ensure_index('exon_id') self._db.exons.ensure_index('gene_id') gencode_file = gzip.open(self.settings_module.gencode_gtf_file) size = os.path.getsize(self.settings_module.gencode_gtf_file) progress = get_progressbar(size, 'Loading gene definitions from GTF') for datatype, obj in get_data_from_gencode_gtf(gencode_file): progress.update(gencode_file.fileobj.tell()) if datatype == 'gene': gene_id = obj['gene_id'] obj['symbol'] = obj['gene_name'] obj['tags'] = {} # TODO #obj['coding_size'] = loading_utils.get_coding_size_from_gene_structure(obj) obj['coding_size'] = 0 self._db.genes.insert(obj) if datatype == 'transcript': transcript_id = obj['transcript_id'] obj['tags'] = {} self._db.transcripts.insert(obj) if datatype == 'exon': exon_id = obj['exon_id'] transcript_id = obj['transcript_id'] del obj['transcript_id'] if self._db.exons.find_one({'exon_id': exon_id}): self._db.exons.update({'exon_id': exon_id}, {'$push': {'transcripts': transcript_id}}) else: obj['transcripts'] = [transcript_id,] obj['tags'] = {} self._db.exons.insert(obj) if datatype == 'cds': exon_id = obj['exon_id'] # this works because cds always comes after exon # this is obviously an inglorious hack - all the gtf parsing should be improved self._db.exons.update({'exon_id': exon_id}, {'$set': { 'cds_start': obj['start'], 'cds_stop': obj['stop'], 'cds_xstart': obj['xstart'], 'cds_xstop': obj['xstop'], }})
def get_tissue_expression_values_by_gene(expression_file_name, samples_file_name): """ Return iterator of (gene_id, expression array) tuples Expression array is: expressions: { tissue_type: [array of expression values] } expression_file (RPKM_GeneLevel_September.gct) is in gtex format; samples file is just two columns: sample -> tissue type Command for getting samples file: awk -F"\t" '{ gsub(/ /,"_",$47); gsub(/-/,".",$1); print $1"\t"tolower($47) }' RNA-SeQC_metrics_September.tsv > gtex_samples.txt """ # read samples file to get a map of sample_id -> tissue_type tissue_type_map = get_tissue_type_map(samples_file_name) expression_file = open(expression_file_name) size = os.path.getsize(expression_file_name) progress = get_progressbar(size, 'Loading GTeX data') for i, line in enumerate(expression_file): progress.update(expression_file.tell()) line = line.strip('\n') if not line: break # first two lines are junk; third is the header if i < 2: continue if i == 2: # read header of expression file to get tissue type list # (used to link column to tissue type) # this wouldn't be necessary if samples file is in the same order as expression file, # but I don't wait to rely on that guarantee (mainly because they have a different # of fields) tissues_by_column = get_tissues_by_column(line, tissue_type_map) continue fields = line.split('\t') gene_id = fields[0].split('.')[0] yield (gene_id, get_expressions(line, tissues_by_column))
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} for collection in collections.values(): collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] collection.insert(family_variant_dict)
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar( size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ( "1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = [ "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF" ] else: meta_fields = [ meta_key, ] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max( freq, float( variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float( variant.extras.get( meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath( os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar( file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']]) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith( '.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf( vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int( variant.extras.get(ac_info_key).split(',') [alt_allele_pos].replace("NA", "0")) except Exception, e: print( "Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int( variant.extras.get(an_info_key).split(',') [AN_index].replace("NA", "0")) except Exception, e: print( "Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac) / an self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1,24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1,24): position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos/1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"] else: meta_fields = [meta_key,] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0")) except Exception, e: print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0")) except Exception, e: print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac)/an self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq )
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) #for family in family_info_list: # print("Indexing family: " + str(family)) # collection = collections[family['family_id']] # collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 if variants_buffered_counter > 10000: print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list))) insert_all_variants_in_buffer(family_id_to_variant_list, collections) assert len(family_id_to_variant_list) == 0 vcf_rows_counter = 0 variants_buffered_counter = 0
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population['vcf_info_key'] progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,]): progress.update(progress_file.tell()) freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq )