def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False): """ Add the variants in vcf_file_path to annotator Convenience wrapper around add_variants_to_annotator """ if not force_all and self.get_annotator_datastore().vcf_files.find_one( {'vcf_file_path': vcf_file_path}): print "VCF already annotated" return print "Scanning VCF file first..." variant_t_list = [] for variant_t in vcf_stuff.iterate_tuples( compressed_file(vcf_file_path)): variant_t_list.append(variant_t) if len(variant_t_list) == 100000: print "Adding another 100000 variants, through {}".format( variant_t_list[-1][0]) self.add_variants_to_annotator(variant_t_list, force_all) variant_t_list = [] self.add_variants_to_annotator(variant_t_list, force_all) self.get_annotator_datastore().vcf_files.insert({ 'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow() })
def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False): """ Add the variants in vcf_file_path to annotator Convenience wrapper around add_variants_to_annotator """ print "Scanning VCF file first..." variant_t_list = [] for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)): variant_t_list.append(variant_t) if len(variant_t_list) == 100000: print "Adding another 100000 variants, through {}".format(variant_t_list[-1][0]) self.add_variants_to_annotator(variant_t_list, force_all) variant_t_list = [] self.add_variants_to_annotator(variant_t_list, force_all)
def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False): """ Add the variants in vcf_file_path to annotator Convenience wrapper around add_variants_to_annotator """ if not force_all and self._db.vcf_files.find_one({'vcf_file_path': vcf_file_path}): print "VCF already annotated" return print "Scanning VCF file first..." variant_t_list = [] for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)): variant_t_list.append(variant_t) if len(variant_t_list) == 100000: print "Adding another 100000 variants, through {}".format(variant_t_list[-1][0]) self.add_variants_to_annotator(variant_t_list, force_all) variant_t_list = [] self.add_variants_to_annotator(variant_t_list, force_all) self._db.vcf_files.insert({'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow()})
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} for collection in collections.values(): collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] collection.insert(family_variant_dict)
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None, start_from_chrom=None, end_with_chrom=None): collections = { f['family_id']: self._db[f['coll_name']] for f in family_info_list } #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write( "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next( vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1, 24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({ '$and': [{ 'xpos': { '$gte': chrom * 1e9 } }, { 'xpos': { '$lt': (chrom + 1) * 1e9 } }] }).sort([('xpos', -1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family[ 'family_id']] = variants[0]['xpos'] - chrom * 1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1, 24): position_per_chrom[chrom] = min( position_per_chrom[chrom].values() ) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos / 1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100. * start_from_pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain( tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) elif start_from_chrom or end_with_chrom: if start_from_chrom: print("Start chrom: chr%s" % start_from_chrom) if end_with_chrom: print("End chrom: chr%s" % end_with_chrom) chrom_list = list(map(str, range(1, 23))) + ['X', 'Y'] chrom_list_start_index = 0 if start_from_chrom: chrom_list_start_index = chrom_list.index( start_from_chrom.replace("chr", "").upper()) chrom_list_end_index = len(chrom_list) if end_with_chrom: chrom_list_end_index = chrom_list.index( end_with_chrom.replace("chr", "").upper()) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = tabix_file.header for chrom in chrom_list[ chrom_list_start_index:chrom_list_end_index + 1]: print("Will load chrom: " + chrom) try: vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom)) except ValueError as e: print("WARNING: " + str(e)) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError( "%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[ family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict( list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation( variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy( restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals( family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({ 'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt }): family_id_to_variant_list[family[ 'family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write( "ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
import sys from xbrowse.parsers.vcf_stuff import iterate_vcf from xbrowse.utils import get_aaf, compressed_file if __name__ == '__main__': vcf_file = compressed_file(sys.argv[1]) for variant in iterate_vcf(vcf_file, genotypes=True): print '\t'.join([ str(variant.xpos), variant.ref, variant.alt, str(get_aaf(variant)), ])
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1,24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1,24): position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos/1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) #for family in family_info_list: # print("Indexing family: " + str(family)) # collection = collections[family['family_id']] # collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 if variants_buffered_counter > 10000: print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list))) insert_all_variants_in_buffer(family_id_to_variant_list, collections) assert len(family_id_to_variant_list) == 0 vcf_rows_counter = 0 variants_buffered_counter = 0