Beispiel #1
0
    def _add_vcf_file_for_family(self, project_id, family_id, vcf_file_path, reference_populations=None):

        fam = self._db.families.find_one({'project_id': project_id, 'family_id': family_id})
        collection = self._db[fam['coll_name']]
        indiv_id_list = fam['individuals']

        for i, variant in enumerate(vcf_stuff.iterate_vcf_path(
            vcf_file_path,
            genotypes=True,
            indiv_id_list=indiv_id_list,
        )):
            if i % 10000 == 0:
                print i
            self._annotator.annotate_variant(variant, reference_populations)
            family_variant = variant.make_copy(restrict_to_genotypes=fam['individuals'])
            if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, fam['individuals']) is True:
                self._save_variant_to_collection(family_variant, collection)
Beispiel #2
0
 def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None):
     print "Adding variants from %s" % vcf_file_path
     collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
     for collection in collections.values():
         collection.drop_indexes()
     indiv_id_list = [i for f in family_info_list for i in f['individuals']]
     for i, variant in enumerate(vcf_stuff.iterate_vcf_path(
         vcf_file_path,
         genotypes=True,
         indiv_id_list=indiv_id_list,
     )):
         if i % 1000 == 0:
             print i
         self._annotator.annotate_variant(variant, reference_populations)
         for family in family_info_list:
             family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
             if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                 self._save_variant_to_collection(family_variant, collections[family['family_id']])
Beispiel #3
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        for collection in collections.values():
            collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    collection.insert(family_variant_dict)
Beispiel #4
0
    def _add_vcf_file_for_family_set(self,
                                     family_info_list,
                                     vcf_file_path,
                                     reference_populations=None,
                                     vcf_id_map=None,
                                     start_from_chrom=None,
                                     end_with_chrom=None):
        collections = {
            f['family_id']: self._db[f['coll_name']]
            for f in family_info_list
        }
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write(
            "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n"
            % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(
                vcf_stuff.iterate_vcf(vcf_file,
                                      genotypes=False,
                                      indiv_id_list=indiv_id_list,
                                      vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1, 24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:  #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({
                        '$and': [{
                            'xpos': {
                                '$gte': chrom * 1e9
                            }
                        }, {
                            'xpos': {
                                '$lt': (chrom + 1) * 1e9
                            }
                        }]
                    }).sort([('xpos', -1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family[
                            'family_id']] = variants[0]['xpos'] - chrom * 1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1, 24):
                position_per_chrom[chrom] = min(
                    position_per_chrom[chrom].values()
                )  # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos / 1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" %
                  (chr_idx, start_from_pos, 100. * start_from_pos /
                   CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(
                tabix_file.header,
                tabix_file.fetch(variant.chr.replace("chr", ""),
                                 start_from_pos, int(2.5e8)))
        elif start_from_chrom or end_with_chrom:
            if start_from_chrom:
                print("Start chrom: chr%s" % start_from_chrom)
            if end_with_chrom:
                print("End chrom: chr%s" % end_with_chrom)

            chrom_list = list(map(str, range(1, 23))) + ['X', 'Y']
            chrom_list_start_index = 0
            if start_from_chrom:
                chrom_list_start_index = chrom_list.index(
                    start_from_chrom.replace("chr", "").upper())

            chrom_list_end_index = len(chrom_list)
            if end_with_chrom:
                chrom_list_end_index = chrom_list.index(
                    end_with_chrom.replace("chr", "").upper())

            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = tabix_file.header
            for chrom in chrom_list[
                    chrom_list_start_index:chrom_list_end_index + 1]:
                print("Will load chrom: " + chrom)
                try:
                    vcf_iter = itertools.chain(vcf_iter,
                                               tabix_file.fetch(chrom))
                except ValueError as e:
                    print("WARNING: " + str(e))

        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)

        #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError(
                        "%s has zero variants to insert. Should not be in buff."
                        % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[
                        family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(
            list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter,
                                             genotypes=True,
                                             indiv_id_list=indiv_id_list,
                                             vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(
                        restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict,
                                                 annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(
                            family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({
                                'xpos': family_variant.xpos,
                                'ref': family_variant.ref,
                                'alt': family_variant.alt
                        }):
                            family_id_to_variant_list[family[
                                'family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write(
                        "ERROR: on variant %s, family: %s - %s\n" %
                        (variant.toJSON(), family, e))
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1,24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:     #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1,24):
                position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos/1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8)))
        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict, annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                            family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
Beispiel #6
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        #for family in family_info_list:
        #    print("Indexing family: " + str(family))
        #    collection = collections[family['family_id']]
        #    collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                        family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                        variants_buffered_counter += 1

            if variants_buffered_counter > 10000:
                print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list)))

                insert_all_variants_in_buffer(family_id_to_variant_list, collections)

                assert len(family_id_to_variant_list) == 0
                vcf_rows_counter = 0
                variants_buffered_counter = 0