def apply(self, variants=None, genotypes=None, db=None, limit=None, offset=0):
     """Apply this collection of filters on a collection of variants.
     :param variants: a VariantsCollection or a QuerySet of variants.
         If None, makes a QuerySet of the whole *db*.
     :param db: database name. If no set, it tries to be inferred from *variants*.
     :param genotypes: a list of genotypes arrays.
         if None, a GenotypesService is created from the variants' db.
         In principle, set it for testing purposes only.
     :rtype: FilterResult
     """
     sub_ids = None
     if variants is None and db is not None:
         variants = Variant.objects.using(db)
     elif db is None:
         db = variants.db
     if self.shortcut:
         return FilterResult(variants=VariantsCollection([]), ids=[], n_filtered=0)
     if genotypes is None:
         assert db is not None, "Either a db name or a genotypes array is required"
         genotypes = genotypes_service(db).genotypes
     else:
         assert len(genotypes) == len(variants)
     if self.val == 'x_linked':
         if isinstance(variants, VariantsCollection):
             sub_ids = np.asarray([v.variant_id for v in variants if v.chrom=='chrX'], dtype=np.uint64)
         else:
             sub_ids = genotypes_service(db).chrX
     passing = self.scan_genotypes(genotypes, sub_ids=sub_ids, db=db)
     return FilterResult(
         variants=self.variants_from_mask(variants, passing, db, limit, offset),
         ids=passing,
         n_filtered=len(passing),
     )
 def apply(self, variants=None, genotypes=None, db=None, limit=None, offset=0, sub_ids=None, parallel=True):
     """:param sub_ids: does nothing, just inheritance"""
     if self.shortcut:
         return FilterResult(variants=VariantsCollection([]), ids=[], n_filtered=0)
     if variants is None and db is not None:
         variants = Variant.objects.using(db)
     elif db is None:
         db = variants.db
     if db is None:
         batches = {gene: np.array([v.variant_id for v in var], dtype=np.uint64)
             for gene,var in itertools.groupby(variants, key=attrgetter('gene_symbol'))}
     else:
         gs = genotypes_service(db)
         batches = gs.variant_ids_batches_by_gene
     if genotypes is None:
         assert db is not None, "Either a db name or a genotypes array is required"
         genotypes = genotypes_service(db).genotypes
     else:
         assert len(genotypes) == len(variants)
     passing, sources, pairs = self.scan_genotypes_compound(genotypes, batches, parallel)
     variants = self.variants_from_mask(variants, passing, db, limit, offset)
     for v in variants:
         set_source(v, sources[v.variant_id])
     return FilterResult(
         variants=variants,
         ids=passing,
         n_filtered=len(passing),
     )
Example #3
0
    def ready(self):
        from varapp.common import manage_dbs, utils, db_utils
        from varapp.common.versioning import add_versions
        from varapp.stats.stats_service import stats_service
        from varapp.variants.genotypes_service import genotypes_service

        # Check that there are tables in the users_db,
        # because this code is also run when manage.py is used,
        # for instance to generate the tables.
        # It needs more that 1 table, which could be only django_migrations.
        user_db_ready = db_utils.connection_has_tables('default', 5)

        # Manage.py must work without the following to execute.
        #print(user_db_ready)
        #print(''.join(sys.argv))
        if user_db_ready and "migrate" not in sys.argv:
            # At startup, fill settings.DATABASES with what is in VariantsDb.
            # Do not add any new db here, as unlike deactivation, inserts
            # are not idempotent and this code could be executed several times.
            # Return the valid databases added to connections,
            # since they need to be in there to be read for vcf_header, stats etc.
            added_connections = manage_dbs.copy_VariantsDb_to_settings()

            # Check that the Redis service is running.
            # It is necessary for stats and genotypes cache.
            redis_ready = utils.check_redis_connection()
            if redis_ready:
                # Fill the stats cache
                if settings.WARMUP_STATS_CACHE:
                    for dbname in added_connections:
                        stats_service(dbname)

                # Fill the genotypes cache
                if settings.WARMUP_GENOTYPES_CACHE:
                    for dbname in added_connections:
                        genotypes_service(dbname)

                # Update the *annotation* table with versions of all programs used,
                # i.e. Gemini, VEP, their dbs, etc.
                for dbname in added_connections:
                    add_versions(dbname)
            else:
                logger.warning(
                    "(!) Could not connect to Redis. Make sure Redis is installed, "
                    "is up and running (try `redis-cli ping`) "
                    "and serves at 127.0.0.1:6379 (or whatever is defined in settings)."
                )
                return 2

            return 0

        else:
            logger.warning("(!) Users db is not ready or it is migration.")
            return 1
Example #4
0
 def apply(self,
           variants=None,
           genotypes=None,
           db=None,
           limit=None,
           offset=0):
     """Apply this collection of filters on a collection of variants.
     :param variants: a VariantsCollection or a QuerySet of variants.
         If None, makes a QuerySet of the whole *db*.
     :param db: database name. If no set, it tries to be inferred from *variants*.
     :param genotypes: a list of genotypes arrays.
         if None, a GenotypesService is created from the variants' db.
         In principle, set it for testing purposes only.
     :rtype: FilterResult
     """
     sub_ids = None
     if variants is None and db is not None:
         variants = Variant.objects.using(db)
     elif db is None:
         db = variants.db
     if self.shortcut:
         return FilterResult(variants=VariantsCollection([]),
                             ids=[],
                             n_filtered=0)
     if genotypes is None:
         assert db is not None, "Either a db name or a genotypes array is required"
         genotypes = genotypes_service(db).genotypes
     else:
         assert len(genotypes) == len(variants)
     if self.val == 'x_linked':
         if isinstance(variants, VariantsCollection):
             sub_ids = np.asarray(
                 [v.variant_id for v in variants if v.chrom == 'chrX'],
                 dtype=np.uint64)
         else:
             sub_ids = genotypes_service(db).chrX
     passing = self.scan_genotypes(genotypes, sub_ids=sub_ids, db=db)
     return FilterResult(
         variants=self.variants_from_mask(variants, passing, db, limit,
                                          offset),
         ids=passing,
         n_filtered=len(passing),
     )
Example #5
0
 def apply(self,
           variants=None,
           genotypes=None,
           db=None,
           limit=None,
           offset=0,
           sub_ids=None,
           parallel=True):
     """:param sub_ids: does nothing, just inheritance"""
     if self.shortcut:
         return FilterResult(variants=VariantsCollection([]),
                             ids=[],
                             n_filtered=0)
     if variants is None and db is not None:
         variants = Variant.objects.using(db)
     elif db is None:
         db = variants.db
     if db is None:
         batches = {
             gene: np.array([v.variant_id for v in var], dtype=np.uint64)
             for gene, var in itertools.groupby(
                 variants, key=attrgetter('gene_symbol'))
         }
     else:
         gs = genotypes_service(db)
         batches = gs.variant_ids_batches_by_gene
     if genotypes is None:
         assert db is not None, "Either a db name or a genotypes array is required"
         genotypes = genotypes_service(db).genotypes
     else:
         assert len(genotypes) == len(variants)
     passing, sources, pairs = self.scan_genotypes_compound(
         genotypes, batches, parallel)
     variants = self.variants_from_mask(variants, passing, db, limit,
                                        offset)
     for v in variants:
         set_source(v, sources[v.variant_id])
     return FilterResult(
         variants=variants,
         ids=passing,
         n_filtered=len(passing),
     )
 def scan_genotypes(self, genotypes, sub_ids=None, db=None):
     """Pass through all genotypes and return only the indices of those that pass the filter.
     :param genotypes: np.ndarray[uint64, dim=2]
     :rtype: np.ndarray[uint64]"""
     if self.shortcut:
         return np.zeros(0)
     N = len(genotypes)
     if sub_ids is not None:
         variant_ids = sub_ids
     elif self.val == 'x_linked' and db:
         variant_ids = genotypes_service(db).chrX
     else:
         variant_ids = np.asarray(range(1,N+1), dtype=np.uint64)
     active_idx = np.asarray(self.ss.active_idx, dtype=np.uint16)
     conditions = self.conditions_vector
     is_and = self.merge_op == AND
     if len(conditions) == 0:
         passing = variant_ids
     else:
         passing = self.parallel_apply_bitwise(genotypes, variant_ids, conditions, active_idx, is_and)
     return passing
Example #7
0
 def scan_genotypes(self, genotypes, sub_ids=None, db=None):
     """Pass through all genotypes and return only the indices of those that pass the filter.
     :param genotypes: np.ndarray[uint64, dim=2]
     :rtype: np.ndarray[uint64]"""
     if self.shortcut:
         return np.zeros(0)
     N = len(genotypes)
     if sub_ids is not None:
         variant_ids = sub_ids
     elif self.val == 'x_linked' and db:
         variant_ids = genotypes_service(db).chrX
     else:
         variant_ids = np.asarray(range(1, N + 1), dtype=np.uint64)
     active_idx = np.asarray(self.ss.active_idx, dtype=np.uint16)
     conditions = self.conditions_vector
     is_and = self.merge_op == AND
     if len(conditions) == 0:
         passing = variant_ids
     else:
         passing = self.parallel_apply_bitwise(genotypes, variant_ids,
                                               conditions, active_idx,
                                               is_and)
     return passing
Example #8
0
    def ready(self):
        from varapp.common import manage_dbs, utils, db_utils
        from varapp.common.versioning import add_versions
        from varapp.stats.stats_service import stats_service
        from varapp.variants.genotypes_service import genotypes_service

        # Check that there are tables in the users_db,
        # because this code is also run when manage.py is used,
        # for instance to generate the tables.
        # It needs more that 1 table, which could be only django_migrations.
        mysql_connection_attempts = 0
        connected_to_mysql = False
        user_db_ready = False
        error = None
        while mysql_connection_attempts < 50 and not connected_to_mysql:
            try:
                mysql_connection_attempts += 1
                user_db_ready = db_utils.connection_has_tables('default', 5)
                connected_to_mysql = True
                logger.info("Connected to MySQL")
            except django.db.utils.OperationalError as err:
                error = str(err)
                if mysql_connection_attempts == 1:
                    logger.error(error)
                logger.warning(
                    "(!) Could not connect to users db. Retrying in 1 second... ({})"
                    .format(mysql_connection_attempts))
                time.sleep(0.5)

        if not connected_to_mysql:
            raise ConnectionError(
                "Could not connect to users db. Check your MySQL connection and settings. Error message: {}"
                .format(error))

        # Manage.py must work without the following to execute.
        if user_db_ready and "migrat" not in ''.join(sys.argv):
            # At startup, fill settings.DATABASES with what is in VariantsDb.
            # Do not add any new db here, as unlike deactivation, inserts
            # are not idempotent and this code could be executed several times.
            # Return the valid databases added to connections,
            # since they need to be in there to be read for vcf_header, stats etc.
            added_connections = manage_dbs.copy_VariantsDb_to_settings()

            # Check that the Redis service is running.
            # It is necessary for stats and genotypes cache.
            redis_ready = utils.check_redis_connection()
            if redis_ready:
                # Fill the stats cache
                if settings.WARMUP_STATS_CACHE:
                    for dbname in added_connections:
                        stats_service(dbname)

                # Fill the genotypes cache
                if settings.WARMUP_GENOTYPES_CACHE:
                    for dbname in added_connections:
                        genotypes_service(dbname)

                # Update the *annotation* table with versions of all programs used,
                # i.e. Gemini, VEP, their dbs, etc.
                for dbname in added_connections:
                    add_versions(dbname)
            else:
                logger.warning(
                    "(!) Could not connect to Redis. Make sure Redis is installed, "
                    "is up and running (try `redis-cli ping`) "
                    "and serves at '{}').".format(
                        settings.CACHES['redis']['LOCATION']))
                return 2

            return 0

        else:
            logger.warning("(!) Users db has no tables.")
            return 1
Example #9
0
    def apply(self,
              db=None,
              initqs=None,
              limit=None,
              offset=0,
              sort_by=None,
              reverse=False,
              batch_size=500):
        """Applies all filters in list to the database. Return a FilterResult with
         *limit* variants to expose.
        :param initqs: A QuerySet to be further filtered.
            Otherwise all entries of *db* will be fetched.
        :param db: The alias of the database to query from.
            If not set, try to get it from the first filter in list.
        :param limit: maximum number or variants to return.
        :param offset: number of variants to skip before returning *limit* of them.
        :param sort_by: (str) name of the field to sort by.
        :param reverse: (bool) whether to reverse the ordering.
        :rtype: FilterResult
        """
        is_sorted = sort_by and sort_by in VARIANT_FIELDS
        is_gen_filter = len(self.genotype_filters) > 0
        is_var_fiter = len(self.variant_filters) > 0

        if initqs is None:
            initqs = Variant.objects.using(db)

        # Filter what can be filtered directly in the db
        conds = [f.django_condition() for f in self.variant_filters]
        conds = [x for x in conds if x]
        qs = initqs.filter(*conds)

        # Sort what can be sorted directly in the db
        if is_sorted:
            sort_key = '-' + sort_by if reverse else sort_by
            qs = qs.order_by(sort_key)
        else:
            qs = qs.order_by('chrom', 'start')  # trust Gemini for that

        # If no genotype filter, paginate from db and return the collection.
        # For the moment it never happens because there is always at least the 'active' gen filter.
        n_filtered = qs.count()
        if not is_gen_filter or n_filtered == 0:
            ids = np.asarray(list(qs.values_list('variant_id', flat=True)),
                             dtype=np.uint64)
            if limit is not None:
                qs = qs[offset:offset + limit]
            variants = namedtuples(qs)  # instead of list

        # If genotype filter, get indices from gen service, indices from
        # the variant filters (nothing is evaluated yet), and return the intersection.
        else:
            gf = self.genotype_filters[0]
            is_compound = gf.val == GENOTYPE_COMPOUND
            gs = genotypes_service(db=db)
            sources = {}
            pairs = []
            sql_indices = []
            bin_ids = np.zeros(0)
            if is_compound:
                gen_indices, sources, pairs = gf.scan_genotypes_compound(
                    genotypes=gs.genotypes,
                    batches=gs.variant_ids_batches_by_gene)
            elif gf.val == 'x_linked':
                gen_indices = gf.scan_genotypes(genotypes=gs.genotypes,
                                                sub_ids=gs.chrX)
            else:
                gen_indices = gf.scan_genotypes(
                    genotypes=gs.genotypes)  # type: np.ndarray
            # If nothing left, return
            if len(gen_indices) == 0:
                ids = np.zeros(0)
            # Find the variant ids that are present in both var filtered and gen filtered sets
            elif is_var_fiter or is_sorted or initqs is not None:
                max_gen_index = gen_indices[-1]
                qs_indices = qs.values_list(
                    'variant_id',
                    flat=True).filter(variant_id__lte=max_gen_index)
                t1 = time()
                sql_indices = list(
                    qs_indices)  # qs is already ordered_by, so are sql_indices
                t2 = time()
                if DEBUG:
                    print("  Apply fc :: Instantiate sql indices:", t2 - t1)
                bin_sql = masking.pack(
                    masking.to_binary_array(sql_indices, max_gen_index))
                bin_gen = masking.pack(
                    masking.to_binary_array(gen_indices, max_gen_index)),
                bin_ids = masking.unpack(masking.binary_and(bin_sql, bin_gen),
                                         max_gen_index)  # always sorted by id
                t3 = time()
                if DEBUG: print("  Apply fc :: Sets intersection:", t3 - t2)
                # If compound, filter out those were after intersection, a gene has only one component left
                if is_compound:
                    bin_keep = np.zeros(max_gen_index, dtype=np.bool_)
                    for a, b in pairs:
                        if bin_ids[a - 1] & bin_ids[b - 1]:
                            bin_keep[a - 1] = 1
                            bin_keep[b - 1] = 1
                    bin_ids = masking.binary_and(bin_ids, bin_keep)
                    t4 = time()
                    if DEBUG:
                        print("  Apply fc :: Compound pairs filtering:",
                              t4 - t3)
                ids = masking.to_indices(bin_ids) + 1
            # If the only filter is on genotypes and no need to sort, skip slow steps
            else:
                ids = gen_indices

            n_filtered = len(ids)
            # Extract the variants for the filtered ids from the inital QuerySet,
            # up to limit (i.e. up to ~300 variants to expose).
            # We need to pass `sql_indices` on top of `ids` because the latter is sorted,
            # and we want the top of the sorted QuerySet.
            variants = extract_variants_from_ids_bin_array(
                qs, bin_ids, sql_indices, limit, offset, batch_size, sources)

        return FilterResult(
            variants=VariantsCollection(variants, db=db),
            n_filtered=n_filtered,
            ids=np.asarray(ids, dtype=np.uint64),
        )
Example #10
0
    def apply(self, db=None, initqs=None, limit=None, offset=0, sort_by=None, reverse=False, batch_size=500):
        """Applies all filters in list to the database. Return a FilterResult with
         *limit* variants to expose.
        :param initqs: A QuerySet to be further filtered.
            Otherwise all entries of *db* will be fetched.
        :param db: The alias of the database to query from.
            If not set, try to get it from the first filter in list.
        :param limit: maximum number or variants to return.
        :param offset: number of variants to skip before returning *limit* of them.
        :param sort_by: (str) name of the field to sort by.
        :param reverse: (bool) whether to reverse the ordering.
        :rtype: FilterResult
        """
        is_sorted = sort_by and sort_by in VARIANT_FIELDS
        is_gen_filter = len(self.genotype_filters) > 0
        is_var_fiter = len(self.variant_filters) > 0

        if initqs is None:
            initqs = Variant.objects.using(db)

        # Filter what can be filtered directly in the db
        conds = [f.django_condition() for f in self.variant_filters]
        conds = [x for x in conds if x]
        qs = initqs.filter(*conds)

        # Sort what can be sorted directly in the db
        if is_sorted:
            sort_key = '-'+sort_by if reverse else sort_by
            qs = qs.order_by(sort_key)
        else:
            qs = qs.order_by('chrom','start')  # trust Gemini for that

        # If no genotype filter, paginate from db and return the collection.
        # For the moment it never happens because there is always at least the 'active' gen filter.
        n_filtered = qs.count()
        if not is_gen_filter or n_filtered == 0:
            ids = np.asarray(list(qs.values_list('variant_id', flat=True)), dtype=np.uint64)
            if limit is not None:
                qs = qs[offset:offset+limit]
            variants = namedtuples(qs)  # instead of list

        # If genotype filter, get indices from gen service, indices from
        # the variant filters (nothing is evaluated yet), and return the intersection.
        else:
            gf = self.genotype_filters[0]
            is_compound = gf.val == GENOTYPE_COMPOUND
            gs = genotypes_service(db=db)
            sources = {}; pairs = []
            sql_indices = []; bin_ids = np.zeros(0)
            if is_compound:
                gen_indices,sources,pairs = gf.scan_genotypes_compound(genotypes=gs.genotypes, batches=gs.variant_ids_batches_by_gene)
            elif gf.val == 'x_linked':
                gen_indices = gf.scan_genotypes(genotypes=gs.genotypes, sub_ids=gs.chrX)
            else:
                gen_indices = gf.scan_genotypes(genotypes=gs.genotypes) # type: np.ndarray
            # If nothing left, return
            if len(gen_indices) == 0:
                ids = np.zeros(0)
            # Find the variant ids that are present in both var filtered and gen filtered sets
            elif is_var_fiter or is_sorted or initqs is not None:
                max_gen_index = gen_indices[-1]
                qs_indices = qs.values_list('variant_id', flat=True).filter(variant_id__lte=max_gen_index)
                t1 = time()
                sql_indices = list(qs_indices)  # qs is already ordered_by, so are sql_indices
                t2 = time()
                if DEBUG: print("  Apply fc :: Instantiate sql indices:", t2-t1)
                bin_sql = masking.pack(masking.to_binary_array(sql_indices, max_gen_index))
                bin_gen = masking.pack(masking.to_binary_array(gen_indices, max_gen_index)),
                bin_ids = masking.unpack(masking.binary_and(bin_sql, bin_gen), max_gen_index)  # always sorted by id
                t3 = time()
                if DEBUG: print("  Apply fc :: Sets intersection:", t3-t2)
                # If compound, filter out those were after intersection, a gene has only one component left
                if is_compound:
                    bin_keep = np.zeros(max_gen_index, dtype=np.bool_)
                    for a,b in pairs:
                        if bin_ids[a-1] & bin_ids[b-1]:
                            bin_keep[a-1] = 1
                            bin_keep[b-1] = 1
                    bin_ids = masking.binary_and(bin_ids, bin_keep)
                    t4 = time()
                    if DEBUG: print("  Apply fc :: Compound pairs filtering:", t4-t3)
                ids = masking.to_indices(bin_ids)+1
            # If the only filter is on genotypes and no need to sort, skip slow steps
            else:
                ids = gen_indices

            n_filtered = len(ids)
            # Extract the variants for the filtered ids from the inital QuerySet,
            # up to limit (i.e. up to ~300 variants to expose).
            # We need to pass `sql_indices` on top of `ids` because the latter is sorted,
            # and we want the top of the sorted QuerySet.
            variants = extract_variants_from_ids_bin_array(qs, bin_ids, sql_indices, limit, offset, batch_size, sources)

        return FilterResult(
            variants = VariantsCollection(variants, db=db),
            n_filtered = n_filtered,
            ids = np.asarray(ids, dtype=np.uint64),
        )
Example #11
0
    def ready(self):
        from varapp.common import manage_dbs, utils, db_utils
        from varapp.common.versioning import add_versions
        from varapp.stats.stats_service import stats_service
        from varapp.variants.genotypes_service import genotypes_service

        # Check that there are tables in the users_db,
        # because this code is also run when manage.py is used,
        # for instance to generate the tables.
        # It needs more that 1 table, which could be only django_migrations.
        mysql_connection_attempts = 0
        connected_to_mysql = False
        user_db_ready = False
        error = None
        while mysql_connection_attempts < 50 and not connected_to_mysql:
            try:
                mysql_connection_attempts += 1
                user_db_ready = db_utils.connection_has_tables('default', 5)
                connected_to_mysql = True
                logger.info("Connected to MySQL")
            except django.db.utils.OperationalError as err:
                error = str(err)
                if mysql_connection_attempts == 1:
                    logger.error(error)
                logger.warning("(!) Could not connect to users db. Retrying in 1 second... ({})".format(mysql_connection_attempts))
                time.sleep(0.5)

        if not connected_to_mysql:
            raise ConnectionError("Could not connect to users db. Check your MySQL connection and settings. Error message: {}".format(error))

        # Manage.py must work without the following to execute.
        if user_db_ready and "migrat" not in ''.join(sys.argv):
            # At startup, fill settings.DATABASES with what is in VariantsDb.
            # Do not add any new db here, as unlike deactivation, inserts
            # are not idempotent and this code could be executed several times.
            # Return the valid databases added to connections,
            # since they need to be in there to be read for vcf_header, stats etc.
            added_connections = manage_dbs.copy_VariantsDb_to_settings()

            # Check that the Redis service is running.
            # It is necessary for stats and genotypes cache.
            redis_ready = utils.check_redis_connection()
            if redis_ready:
                # Fill the stats cache
                if settings.WARMUP_STATS_CACHE:
                    for dbname in added_connections:
                        stats_service(dbname)

                # Fill the genotypes cache
                if settings.WARMUP_GENOTYPES_CACHE:
                    for dbname in added_connections:
                        genotypes_service(dbname)

                # Update the *annotation* table with versions of all programs used,
                # i.e. Gemini, VEP, their dbs, etc.
                for dbname in added_connections:
                    add_versions(dbname)
            else:
                logger.warning("(!) Could not connect to Redis. Make sure Redis is installed, "
                                "is up and running (try `redis-cli ping`) "
                                "and serves at '{}').".format(settings.CACHES['redis']['LOCATION']))
                return 2


            return 0

        else:
            logger.warning("(!) Users db has no tables.")
            return 1