Esempio n. 1
0
def test_is_pathogenic_VEP97_conflicting(one_vep97_annotated_variant):

    ## WHEN checking if variants should be loaded
    pathogenic = is_pathogenic(one_vep97_annotated_variant)

    ## THEN assert that the variant should be loaded
    assert pathogenic is True
Esempio n. 2
0
def test_is_pathogenic_no_annotation(cyvcf2_variant):
    ## GIVEN a variant without clinvar annotations

    ## WHEN checking if variants should be loaded
    pathogenic = is_pathogenic(cyvcf2_variant)

    ## THEN assert that The variant should be loaded
    assert pathogenic is False
Esempio n. 3
0
def test_is_pathogenic_classic_pathogenic(cyvcf2_variant):
    ## GIVEN a variant with classic clinvar annotations
    acc_nr = "RCV000014440.17|RCV000014441.25|RCV000014442.25|RCV000014443.17|RCV000184011.1|RCV000188658.1"
    clnsig = "5|4|3|2|1|0"
    revstat = "conf|single|single|single|conf|conf"

    cyvcf2_variant.INFO["CLNVID"] = acc_nr
    cyvcf2_variant.INFO["CLNSIG"] = clnsig
    cyvcf2_variant.INFO["CLNREVSTAT"] = revstat

    ## WHEN checking if variants should be loaded
    pathogenic = is_pathogenic(cyvcf2_variant)

    ## THEN assert that The variant should be loaded
    assert pathogenic is True
Esempio n. 4
0
def test_is_pathogenic_benign(cyvcf2_variant):
    ## GIVEN a variant with classic clinvar annotations
    acc_nr = "265359"
    clnsig = "Likely_benign"
    revstat = "criteria_provided,_multiple_submitters,_no_conflicts"

    cyvcf2_variant.INFO["CLNVID"] = acc_nr
    cyvcf2_variant.INFO["CLNSIG"] = clnsig
    cyvcf2_variant.INFO["CLNREVSTAT"] = revstat

    ## WHEN checking if variants should be loaded
    pathogenic = is_pathogenic(cyvcf2_variant)

    ## THEN assert that The variant should be loaded
    assert pathogenic is False
Esempio n. 5
0
    def _load_variants(
        self,
        variants,
        variant_type,
        case_obj,
        individual_positions,
        rank_threshold,
        institute_id,
        build=None,
        rank_results_header=None,
        vep_header=None,
        category="snv",
        sample_info=None,
    ):
        """Perform the loading of variants

        This is the function that loops over the variants, parse them and build the variant
        objects so they are ready to be inserted into the database.

        Args:
            variants(iterable(cyvcf2.Variant))
            variant_type(str): ['clinical', 'research']
            case_obj(dict)
            individual_positions(dict): How individuals are positioned in vcf
            rank_treshold(int): Only load variants with a rank score > than this
            institute_id(str)
            build(str): Genome build
            rank_results_header(list): Rank score categories
            vep_header(list)
            category(str): ['snv','sv','cancer','str']
            sample_info(dict): A dictionary with info about samples.
                               Strictly for cancer to tell which is tumor

        Returns:
            nr_inserted(int)
        """
        build = build or "37"
        genes = [gene_obj for gene_obj in self.all_genes(build=build)]
        gene_to_panels = self.gene_to_panels(case_obj)
        hgncid_to_gene = self.hgncid_to_gene(genes=genes, build=build)
        genomic_intervals = self.get_coding_intervals(genes=genes)

        LOG.info("Start inserting {0} {1} variants into database".format(
            variant_type, category))
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets inserted
        nr_inserted = 0
        # This is to keep track of blocks of inserted variants
        inserted = 1

        nr_bulks = 0

        # We want to load batches of variants to reduce the number of network round trips
        bulk = {}
        current_region = None

        for nr_variants, variant in enumerate(variants):
            # All MT variants are loaded
            mt_variant = "MT" in variant.CHROM
            rank_score = parse_rank_score(variant.INFO.get("RankScore"),
                                          case_obj["_id"])
            pathogenic = is_pathogenic(variant)

            # Check if the variant should be loaded at all
            # if rank score is None means there are no rank scores annotated, all variants will be loaded
            # Otherwise we load all variants above a rank score treshold
            # Except for MT variants where we load all variants
            if ((rank_score is None) or (rank_score > rank_threshold)
                    or mt_variant or pathogenic):
                nr_inserted += 1
                # Parse the vcf variant
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header=vep_header,
                    individual_positions=individual_positions,
                    category=category,
                )

                # Build the variant object
                variant_obj = build_variant(
                    variant=parsed_variant,
                    institute_id=institute_id,
                    gene_to_panels=gene_to_panels,
                    hgncid_to_gene=hgncid_to_gene,
                    sample_info=sample_info,
                )

                # Check if the variant is in a genomic region
                var_chrom = variant_obj["chromosome"]
                var_start = variant_obj["position"]
                # We need to make sure that the interval has a length > 0
                var_end = variant_obj["end"] + 1
                var_id = variant_obj["_id"]
                # If the bulk should be loaded or not
                load = True
                new_region = None

                intervals = genomic_intervals.get(var_chrom, IntervalTree())
                genomic_regions = intervals.overlap(var_start, var_end)

                # If the variant is in a coding region
                if genomic_regions:
                    # We know there is data here so get the interval id
                    new_region = genomic_regions.pop().data
                    # If the variant is in the same region as previous
                    # we add it to the same bulk
                    if new_region == current_region:
                        load = False

                # This is the case where the variant is intergenic
                else:
                    # If the previous variant was also intergenic we add the variant to the bulk
                    if not current_region:
                        load = False
                    # We need to have a max size of the bulk
                    if len(bulk) > 10000:
                        load = True
                # Load the variant object
                if load:
                    # If the variant bulk contains coding variants we want to update the compounds
                    if current_region:
                        self.update_compounds(bulk)
                    try:
                        # Load the variants
                        self.load_variant_bulk(list(bulk.values()))
                        nr_bulks += 1
                    except IntegrityError as error:
                        pass
                    bulk = {}

                current_region = new_region
                bulk[var_id] = variant_obj

                if nr_variants != 0 and nr_variants % 5000 == 0:
                    LOG.info("%s variants parsed", str(nr_variants))
                    LOG.info(
                        "Time to parse variants: %s",
                        (datetime.now() - start_five_thousand),
                    )
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0
                        and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    LOG.info("%s variants inserted", nr_inserted)
                    inserted += 1
        # If the variants are in a coding region we update the compounds
        if current_region:
            self.update_compounds(bulk)

        # Load the final variant bulk
        self.load_variant_bulk(list(bulk.values()))
        nr_bulks += 1
        LOG.info("All variants inserted, time to insert variants: {0}".format(
            datetime.now() - start_insertion))

        if nr_variants:
            nr_variants += 1
        LOG.info("Nr variants parsed: %s", nr_variants)
        LOG.info("Nr variants inserted: %s", nr_inserted)
        LOG.debug("Nr bulks inserted: %s", nr_bulks)

        return nr_inserted