Ejemplo n.º 1
0
def test_parse_small_sv(one_sv_variant, case_obj):
    parsed_variant = parse_variant(one_sv_variant, case_obj)

    assert parsed_variant["category"] == "sv"
    assert parsed_variant["sub_category"] == one_sv_variant.INFO[
        "SVTYPE"].lower()
    assert parsed_variant["position"] == int(one_sv_variant.POS)
Ejemplo n.º 2
0
def test_parse_small_sv(one_sv_variant, case_obj):
    parsed_variant = parse_variant(one_sv_variant, case_obj)

    assert parsed_variant['category'] == 'sv'
    assert parsed_variant['sub_category'] == one_sv_variant.INFO[
        'SVTYPE'].lower()
    assert parsed_variant['position'] == int(one_sv_variant.POS)
Ejemplo n.º 3
0
def test_parse_with_header(one_variant, case_obj, rank_results_header):
    """docstring for test_parse_all_variants"""
    parsed_variant = parse_variant(one_variant, case_obj,
                                   rank_results_header=rank_results_header)

    assert parsed_variant['chromosome'] == '1'
    assert parsed_variant['rank_result']['Consequence'] == 1
Ejemplo n.º 4
0
def test_parse_minimal(one_variant, case_obj):
    """Test to parse a minimal variant"""
    parsed_variant = parse_variant(one_variant,
                                   case_obj,
                                   variant_type="clinical")
    assert parsed_variant["position"] == int(one_variant.POS)
    assert parsed_variant["category"] == "snv"
Ejemplo n.º 5
0
def test_load_vep97_parsed_variant(one_vep97_annotated_variant,
                                   real_populated_database, case_obj):
    """test first parsing and then loading a vep v97 annotated variant"""

    # GIVEN a variant annotated using the following CSQ entry fields
    csq_header = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|SOURCE|GIVEN_REF|USED_REF|BAM_EDIT|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|MES-NCSS_downstream_acceptor|MES-NCSS_downstream_donor|MES-NCSS_upstream_acceptor|MES-NCSS_upstream_donor|MES-SWA_acceptor_alt|MES-SWA_acceptor_diff|MES-SWA_acceptor_ref|MES-SWA_acceptor_ref_comp|MES-SWA_donor_alt|MES-SWA_donor_diff|MES-SWA_donor_ref|MES-SWA_donor_ref_comp|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|LoFtool|ExACpLI|GERP++_NR|GERP++_RS|REVEL_rankscore|phastCons100way_vertebrate|phyloP100way_vertebrate|CLINVAR|CLINVAR_CLNSIG|CLINVAR_CLNVID|CLINVAR_CLNREVSTAT|genomic_superdups_frac_match"

    header = [word.upper() for word in csq_header.split("|")]

    # WHEN parsed using
    parsed_vep97_annotated_variant = parse_variant(
        variant=one_vep97_annotated_variant, vep_header=header, case=case_obj)

    # GIVEN a database without any variants
    adapter = real_populated_database
    assert adapter.variant_collection.find_one() is None

    # WHEN loading the variant into the database
    adapter.load_variant(variant_obj=parsed_vep97_annotated_variant)

    # THEN the variant is loaded with the fields correctly parsed
    # revel score
    variant = adapter.variant_collection.find_one()
    assert isinstance(variant["revel_score"], float)

    # conservation fields
    for key, value in variant["conservation"].items():
        assert value == ["NotConserved"]

    # clinvar fields
    assert isinstance(variant["clnsig"][0]["accession"], int)
    assert variant["clnsig"][0]["value"] in REV_CLINSIG_MAP  # can be str or int
    assert isinstance(variant["clnsig"][0]["revstat"], str)  # str
Ejemplo n.º 6
0
def test_load_cancer_SV_variant(one_cancer_manta_SV_variant,
                                real_populated_database, cancer_case_obj):
    """ Test loading a cancer SV variant into a mongo database """

    # GIVEN a database containing one cancer case
    adapter = real_populated_database
    adapter.case_collection.insert_one(cancer_case_obj)
    assert sum(1
               for i in adapter.case_collection.find({"track": "cancer"})) == 1

    # AND no variants
    assert adapter.variant_collection.find_one() is None

    # WHEN parsing a SV variant
    parsed_cancer_SV_variant = parse_variant(
        variant=one_cancer_manta_SV_variant, case=cancer_case_obj)

    # WHEN loading the variant into the database
    adapter.load_variant(variant_obj=parsed_cancer_SV_variant)

    # THEN the variant should have been parsed correctly
    variant = adapter.variant_collection.find_one()
    assert variant["variant_type"] == "clinical"
    assert variant["chromosome"]
    assert variant["position"]
    assert variant["end"]
    assert isinstance(variant["somatic_score"], int)
Ejemplo n.º 7
0
def test_parse_minimal(one_variant, case_obj):
    """Test to parse a minimal variant"""
    parsed_variant = parse_variant(one_variant,
                                   case_obj,
                                   variant_type='clinical')
    assert parsed_variant['position'] == int(one_variant.POS)
    assert parsed_variant['category'] == 'snv'
Ejemplo n.º 8
0
def test_parse_clinsig_vep97(one_vep97_annotated_variant,
                             real_populated_database, case_obj):
    """Test Clinsig parsing in a VEP97 formatted VCF"""

    # GIVEN a variant annotated using the following CSQ entry fields
    csq_header = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|SOURCE|GIVEN_REF|USED_REF|BAM_EDIT|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|MES-NCSS_downstream_acceptor|MES-NCSS_downstream_donor|MES-NCSS_upstream_acceptor|MES-NCSS_upstream_donor|MES-SWA_acceptor_alt|MES-SWA_acceptor_diff|MES-SWA_acceptor_ref|MES-SWA_acceptor_ref_comp|MES-SWA_donor_alt|MES-SWA_donor_diff|MES-SWA_donor_ref|MES-SWA_donor_ref_comp|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|GERP++_NR|GERP++_RS|REVEL_rankscore|phastCons100way_vertebrate|phyloP100way_vertebrate|LoFtool|ExACpLI|CLINVAR|CLINVAR_CLNSIG|CLINVAR_CLNVID|CLINVAR_CLNREVSTAT|genomic_superdups_frac_match"

    header = [word.upper() for word in csq_header.split("|")]

    # WHEN parsed using the parse_variant method
    parsed_vep97_annotated_variant = parse_variant(
        variant=one_vep97_annotated_variant, vep_header=header, case=case_obj)

    # GIVEN a database without any variants
    adapter = real_populated_database
    assert adapter.variant_collection.find_one() is None

    # WHEN loading the variant into the database
    adapter.load_variant(variant_obj=parsed_vep97_annotated_variant)

    # THEN the variant is loaded with the fields correctly parsed
    variant = adapter.variant_collection.find_one()

    # Clinvar fields shoud be correctly parsed:
    first_clnsig = variant["clnsig"][0]
    assert first_clnsig

    # Clinvar accession should be a numberical value
    assert isinstance(first_clnsig["accession"], int)

    # Value field should be a string (i.e. pathogenic, benign,..)
    assert isinstance(first_clnsig["value"], str)

    # Revstat field should be also a string (i.e. criteria_provided, ..)
    assert isinstance(first_clnsig["revstat"], str)
Ejemplo n.º 9
0
def test_parse_with_header(one_variant, case_obj, rank_results_header):
    """docstring for test_parse_all_variants"""
    parsed_variant = parse_variant(one_variant,
                                   case_obj,
                                   rank_results_header=rank_results_header)

    assert parsed_variant['chromosome'] == '1'
    assert parsed_variant['rank_result']['Consequence'] == 1
Ejemplo n.º 10
0
def test_build_minimal(case_obj, cyvcf2_variant):
    ## GIVEN a variant with minimal information
    variant = cyvcf2_variant

    parsed_variant = parse_variant(variant, case_obj)
    assert "ids" in parsed_variant
    variant_obj = build_variant(parsed_variant, INSTITUTE_ID)
    assert variant_obj["_id"] == parsed_variant["ids"]["document_id"]
Ejemplo n.º 11
0
def test_parse_with_header(one_variant, case_obj, rank_results_header):
    """docstring for test_parse_all_variants"""
    parsed_variant = parse_variant(one_variant,
                                   case_obj,
                                   rank_results_header=rank_results_header)

    assert parsed_variant["chromosome"] == "1"
    assert parsed_variant["rank_result"]["Consequence"] == 1
Ejemplo n.º 12
0
def test_parse_cadd(variants, case_obj):
    # GIVEN some parsed variant dicts
    for variant in variants:
        # WHEN score is present
        if "CADD" in variant.INFO:
            cadd_score = float(variant.INFO["CADD"])
            parsed_variant = parse_variant(variant, case_obj)
            # THEN make sure that the cadd score is parsed correct
            assert parsed_variant["cadd_score"] == cadd_score
Ejemplo n.º 13
0
def test_parse_cadd(variants, case_obj):
    # GIVEN some parsed variant dicts
    for variant in variants:
        # WHEN score is present
        if 'CADD' in variant.INFO:
            cadd_score = float(variant.INFO['CADD'])
            parsed_variant = parse_variant(variant, case_obj)
            # THEN make sure that the cadd score is parsed correct
            assert parsed_variant['cadd_score'] == cadd_score
Ejemplo n.º 14
0
def parsed_sv_variants(request, sv_variants, case_obj):
    """Get a generator with parsed variants"""
    print('')
    individual_positions = {}
    for i, ind in enumerate(sv_variants.samples):
        individual_positions[ind] = i

    return (parse_variant(variant, case_obj,
                          individual_positions=individual_positions)
            for variant in sv_variants)
Ejemplo n.º 15
0
def test_parse_small_str(one_str_variant, case_obj):
    parsed_variant = parse_variant(one_str_variant, case_obj, category="str")

    assert parsed_variant["category"] == "str"
    assert parsed_variant["str_status"] == one_str_variant.INFO["STR_STATUS"]
    assert parsed_variant["str_normal_max"] == one_str_variant.INFO[
        "STR_NORMAL_MAX"]
    assert (parsed_variant["str_pathologic_min"] ==
            one_str_variant.INFO["STR_PATHOLOGIC_MIN"])
    assert parsed_variant["position"] == int(one_str_variant.POS)
Ejemplo n.º 16
0
def test_parse_hmtvar(cyvcf2_variant, case_obj):
    """Test parsing HmtVar value from variant annotated with HmtNote"""

    # GIVEN a variant containing HmtVar key in the INFO field:
    cyvcf2_variant.INFO["HmtVar"] = "39192"

    # THEN make sure that it is parsed correctly
    hmtvar_variant_id = int(cyvcf2_variant.INFO["HmtVar"])
    parsed_variant = parse_variant(cyvcf2_variant, case_obj)
    assert parsed_variant["hmtvar_variant_id"] == hmtvar_variant_id
Ejemplo n.º 17
0
def test_parse_many_svs(sv_variants, case_obj):
    """docstring for test_parse_all_variants"""

    for variant in sv_variants:
        try:
            parsed_variant = parse_variant(variant, case_obj)
        except VcfError:
            for info in variant['info_dict']:
                print(info, variant['info'])
            assert False
        assert parsed_variant['chromosome'] == variant.CHROM
Ejemplo n.º 18
0
def test_parse_many_svs(sv_variants, case_obj):
    """docstring for test_parse_all_variants"""

    for variant in sv_variants:
        try:
            parsed_variant = parse_variant(variant, case_obj)
        except VcfError:
            for info in variant['info_dict']:
                print(info, variant['info'])
            assert False
        assert parsed_variant['chromosome'] == variant.CHROM
Ejemplo n.º 19
0
def test_parse_many_strs(str_variants, case_obj):
    """docstring for test_parse_many_strs"""

    for variant in str_variants:
        try:
            parsed_variant = parse_variant(variant, case_obj, category="str")
        except VcfError:
            for info in variant["info_dict"]:
                print(info, variant["info"])
            assert False
        assert parsed_variant["chromosome"] == variant.CHROM
Ejemplo n.º 20
0
def test_parse_mitomapassociateddiseases(cyvcf2_variant, case_obj):
    """Test parsing HmtVar value from variant annotated with HmtNote"""

    # GIVEN a variant containing HmtVar key in the INFO field:
    cyvcf2_variant.INFO["MitomapAssociatedDiseases"] = "LHON"

    # THEN make sure that it is parsed correctly
    mitomap_associated_diseases = cyvcf2_variant.INFO[
        "MitomapAssociatedDiseases"]
    parsed_variant = parse_variant(cyvcf2_variant, case_obj)
    assert parsed_variant[
        "mitomap_associated_diseases"] == mitomap_associated_diseases
Ejemplo n.º 21
0
def parsed_cancer_variant(request, cancer_variants, one_cancer_variant,
                          cancer_case_obj):
    """Return a parsed variant"""

    individual_positions = {}
    for i, ind in enumerate(cancer_variants.samples):
        individual_positions[ind] = i

    variant_dict = parse_variant(one_cancer_variant,
                                 cancer_case_obj,
                                 individual_positions=individual_positions)

    return variant_dict
Ejemplo n.º 22
0
def test_parse_old_obs_archive_SV(case_obj, cyvcf2_variant):
    """Test parsing local_obs_old and local_obs_old_freq off a variant VCF file"""

    nr_old_obs = 22
    freq_old_obs = 0.3

    # GIVEN a VCF variant containing old local observations stats
    cyvcf2_variant.INFO["clinical_genomics_loqusObs"] = nr_old_obs
    cyvcf2_variant.INFO["clinical_genomics_loqusFrq"] = freq_old_obs

    # WHEN parsing the variant
    parsed_var = parse_variant(cyvcf2_variant, case_obj)

    # THEN the parsed variant should contain these values
    assert parsed_var["local_obs_old"] == nr_old_obs
    assert parsed_var["local_obs_old_freq"] == freq_old_obs
Ejemplo n.º 23
0
def test_parse_revel(cyvcf2_variant, case_obj):
    ## GIVEN a variant with REVEL score in the CSQ entry
    csq_header = "ALLELE|CONSEQUENCE|REVEL_rankscore"
    csq_entry = "C|missense_variant|0.75,C|missense_variant|0.75"  # mimic a variant with transcripts

    cyvcf2_variant.INFO["CSQ"] = csq_entry

    header = [word.upper() for word in csq_header.split("|")]

    # WHEN the variant is parsed
    parsed_variant = parse_variant(variant=cyvcf2_variant,
                                   case=case_obj,
                                   vep_header=header)

    # THEN the REVEL score should be parsed correctly
    assert parsed_variant["revel_score"] == 0.75
Ejemplo n.º 24
0
def test_build_minimal(case_obj):
    ## GIVEN a variant with minimal information
    class Cyvcf2Variant(object):
        def __init__(self):
            self.CHROM = '1'
            self.REF = 'A'
            self.ALT = ['C']
            self.POS = 10
            self.end = 11
            self.FILTER = None
            self.ID = '.'
            self.QUAL = None
            self.var_type = 'snp'
            self.INFO = {}

    variant = Cyvcf2Variant()

    parsed_variant = parse_variant(variant, case_obj)
    assert 'ids' in parsed_variant
    variant_obj = build_variant(parsed_variant, INSTITUTE_ID)
    assert variant_obj['_id'] == parsed_variant['ids']['document_id']
Ejemplo n.º 25
0
def test_build_minimal(case_obj):
    ## GIVEN a variant with minimal information
    class Cyvcf2Variant(object):
        def __init__(self):
            self.CHROM = '1'
            self.REF = 'A'
            self.ALT = ['C']
            self.POS = 10
            self.end = 11
            self.FILTER = None
            self.ID = '.'
            self.QUAL = None
            self.var_type = 'snp'
            self.INFO = {}
    
    variant = Cyvcf2Variant()
    
    parsed_variant = parse_variant(variant, case_obj)
    assert 'ids' in parsed_variant
    variant_obj = build_variant(parsed_variant, INSTITUTE_ID)
    assert variant_obj['_id'] == parsed_variant['ids']['document_id']
Ejemplo n.º 26
0
def test_build_minimal(case_obj):
    ## GIVEN a variant with minimal information
    class Cyvcf2Variant(object):
        def __init__(self):
            self.CHROM = "1"
            self.REF = "A"
            self.ALT = ["C"]
            self.POS = 10
            self.end = 11
            self.FILTER = None
            self.ID = "."
            self.QUAL = None
            self.var_type = "snp"
            self.INFO = {}

    variant = Cyvcf2Variant()

    parsed_variant = parse_variant(variant, case_obj)
    assert "ids" in parsed_variant
    variant_obj = build_variant(parsed_variant, INSTITUTE_ID)
    assert variant_obj["_id"] == parsed_variant["ids"]["document_id"]
Ejemplo n.º 27
0
def parsed_str_variant(request, one_str_variant, case_obj):
    """Return a parsed variant"""
    print("")
    variant_dict = parse_variant(one_str_variant, case_obj, category="str")
    return variant_dict
Ejemplo n.º 28
0
def parsed_sv_variant(request, one_sv_variant, case_obj):
    """Return a parsed variant"""
    print('')
    variant_dict = parse_variant(one_sv_variant, case_obj)
    return variant_dict
Ejemplo n.º 29
0
def test_parse_small_sv(one_sv_variant, case_obj):
    parsed_variant = parse_variant(one_sv_variant, case_obj)

    assert parsed_variant['category'] == 'sv'
    assert parsed_variant['sub_category'] == one_sv_variant.INFO['SVTYPE'].lower()
    assert parsed_variant['position'] == int(one_sv_variant.POS)
Ejemplo n.º 30
0
def test_parse_many_snvs(variants, case_obj):
    """docstring for test_parse_all_variants"""

    for variant in variants:
        parsed_variant = parse_variant(variant, case_obj)
        assert parsed_variant['chromosome'] == variant.CHROM
Ejemplo n.º 31
0
def test_parse_customannotation(one_variant_customannotation, case_obj):
    """Test parsing of custom annotations"""
    parsed_variant = parse_variant(one_variant_customannotation, case_obj)
    assert parsed_variant["custom"] == [["key1", "val1"], ["key2", "val2"]]
Ejemplo n.º 32
0
def load_variants(adapter, variant_file, case_obj, variant_type='clinical',
                  category='snv', rank_threshold=5, chrom=None, start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i,ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    logger.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    nr_variants = 0
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {
            'chrom': chrom,
            'start': start,
            'end': end
        }

    try:
        for nr_variants, variant in enumerate(vcf_obj):
            rank_score = parse_rank_score(
                variant.INFO.get('RankScore'),
                case_obj['display_name']
            )
            variant_obj = None
            add_variant = False

            if coordinates or (rank_score > rank_threshold):
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header = vep_header,
                    individual_positions = individual_positions
                )
                add_variant = True
                # If there are coordinates the variant should be loaded
                if coordinates:
                    if not check_coordinates(parsed_variant, coordinates):
                        add_variant = False

                if add_variant:
                    variant_obj = build_variant(
                        variant=parsed_variant,
                        institute_id=institute_obj['_id'],
                        gene_to_panels=gene_to_panels,
                        hgncid_to_gene=hgncid_to_gene,
                    )
                    try:
                        load_variant(adapter, variant_obj)
                        nr_inserted += 1
                    except IntegrityError as error:
                        pass

                if (nr_variants != 0 and nr_variants % 5000 == 0):
                    logger.info("%s variants parsed" % str(nr_variants))
                    logger.info("Time to parse variants: {} ".format(
                                datetime.now() - start_five_thousand))
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    logger.info("%s variants inserted" % nr_inserted)
                    inserted += 1

    except Exception as error:
        if not coordinates:
            logger.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    logger.info("All variants inserted.")
    logger.info("Number of variants in file: {0}".format(nr_variants + 1))
    logger.info("Number of variants inserted: {0}".format(nr_inserted))
    logger.info("Time to insert variants:{0}".format(datetime.now() - start_insertion))
Ejemplo n.º 33
0
def test_compounds_region(real_populated_database, case_obj,
                          variant_clinical_file):
    """When loading the variants not all variants will be loaded, only the ones that
       have a rank score above a treshold.
       This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = "clinical"
    category = "snv"
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find_one() is None

    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj["_id"]

    ## WHEN loading variants into the database without updating compound information

    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i, variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type="clinical",
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
            category="snv",
        )

        variant_obj = build_variant(variant=parsed_variant,
                                    institute_id=institute_id)
        variants.append(variant_obj)

    # Load all variants
    adapter.variant_collection.insert_many(variants)

    print("Nr variants: {0}".format(len(variants)))

    ## THEN assert that the variants does not have updated compound information
    nr_compounds = 0
    for var in adapter.variant_collection.find():
        if not var.get("compounds"):
            continue
        for comp in var["compounds"]:
            if "genes" in comp:
                assert False
            if "not_loaded" in comp:
                assert False
            nr_compounds += 1

    assert nr_compounds > 0

    ## WHEN updating all compounds for a case
    adapter.update_case_compounds(case_obj)
    hgnc_ids = set([gene["hgnc_id"] for gene in adapter.all_genes()])

    nr_compounds = 0
    ## THEN assert that all compounds  (within the gene defenition) are updated
    for var in adapter.variant_collection.find():
        cont = False
        for hgnc_id in var["hgnc_ids"]:
            if hgnc_id not in hgnc_ids:
                cont = True
        if cont:
            continue
        if not var.get("compounds"):
            continue
        for comp in var["compounds"]:
            nr_compounds += 1
            if not "genes" in comp:
                # pp(var)
                assert False
            if not "not_loaded" in comp:
                assert False
    assert nr_compounds > 0
Ejemplo n.º 34
0
    def _load_variants(self, variants, variant_type, case_obj, individual_positions, rank_threshold,
                       institute_id, build=None, rank_results_header=None, vep_header=None,
                       category='snv', sample_info = None):
        """Perform the loading of variants

        This is the function that loops over the variants, parse them and build the variant
        objects so they are ready to be inserted into the database.

        """
        build = build or '37'
        genes = [gene_obj for gene_obj in self.all_genes(build=build)]
        gene_to_panels = self.gene_to_panels(case_obj)
        hgncid_to_gene = self.hgncid_to_gene(genes=genes)
        genomic_intervals = self.get_coding_intervals(genes=genes)

        LOG.info("Start inserting {0} {1} variants into database".format(variant_type, category))
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets inserted
        nr_inserted = 0
        # This is to keep track of blocks of inserted variants
        inserted = 1

        nr_bulks = 0

        # We want to load batches of variants to reduce the number of network round trips
        bulk = {}
        current_region = None

        for nr_variants, variant in enumerate(variants):
            # All MT variants are loaded
            mt_variant = 'MT' in variant.CHROM
            rank_score = parse_rank_score(variant.INFO.get('RankScore'), case_obj['_id'])

            # Check if the variant should be loaded at all
            # if rank score is None means there are no rank scores annotated, all variants will be loaded
            # Otherwise we load all variants above a rank score treshold
            # Except for MT variants where we load all variants
            if (rank_score is None) or (rank_score > rank_threshold) or mt_variant:
                nr_inserted += 1
                # Parse the vcf variant
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header=vep_header,
                    individual_positions=individual_positions,
                    category=category,
                )

                # Build the variant object
                variant_obj = build_variant(
                    variant=parsed_variant,
                    institute_id=institute_id,
                    gene_to_panels=gene_to_panels,
                    hgncid_to_gene=hgncid_to_gene,
                    sample_info=sample_info
                )

                # Check if the variant is in a genomic region
                var_chrom = variant_obj['chromosome']
                var_start = variant_obj['position']
                # We need to make sure that the interval has a length > 0
                var_end = variant_obj['end'] + 1
                var_id = variant_obj['_id']
                # If the bulk should be loaded or not
                load = True
                new_region = None

                genomic_regions = genomic_intervals.get(var_chrom, IntervalTree()).search(var_start, var_end)

                # If the variant is in a coding region
                if genomic_regions:
                    # We know there is data here so get the interval id
                    new_region = genomic_regions.pop().data
                    # If the variant is in the same region as previous
                    # we add it to the same bulk
                    if new_region == current_region:
                        load = False

                # This is the case where the variant is intergenic
                else:
                    # If the previous variant was also intergenic we add the variant to the bulk
                    if not current_region:
                        load = False
                    # We need to have a max size of the bulk
                    if len(bulk) > 10000:
                        load = True
                # Load the variant object
                if load:
                    # If the variant bulk contains coding variants we want to update the compounds
                    if current_region:
                        self.update_compounds(bulk)
                    try:
                        # Load the variants
                        self.load_variant_bulk(list(bulk.values()))
                        nr_bulks += 1
                    except IntegrityError as error:
                        pass
                    bulk = {}

                current_region = new_region
                bulk[var_id] = variant_obj

                if (nr_variants != 0 and nr_variants % 5000 == 0):
                    LOG.info("%s variants parsed", str(nr_variants))
                    LOG.info("Time to parse variants: %s",
                                (datetime.now() - start_five_thousand))
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    LOG.info("%s variants inserted", nr_inserted)
                    inserted += 1
        # If the variants are in a coding region we update the compounds
        if current_region:
            self.update_compounds(bulk)

        # Load the final variant bulk
        self.load_variant_bulk(list(bulk.values()))
        nr_bulks += 1
        LOG.info("All variants inserted, time to insert variants: {0}".format(
            datetime.now() - start_insertion))

        if nr_variants:
            nr_variants += 1
        LOG.info("Nr variants parsed: %s", nr_variants)
        LOG.info("Nr variants inserted: %s", nr_inserted)
        LOG.debug("Nr bulks inserted: %s", nr_bulks)

        return nr_inserted
Ejemplo n.º 35
0
    def _load_variants(
        self,
        variants,
        variant_type,
        case_obj,
        individual_positions,
        rank_threshold,
        institute_id,
        build=None,
        rank_results_header=None,
        vep_header=None,
        category="snv",
        sample_info=None,
    ):
        """Perform the loading of variants

        This is the function that loops over the variants, parse them and build the variant
        objects so they are ready to be inserted into the database.

        Args:
            variants(iterable(cyvcf2.Variant))
            variant_type(str): ['clinical', 'research']
            case_obj(dict)
            individual_positions(dict): How individuals are positioned in vcf
            rank_treshold(int): Only load variants with a rank score > than this
            institute_id(str)
            build(str): Genome build
            rank_results_header(list): Rank score categories
            vep_header(list)
            category(str): ['snv','sv','cancer','str']
            sample_info(dict): A dictionary with info about samples.
                               Strictly for cancer to tell which is tumor

        Returns:
            nr_inserted(int)
        """
        build = build or "37"
        genes = [gene_obj for gene_obj in self.all_genes(build=build)]
        gene_to_panels = self.gene_to_panels(case_obj)
        hgncid_to_gene = self.hgncid_to_gene(genes=genes, build=build)
        genomic_intervals = self.get_coding_intervals(genes=genes)

        LOG.info("Start inserting {0} {1} variants into database".format(
            variant_type, category))
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets inserted
        nr_inserted = 0
        # This is to keep track of blocks of inserted variants
        inserted = 1

        nr_bulks = 0

        # We want to load batches of variants to reduce the number of network round trips
        bulk = {}
        current_region = None

        for nr_variants, variant in enumerate(variants):
            # All MT variants are loaded
            mt_variant = "MT" in variant.CHROM
            rank_score = parse_rank_score(variant.INFO.get("RankScore"),
                                          case_obj["_id"])
            pathogenic = is_pathogenic(variant)

            # Check if the variant should be loaded at all
            # if rank score is None means there are no rank scores annotated, all variants will be loaded
            # Otherwise we load all variants above a rank score treshold
            # Except for MT variants where we load all variants
            if ((rank_score is None) or (rank_score > rank_threshold)
                    or mt_variant or pathogenic):
                nr_inserted += 1
                # Parse the vcf variant
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header=vep_header,
                    individual_positions=individual_positions,
                    category=category,
                )

                # Build the variant object
                variant_obj = build_variant(
                    variant=parsed_variant,
                    institute_id=institute_id,
                    gene_to_panels=gene_to_panels,
                    hgncid_to_gene=hgncid_to_gene,
                    sample_info=sample_info,
                )

                # Check if the variant is in a genomic region
                var_chrom = variant_obj["chromosome"]
                var_start = variant_obj["position"]
                # We need to make sure that the interval has a length > 0
                var_end = variant_obj["end"] + 1
                var_id = variant_obj["_id"]
                # If the bulk should be loaded or not
                load = True
                new_region = None

                intervals = genomic_intervals.get(var_chrom, IntervalTree())
                genomic_regions = intervals.overlap(var_start, var_end)

                # If the variant is in a coding region
                if genomic_regions:
                    # We know there is data here so get the interval id
                    new_region = genomic_regions.pop().data
                    # If the variant is in the same region as previous
                    # we add it to the same bulk
                    if new_region == current_region:
                        load = False

                # This is the case where the variant is intergenic
                else:
                    # If the previous variant was also intergenic we add the variant to the bulk
                    if not current_region:
                        load = False
                    # We need to have a max size of the bulk
                    if len(bulk) > 10000:
                        load = True
                # Load the variant object
                if load:
                    # If the variant bulk contains coding variants we want to update the compounds
                    if current_region:
                        self.update_compounds(bulk)
                    try:
                        # Load the variants
                        self.load_variant_bulk(list(bulk.values()))
                        nr_bulks += 1
                    except IntegrityError as error:
                        pass
                    bulk = {}

                current_region = new_region
                bulk[var_id] = variant_obj

                if nr_variants != 0 and nr_variants % 5000 == 0:
                    LOG.info("%s variants parsed", str(nr_variants))
                    LOG.info(
                        "Time to parse variants: %s",
                        (datetime.now() - start_five_thousand),
                    )
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0
                        and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    LOG.info("%s variants inserted", nr_inserted)
                    inserted += 1
        # If the variants are in a coding region we update the compounds
        if current_region:
            self.update_compounds(bulk)

        # Load the final variant bulk
        self.load_variant_bulk(list(bulk.values()))
        nr_bulks += 1
        LOG.info("All variants inserted, time to insert variants: {0}".format(
            datetime.now() - start_insertion))

        if nr_variants:
            nr_variants += 1
        LOG.info("Nr variants parsed: %s", nr_variants)
        LOG.info("Nr variants inserted: %s", nr_inserted)
        LOG.debug("Nr bulks inserted: %s", nr_bulks)

        return nr_inserted
Ejemplo n.º 36
0
def test_parse_many_snvs(variants, case_obj):
    """docstring for test_parse_all_variants"""

    for variant in variants:
        parsed_variant = parse_variant(variant, case_obj)
        assert parsed_variant["chromosome"] == variant.CHROM
def test_compounds_region(real_populated_database, case_obj, variant_clinical_file):
    """When loading the variants not all variants will be loaded, only the ones that
       have a rank score above a treshold.
       This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = 'clinical'
    category = 'snv'
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find().count() == 0
    
    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj['_id']
    
    ## WHEN loading variants into the database without updating compound information
    
    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)
    
    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i,variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type='clinical',
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
            category='snv',
        )
        
        variant_obj = build_variant(
            variant=parsed_variant,
            institute_id=institute_id,
        )
        variants.append(variant_obj)
    
    # Load all variants
    adapter.variant_collection.insert_many(variants)

    print("Nr variants: {0}".format(len(variants)))

    ## THEN assert that the variants does not have updated compound information
    nr_compounds = 0
    for var in adapter.variant_collection.find():
        if not var.get('compounds'):
            continue
        for comp in var['compounds']:
            if 'genes' in comp:
                assert False
            if 'not_loaded' in comp:
                assert False
            nr_compounds += 1
    
    assert nr_compounds > 0
    
    ## WHEN updating all compounds for a case
    adapter.update_case_compounds(case_obj)
    hgnc_ids = set([gene['hgnc_id'] for gene in adapter.all_genes()])

    nr_compounds = 0
    ## THEN assert that all compounds  (within the gene defenition) are updated
    for var in adapter.variant_collection.find():
        cont = False
        for hgnc_id in var['hgnc_ids']:
            if hgnc_id not in hgnc_ids:
                cont = True
        if cont:
            continue
        if not var.get('compounds'):
            continue
        for comp in var['compounds']:
            nr_compounds += 1
            if not 'genes' in comp:
                # pp(var)
                assert False
            if not 'not_loaded' in comp:
                assert False
    assert nr_compounds > 0
Ejemplo n.º 38
0
def test_parse_minimal(one_variant, case_obj):
    """Test to parse a minimal variant"""
    parsed_variant = parse_variant(one_variant, case_obj, variant_type='clinical')
    assert parsed_variant['position'] == int(one_variant.POS)
    assert parsed_variant['category'] == 'snv'