def test_build_empty(): ## GIVEN a variant with no information variant = {} ## WHEN building a variant_obj ## THEN a Key Error should be raised since mandatory fields are missing with pytest.raises(KeyError): build_variant(variant, INSTITUTE_ID)
def test_build_with_gene_info(parsed_variant): ## GIVEN information about a variant ## WHEN adding gene and transcript information and building variant transcript_info = { 'functional_annotations': ['transcript_ablation'], 'transcript_id': 'ENST00000249504', 'hgnc_id': 5134, 'sift_prediction': 'deleterious' } gene_info = { 'transcripts': [transcript_info], 'most_severe_transcript': transcript_info, 'most_severe_consequence': 'transcript_ablation', 'most_severe_sift': 'deleterious', 'most_severe_polyphen': None, 'hgnc_id': 5134, 'region_annotation': 'exonic', } parsed_variant['genes'].append(gene_info) variant_obj = build_variant(parsed_variant, INSTITUTE_ID) ## THEN assert the information is added assert variant_obj['institute'] == INSTITUTE_ID assert len(variant_obj['genes']) == 1
def test_build_cadd_score(parsed_variants, institute_obj): for index,variant in enumerate(parsed_variants): if variant.get('cadd_score'): variant_obj = build_variant(variant, institute_obj) assert variant_obj['cadd_score'] == variant['cadd_score'] assert index > 0
def test_build_minimal(case_obj): ## GIVEN a variant with minimal information class Cyvcf2Variant(object): def __init__(self): self.CHROM = '1' self.REF = 'A' self.ALT = ['C'] self.POS = 10 self.end = 11 self.FILTER = None self.ID = '.' self.QUAL = None self.var_type = 'snp' self.INFO = {} variant = Cyvcf2Variant() parsed_variant = parse_variant(variant, case_obj) assert 'ids' in parsed_variant variant_obj = build_variant(parsed_variant, INSTITUTE_ID) assert variant_obj['_id'] == parsed_variant['ids']['document_id']
def test_build_with_hgnc_info(parsed_variant): ## GIVEN information about a variant ## WHEN adding gene and transcript information and building variant transcript_info = { 'functional_annotations': ['transcript_ablation'], 'transcript_id': 'ENST00000249504', 'hgnc_id': 5134, 'sift_prediction': 'deleterious' } gene_info = { 'transcripts': [transcript_info], 'most_severe_transcript': transcript_info, 'most_severe_consequence': 'transcript_ablation', 'most_severe_sift': 'deleterious', 'most_severe_polyphen': None, 'hgnc_id': 5134, 'region_annotation': 'exonic', } parsed_variant['genes'].append(gene_info) transcript_1 = { 'ensembl_transcript_id': 'ENST00000498438', 'is_primary': False, 'start': 176968944, 'end': 176974482 } transcript_2 = { 'ensembl_transcript_id': 'ENST00000249504', 'is_primary': True, 'refseq_id': 'NM_021192', 'start': 176972014, 'end': 176974722, } hgnc_transcripts = [ transcript_1, transcript_2 ] hgnc_gene = { 'hgnc_id': 5134, 'hgnc_symbol': 'HOXD11', 'ensembl_id': 'ENSG00000128713', 'chromosome': '2', 'start': 176968944, 'end': 176974722, 'build': 37, 'description': 'homeobox D11', 'aliases': ['HOX4', 'HOXD11', 'HOX4F'], 'entrez_id': 3237, 'omim_ids': 142986, 'pli_score': 0.0131898476206074, 'primary_transcripts': ['NM_021192'], 'ucsc_id': 'uc010fqx.4', 'uniprot_ids': ['P31277'], 'vega_id': 'OTTHUMG00000132510', 'transcripts': hgnc_transcripts, 'incomplete_penetrance': False, 'ad': True, 'ar': False, 'xd': False, 'xr': False, 'x': False, 'y': False, 'transcripts_dict': { 'ENST00000498438': transcript_1, 'ENST00000249504': transcript_2, } } hgncid_to_gene = {5134: hgnc_gene} variant_obj = build_variant(parsed_variant, INSTITUTE_ID, hgncid_to_gene=hgncid_to_gene) ## THEN assert the information is added assert variant_obj['institute'] == INSTITUTE_ID assert variant_obj['genes'][0]['hgnc_id'] == 5134 assert variant_obj['genes'][0]['hgnc_symbol'] == 'HOXD11' assert variant_obj['genes'][0]['inheritance'] == ['AD']
def test_build_sv_variants(parsed_sv_variants, institute_obj): for variant in parsed_sv_variants: variant_obj = build_variant(variant, institute_obj) assert variant_obj['chromosome'] == variant['chromosome'] assert variant_obj['category'] == 'sv'
def test_build_sv_variant(parsed_sv_variant, institute_obj): variant_obj = build_variant(parsed_sv_variant, institute_obj) assert variant_obj['chromosome'] == parsed_sv_variant['chromosome'] assert variant_obj['category'] == 'sv'
def _load_variants(self, variants, variant_type, case_obj, individual_positions, rank_threshold, institute_id, build=None, rank_results_header=None, vep_header=None, category='snv', sample_info = None): """Perform the loading of variants This is the function that loops over the variants, parse them and build the variant objects so they are ready to be inserted into the database. """ build = build or '37' genes = [gene_obj for gene_obj in self.all_genes(build=build)] gene_to_panels = self.gene_to_panels(case_obj) hgncid_to_gene = self.hgncid_to_gene(genes=genes) genomic_intervals = self.get_coding_intervals(genes=genes) LOG.info("Start inserting variants into database") start_insertion = datetime.now() start_five_thousand = datetime.now() # These are the number of parsed varaints nr_variants = 0 # These are the number of variants that meet the criteria and gets inserted nr_inserted = 0 # This is to keep track of blocks of inserted variants inserted = 1 nr_bulks = 0 # We want to load batches of variants to reduce the number of network round trips bulk = {} current_region = None for nr_variants, variant in enumerate(variants): # All MT variants are loaded mt_variant = 'MT' in variant.CHROM rank_score = parse_rank_score(variant.INFO.get('RankScore'), case_obj['_id']) # Check if the variant should be loaded at all # if rank score is None means there are no rank scores annotated, all variants will be loaded # Otherwise we load all variants above a rank score treshold # Except for MT variants where we load all variants if (rank_score is None) or (rank_score > rank_threshold) or mt_variant: nr_inserted += 1 # Parse the vcf variant parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type=variant_type, rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions, category=category, ) # Build the variant object variant_obj = build_variant( variant=parsed_variant, institute_id=institute_id, gene_to_panels=gene_to_panels, hgncid_to_gene=hgncid_to_gene, sample_info=sample_info ) # Check if the variant is in a genomic region var_chrom = variant_obj['chromosome'] var_start = variant_obj['position'] # We need to make sure that the interval has a length > 0 var_end = variant_obj['end'] + 1 var_id = variant_obj['_id'] # If the bulk should be loaded or not load = True new_region = None genomic_regions = genomic_intervals.get(var_chrom, IntervalTree()).search(var_start, var_end) # If the variant is in a coding region if genomic_regions: # We know there is data here so get the interval id new_region = genomic_regions.pop().data # If the variant is in the same region as previous # we add it to the same bulk if new_region == current_region: load = False # This is the case where the variant is intergenic else: # If the previous variant was also intergenic we add the variant to the bulk if not current_region: load = False # We need to have a max size of the bulk if len(bulk) > 10000: load = True # Load the variant object if load: # If the variant bulk contains coding variants we want to update the compounds if current_region: self.update_compounds(bulk) try: # Load the variants self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 except IntegrityError as error: pass bulk = {} current_region = new_region bulk[var_id] = variant_obj if (nr_variants != 0 and nr_variants % 5000 == 0): LOG.info("%s variants parsed", str(nr_variants)) LOG.info("Time to parse variants: %s", (datetime.now() - start_five_thousand)) start_five_thousand = datetime.now() if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0): LOG.info("%s variants inserted", nr_inserted) inserted += 1 # If the variants are in a coding region we update the compounds if current_region: self.update_compounds(bulk) # Load the final variant bulk self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 LOG.info("All variants inserted, time to insert variants: {0}".format( datetime.now() - start_insertion)) if nr_variants: nr_variants += 1 LOG.info("Nr variants parsed: %s", nr_variants) LOG.info("Nr variants inserted: %s", nr_inserted) LOG.debug("Nr bulks inserted: %s", nr_bulks) return nr_inserted
def test_build_sv_variants(parsed_sv_variants, institute_obj): for variant in parsed_sv_variants: variant_obj = build_variant(variant, institute_obj) assert variant_obj["chromosome"] == variant["chromosome"] assert variant_obj["category"] == "sv"
def test_build_sv_variant(parsed_sv_variant, institute_obj): variant_obj = build_variant(parsed_sv_variant, institute_obj) assert variant_obj["chromosome"] == parsed_sv_variant["chromosome"] assert variant_obj["category"] == "sv"
def test_build_variant(parsed_variant): variant_obj = build_variant(parsed_variant, INSTITUTE_ID) assert variant_obj["chromosome"] == parsed_variant["chromosome"] assert variant_obj["category"] == "snv" assert variant_obj["institute"] == INSTITUTE_ID
def _load_variants( self, variants, variant_type, case_obj, individual_positions, rank_threshold, institute_id, build=None, rank_results_header=None, vep_header=None, category="snv", sample_info=None, ): """Perform the loading of variants This is the function that loops over the variants, parse them and build the variant objects so they are ready to be inserted into the database. Args: variants(iterable(cyvcf2.Variant)) variant_type(str): ['clinical', 'research'] case_obj(dict) individual_positions(dict): How individuals are positioned in vcf rank_treshold(int): Only load variants with a rank score > than this institute_id(str) build(str): Genome build rank_results_header(list): Rank score categories vep_header(list) category(str): ['snv','sv','cancer','str'] sample_info(dict): A dictionary with info about samples. Strictly for cancer to tell which is tumor Returns: nr_inserted(int) """ build = build or "37" genes = [gene_obj for gene_obj in self.all_genes(build=build)] gene_to_panels = self.gene_to_panels(case_obj) hgncid_to_gene = self.hgncid_to_gene(genes=genes) genomic_intervals = self.get_coding_intervals(genes=genes) LOG.info( "Start inserting {0} {1} variants into database".format( variant_type, category ) ) start_insertion = datetime.now() start_five_thousand = datetime.now() # These are the number of parsed varaints nr_variants = 0 # These are the number of variants that meet the criteria and gets inserted nr_inserted = 0 # This is to keep track of blocks of inserted variants inserted = 1 nr_bulks = 0 # We want to load batches of variants to reduce the number of network round trips bulk = {} current_region = None for nr_variants, variant in enumerate(variants): # All MT variants are loaded mt_variant = "MT" in variant.CHROM rank_score = parse_rank_score( variant.INFO.get("RankScore"), case_obj["_id"] ) pathogenic = is_pathogenic(variant) # Check if the variant should be loaded at all # if rank score is None means there are no rank scores annotated, all variants will be loaded # Otherwise we load all variants above a rank score treshold # Except for MT variants where we load all variants if ( (rank_score is None) or (rank_score > rank_threshold) or mt_variant or pathogenic ): nr_inserted += 1 # Parse the vcf variant parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type=variant_type, rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions, category=category, ) # Build the variant object variant_obj = build_variant( variant=parsed_variant, institute_id=institute_id, gene_to_panels=gene_to_panels, hgncid_to_gene=hgncid_to_gene, sample_info=sample_info, ) # Check if the variant is in a genomic region var_chrom = variant_obj["chromosome"] var_start = variant_obj["position"] # We need to make sure that the interval has a length > 0 var_end = variant_obj["end"] + 1 var_id = variant_obj["_id"] # If the bulk should be loaded or not load = True new_region = None intervals = genomic_intervals.get(var_chrom, IntervalTree()) genomic_regions = intervals.overlap(var_start, var_end) # If the variant is in a coding region if genomic_regions: # We know there is data here so get the interval id new_region = genomic_regions.pop().data # If the variant is in the same region as previous # we add it to the same bulk if new_region == current_region: load = False # This is the case where the variant is intergenic else: # If the previous variant was also intergenic we add the variant to the bulk if not current_region: load = False # We need to have a max size of the bulk if len(bulk) > 10000: load = True # Load the variant object if load: # If the variant bulk contains coding variants we want to update the compounds if current_region: self.update_compounds(bulk) try: # Load the variants self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 except IntegrityError as error: pass bulk = {} current_region = new_region bulk[var_id] = variant_obj if nr_variants != 0 and nr_variants % 5000 == 0: LOG.info("%s variants parsed", str(nr_variants)) LOG.info( "Time to parse variants: %s", (datetime.now() - start_five_thousand), ) start_five_thousand = datetime.now() if ( nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0 ): LOG.info("%s variants inserted", nr_inserted) inserted += 1 # If the variants are in a coding region we update the compounds if current_region: self.update_compounds(bulk) # Load the final variant bulk self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 LOG.info( "All variants inserted, time to insert variants: {0}".format( datetime.now() - start_insertion ) ) if nr_variants: nr_variants += 1 LOG.info("Nr variants parsed: %s", nr_variants) LOG.info("Nr variants inserted: %s", nr_inserted) LOG.debug("Nr bulks inserted: %s", nr_bulks) return nr_inserted
def sv_variant_objs(request, parsed_sv_variants, institute_obj): """Get a generator with parsed variants""" print('') return (build_variant(variant, institute_obj) for variant in parsed_sv_variants)
def test_compounds_region(real_populated_database, case_obj, variant_clinical_file): """When loading the variants not all variants will be loaded, only the ones that have a rank score above a treshold. This implies that some compounds will have the status 'not_loaded'=True. When loading all variants for a region then all variants should have status 'not_loaded'=False. """ adapter = real_populated_database variant_type = 'clinical' category = 'snv' ## GIVEN a database without any variants assert adapter.variant_collection.find().count() == 0 institute_obj = adapter.institute_collection.find_one() institute_id = institute_obj['_id'] ## WHEN loading variants into the database without updating compound information vcf_obj = VCF(variant_clinical_file) rank_results_header = parse_rank_results_header(vcf_obj) vep_header = parse_vep_header(vcf_obj) individual_positions = {} for i, ind in enumerate(vcf_obj.samples): individual_positions[ind] = i variants = [] for i,variant in enumerate(vcf_obj): parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type='clinical', rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions, category='snv', ) variant_obj = build_variant( variant=parsed_variant, institute_id=institute_id, ) variants.append(variant_obj) # Load all variants adapter.variant_collection.insert_many(variants) print("Nr variants: {0}".format(len(variants))) ## THEN assert that the variants does not have updated compound information nr_compounds = 0 for var in adapter.variant_collection.find(): if not var.get('compounds'): continue for comp in var['compounds']: if 'genes' in comp: assert False if 'not_loaded' in comp: assert False nr_compounds += 1 assert nr_compounds > 0 ## WHEN updating all compounds for a case adapter.update_case_compounds(case_obj) hgnc_ids = set([gene['hgnc_id'] for gene in adapter.all_genes()]) nr_compounds = 0 ## THEN assert that all compounds (within the gene defenition) are updated for var in adapter.variant_collection.find(): cont = False for hgnc_id in var['hgnc_ids']: if hgnc_id not in hgnc_ids: cont = True if cont: continue if not var.get('compounds'): continue for comp in var['compounds']: nr_compounds += 1 if not 'genes' in comp: # pp(var) assert False if not 'not_loaded' in comp: assert False assert nr_compounds > 0
def test_build_variant(parsed_variant): variant_obj = build_variant(parsed_variant, INSTITUTE_ID) assert variant_obj['chromosome'] == parsed_variant['chromosome'] assert variant_obj['category'] == 'snv' assert variant_obj['institute'] == INSTITUTE_ID
def _load_variants(self, variants, variant_type, case_obj, individual_positions, rank_threshold, institute_id, build=None, rank_results_header=None, vep_header=None, category='snv', sample_info = None): """Perform the loading of variants This is the function that loops over the variants, parse them and build the variant objects so they are ready to be inserted into the database. """ build = build or '37' genes = [gene_obj for gene_obj in self.all_genes(build=build)] gene_to_panels = self.gene_to_panels(case_obj) hgncid_to_gene = self.hgncid_to_gene(genes=genes) genomic_intervals = self.get_coding_intervals(genes=genes) LOG.info("Start inserting {0} {1} variants into database".format(variant_type, category)) start_insertion = datetime.now() start_five_thousand = datetime.now() # These are the number of parsed varaints nr_variants = 0 # These are the number of variants that meet the criteria and gets inserted nr_inserted = 0 # This is to keep track of blocks of inserted variants inserted = 1 nr_bulks = 0 # We want to load batches of variants to reduce the number of network round trips bulk = {} current_region = None for nr_variants, variant in enumerate(variants): # All MT variants are loaded mt_variant = 'MT' in variant.CHROM rank_score = parse_rank_score(variant.INFO.get('RankScore'), case_obj['_id']) # Check if the variant should be loaded at all # if rank score is None means there are no rank scores annotated, all variants will be loaded # Otherwise we load all variants above a rank score treshold # Except for MT variants where we load all variants if (rank_score is None) or (rank_score > rank_threshold) or mt_variant: nr_inserted += 1 # Parse the vcf variant parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type=variant_type, rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions, category=category, ) # Build the variant object variant_obj = build_variant( variant=parsed_variant, institute_id=institute_id, gene_to_panels=gene_to_panels, hgncid_to_gene=hgncid_to_gene, sample_info=sample_info ) # Check if the variant is in a genomic region var_chrom = variant_obj['chromosome'] var_start = variant_obj['position'] # We need to make sure that the interval has a length > 0 var_end = variant_obj['end'] + 1 var_id = variant_obj['_id'] # If the bulk should be loaded or not load = True new_region = None genomic_regions = genomic_intervals.get(var_chrom, IntervalTree()).search(var_start, var_end) # If the variant is in a coding region if genomic_regions: # We know there is data here so get the interval id new_region = genomic_regions.pop().data # If the variant is in the same region as previous # we add it to the same bulk if new_region == current_region: load = False # This is the case where the variant is intergenic else: # If the previous variant was also intergenic we add the variant to the bulk if not current_region: load = False # We need to have a max size of the bulk if len(bulk) > 10000: load = True # Load the variant object if load: # If the variant bulk contains coding variants we want to update the compounds if current_region: self.update_compounds(bulk) try: # Load the variants self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 except IntegrityError as error: pass bulk = {} current_region = new_region bulk[var_id] = variant_obj if (nr_variants != 0 and nr_variants % 5000 == 0): LOG.info("%s variants parsed", str(nr_variants)) LOG.info("Time to parse variants: %s", (datetime.now() - start_five_thousand)) start_five_thousand = datetime.now() if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0): LOG.info("%s variants inserted", nr_inserted) inserted += 1 # If the variants are in a coding region we update the compounds if current_region: self.update_compounds(bulk) # Load the final variant bulk self.load_variant_bulk(list(bulk.values())) nr_bulks += 1 LOG.info("All variants inserted, time to insert variants: {0}".format( datetime.now() - start_insertion)) if nr_variants: nr_variants += 1 LOG.info("Nr variants parsed: %s", nr_variants) LOG.info("Nr variants inserted: %s", nr_inserted) LOG.debug("Nr bulks inserted: %s", nr_bulks) return nr_inserted
def test_compounds_region(real_populated_database, case_obj, variant_clinical_file): """When loading the variants not all variants will be loaded, only the ones that have a rank score above a treshold. This implies that some compounds will have the status 'not_loaded'=True. When loading all variants for a region then all variants should have status 'not_loaded'=False. """ adapter = real_populated_database variant_type = 'clinical' category = 'snv' ## GIVEN a database without any variants assert adapter.variant_collection.find().count() == 0 institute_obj = adapter.institute_collection.find_one() institute_id = institute_obj['_id'] ## WHEN loading variants into the database without updating compound information vcf_obj = VCF(variant_clinical_file) rank_results_header = parse_rank_results_header(vcf_obj) vep_header = parse_vep_header(vcf_obj) individual_positions = {} for i, ind in enumerate(vcf_obj.samples): individual_positions[ind] = i variants = [] for i, variant in enumerate(vcf_obj): parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type='clinical', rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions, category='snv', ) variant_obj = build_variant( variant=parsed_variant, institute_id=institute_id, ) variants.append(variant_obj) # Load all variants adapter.variant_collection.insert_many(variants) print("Nr variants: {0}".format(len(variants))) ## THEN assert that the variants does not have updated compound information nr_compounds = 0 for var in adapter.variant_collection.find(): if not var.get('compounds'): continue for comp in var['compounds']: if 'genes' in comp: assert False if 'not_loaded' in comp: assert False nr_compounds += 1 assert nr_compounds > 0 ## WHEN updating all compounds for a case adapter.update_case_compounds(case_obj) hgnc_ids = set([gene['hgnc_id'] for gene in adapter.all_genes()]) nr_compounds = 0 ## THEN assert that all compounds (within the gene defenition) are updated for var in adapter.variant_collection.find(): cont = False for hgnc_id in var['hgnc_ids']: if hgnc_id not in hgnc_ids: cont = True if cont: continue if not var.get('compounds'): continue for comp in var['compounds']: nr_compounds += 1 if not 'genes' in comp: # pp(var) assert False if not 'not_loaded' in comp: assert False assert nr_compounds > 0