def annotate_snvs(adapter, vcf_obj): """Annotate all variants in a VCF Args: adapter(loqusdb.plugin.adapter) vcf_obj(cyvcf2.VCF) Yields: variant(cyvcf2.Variant): Annotated variant """ variants = {} for nr_variants, variant in enumerate(vcf_obj, 1): # Add the variant to current batch variants[get_variant_id(variant)] = variant # If batch len == 1000 we annotate the batch if (nr_variants % 1000) == 0: for var_obj in adapter.search_variants(list(variants.keys())): var_id = var_obj["_id"] if var_id in variants: annotate_variant(variants[var_id], var_obj) for variant_id in variants: yield variants[variant_id] variants = {} for var_obj in adapter.search_variants(list(variants.keys())): var_id = var_obj["_id"] if var_id in variants: annotate_variant(variants[var_id], var_obj) for variant_id in variants: yield variants[variant_id]
def test_delete_variant(real_mongo_adapter, het_variant, case_obj): ## GIVEN a database with one variant that is observed twice db = real_mongo_adapter.db case_id = case_obj["case_id"] db.variant.insert_one({ "_id": get_variant_id(het_variant), "families": [case_id, "2"], "observations": 2, }) mongo_variant = db.variant.find_one() assert mongo_variant["observations"] == 2 ## WHEN deleting the variant for one case delete_variants( adapter=real_mongo_adapter, vcf_obj=[het_variant], case_obj=case_obj, case_id="2", ) mongo_variant = db.variant.find_one() ## THEN assert that one case has been removed from 'families' assert mongo_variant["families"] == [case_id] ## THEN assert that the observation count is decreased assert mongo_variant["observations"] == 1
def test_delete_variant(real_mongo_adapter, het_variant, case_obj): ## GIVEN a database with one variant that is observed twice db = real_mongo_adapter.db case_id = case_obj['case_id'] db.variant.insert_one({ '_id': get_variant_id(het_variant), 'families': [case_id, '2'], 'observations': 2, }) mongo_variant = db.variant.find_one() assert mongo_variant['observations'] == 2 ## WHEN deleting the variant for one case delete_variants( adapter=real_mongo_adapter, vcf_obj=[het_variant], case_obj=case_obj, case_id='2', ) mongo_variant = db.variant.find_one() ## THEN assert that one case has been removed from 'families' assert mongo_variant['families'] == [case_id] ## THEN assert that the observation count is decreased assert mongo_variant['observations'] == 1
def build_profile_variant(variant): """Returns a ProfileVariant object Args: variant (cyvcf2.Variant) Returns: variant (models.ProfileVariant) """ chrom = variant.CHROM if chrom.startswith(("chr", "CHR", "Chr")): chrom = chrom[3:] pos = int(variant.POS) variant_id = get_variant_id(variant) ref = variant.REF alt = variant.ALT[0] maf = get_maf(variant) profile_variant = ProfileVariant( variant_id=variant_id, chrom=chrom, pos=pos, ref=ref, alt=alt, maf=maf, id_column=variant.ID ) return profile_variant
def insert_sv_variants(adapter: MongoAdapter, case_obj: Case) -> None: """Build sv_variant documents and insert them into database on the fly, one at a time""" for variant in VCF(case_obj.vcf_sv_path, threads=settings.cyvcf_threads): variant_id = get_variant_id(variant=variant) ref = variant.REF alt = variant.ALT[0] coordinates = get_coords(variant) chrom = coordinates["chrom"] pos = coordinates["pos"] variant_obj = Variant( variant_id=variant_id, chrom=chrom, pos=pos, end=coordinates["end"], ref=ref, alt=alt, end_chrom=coordinates["end_chrom"], sv_type=coordinates["sv_type"], sv_len=coordinates["sv_length"], case_id=case_obj.case_id, homozygote=0, hemizygote=0, is_sv=True, id_column=variant.ID, ) adapter.add_structural_variant(variant=variant_obj, max_window=settings.load_sv_window)
def get_profiles(adapter: MongoAdapter, vcf_file: str) -> Dict[str, str]: """ Reads VCF file containing one or more samples. Creates a dictionary where each sample ID from VCF file is a key. Retrieves coordinates for each variant from loqusdb.profile_variants Adds each variant of each sample as value of the dictionary. Returns a dictionary : {SAMPLE_ID : [var1, var2, ..., var50]} """ vcf = VCF(vcf_file, threads=settings.cyvcf_threads) individuals = vcf.samples profiles = {individual: [] for individual in individuals} for profile_variant in adapter.profile_variants(): ref = profile_variant["ref"] alt = profile_variant["alt"] pos = profile_variant["pos"] end = pos + 1 chrom = profile_variant["chrom"] region = f"{chrom}:{pos}-{end}" # Find variants in region found_variant = False for variant in vcf(region): variant_id = get_variant_id(variant) # If variant id i.e. chrom_pos_ref_alt matches if variant_id == profile_variant["_id"]: found_variant = True # find genotype for each individual in vcf for i, individual in enumerate(individuals): genotype = GENOTYPE_MAP[variant.gt_types[i]] if genotype == "hom_alt": gt_str = f"{alt}{alt}" elif genotype == "het": gt_str = f"{ref}{alt}" else: gt_str = f"{ref}{ref}" # Append genotype to profile string of individual profiles[individual].append(gt_str) # Break loop if variant is found in region break # If no call was found for variant, give all samples a hom ref genotype if not found_variant: for individual in individuals: profiles[individual].append(f"{ref}{ref}") return profiles
def annotate_snv(adpter, variant): """Annotate an SNV/INDEL variant Args: adapter(loqusdb.plugin.adapter) variant(cyvcf2.Variant) """ variant_id = get_variant_id(variant) variant_obj = adapter.get_variant(variant={"_id": variant_id}) annotated_variant = annotated_variant(variant, variant_obj) return annotated_variant
def insert_snv_variants(adapter: MongoAdapter, case_obj: Case) -> None: """Build variant documents and bulk insert them into database""" variants = [] for variant in VCF(case_obj.vcf_path, threads=settings.cyvcf_threads): variant_id = get_variant_id(variant=variant) ref = variant.REF alt = variant.ALT[0] coordinates = get_coords(variant) chrom = coordinates["chrom"] pos = coordinates["pos"] found_homozygote = 0 found_hemizygote = 0 for ind_obj in case_obj.individuals: ind_pos = ind_obj["ind_index"] if int(variant.gt_quals[ind_pos]) < settings.load_gq_threshold: continue genotype = GENOTYPE_MAP[variant.gt_types[ind_pos]] if genotype not in ["het", "hom_alt"]: continue if genotype == "hom_alt": found_homozygote = 1 if ( chrom in ["X", "Y"] and ind_obj["sex"] == 1 and not check_par(chrom, pos, genome_build=settings.genome_build) ): found_hemizygote = 1 variant_obj = Variant( variant_id=variant_id, chrom=chrom, pos=pos, end=coordinates["end"], ref=ref, alt=alt, end_chrom=coordinates["end_chrom"], sv_type=coordinates["sv_type"], sv_len=coordinates["sv_length"], case_id=case_obj.case_id, homozygote=found_homozygote, hemizygote=found_hemizygote, is_sv=False, id_column=variant.ID, ) variants.append(variant_obj) adapter.add_variants(variants=variants)
def test_delete_variants(real_mongo_adapter, het_variant, case_obj): ## GIVEN a database with one variant db = real_mongo_adapter.db case_id = case_obj["case_id"] db.variant.insert_one({ "_id": get_variant_id(het_variant), "families": [case_id], "observations": 1, }) mongo_variant = db.variant.find_one() assert mongo_variant["families"] == [case_id] ## WHEN deleting the variant delete_variants(adapter=real_mongo_adapter, vcf_obj=[het_variant], case_obj=case_obj) mongo_variant = db.variant.find_one() ## THEN assert that the variant was not found assert mongo_variant is None
def test_delete_variants(real_mongo_adapter, het_variant, case_obj): ## GIVEN a database with one variant db = real_mongo_adapter.db case_id = case_obj['case_id'] db.variant.insert_one({ '_id': get_variant_id(het_variant), 'families': [case_id], 'observations': 1, }) mongo_variant = db.variant.find_one() assert mongo_variant['families'] == [case_id] ## WHEN deleting the variant delete_variants(adapter=real_mongo_adapter, vcf_obj=[het_variant], case_obj=case_obj) mongo_variant = db.variant.find_one() ## THEN assert that the variant was not found assert mongo_variant == None
def get_profiles(adapter, vcf_file): """Given a vcf, get a profile string for each sample in the vcf based on the profile variants in the database Args: adapter(MongoAdapter): Adapter to mongodb vcf_file(str): Path to vcf file Returns: profiles (dict(str)): The profiles (given as strings) for each sample in vcf. """ vcf = get_file_handle(vcf_file) individuals = vcf.samples profiles = {individual: [] for individual in individuals} for profile_variant in adapter.profile_variants(): ref = profile_variant["ref"] alt = profile_variant["alt"] pos = profile_variant["pos"] end = pos + 1 chrom = profile_variant["chrom"] region = f"{chrom}:{pos}-{end}" # Find variants in region found_variant = False for variant in vcf(region): variant_id = get_variant_id(variant) # If variant id i.e. chrom_pos_ref_alt matches if variant_id == profile_variant["_id"]: found_variant = True # find genotype for each individual in vcf for i, individual in enumerate(individuals): genotype = GENOTYPE_MAP[variant.gt_types[i]] if genotype == "hom_alt": gt_str = f"{alt}{alt}" elif genotype == "het": gt_str = f"{ref}{alt}" else: gt_str = f"{ref}{ref}" # Append genotype to profile string of individual profiles[individual].append(gt_str) # Break loop if variant is found in region break # If no call was found for variant, give all samples a hom ref genotype if not found_variant: for individual in individuals: profiles[individual].append(f"{ref}{ref}") return profiles
def check_vcf(vcf_path, expected_type="snv"): """Check if there are any problems with the vcf file Args: vcf_path(str) expected_type(str): 'sv' or 'snv' Returns: vcf_info(dict): dict like { 'nr_variants':<INT>, 'variant_type': <STR> in ['snv', 'sv'], 'individuals': <LIST> individual positions in file } """ LOG.info("Check if vcf is on correct format...") vcf = VCF(vcf_path) individuals = vcf.samples variant_type = None previous_pos = None previous_chrom = None posititon_variants = set() nr_variants = 0 for nr_variants, variant in enumerate(vcf, 1): # Check the type of variant current_type = "sv" if variant.var_type == "sv" else "snv" if not variant_type: variant_type = current_type # Vcf can not include both snvs and svs if variant_type != current_type: raise VcfError("Vcf includes a mix of snvs and svs") current_chrom = variant.CHROM current_pos = variant.POS # We start with a simple id that can be used by SV:s variant_id = "{0}_{1}".format(current_chrom, current_pos) # For SNVs we can create a proper variant id with chrom_pos_ref_alt if variant_type == "snv": variant_id = get_variant_id(variant) # Initiate variables if not previous_chrom: previous_chrom = current_chrom previous_pos = current_pos posititon_variants = {variant_id} continue # Update variables if new chromosome if current_chrom != previous_chrom: previous_chrom = current_chrom previous_pos = current_pos posititon_variants = {variant_id} continue if variant_type == "snv": # Check if variant is unique if current_pos == previous_pos: if variant_id in posititon_variants: raise VcfError("Variant {0} occurs several times" " in vcf".format(variant_id)) else: posititon_variants.add(variant_id) # Check if vcf is sorted else: if not current_pos >= previous_pos: raise VcfError("Vcf if not sorted in a correct way") previous_pos = current_pos # Reset posititon_variants since we are on a new position posititon_variants = {variant_id} if variant_type != expected_type: raise VcfError( "VCF file does not only include {0}s, please check vcf {1}".format( expected_type.upper(), vcf_path ) ) LOG.info("Vcf file %s looks fine", vcf_path) LOG.info("Nr of variants in vcf: {0}".format(nr_variants)) LOG.info("Type of variants in vcf: {0}".format(variant_type)) return { "nr_variants": nr_variants, "variant_type": variant_type, "individuals": individuals, }