def load_variants(adapter, family_id, individuals, vcf, nr_variants=None, skip_case_id=False, gq_treshold=None): """Load variants for a family into the database. Args: adapter (loqusdb.plugins.Adapter): initialized plugin family_id (str): unique family identifier inidividuals (List[str]): list to match individuals vcf (iterable(dict)): An iterable variant dictionaries skip_case_id (bool): whether to include the case id on variant level or not """ gq_treshold = gq_treshold or 20 if skip_case_id: family_id = None # Loop over the variants in the vcf with click.progressbar(vcf, label="Inserting variants", length=nr_variants) as bar: for variant in bar: # Creates a variant that is ready to insert into the database formated_variant = get_formated_variant( variant=variant, individuals=individuals, family_id=family_id, gq_treshold=gq_treshold ) if formated_variant: adapter.add_variant(variant=formated_variant)
def load_variants(adapter, family_id, individuals, vcf, nr_variants=None, skip_case_id=False, gq_treshold=None): """Load variants for a family into the database. Args: adapter (loqusdb.plugins.Adapter): initialized plugin family_id (str): unique family identifier inidividuals (List[str]): list to match individuals vcf (iterable(dict)): An iterable variant dictionaries skip_case_id (bool): whether to include the case id on variant level or not """ gq_treshold = gq_treshold or 20 if skip_case_id: family_id = None # Loop over the variants in the vcf with click.progressbar(vcf, label="Inserting variants",length=nr_variants) as bar: for variant in bar: #Creates a variant that is ready to insert into the database formated_variant = get_formated_variant( variant=variant, individuals=individuals, family_id=family_id, gq_treshold=gq_treshold, ) if formated_variant: adapter.add_variant(variant=formated_variant)
def test_format_variant_no_gq(variant_no_gq, individuals, case_id): formated_variant = get_formated_variant( variant=variant_no_gq, individuals=individuals, family_id=case_id ) assert formated_variant == {}
def test_format_variant_no_call(variant_no_call, individuals, case_id): formated_variant = get_formated_variant( variant=variant_no_call, individuals=individuals, family_id=case_id ) assert formated_variant == {}
def test_format_variant_no_call(variant_no_call, individuals, case_id, ind_positions): formated_variant = get_formated_variant(variant=variant_no_call, ind_positions=ind_positions, individuals=individuals, family_id=case_id) assert formated_variant == {}
def test_format_variant_no_family_id(het_variant, individuals, ind_positions): formated_variant = get_formated_variant(variant=het_variant, individuals=individuals, ind_positions=ind_positions, family_id=None) assert formated_variant assert formated_variant.get('family_id') == None assert formated_variant['homozygote'] == 0 assert formated_variant['hemizygote'] == 0
def test_format_hemizygote_variant(hem_variant, individuals, case_id, ind_positions): formated_variant = get_formated_variant(variant=hem_variant, individuals=individuals, ind_positions=ind_positions, family_id=case_id) assert formated_variant['homozygote'] == 0 assert formated_variant['hemizygote'] == 1
def test_format_homozygote_variant(hom_variant, individuals, case_id): formated_variant = get_formated_variant( variant=hom_variant, individuals=individuals, family_id=case_id ) assert formated_variant['homozygote'] == 1 assert formated_variant['hemizygote'] == 0
def test_format_hemizygote_variant(hem_variant, individuals, case_id): formated_variant = get_formated_variant( variant=hem_variant, individuals=individuals, family_id=case_id ) assert formated_variant['homozygote'] == 0 assert formated_variant['hemizygote'] == 1
def test_format_variant_chr_prefix(variant_chr, individuals, ind_positions, case_id): formated_variant = get_formated_variant(variant=variant_chr, individuals=individuals, ind_positions=ind_positions, family_id=case_id, gq_treshold=20) assert formated_variant['chrom'] == variant_chr.CHROM[3:]
def test_format_variant_no_gq(variant_no_gq, individuals, ind_positions, case_id): formated_variant = get_formated_variant(variant=variant_no_gq, individuals=individuals, ind_positions=ind_positions, family_id=case_id, gq_treshold=20) assert formated_variant == {}
def test_format_variant_no_family_id(het_variant, individuals): formated_variant = get_formated_variant( variant=het_variant, individuals=individuals, family_id=None ) assert formated_variant assert formated_variant.get('family_id') == None assert formated_variant['homozygote'] == 0 assert formated_variant['hemizygote'] == 0
def test_format_variant_no_call(): """docstring for test_format_variant""" variant_line = get_variant(genotypes=['./.']) header_line = get_header_line() affected_individuals = set(['proband']) formatted_variant = get_formated_variant( variant_line = variant_line, header_line = header_line, affected_individuals = affected_individuals ) assert formatted_variant == {}
def test_format_variant_no_header(): """docstring for test_format_variant""" variant_line = get_variant(genotypes=['1/1']) header_line = [] affected_individuals = set(['proband']) with pytest.raises(Exception): formatted_variant = get_formated_variant( variant_line = variant_line, header_line = header_line, affected_individuals = affected_individuals )
def test_format_homozygote_variant(): """docstring for test_format_variant""" variant_line = get_variant(genotypes=['1/1']) header_line = get_header_line() affected_individuals = set(['proband']) formatted_variant = get_formated_variant( variant_line = variant_line, header_line = header_line, affected_individuals = affected_individuals ) assert formatted_variant['_id'] == '1_10_A_T' assert formatted_variant['homozygote'] == 1
def delete_variants(adapter, vcf, ind_positions, family_id, individuals): """Delete variants for a case in the database Args: adapter (loqusdb.plugins.Adapter) vcf (iterable(dict)) ind_positions(dict) family_id (str) Returns: nr_of_deleted (int): Number of deleted variants """ nr_of_deleted = 0 start_deleting = datetime.now() chrom_time = datetime.now() current_chrom = None new_chrom = None for variant in vcf: formated_variant = get_formated_variant(variant=variant, ind_positions=ind_positions, individuals=individuals, family_id=family_id) if formated_variant: new_chrom = formated_variant.get('chrom') adapter.delete_variant(formated_variant) nr_of_deleted += 1 if new_chrom != current_chrom: if current_chrom: logger.info("Chromosome {0} done".format(current_chrom)) logger.info("Time to delete chromosome {0}: {1}".format( current_chrom, datetime.now() - chrom_time)) logger.info( "Start deleting chromosome {0}".format(new_chrom)) else: logger.info( "Start deleting chromosome {}".format(new_chrom)) current_chrom = new_chrom chrom_time = datetime.now() return nr_of_deleted
def test_format_variant(het_variant, individuals, ind_positions, case_id): formated_variant = get_formated_variant(variant=het_variant, individuals=individuals, ind_positions=ind_positions, family_id=case_id) expected_id = '_'.join([ het_variant.CHROM, str(het_variant.POS), het_variant.REF, het_variant.ALT[0] ]) assert formated_variant assert formated_variant['_id'] == expected_id assert formated_variant['chrom'] == het_variant.CHROM assert formated_variant['pos'] == het_variant.POS assert formated_variant['ref'] == het_variant.REF assert formated_variant['alt'] == het_variant.ALT[0] assert formated_variant['family_id'] == case_id assert formated_variant['homozygote'] == 0
def delete_variants(adapter, variant_stream, family_id, affected_individuals): case = {'case_id': family_id} adapter.delete_case(case) header = [] nr_of_deleted = 0 for line in variant_stream: line = line.rstrip() if line.startswith('#'): if not line.startswith('##'): header = line[1:].split() else: formated_variant = get_formated_variant( variant_line=line, header_line=header, affected_individuals=affected_individuals) adapter.delete_variant(formated_variant) nr_of_deleted += 1 return nr_of_deleted
def test_format_variant(het_variant, individuals, case_id): formated_variant = get_formated_variant( variant=het_variant, individuals=individuals, family_id=case_id ) expected_id = '_'.join([ het_variant['CHROM'], het_variant['POS'], het_variant['REF'], het_variant['ALT'] ]) assert formated_variant assert formated_variant['_id'] == expected_id assert formated_variant['chrom'] == het_variant['CHROM'] assert formated_variant['pos'] == int(het_variant['POS']) assert formated_variant['ref'] == het_variant['REF'] assert formated_variant['alt'] == het_variant['ALT'] assert formated_variant['family_id'] == case_id assert formated_variant['homozygote'] == 0
def delete_variants(adapter, vcf, family_id, individuals): """Delete variants for a case in the database Args: adapter (loqusdb.plugins.Adapter) vcf (iterable(dict)) family_id (str) Returns: nr_of_deleted (int): Number of deleted variants """ nr_of_deleted = 0 start_deleting = datetime.now() chrom_time = datetime.now() current_chrom = None new_chrom = None for variant in vcf: formated_variant = get_formated_variant(variant=variant, individuals=individuals, family_id=family_id) if formated_variant: new_chrom = formated_variant.get("chrom") adapter.delete_variant(formated_variant) nr_of_deleted += 1 if new_chrom != current_chrom: if current_chrom: logger.info("Chromosome {0} done".format(current_chrom)) logger.info("Time to delete chromosome {0}: {1}".format(current_chrom, datetime.now() - chrom_time)) logger.info("Start deleting chromosome {0}".format(new_chrom)) else: logger.info("Start deleting chromosome {}".format(new_chrom)) current_chrom = new_chrom chrom_time = datetime.now() return nr_of_deleted
def load_variants(adapter, family_id, affected_individuals, variant_stream, bulk_insert=False, vcf_path=None): """Load variants for a family into the database. Args: adapter (loqusdb.plugins.MongoAdapter): initialized plugin family_id (str): unique family identifier affected_inidividuals (List[str]): list to match individuals variant_stream (sequence): stream of VCF lines bulk_insert (bool): whether to insert in bulk or one-by-one vcf_path (path): for storing in database """ case = {'case_id': family_id, 'vcf_path': vcf_path} adapter.add_case(case) # This is the header line with mandatory vcf fields header = [] nr_of_variants = 0 nr_of_inserted = 0 start_inserting = datetime.now() start_ten_thousand = datetime.now() variants = [] for line in variant_stream: line = line.rstrip() if line.startswith('#'): if not line.startswith('##'): header = line[1:].split() else: nr_of_variants += 1 formated_variant = get_formated_variant( variant_line=line, header_line=header, affected_individuals=affected_individuals) if formated_variant: nr_of_inserted += 1 if bulk_insert: variants.append(formated_variant) else: adapter.add_variant(variant=formated_variant) if nr_of_variants % 10000 == 0: logger.info("{0} of variants processed".format(nr_of_variants)) logger.info("Time to insert last 10000: {0}".format( datetime.now()-start_ten_thousand)) start_ten_thousand = datetime.now() if nr_of_variants % 100000 == 0: if bulk_insert: adapter.add_bulk(variants) variants = [] if bulk_insert: adapter.add_bulk(variants) logger.info("Nr of variants in vcf: {0}".format(nr_of_variants)) logger.info("Nr of variants inserted: {0}".format(nr_of_inserted)) logger.info("Time to insert variants: {0}".format(datetime.now() - start_inserting))