def __init__(self, opts): """intialise the class with the some definitions """ self.set_definitions(opts) self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date)
def test__save_tabular(self): ''' check that _save_tabular() works correctly ''' temp = tempfile.NamedTemporaryFile(suffix='.txt', dir=self.temp_dir, delete=False) report = Report(temp.name, None, None) var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"]) var[0].child.format['GQ'] = 40 _save_tabular(temp.name, [var], self.trio) with open(temp.name, 'r') as handle: lines = handle.readlines() expected = [ 'proband\tsex\tchrom\tposition\tgene\t' 'mutation_ID\ttranscript\tconsequence\tref/alt_alleles\tMAX_MAF\t' 'inheritance\ttrio_genotype\tmom_aff\tdad_aff\tresult\tpp_dnm\t' 'exac_allele_count\tGQ\thas_parents\tcnv_length\n', 'child\tF\tX\t150\tTEST\tNA\tNA\t' 'missense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t1\t' 'single_variant\t0.99\tNA\t40\tTrue\tNA\n' ] self.assertEqual(lines, expected)
def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child = create_snv(child_gender, "0/1", chrom='X', pos=150, extra_info='HGNC=TEST;MAX_AF=0.0005') mom = create_snv("F", "0/0", chrom='X', pos=150) dad = create_snv("M", "0/0", chrom='X', pos=150) self.variants = [TrioGenotypes('X', '150', child, mom, dad)] self.report = Report(None, None, None) Info.set_populations([ "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF" ])
def __init__(self, population_tags=None, count=0, known_genes=None, date=None, regions=None, lof_sites=None, pp_filter=0.0, sum_x_lr2_file=None, output_path=None, export_vcf=None, debug_chrom=None, debug_pos=None): """ initialise the class object Args: population_tags: list of population ID tags, that could exist within the INFO field, or None. count: number of probands to analyse, helpful for tracking progress in output logs. known_genes: path to table of genes genes known to be associated with genetic disorders, or None. date: date of the known_genes file, or None if not using/unknown. regions: path to a table of regions for DECIPHER CNV syndromes. lof_sites: path to json file of [chrom, position] coordinates in genome, for modifying to a loss-of-function consequence if required. Can be None if unneeded. pp_filter: threshold from 0 to 1 for pp_dnm value to filter out candidiate DNMs which fall below this value sum_x_lr2_file: File containing sum of l2r values on x chromosome for each person output_path: path to write output tab-separated file to export_vcf: path to file or folder to write VCFs to. debug_chrom: chromosome for debugging purposes. debug_pos: position for debugging variant filtering at. """ self.pp_filter = pp_filter self.total = count self.count = 0 self.populations = population_tags self.debug_chrom = debug_chrom self.debug_pos = debug_pos # open reference datasets, these return None if the paths are None self.known_genes = open_known_genes(known_genes) self.cnv_regions = open_cnv_regions(regions) self.last_base = open_last_base_sites(lof_sites) #open file containing sum of mean log 2 ratios on X, returns an empty dict if path is None self.sum_x_lr2 = open_x_lr2_file(sum_x_lr2_file) self.reporter = Report(output_path, export_vcf, date)
def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child_var = self.create_snv(child_gender, "0/1") mom_var = self.create_snv("F", "0/0") dad_var = self.create_snv("M", "0/0") var = TrioGenotypes(child_var) var.add_mother_variant(mom_var) var.add_father_variant(dad_var) self.variants = [var] self.report = Report(None, None, None, None) self.report.family = self.trio
class ClinicalFilter(LoadOptions): """ filters trios for candidate variants that might contribute to a probands disorder. """ def __init__(self, opts): """intialise the class with the some definitions """ self.set_definitions(opts) self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date) def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \ self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants( self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0) def analyse_trio(self, variants): """identify candidate variants in exome data for a single trio. takes variants that passed the initial filtering from VCF loading, and splits the variants into groups for each gene with variants. Then analyses variants in a single gene (so we can utilise the appropriate inheritance mechanisms for that gene), before running some pos-inheritance filters, and exporting the data (ir required). Args: variants: list of TrioGenotypes objects """ # organise variants by gene, then find variants that fit # different inheritance models genes_dict = self.create_gene_dict(variants) found_vars = [] for gene in genes_dict: gene_vars = genes_dict[gene] found_vars += self.find_variants(gene_vars, gene) # remove any duplicate variants (which might ocur due to CNVs being # checked against all the genes that they encompass) found_vars = self.exclude_duplicates(found_vars) # apply some final filters to the flagged variants post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos) found_vars = post_filter.filter_variants() # export the results to either tab-separated table or VCF format self.report.export_data(found_vars, self.family, \ self.vcf_loader.child_header, self.vcf_provenance) def create_gene_dict(self, variants): """creates dictionary of variants indexed by gene Args: variants: list of TrioGenotypes objects Returns: dictionary of variants indexed by HGNC symbols """ # organise the variants into entries for each gene genes = {} for var in variants: # variants (particularly CNVs) can span multiple genes, so we need # to check each gene separately, and then collapse duplicates later for gene in var.get_genes(): if gene not in genes: genes[gene] = [] # add the variant to the gene entry genes[gene].append(var) return genes def find_variants(self, variants, gene): """ finds variants that fit inheritance models Args: variants: list of TrioGenotype objects gene: gene ID as string Returns: list of variants that pass inheritance checks """ # get the inheritance for the gene (monoalleleic, biallelic, hemizygous # etc), but allow for times when we haven't specified a list of genes # to use gene_inh = None if self.known_genes is not None and gene in self.known_genes: gene_inh = self.known_genes[gene]["inh"] # If we are looking for variants in a set of known genes, and the gene # isn't part of that set, then we don't ant to examine the variant for # that gene, UNLESS the variant is a CNV, since CNVs can be included # purely from size thresholds, regardless of which gene they overlap. if self.known_genes is not None and gene not in self.known_genes: variants = [x for x in variants if x.is_cnv()] # ignore intergenic variants if gene is None: for var in variants: if var.get_chrom() == self.debug_chrom and var.get_position( ) == self.debug_pos: print(var, "lacks HGNC/gene symbol") return [] # Now that we are examining a single gene, check that the consequences # for the gene are in the required functional categories. variants = [ var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene) ] if variants == []: return [] logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh)) chrom_inheritance = variants[0].get_inheritance_type() if chrom_inheritance == "autosomal": finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]: finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) variants = finder.get_candidate_variants() variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants] return variants def exclude_duplicates(self, variants): """ rejig variants included under multiple inheritance mechanisms Args: variants: list of candidate variants Returns: list of (variant, check_type, inheritance) tuples, with duplicates excluded, and originals modified to show both mechanisms """ unique_vars = {} for variant in variants: key = variant[0].child.get_key() if key not in unique_vars: unique_vars[key] = list(variant) else: result = variant[1] inh = variant[2] hgnc = variant[3] # append the check type and inheritance type to the first # instance of the variant unique_vars[key][1] += [ x for x in result if x not in unique_vars[key][1] ] unique_vars[key][2] += [ x for x in inh if x not in unique_vars[key][2] ] # add the HGNC symbols that are unique to the current variant # to the merged variant hgnc = [x for x in hgnc if x not in unique_vars[key][3]] unique_vars[key][3] += hgnc unique_vars = [tuple(unique_vars[x]) for x in unique_vars] return unique_vars
class Filter(object): """ filters trios for candidate variants that might contribute to a probands disorder. """ def __init__(self, population_tags=None, count=0, known_genes=None, date=None, regions=None, lof_sites=None, pp_filter=0.0, sum_x_lr2_file=None, output_path=None, export_vcf=None, debug_chrom=None, debug_pos=None): """ initialise the class object Args: population_tags: list of population ID tags, that could exist within the INFO field, or None. count: number of probands to analyse, helpful for tracking progress in output logs. known_genes: path to table of genes genes known to be associated with genetic disorders, or None. date: date of the known_genes file, or None if not using/unknown. regions: path to a table of regions for DECIPHER CNV syndromes. lof_sites: path to json file of [chrom, position] coordinates in genome, for modifying to a loss-of-function consequence if required. Can be None if unneeded. pp_filter: threshold from 0 to 1 for pp_dnm value to filter out candidiate DNMs which fall below this value sum_x_lr2_file: File containing sum of l2r values on x chromosome for each person output_path: path to write output tab-separated file to export_vcf: path to file or folder to write VCFs to. debug_chrom: chromosome for debugging purposes. debug_pos: position for debugging variant filtering at. """ self.pp_filter = pp_filter self.total = count self.count = 0 self.populations = population_tags self.debug_chrom = debug_chrom self.debug_pos = debug_pos # open reference datasets, these return None if the paths are None self.known_genes = open_known_genes(known_genes) self.cnv_regions = open_cnv_regions(regions) self.last_base = open_last_base_sites(lof_sites) #open file containing sum of mean log 2 ratios on X, returns an empty dict if path is None self.sum_x_lr2 = open_x_lr2_file(sum_x_lr2_file) self.reporter = Report(output_path, export_vcf, date) def filter_trio(self, family): """ loads trio variants, and screens for candidate variants """ # some families have more than one child in the family, so run # through each child. family.set_child() while family.child is not None: if family.child.is_affected(): self.count += 1 logging.info("opening trio {} of {}".format(self.count, self.total)) found_vars = self.analyse_trio(family) # export the results to either tab-separated table or VCF format self.reporter.export_data(found_vars, family) family.set_child_examined() def analyse_trio(self, family): """identify candidate variants in exome data for a single trio. takes variants that passed the initial filtering from VCF loading, and splits the variants into groups for each gene with variants. Then analyses variants in a single gene (so we can utilise the appropriate inheritance mechanisms for that gene), before running some pos-inheritance filters, and exporting the data (ir required). Args: family: Family object Returns: list of (TrioGenotype, [genes], [inheritances], [type]) tuples for variants that pass inheritance and post-inheritance checks. """ variants = load_variants(family, self.pp_filter, self.populations, self.known_genes, self.last_base, self.sum_x_lr2, self.debug_chrom, self.debug_pos) # organise variants by gene, then find variants that fit different # inheritance models. We have to flatten the list of variant lists genes = self.create_gene_dict(variants) variants = [ self.find_variants(genes[x], x, family) for x in genes ] variants = [ x for sublist in variants for x in sublist ] # remove any duplicate variants (which might ocur due to CNVs being # checked against all the genes that they encompass) variants = self.exclude_duplicates(variants) # apply some final filters to the flagged variants post_filter = PostInheritanceFilter(family, self.debug_chrom, self.debug_pos) return post_filter.filter_variants(variants) def create_gene_dict(self, variants): """creates dictionary of variants indexed by gene Args: variants: list of TrioGenotypes objects Returns: dictionary of variants indexed by HGNC ID """ # organise the variants into entries for each gene genes = {} for var in variants: # variants (particularly CNVs) can span multiple genes, so we need # to check each gene separately, and then collapse duplicates later for gene_list in var.get_genes(): for gene in gene_list: if gene not in genes: genes[gene] = [] # add the variant to the gene entry genes[gene].append(var) return genes def find_variants(self, variants, gene, family): """ finds variants that fit inheritance models Args: variants: list of TrioGenotype objects gene: gene ID as string Returns: list of variants that pass inheritance checks """ # get the inheritance for the gene (monoalleleic, biallelic, hemizygous # etc), but allow for times when we haven't specified a list of genes # to use known_gene = None gene_inh = None if self.known_genes is not None and gene in self.known_genes: known_gene = self.known_genes[gene] gene_inh = known_gene['inh'] chrom_inheritance = variants[0].get_inheritance_type() # If we are looking for variants in a set of known genes, and the gene # isn't part of that set, then we don't ant to examine the variant for # that gene, UNLESS the variant is a CNV, since CNVs can be included # purely from size thresholds, regardless of which gene they overlap. if self.known_genes is not None and gene not in self.known_genes: variants = [ x for x in variants if x.is_cnv() ] # ignore intergenic variants if gene is None: for var in variants: if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos: print(var, "lacks HGNC/gene symbol") return [] # Now that we are examining a single gene, check that the consequences # for the gene are in the required functional categories. variants = [ var for var in variants if var.child.is_lof(gene) or var.child.is_missense(var.child.is_cnv(), gene) ] if variants == []: return [] for x in variants[0].child.info.symbols: try: symbol = x.get(gene, ['HGNC', 'SYMBOL', 'ENSG']) break except KeyError: continue logging.info("{}\t{}\tvariants: {}\trequired_mode: {}".format( family.child.get_id(), symbol, [str(x) for x in variants], gene_inh)) if chrom_inheritance == "autosomal": finder = Autosomal(variants, family, known_gene, gene, self.cnv_regions) elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]: finder = Allosomal(variants, family, known_gene, gene, self.cnv_regions) return finder.get_candidate_variants() def exclude_duplicates(self, variants): """ rejig variants included under multiple inheritance mechanisms Args: variants: list of candidate variants Returns: list of (variant, check_type, inheritance) tuples, with duplicates excluded, and originals modified to show both mechanisms """ unique_vars = {} for variant in variants: key = variant[0].child.get_key() if key not in unique_vars: unique_vars[key] = list(variant) else: result = variant[1] inh = variant[2] hgnc = variant[3] # append the check type and inheritance type to the first # instance of the variant unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]] unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]] unique_vars[key][1] = sorted(unique_vars[key][1]) unique_vars[key][2] = sorted(unique_vars[key][2]) # add the gene IDs that are unique to the current variant # to the merged variant genes = [x for x in hgnc if x not in unique_vars[key][3]] unique_vars[key][3] += genes unique_vars = [tuple(unique_vars[x]) for x in unique_vars] return unique_vars
class TestReportPy(unittest.TestCase): """ test the Report class """ def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child_var = self.create_snv(child_gender, "0/1") mom_var = self.create_snv("F", "0/0") dad_var = self.create_snv("M", "0/0") var = TrioGenotypes(child_var) var.add_mother_variant(mom_var) var.add_father_variant(dad_var) self.variants = [var] self.report = Report(None, None, None, None) self.report.family = self.trio # self.report.tags_dict = tags def create_snv(self, gender, genotype): """ create a default variant """ chrom = "X" pos = "15000000" snp_id = "." ref = "A" alt = "G" qual = "50" filt = "PASS" # set up a SNV object, since SNV inherits VcfInfo var = SNV(chrom, pos, snp_id, ref, alt, filt) info = "HGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005" format_keys = "GT:DP" sample_values = genotype + ":50" var.vcf_line = [chrom, pos, snp_id, ref, alt, qual, filt, info, format_keys, sample_values] var.add_info(info) var.add_format(format_keys, sample_values) var.set_gender(gender) var.set_genotype() return var def create_family(self, child_gender, mom_aff, dad_aff): """ create a default family, with optional gender and parental statuses """ fam = Family("test") fam.add_child("child", "child_vcf", "2", child_gender) fam.add_mother("mother", "mother_vcf", mom_aff, "2") fam.add_father("father", "father_vcf", dad_aff, "1") fam.set_child() return fam def test__get_provenance(self): """ check that _get_provenance() works correctly """ prov = ["checksum", "sample.calls.date.vcf.gz", "2014-01-01"] member = "proband" self.assertEqual(self.report._get_provenance(prov, member), \ ["##UberVCF_proband_Id=sample\n", \ "##UberVCF_proband_Checksum=checksum\n", \ "##UberVCF_proband_Basename=sample.calls.date.vcf.gz\n", \ "##UberVCF_proband_Date=2014-01-01\n"]) def test__get_vcf_export_path(self): """ check that _get_vcf_export_path() works correctly """ # use a folder to place the VCFG file in, which means we join the # proband ID to get a full path self.report.export_vcf = os.getcwd() self.assertEqual(self.report._get_vcf_export_path(), os.path.join(os.getcwd(), "child.vcf.gz")) # define an un-uable directory, to raise an error self.report.export_vcf = os.getcwd() + "asjhfgasjhfg" self.assertRaises(ValueError, self.report._get_vcf_export_path) # define a specific path for a VCF file, which is returned directly self.report.export_vcf = os.path.join(os.getcwd(), "sample_id.vcf.gz") self.assertEqual(self.report._get_vcf_export_path(), self.report.export_vcf) def test__make_vcf_header(self): """ check that _make_vcf_header() works correctly """ # define the intial header lines header = ["####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"] # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] processed_header = ["####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,Description="The HGNC symbol which the variant was identified as being reportable for.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()), "##ClinicalFilterVersion=XXX\n", "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"] # check that the standard function returns the expected value. Note that # I haven't checked the output if self.known_genes_date is not None, nor # have I checked if the _clinicalFilterVersion is available self.assertEqual(self.report._make_vcf_header(header, provenance), processed_header) def test__get_parental_inheritance(self): """ check that _get_parental_inheritance() works correctly """ var = self.variants[0] # check for the default genotypes self.assertEqual(self.report._get_parental_inheritance(var), "deNovo") # check when only the mother is non-ref var.mother.genotype = 1 self.assertEqual(self.report._get_parental_inheritance(var), "maternal") # check when both parents are non-ref var.father.genotype = 1 self.assertEqual(self.report._get_parental_inheritance(var), "biparental") # check when only the father is non-ref var.mother.genotype = 0 self.assertEqual(self.report._get_parental_inheritance(var), "paternal") # check when the proband lacks parental information self.report.family.father = None self.report.family.mother = None self.assertEqual(self.report._get_parental_inheritance(var), "unknown") def test__get_vcf_lines(self): """ check that _get_vcf_lines() works correctly """ # define the intial header lines header = ["####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"] # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] # define what the header will become vcf_lines = ["####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n', '##INFO=<ID=ClinicalFilterReportableHGNC,Number=.,Type=String,Description="The HGNC symbol which the variant was identified as being reportable for.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format(datetime.date.today()), "##ClinicalFilterVersion=XXX\n", "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n"] # define what the default variant vcf line will become line = ["X\t15000000\t.\tA\tG\t50\tPASS\tHGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005;ClinicalFilterGeneInheritance=Monoallelic;ClinicalFilterType=single_variant;ClinicalFilterReportableHGNC=TEST\tGT:DP:INHERITANCE:INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n"] # check that a list of one variant produces the correct VCF output. Note # that we haven't checked against CNVs, which can change the # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"]) self.assertEqual(self.report._get_vcf_lines([var], header, provenance), vcf_lines + line) def test__get_output_line(self): """ check that _get_output_line() works correctly """ var = (self.variants[0], ["single_variant"], ["Monoallelic"], ["TEST"]) dad_aff = "0" mom_aff = "1" alt_id = "test_id" # check the output for the default variant expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tNA\tmissense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\tNA\tNA\n" self.assertEqual(self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected) # introduce additional info for the output line parsing, check the line # that is returned is expected var[0].child.info["PolyPhen"] = "probably_damaging(0.99)" var[0].child.info["SIFT"] = "deleterious(0)" var[0].child.info["ENST"] = "ENST00X" expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tENST00X\tmissense_variant,PolyPhen=probably_damaging(0.99),SIFT=deleterious(0)\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\tNA\tNA\n" self.assertEqual(self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected)
class ClinicalFilter(LoadOptions): """ filters trios for candidate variants that might contribute to a probands disorder. """ def __init__(self, opts): """intialise the class with the some definitions """ self.set_definitions(opts) self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date) def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0) def analyse_trio(self, variants): """identify candidate variants in exome data for a single trio. takes variants that passed the initial filtering from VCF loading, and splits the variants into groups for each gene with variants. Then analyses variants in a single gene (so we can utilise the appropriate inheritance mechanisms for that gene), before running some pos-inheritance filters, and exporting the data (ir required). Args: variants: list of TrioGenotypes objects """ # organise variants by gene, then find variants that fit # different inheritance models genes_dict = self.create_gene_dict(variants) found_vars = [] for gene in genes_dict: gene_vars = genes_dict[gene] found_vars += self.find_variants(gene_vars, gene) # remove any duplicate variants (which might ocur due to CNVs being # checked against all the genes that they encompass) found_vars = self.exclude_duplicates(found_vars) # apply some final filters to the flagged variants post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos) found_vars = post_filter.filter_variants() # export the results to either tab-separated table or VCF format self.report.export_data(found_vars, self.family, self.vcf_loader.child_header, self.vcf_provenance) def create_gene_dict(self, variants): """creates dictionary of variants indexed by gene Args: variants: list of TrioGenotypes objects Returns: dictionary of variants indexed by HGNC symbols """ # organise the variants into entries for each gene genes = {} for var in variants: # variants (particularly CNVs) can span multiple genes, so we need # to check each gene separately, and then collapse duplicates later for gene in var.get_genes(): if gene not in genes: genes[gene] = [] # add the variant to the gene entry genes[gene].append(var) return genes def find_variants(self, variants, gene): """ finds variants that fit inheritance models Args: variants: list of TrioGenotype objects gene: gene ID as string Returns: list of variants that pass inheritance checks """ # get the inheritance for the gene (monoalleleic, biallelic, hemizygous # etc), but allow for times when we haven't specified a list of genes # to use gene_inh = None if self.known_genes is not None and gene in self.known_genes: gene_inh = self.known_genes[gene]["inh"] # If we are looking for variants in a set of known genes, and the gene # isn't part of that set, then we don't ant to examine the variant for # that gene, UNLESS the variant is a CNV, since CNVs can be included # purely from size thresholds, regardless of which gene they overlap. if self.known_genes is not None and gene not in self.known_genes: variants = [x for x in variants if x.is_cnv()] # ignore intergenic variants if gene is None: for var in variants: if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos: print(var, "lacks HGNC/gene symbol") return [] # Now that we are examining a single gene, check that the consequences # for the gene are in the required functional categories. variants = [var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene)] if variants == []: return [] logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh)) chrom_inheritance = variants[0].get_inheritance_type() if chrom_inheritance == "autosomal": finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]: finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) variants = finder.get_candidate_variants() variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants] return variants def exclude_duplicates(self, variants): """ rejig variants included under multiple inheritance mechanisms Args: variants: list of candidate variants Returns: list of (variant, check_type, inheritance) tuples, with duplicates excluded, and originals modified to show both mechanisms """ unique_vars = {} for variant in variants: key = variant[0].child.get_key() if key not in unique_vars: unique_vars[key] = list(variant) else: result = variant[1] inh = variant[2] hgnc = variant[3] # append the check type and inheritance type to the first # instance of the variant unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]] unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]] # add the HGNC symbols that are unique to the current variant # to the merged variant hgnc = [x for x in hgnc if x not in unique_vars[key][3]] unique_vars[key][3] += hgnc unique_vars = [tuple(unique_vars[x]) for x in unique_vars] return unique_vars
class TestReportPy(unittest.TestCase): """ test the Report class """ def setUp(self): """ define a family and variant, and start the Allosomal class """ # generate a test family child_gender = "F" mom_aff = "1" dad_aff = "1" self.trio = self.create_family(child_gender, mom_aff, dad_aff) # generate a test variant child_var = self.create_snv(child_gender, "0/1") mom_var = self.create_snv("F", "0/0") dad_var = self.create_snv("M", "0/0") var = TrioGenotypes(child_var) var.add_mother_variant(mom_var) var.add_father_variant(dad_var) self.variants = [var] self.report = Report(None, None, None, None) self.report.family = self.trio # self.report.tags_dict = tags def create_snv(self, gender, genotype): """ create a default variant """ chrom = "X" pos = "15000000" snp_id = "." ref = "A" alt = "G" qual = "50" filt = "PASS" # set up a SNV object, since SNV inherits VcfInfo var = SNV(chrom, pos, snp_id, ref, alt, filt) info = "HGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005" format_keys = "GT:DP" sample_values = genotype + ":50" var.vcf_line = [ chrom, pos, snp_id, ref, alt, qual, filt, info, format_keys, sample_values ] var.add_info(info) var.add_format(format_keys, sample_values) var.set_gender(gender) var.set_genotype() return var def create_family(self, child_gender, mom_aff, dad_aff): """ create a default family, with optional gender and parental statuses """ fam = Family("test") fam.add_child("child", "child_vcf", "2", child_gender) fam.add_mother("mother", "mother_vcf", mom_aff, "2") fam.add_father("father", "father_vcf", dad_aff, "1") fam.set_child() return fam def test__get_provenance(self): """ check that _get_provenance() works correctly """ prov = ["checksum", "sample.calls.date.vcf.gz", "2014-01-01"] member = "proband" self.assertEqual(self.report._get_provenance(prov, member), \ ["##UberVCF_proband_Id=sample\n", \ "##UberVCF_proband_Checksum=checksum\n", \ "##UberVCF_proband_Basename=sample.calls.date.vcf.gz\n", \ "##UberVCF_proband_Date=2014-01-01\n"]) def test__get_vcf_export_path(self): """ check that _get_vcf_export_path() works correctly """ # use a folder to place the VCFG file in, which means we join the # proband ID to get a full path self.report.export_vcf = os.getcwd() self.assertEqual(self.report._get_vcf_export_path(), os.path.join(os.getcwd(), "child.vcf.gz")) # define an un-uable directory, to raise an error self.report.export_vcf = os.getcwd() + "asjhfgasjhfg" self.assertRaises(ValueError, self.report._get_vcf_export_path) # define a specific path for a VCF file, which is returned directly self.report.export_vcf = os.path.join(os.getcwd(), "sample_id.vcf.gz") self.assertEqual(self.report._get_vcf_export_path(), self.report.export_vcf) def test__make_vcf_header(self): """ check that _make_vcf_header() works correctly """ # define the intial header lines header = [ "####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n" ] # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] processed_header = [ "####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format( datetime.date.today()), "##ClinicalFilterVersion=XXX\n", "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n" ] # check that the standard function returns the expected value. Note that # I haven't checked the output if self.known_genes_date is not None, nor # have I checked if the _clinicalFilterVersion is available self.assertEqual(self.report._make_vcf_header(header, provenance), processed_header) def test__get_parental_inheritance(self): """ check that _get_parental_inheritance() works correctly """ var = self.variants[0] # check for the default genotypes self.assertEqual(self.report._get_parental_inheritance(var), "deNovo") # check when only the mother is non-ref var.mother.genotype = 1 self.assertEqual(self.report._get_parental_inheritance(var), "maternal") # check when both parents are non-ref var.father.genotype = 1 self.assertEqual(self.report._get_parental_inheritance(var), "biparental") # check when only the father is non-ref var.mother.genotype = 0 self.assertEqual(self.report._get_parental_inheritance(var), "paternal") # check when the proband lacks parental information self.report.family.father = None self.report.family.mother = None self.assertEqual(self.report._get_parental_inheritance(var), "unknown") def test__get_vcf_lines(self): """ check that _get_vcf_lines() works correctly """ # define the intial header lines header = [ "####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n" ] # define the VCF provenances provenance = [("checksum", "proband.calls.date.vcf.gz", "2014-01-01"), ("checksum", "mother.calls.date.vcf.gz", "2014-01-02"), ("checksum", "father.calls.date.vcf.gz", "2014-01-03")] # define what the header will become vcf_lines = [ "####fileformat=VCFv4.1\n", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", '##INFO=<ID=ClinicalFilterType,Number=.,Type=String,Description="The type of clinical filter that passed this variant.">\n', '##INFO=<ID=ClinicalFilterGeneInheritance,Number=.,Type=String,Description="The inheritance mode (Monoallelic, Biallelic etc) under which the variant was found.">\n', '##FORMAT=<ID=INHERITANCE_GENOTYPE,Number=.,Type=String,Description="The 012 coded genotypes for a trio (child, mother, father).">\n', '##FORMAT=<ID=INHERITANCE,Number=.,Type=String,Description="The inheritance of the variant in the trio (biparental, paternal, maternal, deNovo).">\n', "##ClinicalFilterRunDate={0}\n".format( datetime.date.today()), "##ClinicalFilterVersion=XXX\n", "##ClinicalFilterHistory=single_variant,compound_het\n", "##UberVCF_proband_Id=proband\n", "##UberVCF_proband_Checksum=checksum\n", "##UberVCF_proband_Basename=proband.calls.date.vcf.gz\n", "##UberVCF_proband_Date=2014-01-01\n", "##UberVCF_maternal_Id=mother\n", "##UberVCF_maternal_Checksum=checksum\n", "##UberVCF_maternal_Basename=mother.calls.date.vcf.gz\n", "##UberVCF_maternal_Date=2014-01-02\n", "##UberVCF_paternal_Id=father\n", "##UberVCF_paternal_Checksum=checksum\n", "##UberVCF_paternal_Basename=father.calls.date.vcf.gz\n", "##UberVCF_paternal_Date=2014-01-03\n", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\sample_id\n" ] # define what the default variant vcf line will become line = [ "X\t15000000\t.\tA\tG\t50\tPASS\tHGNC=TEST;CQ=missense_variant;random_tag;EUR_AF=0.0005;ClinicalFilterGeneInheritance=Monoallelic;ClinicalFilterType=single_variant\tGT:DP:INHERITANCE:INHERITANCE_GENOTYPE\t0/1:50:deNovo:1,0,0\n" ] # check that a list of one variant produces the correct VCF output. Note # that we haven't checked against CNVs, which can change the # INHERITANCE_GENOTYPE flag, nor have we tested a larger list of variants var = (self.variants[0], "single_variant", "Monoallelic") self.assertEqual(self.report._get_vcf_lines([var], header, provenance), vcf_lines + line) def test__get_output_line(self): """ check that _get_output_line() works correctly """ var = (self.variants[0], "single_variant", "Monoallelic") dad_aff = "0" mom_aff = "1" alt_id = "test_id" # check the output for the default variant expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tNA\tmissense_variant\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\n" self.assertEqual( self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected) # introduce additional info for the output line parsing, check the line # that is returned is expected var[0].child.info["PolyPhen"] = "probably_damaging(0.99)" var[0].child.info["SIFT"] = "deleterious(0)" var[0].child.info["ENST"] = "ENST00X" expected = "child\ttest_id\tF\tX\t15000000\tTEST\tNA\tENST00X\tmissense_variant,PolyPhen=probably_damaging(0.99),SIFT=deleterious(0)\tA/G\t0.0005\tMonoallelic\t1/0/0\t1\t0\tsingle_variant\n" self.assertEqual( self.report._get_output_line(var, dad_aff, mom_aff, alt_id), expected)