def test_get_allele_frequencies(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] output1 = [ 0.3391, 0.2859, 0.4539, 0.4274, 0.253, 0.3272, 0.392, 0.4176, 0.3196, 0.43, 0.2793, 0.5536, 0.4469, 0.4538, 0.3747, 0.2942, 0.4278 ] self.assertEqual(vcf_parser.get_allele_frequencies(input1), output1)
def test_get_rs_number(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] output1 = 'rs3795269' input2 = { 'NM_002617.3': { 'Existing_variation': 'gene1' }, 'NM_002614.3': { 'Existing_variation': 'gene2' } } output2 = 'gene1|gene2' input3 = { 'NM_002617.3': { 'Existing_variation': '' }, 'NM_002614.3': { 'Existing_variation': 'gene2' } } output3 = 'gene2' input4 = { 'NM_002617.3': { 'gene1': '' }, 'NM_002614.3': { 'test': 'gene2' } } output4 = "None" self.assertEqual(vcf_parser.get_rs_number(input1), output1) self.assertEqual(vcf_parser.get_rs_number(input2), output2) self.assertEqual(vcf_parser.get_rs_number(input3), output3) self.assertEqual(vcf_parser.get_rs_number(input4), output4)
def test_get_max_af(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] input2 = {'NM_002617.3': {'MAX_AF': '0.01'}} input3 = {'NM_002617.3': {'MAX_AF': ''}} input4 = {'NM_002617.3': {'NO_MAX_AF': ''}} self.assertEqual(vcf_parser.get_max_af(input1), 0.5536) self.assertEqual(vcf_parser.get_max_af(input2), 0.01) self.assertEqual(vcf_parser.get_max_af(input3), 0.0) self.assertRaises(KeyError, vcf_parser.get_max_af, input4)
def test_get_variant_genes_list(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] output1 = [('PEX10', '-1'), ('RER1', '1')] input2 = { 'NM_002617.3': { 'SYMBOL': 'gene1', 'STRAND': '1' }, 'NM_002614.3': { 'SYMBOL': 'gene2', 'STRAND': '1' } } output2 = [('gene1', '1'), ('gene2', '1')] input3 = { 'NM_002617.3': { 'SYMBOL': '', 'STRAND': '1' }, 'NM_002614.3': { 'SYMBOL': 'gene2', 'STRAND': '1' } } output3 = [('gene2', '1')] self.assertEqual(vcf_parser.get_variant_genes_list(input1), output1) self.assertEqual(set(vcf_parser.get_variant_genes_list(input2)), set(output2)) self.assertEqual(set(vcf_parser.get_variant_genes_list(input3)), set(output3))
def test_get_clin_sig(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] input2 = {'NM_002617.3': {'CLIN_SIG': 'test2'}} input3 = {'NM_002617.3': {'CLIN_SIG': 'test3'}} input4 = {'NM_002617.3': {'NO_CLIN_SIG': 'test4'}} input5 = master_list[5]['transcript_data'] self.assertEqual(vcf_parser.get_clin_sig(input1), 'None') self.assertEqual(vcf_parser.get_clin_sig(input2), 'test2') self.assertEqual(vcf_parser.get_clin_sig(input3), 'test3') self.assertRaises(KeyError, vcf_parser.get_clin_sig, input4) self.assertEqual(vcf_parser.get_clin_sig(input5), 'benign')
def upload_sample_vcf(output_dir, sample_name): """ Takes a VEP annotated vcf and uploads the variants contained within it. Input: output_dir = The directory containing the pipline output. sample_name = A string representing the unique sample name. Output: None - uploads and returns None. """ #Get the sample - if we cannot find it then raise and error. try: sample = Sample.objects.get(name=sample_name) except Sample.DoesNotExist: raise CommandError("Cannot find a sample in the DB with name " + sample_name + ".") already_uploaded = VariantSample.objects.filter(sample=sample) if already_uploaded.exists(): raise CommandError("Stuff already uploaded against this sample.") #Find VCF file query = output_dir + "vcfs*/" + sample_name + "*.vcf.gz" vcf_file_path = glob.glob(query) if len(vcf_file_path) != 1: raise CommandError("Found more than one file for sample: " + sample_name) else: vcf_file_path = vcf_file_path[0] #validate vcf validation_report = vcf_parser.validate_input(vcf_file_path, sample_name) if validation_report[0] == False: raise CommandError('Error opening vcf file: ' + validation_report[1]) if vcf_parser.vep_annotated(vcf_file_path) == False: raise CommandError("No VEP annotations detected in vcf") #Check we have an admin user in the database for the next stage try: user = User.objects.get(pk=1) # a superuser has to have been created except: raise CommandError( 'Could not find an appropiate user to use for downstream data entry - please create an admin with pk=1' ) #Try and parse the vcf using the vcf_parser try: vcf_data = vcf_parser.create_master_list(vcf_file_path, sample_name) except: raise CommandError('Could not process data using vcf_parser function') #update sample information e.g. vcf location. sample.vcf_file = vcf_file_path sample.save() for variant in vcf_data: chromosome = variant['chrom'] pos = str(variant['pos']) ref = variant['reference'] alt = variant['alt_alleles'][0] #hash_id = hashlib.sha256(chromosome+" "+pos+" "+ref+" "+alt).hexdigest() hash_id = variant_utilities.get_variant_hash(chromosome, pos, ref, alt) gene_list = vcf_parser.get_variant_genes_list( variant['transcript_data']) rs_number = vcf_parser.get_rs_number(variant['transcript_data']) worst_consequence = vcf_parser.worst_consequence( variant['transcript_data']) worst_consequence = Consequence.objects.get(name=worst_consequence) max_af = vcf_parser.get_max_af(variant['transcript_data']) allele_frequencies = vcf_parser.get_allele_frequencies( variant['transcript_data']) clin_sig = vcf_parser.get_clin_sig(variant['transcript_data']) af = allele_frequencies[0] afr_af = allele_frequencies[1] amr_af = allele_frequencies[2] eur_af = allele_frequencies[3] eas_af = allele_frequencies[4] sas_af = allele_frequencies[5] exac_af = allele_frequencies[6] exac_adj_af = allele_frequencies[7] exac_afr_af = allele_frequencies[8] exac_amr_af = allele_frequencies[9] exac_eas_af = allele_frequencies[10] exac_fin_af = allele_frequencies[11] exac_nfe_af = allele_frequencies[12] exac_oth_af = allele_frequencies[13] exac_sas_af = allele_frequencies[14] esp_aa_af = allele_frequencies[15] esp_ea_af = allele_frequencies[16] #Look for a variant in the database if we have not seen it before create a new one try: new_variant = Variant.objects.get(variant_hash=hash_id) except Variant.DoesNotExist: new_variant = Variant(chromosome=chromosome, position=pos, ref=ref, alt=alt, variant_hash=hash_id, rs_number=rs_number, last_updated=timezone.now(), worst_consequence=worst_consequence, max_af=max_af, af=af, afr_af=afr_af, amr_af=amr_af, eur_af=eur_af, eas_af=eas_af, sas_af=sas_af, exac_af=exac_af, exac_adj_af=exac_adj_af, exac_afr_af=exac_afr_af, exac_amr_af=exac_amr_af, exac_eas_af=exac_eas_af, exac_fin_af=exac_fin_af, exac_nfe_af=exac_nfe_af, exac_oth_af=exac_oth_af, exac_sas_af=exac_sas_af, esp_aa_af=esp_aa_af, esp_ea_af=esp_ea_af, clinical_sig=clin_sig) new_variant.save() for gene in gene_list: try: gene_model = Gene.objects.get(name=gene[0]) except Gene.DoesNotExist: gene_model = Gene(name=gene[0]) gene_model.save() #Now create transcripts for transcript_key in variant['transcript_data']: if transcript_key == "": try: transcript_model = Transcript.objects.get( name='no_transcript') except Transcript.DoesNotExist: transcript_model = Transcript(name='no_transcript', canonical=False) transcript_model.save() else: try: transcript_model = Transcript.objects.get( name=transcript_key) except Transcript.DoesNotExist: canonical = variant['transcript_data'][transcript_key][ 'CANONICAL'] if canonical == 'YES': canonical = True else: canonical = False gene = variant['transcript_data'][transcript_key][ 'SYMBOL'] if gene != "": gene = Gene.objects.get(name=gene) transcript_model = Transcript(name=transcript_key, canonical=canonical, gene=gene) transcript_model.save() else: transcript_model = Transcript(name=transcript_key, canonical=canonical) transcript_model.save() #now create transcriptvariant model consequence = variant['transcript_data'][transcript_key][ 'Consequence'] exon = variant['transcript_data'][transcript_key]['EXON'] intron = variant['transcript_data'][transcript_key]['INTRON'] hgvsc_t = variant['transcript_data'][transcript_key]['HGVSc'] hgvsp_t = variant['transcript_data'][transcript_key]['HGVSp'] codons = variant['transcript_data'][transcript_key]['Codons'] cdna_position = variant['transcript_data'][transcript_key][ 'cDNA_position'] cds_position = variant['transcript_data'][transcript_key][ 'CDS_position'] protein_position = variant['transcript_data'][transcript_key][ 'Protein_position'] amino_acids = variant['transcript_data'][transcript_key][ 'Amino_acids'] picked = variant['transcript_data'][transcript_key]['PICK'] if picked == '1': picked = True else: picked = False variant_transcript = VariantTranscript( variant=new_variant, transcript=transcript_model, consequence=consequence, exon=exon, intron=intron, hgvsc=hgvsc_t, hgvsp=hgvsp_t, codons=codons, cdna_position=cdna_position, protein_position=protein_position, amino_acids=amino_acids, picked=picked) variant_transcript.save() genotype = variant['genotype'] caller = variant['Caller'] allele_depth = variant['allele_depth'] filter_status = variant['filter_status'] total_count_forward = variant['TCF'] total_count_reverse = variant['TCR'] vafs = ":".join(str(x) for x in variant['VAFS']) new_variant_sample = VariantSample( variant=new_variant, sample=sample, genotype=genotype, caller=caller, allele_depth=allele_depth, filter_status=filter_status, total_count_forward=total_count_forward, total_count_reverse=total_count_reverse, vafs=vafs) new_variant_sample.save() return None
def test_worst_consequence(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') input1 = master_list[0]['transcript_data'] input2 = { 'NM_002617.3': { 'Consequence': 'transcript_ablation' }, 'NM_002614.3': { 'Consequence': 'splice_acceptor_variant' } } input3 = { 'NM_002617.3': { 'Consequence': 'transcript_ablation' }, 'NM_002614.3': { 'Consequence': 'splice_acceptor_variant&splice_donor_variant' }, 'NM_002684.3': { 'Consequence': 'inframe_deletion' } } input4 = { 'NM_002617.3': { 'Consequence': 'stop_gained' }, 'NM_002614.3': { 'Consequence': 'splice_acceptor_variant' } } input5 = { 'NM_002617.3': { 'Consequence': 'frameshift_variant' }, 'NM_002614.3': { 'Consequence': 'splice_acceptor_variant' } } input6 = { 'NM_002617.3': { 'Consequence': 'intron_variant&missense_variant&non_coding_transcript_variant' }, 'NM_002614.3': { 'Consequence': 'frameshift_variant' } } input7 = { 'NM_002617.3': { 'Consequence': 'transcript_amplification' }, 'NM_002614.3': { 'Consequence': 'inframe_insertion' } } input8 = { 'NM_002617.3': { 'Consequence': 'inframe_insertion' }, 'NM_002614.3': { 'Consequence': 'incomplete_terminal_codon_variant&incomplete_terminal_codon_variant' } } input9 = { 'NM_002617.3': { 'Consequence': 'feature_elongation' }, 'NM_002614.3': { 'Consequence': 'intergenic_variant' } } input10 = { 'NM_002617.3': { 'Consequence': 'feature_truncation' }, 'NM_002614.3': { 'Consequence': 'intergenic_variant' } } input11 = { 'NM_002617.3': { 'Consequence': 'stop_lost' }, 'NM_002614.3': { 'Consequence': 'stop_lost' } } self.assertEqual(vcf_parser.worst_consequence(input1), 'intron_variant') self.assertEqual(vcf_parser.worst_consequence(input2), 'transcript_ablation') self.assertEqual(vcf_parser.worst_consequence(input3), 'transcript_ablation') self.assertEqual(vcf_parser.worst_consequence(input4), 'splice_acceptor_variant') self.assertEqual(vcf_parser.worst_consequence(input5), 'splice_acceptor_variant') self.assertEqual(vcf_parser.worst_consequence(input6), 'frameshift_variant') self.assertEqual(vcf_parser.worst_consequence(input7), 'transcript_amplification') self.assertEqual(vcf_parser.worst_consequence(input8), 'inframe_insertion') self.assertEqual(vcf_parser.worst_consequence(input9), 'feature_elongation') self.assertEqual(vcf_parser.worst_consequence(input10), 'feature_truncation') self.assertEqual(vcf_parser.worst_consequence(input11), 'stop_lost')
def test_create_master_list(self): master_list = vcf_parser.create_master_list( 'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf', 'WS61594_14000835') self.assertEqual(master_list[0]['reference'], 'C') self.assertEqual(master_list[0]['genotype'], '0/1') self.assertEqual(master_list[0]['alt_alleles'], ('A', )) self.assertEqual(master_list[0]['filter_status'], '.') self.assertEqual( master_list[0]['hash_id'], 'd360384c2a1df84a02bc9b2f19ee584ed837d600081450beab17762532ce18ba') self.assertEqual(master_list[0]['allele_depth'], '124:124') self.assertEqual( master_list[0]['transcript_data']['NM_002617.3'], { 'MAX_AF_POPS': 'ExAC_FIN', 'TSL': '', 'APPRIS': '', 'ExAC_AF': '0.392', 'ExAC_NFE_AF': '0.4469', 'AMR_AF': '0.4539', 'SYMBOL': 'PEX10', 'AFR_AF': '0.2859', 'ExAC_EAS_AF': '0.2793', 'Feature': 'NM_002617.3', 'Codons': '', 'MOTIF_NAME': '', 'DOMAINS': '', 'SIFT': '', 'VARIANT_CLASS': 'SNV', 'EA_AF': '0.4278', 'CDS_position': '', 'CCDS': '', 'Allele': 'A', 'PolyPhen': '', 'AA_AF': '0.2942', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'MODIFIER', 'HGVSp': '', 'ENSP': 'NP_002608.1', 'MAX_AF': '0.5536', 'INTRON': '4/5', 'ExAC_AFR_AF': '0.3196', 'Existing_variation': 'rs3795269', 'HGVSc': 'NM_002617.3:c.776+33G>T', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'ExAC_FIN_AF': '0.5536', 'PICK': '', 'GENE_PHENO': '', 'ExAC_SAS_AF': '0.3747', 'UNIPARC': '', 'cDNA_position': '', 'PUBMED': '', 'EAS_AF': '0.253', 'Feature_type': 'Transcript', 'AF': '0.3391', 'ExAC_Adj_AF': '0.4176', 'ExAC_OTH_AF': '0.4538', 'HGNC_ID': '', 'SAS_AF': '0.3272', 'SWISSPROT': '', 'FLAGS': '', 'Consequence': 'intron_variant', 'Protein_position': '', 'Gene': '5192', 'STRAND': '-1', 'EUR_AF': '0.4274', 'DISTANCE': '', 'PHENO': '', 'SYMBOL_SOURCE': '', 'Amino_acids': '', 'ExAC_AMR_AF': '0.43', 'TREMBL': '', 'CLIN_SIG': '', 'REFSEQ_MATCH': '', 'HGVS_OFFSET': '', 'BIOTYPE': 'protein_coding', 'EXON': '', 'SOMATIC': '', 'CANONICAL': '' }) self.assertEqual(len(master_list), 281)