def test_parse_protein_mutations(self): test_info = \ "QNAME=hCoV-19_Guangdong_2020XN4373-P0039_2020_EPI_ISL_413851_2020-01-30;QSTART=274;QSTRAND=+;ANN=" \ "T|synonymous_variant|LOW|ORF1ab|GU280_gp01|transcript|GU280_gp01|" \ "protein_coding|1/2|c.9C>T|p.Ser3Ser|9/21291|9/21291|3/7096||," \ "T|synonymous_variant|LOW|ORF1ab|GU280_gp01|transcript|YP_009725297.1|" \ "protein_coding|1/1|c.9C>T|p.Ser3Ser|9/540|9/540|3/179||WARNING_TRANSCRIPT_NO_STOP_CODON," \ "T|synonymous_variant|LOW|ORF1ab|GU280_gp01|transcript|YP_009742608.1|" \ "protein_coding|1/1|c.9C>T|p.Ser3Ser|9/540|9/540|3/179||WARNING_TRANSCRIPT_NO_STOP_CODON," \ "T|synonymous_variant|LOW|ORF1ab|GU280_gp01|transcript|GU280_gp01.2|" \ "protein_coding|1/1|c.9C>T|p.Ser3Ser|9/13218|9/13218|3/4405||," \ "T|upstream_gene_variant|MODIFIER|ORF1ab|GU280_gp01|transcript|YP_009725298.1|" \ "protein_coding||c.-532C>T|||||532|WARNING_TRANSCRIPT_NO_START_CODON," \ "T|upstream_gene_variant|MODIFIER|ORF1ab|GU280_gp01|transcript|YP_009742609.1|" \ "protein_coding||c.-532C>T|||||532|WARNING_TRANSCRIPT_NO_START_CODON," \ "T|upstream_gene_variant|MODIFIER|ORF1ab|GU280_gp01|transcript|YP_009725299.1|" \ "protein_coding||c.-2446C>T|||||2446|WARNING_TRANSCRIPT_NO_START_CODON," \ "T|upstream_gene_variant|MODIFIER|ORF1ab|GU280_gp01|transcript|YP_009742610.1|" \ "protein_coding||c.-2446C>T|||||2446|WARNING_TRANSCRIPT_NO_START_CODON" vcf_obj = vcf_parser.VcfParser() muts = vcf_obj._extract_protein_mutations(test_info) self.assertEqual(len(muts), 4) self.assertEqual(muts[0], 'p.Ser3Ser') self.assertEqual(muts[-1], 'p.Ser3Ser')
def test_load_with_snpeff_data__protein(self): vcf_obj = vcf_parser.VcfParser() vcf_obj.read_vcf_file(TEST_VCF_FILE2) self.assertEqual(vcf_obj.df_vcf.shape, (500, 10)) # Test get mutations muts = vcf_obj.get_protein_mutations(is_strict_check=False) print(muts[:5]) # Test write to file output_file = f'__out_{time.time()}__ut_mutations_vcf2-protein.txt' vcf_obj.write_mutations_to_file(muts, output_file)
def test_load(self): vcf_obj = vcf_parser.VcfParser() vcf_obj.read_vcf_file(TEST_VCF_FILE1) self.assertEqual(vcf_obj.df_vcf.shape, (19756, 10)) # Test get mutations muts = vcf_obj.get_mutations() print(muts[:5]) # Test write to file output_file = f'__out_{time.time()}__ut_mutations_vcf1.txt' vcf_obj.write_mutations_to_file(muts, output_file)
def get_mutations_from_file(vcf_file_name, check_protein_mutations = False, verbose = 0): if not os.path.isfile(vcf_file_name): raise FileNotFoundError(f"Cannot find VCF file: {vcf_file_name}") # Parse VCF into sequence of mutations vcf_parser_obj = vcf_parser.VcfParser() try: vcf_parser_obj.read_vcf_file(vcf_file_name, verbose=verbose) if check_protein_mutations: vcf_mutations = vcf_parser_obj.get_protein_mutations(is_strict_check=False, verbose=verbose) else: vcf_mutations = vcf_parser_obj.get_mutations(verbose=verbose) assert isinstance(vcf_mutations, list) except Exception as e: # Re-raise exception with additional context raise Exception(f'ERROR while parsing vcf file: {e}') return vcf_mutations
def test_convert_protein_mutations(self): vcf_obj = vcf_parser.VcfParser() muts = ['p.Ser3Ser', 'p.Thr5262Ile'] new_muts = vcf_obj.convert_protein_mutations_from_3_to_1_letters(muts) self.assertEqual(new_muts, ['S3S', 'T5262I']) # Bad first letter ('x') with self.assertRaisesRegex(ValueError, 'Unexpected format'): vcf_obj.convert_protein_mutations_from_3_to_1_letters( ['x.Ser3Ser']) # Bad acid1 (non-existing) with self.assertRaisesRegex(ValueError, 'Cannot recognize acid1'): vcf_obj.convert_protein_mutations_from_3_to_1_letters( ['p.Xxx3Ser']) # Bad acid2 (non-existing) with self.assertRaisesRegex(ValueError, 'Cannot recognize acid2'): vcf_obj.convert_protein_mutations_from_3_to_1_letters( ['p.Ser3Xxx'])