def test_create_vcf_from_sequences(self, temp_mock, ctx_tempfile_mock, ctx_tempdir_mock): builder = SNPFeatureBuilder() temp_vcf_file = tempfile.NamedTemporaryFile('w', delete=False) temp_files = [] temp_folders = [] ctx_tempfile_mock.side_effect = create_context_aware_tempfile_mock(temp_files) ctx_tempdir_mock.side_effect = create_context_aware_tempdir_mock(temp_folders) temp_mock.NamedTemporaryFile.return_value = temp_vcf_file fasta_filename = os.path.join(test_data(), 'file_with_SNPs.aln') fasta_file = open(fasta_filename, 'r') builder.load_fasta_sequences(fasta_file) builder.create_vcf_from_sequences() self.assertEqual(len(temp_files), 1) self.assertFalse(os.path.isfile(temp_files[0])) self.assertEqual(len(temp_folders), 1) self.assertFalse(os.path.isdir(temp_folders[0])) self.assertTrue(os.path.isfile(temp_vcf_file.name)) builder.vcf_input_file.seek(0) records = SNPSitesReader(builder.vcf_input_file) number_of_records = sum((1 for record in records)) self.assertEqual(number_of_records, 5) fasta_file.close() temp_vcf_file.close() os.remove(temp_vcf_file.name)
def test_next_with_GT(self): """Parse a file which already in Genotype format""" vcf_filename = os.path.join(test_data(), 'file_with_SNPs_in_GT_format.aln.vcf') vcf_file = open(vcf_filename, 'r') reader = SNPSitesReader(vcf_file) record = reader.next() samples = record.samples samples_with_alternative_bases = [sample.sample for sample in samples if sample.data.GT != '0'] expected = ['3002_8_1', '3002_8_2', '3002_8_6', '4056_2_10', '4056_2_4', '4056_8_6', '5174_5_1', '5174_5_7', '5174_5_9', '5174_6_10', '5174_7_1', '5174_8_5'] self.assertItemsEqual(samples_with_alternative_bases, expected)
def test_next(self): """Parse a file with Alternate Base info File was cerated using snp-sites""" vcf_filename = os.path.join(test_data(), 'file_with_SNPs.aln.vcf') vcf_file = open(vcf_filename, 'r') reader = SNPSitesReader(vcf_file) record = reader.next() samples = record.samples samples_with_alternative_bases = [sample.sample for sample in samples if sample.data.GT != '0'] expected = ['3002_8_1', '3002_8_2', '3002_8_6', '4056_2_10', '4056_2_4', '4056_8_6', '5174_5_1', '5174_5_7', '5174_5_9', '5174_6_10', '5174_7_1', '5174_8_5'] self.assertItemsEqual(samples_with_alternative_bases, expected)
def test_ammend_line(self): snp_sites_old_bases = SNPSitesReader.__bases__ SNPSitesReader.__bases__ = (MagicMock,) # I don't want to test vcf.Reader reader = SNPSitesReader() reader._separator="\t| +" line = "0\t1\t2\tC\tT,G\t5\t6\tAB\t.\tG\t.\tT" expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1" self.assertEqual(reader._amend_line(line), expected) # It would be elegant if it maintained the separator but this is unlikely to # case big issues, hopefully line = "0 1 2 C T,G 5 6 AB . G . T" expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1" self.assertEqual(reader._amend_line(line), expected) line = "0 1 2 C T,G\t5\t6 AB . G . T" expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1" self.assertEqual(reader._amend_line(line), expected) line = "0\t1\t2\tC\tT,G\t5\t6\tAB\tGT\t2\t0\t1" expected = "0\t1\t2\tC\tT,G\t5\t6\tAB\tGT\t2\t0\t1" self.assertEqual(reader._amend_line(line), expected) SNPSitesReader.__bases__ = snp_sites_old_bases
str(new_amino_acid)) consequence = Consequence(alternative_base, Consequence=consequence_type, Protein_position=position_in_protein, Amino_acids=amino_acid_change, STRAND=strand) consequences.append(consequence) return consequences if __name__ == '__main__': args = get_arguments() vcf_input_file, gff_input_file, fasta_input_file, vcf_output_file = get_file_handles( args) vcf_input_reader = SNPSitesReader(vcf_input_file) add_consequences_info_header(vcf_input_reader) add_GT_format_header(vcf_input_reader) remove_AB_info_header(vcf_input_reader) vcf_output_writer = vcf.Writer(vcf_output_file, vcf_input_reader) sequence = Bio.SeqIO.parse(fasta_input_file, 'fasta').next() feature_index = build_feature_index(gff_input_file) chromosome_name_in_vcf = '1' chromosome_name_in_gff = 'Salmonella_enterica_subsp_enterica_serovar_Typhi_str_CT18_v1|SC|contig000001' for record in vcf_input_reader: if record.CHROM == chromosome_name_in_vcf: matching_cds = get_matching_CDS(record, feature_index, chromosome_name_in_gff)