예제 #1
0
  def test_create_vcf_from_sequences(self, temp_mock, ctx_tempfile_mock,
                                     ctx_tempdir_mock):
    builder = SNPFeatureBuilder()

    temp_vcf_file = tempfile.NamedTemporaryFile('w', delete=False)

    temp_files = []
    temp_folders = []
    ctx_tempfile_mock.side_effect = create_context_aware_tempfile_mock(temp_files)
    ctx_tempdir_mock.side_effect = create_context_aware_tempdir_mock(temp_folders)
    temp_mock.NamedTemporaryFile.return_value = temp_vcf_file

    fasta_filename = os.path.join(test_data(), 'file_with_SNPs.aln')
    fasta_file = open(fasta_filename, 'r')

    builder.load_fasta_sequences(fasta_file)
    builder.create_vcf_from_sequences()

    self.assertEqual(len(temp_files), 1)
    self.assertFalse(os.path.isfile(temp_files[0]))
    self.assertEqual(len(temp_folders), 1)
    self.assertFalse(os.path.isdir(temp_folders[0]))
    self.assertTrue(os.path.isfile(temp_vcf_file.name))

    builder.vcf_input_file.seek(0)
    records = SNPSitesReader(builder.vcf_input_file)
    number_of_records = sum((1 for record in records))

    self.assertEqual(number_of_records, 5)

    fasta_file.close()
    temp_vcf_file.close()
    os.remove(temp_vcf_file.name)
예제 #2
0
  def test_next_with_GT(self):
    """Parse a file which already in Genotype format"""
    vcf_filename = os.path.join(test_data(), 'file_with_SNPs_in_GT_format.aln.vcf')
    vcf_file = open(vcf_filename, 'r')

    reader = SNPSitesReader(vcf_file)
    record = reader.next()

    samples = record.samples
    samples_with_alternative_bases = [sample.sample for sample in samples if
                                      sample.data.GT != '0']

    expected = ['3002_8_1', '3002_8_2', '3002_8_6', '4056_2_10', '4056_2_4',
                '4056_8_6', '5174_5_1', '5174_5_7', '5174_5_9', '5174_6_10',
                '5174_7_1', '5174_8_5']

    self.assertItemsEqual(samples_with_alternative_bases, expected)
예제 #3
0
  def test_next(self):
    """Parse a file with Alternate Base info
    
    File was cerated using snp-sites"""
    vcf_filename = os.path.join(test_data(), 'file_with_SNPs.aln.vcf')
    vcf_file = open(vcf_filename, 'r')

    reader = SNPSitesReader(vcf_file)
    record = reader.next()

    samples = record.samples
    samples_with_alternative_bases = [sample.sample for sample in samples if
                                      sample.data.GT != '0']

    expected = ['3002_8_1', '3002_8_2', '3002_8_6', '4056_2_10', '4056_2_4',
                '4056_8_6', '5174_5_1', '5174_5_7', '5174_5_9', '5174_6_10',
                '5174_7_1', '5174_8_5']

    self.assertItemsEqual(samples_with_alternative_bases, expected)
예제 #4
0
  def test_ammend_line(self):
    snp_sites_old_bases = SNPSitesReader.__bases__
    SNPSitesReader.__bases__ = (MagicMock,) # I don't want to test vcf.Reader
    reader = SNPSitesReader()
    reader._separator="\t| +"

    line = "0\t1\t2\tC\tT,G\t5\t6\tAB\t.\tG\t.\tT"
    expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1"
    self.assertEqual(reader._amend_line(line), expected)

    # It would be elegant if it maintained the separator but this is unlikely to
    # case big issues, hopefully
    line = "0 1 2 C T,G 5 6 AB . G . T"
    expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1"
    self.assertEqual(reader._amend_line(line), expected)

    line = "0  1 2    C T,G\t5\t6 AB . G . T"
    expected = "0\t1\t2\tC\tT,G\t5\t6\t.\tGT\t2\t0\t1"
    self.assertEqual(reader._amend_line(line), expected)

    line = "0\t1\t2\tC\tT,G\t5\t6\tAB\tGT\t2\t0\t1"
    expected = "0\t1\t2\tC\tT,G\t5\t6\tAB\tGT\t2\t0\t1"
    self.assertEqual(reader._amend_line(line), expected)
    SNPSitesReader.__bases__ = snp_sites_old_bases
예제 #5
0
                                               str(new_amino_acid))
            consequence = Consequence(alternative_base,
                                      Consequence=consequence_type,
                                      Protein_position=position_in_protein,
                                      Amino_acids=amino_acid_change,
                                      STRAND=strand)
            consequences.append(consequence)
    return consequences


if __name__ == '__main__':
    args = get_arguments()
    vcf_input_file, gff_input_file, fasta_input_file, vcf_output_file = get_file_handles(
        args)

    vcf_input_reader = SNPSitesReader(vcf_input_file)
    add_consequences_info_header(vcf_input_reader)
    add_GT_format_header(vcf_input_reader)
    remove_AB_info_header(vcf_input_reader)
    vcf_output_writer = vcf.Writer(vcf_output_file, vcf_input_reader)
    sequence = Bio.SeqIO.parse(fasta_input_file, 'fasta').next()

    feature_index = build_feature_index(gff_input_file)

    chromosome_name_in_vcf = '1'
    chromosome_name_in_gff = 'Salmonella_enterica_subsp_enterica_serovar_Typhi_str_CT18_v1|SC|contig000001'

    for record in vcf_input_reader:
        if record.CHROM == chromosome_name_in_vcf:
            matching_cds = get_matching_CDS(record, feature_index,
                                            chromosome_name_in_gff)