def test_parse(self):
    input_fasta = StringIO("""\
>foo
AAAAAAAAAAAAAAAAAAA
>bar
GGGGGGGGGGGGGGGGGGG
""")
    sequences = snp_sites_extensions.parse_fasta(input_fasta)
    self.assertEqual(sequences.next(), ('foo', 'AAAAAAAAAAAAAAAAAAA'))
    self.assertEqual(sequences.next(), ('bar', 'GGGGGGGGGGGGGGGGGGG'))
Exemple #2
0
    def test_parse(self):
        input_fasta = StringIO("""\
>foo
AAAAAAAAAAAAAAAAAAA
>bar
GGGGGGGGGGGGGGGGGGG
""")
        sequences = snp_sites_extensions.parse_fasta(input_fasta)
        self.assertEqual(sequences.next(), ('foo', 'AAAAAAAAAAAAAAAAAAA'))
        self.assertEqual(sequences.next(), ('bar', 'GGGGGGGGGGGGGGGGGGG'))
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
""" % reference_length)
  header_row = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
  header_row += sequence_names
  write_row(header_row, output_file)

BUFFER_SIZE = 10*1024*1024

if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('input', type=argparse.FileType('r', BUFFER_SIZE))
  parser.add_argument('output', type=argparse.FileType('w'),
                      default=open('random.short.fa.vcf', 'w'))
  args = parser.parse_args()
  
  sequences = snp_sites_extensions.parse_fasta(args.input)
  ref_name,ref_seq = sequences.next()
  snps = {}
  sequence_names = []
  sequence_names.append(ref_name)
  
  for seq_name,seq_seq in sequences:
    snp_sites_extensions.update_snps(sequence_names, snps, bytearray(ref_seq),
                                     seq_name, bytearray(seq_seq))
  
  snps = OrderedDict([(posn, snps[posn]) for posn in sorted(snps.keys())])
  
  write_header(sequence_names, len(ref_seq), args.output)
  
  for row_idx, (posn, snp_in_posn) in enumerate(snps.items()):
    ref_base = ref_seq[posn]
Exemple #4
0
    ]
    header_row += sequence_names
    write_row(header_row, output_file)


BUFFER_SIZE = 10 * 1024 * 1024

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', type=argparse.FileType('r', BUFFER_SIZE))
    parser.add_argument('output',
                        type=argparse.FileType('w'),
                        default=open('random.short.fa.vcf', 'w'))
    args = parser.parse_args()

    sequences = snp_sites_extensions.parse_fasta(args.input)
    ref_name, ref_seq = sequences.next()
    snps = {}
    sequence_names = []
    sequence_names.append(ref_name)

    for seq_name, seq_seq in sequences:
        snp_sites_extensions.update_snps(sequence_names, snps,
                                         bytearray(ref_seq), seq_name,
                                         bytearray(seq_seq))

    snps = OrderedDict([(posn, snps[posn]) for posn in sorted(snps.keys())])

    write_header(sequence_names, len(ref_seq), args.output)

    for row_idx, (posn, snp_in_posn) in enumerate(snps.items()):