def test_parse(self): input_fasta = StringIO("""\ >foo AAAAAAAAAAAAAAAAAAA >bar GGGGGGGGGGGGGGGGGGG """) sequences = snp_sites_extensions.parse_fasta(input_fasta) self.assertEqual(sequences.next(), ('foo', 'AAAAAAAAAAAAAAAAAAA')) self.assertEqual(sequences.next(), ('bar', 'GGGGGGGGGGGGGGGGGGG'))
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> """ % reference_length) header_row = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] header_row += sequence_names write_row(header_row, output_file) BUFFER_SIZE = 10*1024*1024 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', type=argparse.FileType('r', BUFFER_SIZE)) parser.add_argument('output', type=argparse.FileType('w'), default=open('random.short.fa.vcf', 'w')) args = parser.parse_args() sequences = snp_sites_extensions.parse_fasta(args.input) ref_name,ref_seq = sequences.next() snps = {} sequence_names = [] sequence_names.append(ref_name) for seq_name,seq_seq in sequences: snp_sites_extensions.update_snps(sequence_names, snps, bytearray(ref_seq), seq_name, bytearray(seq_seq)) snps = OrderedDict([(posn, snps[posn]) for posn in sorted(snps.keys())]) write_header(sequence_names, len(ref_seq), args.output) for row_idx, (posn, snp_in_posn) in enumerate(snps.items()): ref_base = ref_seq[posn]
] header_row += sequence_names write_row(header_row, output_file) BUFFER_SIZE = 10 * 1024 * 1024 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', type=argparse.FileType('r', BUFFER_SIZE)) parser.add_argument('output', type=argparse.FileType('w'), default=open('random.short.fa.vcf', 'w')) args = parser.parse_args() sequences = snp_sites_extensions.parse_fasta(args.input) ref_name, ref_seq = sequences.next() snps = {} sequence_names = [] sequence_names.append(ref_name) for seq_name, seq_seq in sequences: snp_sites_extensions.update_snps(sequence_names, snps, bytearray(ref_seq), seq_name, bytearray(seq_seq)) snps = OrderedDict([(posn, snps[posn]) for posn in sorted(snps.keys())]) write_header(sequence_names, len(ref_seq), args.output) for row_idx, (posn, snp_in_posn) in enumerate(snps.items()):