forked from samfway/gene-coverage
/
translate_contig_annotations.py
executable file
·89 lines (77 loc) · 3.61 KB
/
translate_contig_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
import sys
import argparse
import lib.parse as pl
import lib.nucmer as nm
def interface():
args = argparse.ArgumentParser(
description='Translate contig annotation coordinates to reference coordinates.')
args.add_argument('-g', '--predicted-genes', help='GFF/Genbank file containing predicted genes for the contigs.', required=True)
args.add_argument('-c', '--alignment-coords', help='NUCMER alignment coordinates.', required=True)
args.add_argument('-o', '--output-file', help='Output file (default: translated_genes.gff)', \
default='translated_genes.gff', type=str)
args = args.parse_args()
return args
def translate_annotation(location, nucmer_record, reference_name=None):
""" Take location and nucmer alignment record and map annotation from
contig back to reference sequence
"""
ref_start = nucmer_record['S1']
ref_end = nucmer_record['E1']
contig_start = nucmer_record['S2']
contig_end = nucmer_record['E2']
gene_start = location.start
gene_end = location.end
start_on_contig = max(contig_start, gene_start)
stop_on_contig = min(contig_end, gene_end)
length = stop_on_contig - start_on_contig
if length < 1:
return None
translated_start= ref_start - contig_start + start_on_contig
translated_end = translated_start + length
new_location = pl.feature_location(translated_start, \
translated_end, \
location.strand, \
reference_name)
return new_location
def get_translated_annotations(predicted_genes_file, alignment_coords_file):
""" Translate annotation coordinates back to reference sequences.
predicted_genes_file should be a GFF file containing annotations
for a collection of contigs.
alignment_coords_file should a NUCMER alignment coords file, mapping
said contigs to a reference genome.
With these two files, this function maps genes/features found in contigs
back to the reference sequence. This can be used to compare the number of
genes found in contigs versus those found in the reference genome
"""
nucmer_records = nm.get_nucmer_records_by_contig(alignment_coords_file)
for gene in pl.get_feature_locations(predicted_genes_file):
if gene.seq_id in nucmer_records:
for record in nucmer_records[gene.seq_id]:
new_location = translate_annotation(gene, record)
if new_location is not None:
yield new_location
def create_translated_file(predicted_genes_file, alignment_coords_file, output_file, \
reference_name='reference_genome'):
""" Translate annotations and save to output file
"""
output = open(output_file, 'w')
output.write('##gff out for translated genes\n')
output.write('##gff-version 3\n')
id_count = 0
for new_location in get_translated_annotations(predicted_genes_file, alignment_coords_file):
fields = [reference_name, \
'translated', \
'gene', \
str(new_location.start), \
str(new_location.end), \
'.', \
('+' if new_location.strand > 0 else '-'), \
'.', \
'id_%d' % (id_count)]
output.write('\t'.join(fields)+'\n')
id_count += 1
output.close()
if __name__=="__main__":
args = interface()
create_translated_file(args.predicted_genes, args.alignment_coords, args.output_file)