-
Notifications
You must be signed in to change notification settings - Fork 3
/
samplespecificdbgenerator.py
145 lines (132 loc) · 7.46 KB
/
samplespecificdbgenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# main program: samplespecificdbgenerator.py
__author__ = "anthony"
__date__ = "$Oct 27, 2015 2:41:36 PM$"
import sys
import os.path
import optparse
import refparse
import novelsplices
import variantcalls
from lxml import etree as et
HTML_NS = "http://uniprot.org/uniprot"
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
NAMESPACE_MAP = {None:HTML_NS, "xsi":XSI_NS}
UP = '{'+HTML_NS+'}'
def __main__():
#Parse Command Line
parser = optparse.OptionParser()
#I/O
parser.add_option( '-x', '--reference_xml', dest='reference_xml', help='Reference protein UniProt-XML file. Sequence variant peptide entries are appended to this database to generate the ouptut UniProt-XML protein database.' )
parser.add_option( '-p', '--protein_fasta', dest='protein_fasta', help='Reference protein FASTA file. Used to generate SAV peptide entries. If no UniProt-XML is specified, SAV and NSJ entries will be appended to this database to generate an output database. By default, this output will be a UniProt-XML protein database without PTM annotations. If --output-fasta is selected, the output will be a protein FASTA.')
parser.add_option( '-g', '--gene_model', dest='gene_model', default=None, help='GTF gene model file. Used to annotate NSJ peptide entries.')
parser.add_option( '-v', '--snpeff_vcf', dest='snpeff_vcf', help='SnpEff VCF file with HGVS annotations (else read from stdin).' )
parser.add_option( '-b', '--splice_bed', dest='splice_bed', help='BED file (tophat junctions.bed) with sequence column added.' )
parser.add_option( '-o', '--output', dest='output', help='Output file path. Outputs UniProt-XML format unless --output-fasta is selected.' )
parser.add_option( '-z', '--output_fasta', dest='output_fasta', action='store_true', default=False, help='Output a FASTA-format database. Place path for output file after the --output flag.')
#Peptide sequence construction
parser.add_option( '-l', '--leading_aa_num', dest='leading_aa_num', type='int', default=33, help='Leading number of AAs to output for SAV peptides. Default: 33.' )
parser.add_option( '-t', '--trailing_aa_num', dest='trailing_aa_num', type='int', default=33, help='Trailing number of AAs to output for SAV peptides. Default: 33.' )
#Filtering parameters
parser.add_option( '-D', '--nsj_depth_cutoff', dest='nsj_depth_cutoff', type='int', default=0, help='Keep only NSJs found with above this depth (BED score field). Default: 0.' )
parser.add_option( '-E', '--snv_depth_cutoff', dest='snv_depth_cutoff', type='int', default=0, help='Keep only SNVs found with above this depth (DP=# field). Default: 0.' )
parser.add_option( '-M', '--minimum_length', dest='minimum_length', type='int', default=0, help='Keep only sequence variant peptides with greater than or equal to this length. Default: 0.' )
#Simple entry
parser.add_option( '-Q', '--bed_score_name', dest='bed_score_name', default="depth", help='Include in the NSJ ID line score_name:score. Default: "depth."' )
parser.add_option( '-R', '--reference', dest='reference', default="None", help='Genome Reference Name for NSJ ID location. Automatically pulled from genome_build header in GTF if present.' )
(options, args) = parser.parse_args()
##INPUTS##
#Protein FASTA
try:
protein_fasta = os.path.abspath(options.protein_fasta)
protein_fasta = open(protein_fasta, 'r')
protein_fasta = refparse.read_protein_fasta(protein_fasta)
except Exception, e:
print >> sys.stderr, "failed: %s" % e
exit(2)
#Reference XML/FASTA
if options.reference_xml != None:
try:
refXml = os.path.abspath(options.reference_xml)
refXml = open(refXml, 'r')
p = et.XMLParser(remove_blank_text=True) #required for pretty additions
db = et.parse(refXml, p)
root = db.getroot()
for entry in root:
refparse.condense_xml_entry(entry)
except Exception, e:
print >> sys.stderr, "Parsing and/or condensing reference xml failed: %s" % e
exit(2)
elif options.protein_fasta != None:
try:
refFasta = os.path.abspath(options.protein_fasta)
refFasta = open(refFasta, 'r')
root = et.Element(UP+'uniprot', nsmap=NAMESPACE_MAP)
db = et.ElementTree(root)
refparse.read_fasta_to_xml(root, refFasta)
except Exception, e:
print >> sys.stderr, "Parsing reference fasta failed: %s" %e
exit(2)
else:
print >> sys.stderr, "failed: no reference database specified"
##OUTPUT##
outFile = None
if options.output == None: outFile = sys.stdout
else:
try:
outFile = os.path.abspath(options.output)
outFile = open(outFile, 'w')
except Exception, e:
print >> sys.stderr, "Opening outfile failed: %s" % e
exit(3)
#Process gene model
try:
geneModelFile = os.path.abspath(options.gene_model)
linect = sum(1 for line in open(geneModelFile))
geneModelFile = open(geneModelFile, 'r')
geneModel = refparse.GeneModel()
for i, gtf_line in enumerate(geneModelFile):
if i % 20000 == 0: print "gene_model line " + str(i) + " of " + str(linect)
if gtf_line.startswith('#') and gtf_line.find('genome_build') >= 0: options.reference = gtf_line.split()[1]
elif gtf_line.startswith('#'): continue
else: geneModel.new_entry(gtf_line)
except Exception, e:
print >> sys.stderr, "Parsing gene model failed: %s" % e
exit(2)
#Process VCF
try:
snpeff_vcf = os.path.abspath(options.snpeff_vcf)
linect = sum(1 for line in open(snpeff_vcf))
snpeff_vcf = open(snpeff_vcf, 'r')
for i, vcf_line in enumerate(snpeff_vcf):
if i % 1000 == 0: print "snpeff_vcf line " + str(i) + " of " + str(linect)
variantcalls.parse_vcf_line(root, vcf_line, protein_fasta, options.snv_depth_cutoff, options.minimum_length, options.leading_aa_num, options.trailing_aa_num)
snpeff_vcf.close()
except Exception, e:
print >> sys.stderr, "VCF processing failed: %s" % e
exit(1)
#Process Splice BED
try:
splice_bed = os.path.abspath(options.splice_bed)
linect = sum(1 for line in open(splice_bed))
splice_bed = open(splice_bed, 'r')
for i, bed_line in enumerate(splice_bed):
if i % 2000 == 0: print "splice_bed line " + str(i) + " of " + str(linect)
if bed_line.startswith('track'): continue
novelsplices.translate_bed_line(root, geneModel, bed_line, options.nsj_depth_cutoff, options.minimum_length, options.reference, options.bed_score_name)
splice_bed.close()
except Exception, e:
print >> sys.stderr, "Splice BED processing failed: %s" % e
exit(2)
#Write the sample specific database to outfile
if not options.output_fasta: db.write(outFile, pretty_print=True)
else:
entryct = len(root)
if outFile != None:
for i, entry in enumerate(root):
if i % 1000 == 0: print "writing entry " + str(i) + " of " + str(entryct)
entry = refparse.xml_to_fasta(entry)
if entry == None: continue
else: outFile.write(entry[0] + '\n' + entry[1] + '\n')
outFile.close()
else: print >> sys.stderr, "Writing to fasta failed: no out-file found"
if __name__ == "__main__" : __main__()