-
Notifications
You must be signed in to change notification settings - Fork 0
/
bedGraphSplitter.py
executable file
·197 lines (168 loc) · 7.06 KB
/
bedGraphSplitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/env python
#
# bedGraphSplitter.py: split data file into multiple files by column
# Copyright (C) University of Manchester 2011 Peter Briggs
#
########################################################################
#
# bedGraphSplitter.py
#
#########################################################################
"""bedGraphSplitter.py
Create bedGraph format files from one or more columns in an input
tab-delimited file.
The UCSC bedGraph format is described here:
http://genome.ucsc.edu/goldenPath/help/bedgraph.html
The input file must have the first three columns as 'chromosome', 'start'
and 'end', followed by an arbitrary number of data columns.
There will be one output file for each selected column, each will contain
columns 'chromosome', 'start', 'end' and the data value from the selected
column.
"""
#######################################################################
# Import modules
#######################################################################
import os
import sys
import logging
import optparse
# Set up for local modules in "share"
SHARE_DIR = os.path.abspath(
os.path.normpath(
os.path.join(os.path.dirname(sys.argv[0]),'..','share')))
sys.path.append(SHARE_DIR)
from TabFile import TabFile
import version
__version__ = version.__version__
# Set default logging level and output
logging.basicConfig(format='%(levelname)s: %(message)s')
#######################################################################
# Classes
#######################################################################
# No classes defined
#######################################################################
# Main program
#######################################################################
if __name__ == "__main__":
p = optparse.OptionParser(usage="%prog [options] <file>",
version="%prog "+__version__,
description=
"Generate bedGraph custom track files for display in UCSC "
"browser from genomic data taken from tab-delimited file"
"containing chromosome, start and end as the first three "
"columns of data. Use the --select option to pick one or more "
"columns, each of which will be output to a bedGraph format "
"file.")
p.add_option('--select',action='store',dest='selection',default=None,
help="specify columns from input file as one or more column indices "
"(i.e. where 1 is the first column) or column names (if "
"--first-line-is-header is used). If multiple columns are selected "
"then separate them by commas, e.g. '4,6,7'. A bedGraph file will "
"be output for each of the selected columns specified.")
p.add_option('--skip-first-line',action="store_true",dest="skip_first_line",
help="skip first line of input file")
p.add_option('--first-line-is-header',action="store_true",dest="first_line_is_header",
help="take column names from first line of input file")
p.add_option('--fix-chromosome',action="store_true",dest="fix_chromosome",
help="fix chromosome names in output file file, by prepending 'chr' "
"if missing in the input")
p.add_option('--bedGraph-header',action="store",dest="header",default=None,
help="specify text to use as the header for each output bedGraph "
"(default is not to write a header)")
# Process the command line
options,arguments = p.parse_args()
# Input file
if len(arguments) != 1:
p.error("No input file supplied")
filen = arguments[0]
if not os.path.exists(filen):
logging.error("Input file '%s' not found" % filen)
sys.exit(1)
# Report version
p.print_version()
# Initialise
skip_first_line = options.skip_first_line
first_line_is_header = options.first_line_is_header
fix_chromosome = options.fix_chromosome
bedgraph_header = options.header
user_selected = str(options.selection).split(',')
# Get the input data
data = TabFile(filen,
skip_first_line=skip_first_line,
first_line_is_header=first_line_is_header)
print "Read in %d lines" % len(data)
if first_line_is_header:
print "Header:"
for col in data.header():
print "\t%s" % col
# Output file
output_root = os.path.splitext(os.path.basename(filen))[0]
# Selected columns
if len(user_selected) == 0:
print "No columns selected for output."
sys.exit()
print "Selected columns = %s" % ' '.join(user_selected)
# Assume user counts columns starting from one and adjust to count from zero
# Also check that the requested column exists and set up file names based on
# user input
selected = []
col_lookup = {}
file_names = {}
for col in user_selected:
try:
col0 = int(col) - 1
if col0 >= data.nColumns():
logging.error("Unable to find column %s, not enough columns in input file" % col)
sys.exit(1)
except ValueError:
# Not an integer
if col not in data.header():
logging.error("Unable to find column '%s' in input file" % col)
sys.exit(1)
col0 = data.header().index(col)
# Column lookup
col_lookup[col0] = col
# Adjusted column names
selected.append(col0)
# File names
if first_line_is_header:
file_names[col0] = str(str(data.header()[col0])+".bedGraph").replace(' ','_')
else:
file_names[col0] = str(output_root+"_"+str(col)+".bedGraph").replace(' ','_')
# Open output files
out_file = {}
print "Opening output files:"
for col in selected:
print "\t%s" % file_names[col]
out_file[col] = open(file_names[col],'w')
if bedgraph_header is not None:
# Write bedGraph header
out_file[col].write("%s\n" % bedgraph_header)
# Fix chromosome?
if fix_chromosome:
print "Fixing chromosome names..."
for line in data:
if not str(line[0]).startswith('chr'):
line[0] = 'chr'+str(line[0])
# Fix end positions (subtract 1 base)
fix_end_position = True
if fix_end_position:
print "Fixing end positions..."
for line in data:
try:
line[2] = str(int(line[2])-1)
except TypeError:
logging.warning("Unable to fix end position for L%d" % line.lineno())
# Write to each file
print "Writing data..."
for line in data:
for col in selected:
try:
out_file[col].write("%s\n" % line.subset(0,1,2,col))
except IndexError:
logging.warning("Error outputting data for column '%s'" % col_lookup[col])
# Close output files
for col in selected:
out_file[col].close()
print "Finished"
sys.exit()