# Initialise skip_first_line = options.skip_first_line first_line_is_header = options.first_line_is_header fix_chromosome = options.fix_chromosome bedgraph_header = options.header user_selected = str(options.selection).split(',') # Get the input data data = TabFile(filen, skip_first_line=skip_first_line, first_line_is_header=first_line_is_header) print "Read in %d lines" % len(data) if first_line_is_header: print "Header:" for col in data.header(): print "\t%s" % col # Output file output_root = os.path.splitext(os.path.basename(filen))[0] # Selected columns if len(user_selected) == 0: print "No columns selected for output." sys.exit() print "Selected columns = %s" % ' '.join(user_selected) # Assume user counts columns starting from one and adjust to count from zero # Also check that the requested column exists and set up file names based on # user input selected = [] col_lookup = {}
class MacsXLS: """Class for reading and manipulating XLS output from MACS Reads the XLS output file from the MACS peak caller and processes and stores the information for subsequent manipulation and output. To read in data from a MACS output file: >>> macs = MacsXLS("macs.xls") This reads in the data and prepends an additional 'order' column (a list of numbers from one to the number of data lines). To get the MACS version: >>> macs.macs_version 2.0.10 To access the 'header' information (as a Python list): >>> macs.header To see the column names (as a Python list): >>> macs.columns The data is stored as a TabFile object; to access the data use the 'data' property, e.g. >>> for line in macs.data: ... print "Chr %s Start %s End" % (line['chr'],line['start'],line['end']) To sort the data on a particular column use the 'sort_on' method, e.g. >>> macs.sort_on('chr') (Note that the order column is always recalculated after sorting.) """ def __init__(self,filen=None,fp=None,name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen,'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith("# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0,"order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception,"Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order() @property def filen(self): """Return the source file name """ return self.__filen @property def name(self): """Return the name property """ return self.__name @property def macs_version(self): """Return the MACS version extracted from the file """ return self.__macs_version @property def command_line(self): """Return the command line string extracted from the header This is the value associated with the "# Command line: ..." header line. Will be 'None' if no matching header line is found, else is the string following the ':'. """ return self.__command_line @property def columns(self): """Return the column names for the MACS data Returns a list of the column names from the data extracted from the file. """ return self.__data.header() @property def columns_as_xls_header(self): """Returns the column name list, with hash prepended """ return ['#'+self.columns[0]] + self.columns[1:] @property def header(self): """Return the header data from the file Returns a list of lines comprising the header extracted from the file. """ return self.__header @property def data(self): """Return the data from the file Returns a TabFile object comprising the data extracted from the file. """ return self.__data @property def with_broad_option(self): """Returns True if MACS was run with --broad option If --broad wasn't detected then returns False. """ if self.macs_version.startswith('1.'): # Not an option in MACS 1.* return False try: # Was --broad specified in the command line? return '--broad' in self.command_line.split() except AttributeError: # No command line? Check for 'abs_summit' column return 'abs_summit' not in self.columns def sort_on(self,column,reverse=True): """Sort data on specified column Sorts the data in-place, by the specified column. By default data is sorted in descending order; set 'reverse' argument to False to sort values in ascending order instead Note that the 'order' column is automatically updated after each sorting operation. Arguments: column: name of the column to sort on reverse: if True (default) then sort in descending order (i.e. largest to smallest). Otherwise sort in ascending order. """ # Sort the data self.__data.sort(lambda line: line[column],reverse=reverse) # Update the 'order' column self.update_order() def update_order(self): # Set/update values in 'order' column for i in range(0,len(self.__data)): self.__data[i]['order'] = i+1