def __init__(self, gff_file, fp=None, gffdataline=GFFDataLine, format='gff'): # Storage for format info self._format = format self._version = None # Initialise empty TabFile TabFile.__init__(self, None, fp=None, tab_data_line=GFFDataLine, column_names=GFF_COLUMNS) # Populate by iterating over GFF file for line in GFFIterator(gff_file=gff_file, fp=fp, gffdataline=gffdataline): if line.type == ANNOTATION: # Append to TabFile self.append(tabdataline=line) elif line.type == PRAGMA: # Try to extract relevant data pragma = str(line)[2:].split() if pragma[0] == 'gff-version': self._version = pragma[1]
def write(self, filen): """Write the GFF data to an output GFF Arguments: filen: name of file to write to """ fp = open(filen, 'w') if self.format == 'gff': fp.write("##gff-version 3\n") TabFile.write(self, fp=fp) fp.close()
def write(self, filen): """Write the GFF data to an output GFF Arguments: filen: name of file to write to """ fp = open(filen, "w") if self.format == "gff": fp.write("##gff-version 3\n") TabFile.write(self, fp=fp) fp.close()
def __init__(self, gff_file, fp=None, gffdataline=GFFDataLine, format="gff"): # Storage for format info self._format = format self._version = None # Initialise empty TabFile TabFile.__init__(self, None, fp=None, tab_data_line=GFFDataLine, column_names=GFF_COLUMNS) # Populate by iterating over GFF file for line in GFFIterator(gff_file=gff_file, fp=fp, gffdataline=gffdataline): if line.type == ANNOTATION: # Append to TabFile self.append(tabdataline=line) elif line.type == PRAGMA: # Try to extract relevant data pragma = str(line)[2:].split() if pragma[0] == "gff-version": self._version = pragma[1]
def __init__(self,filen=None,fp=None,name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen,'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith("# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0,"order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception,"Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order()
def __init__(self,infile,column_names): """Create a new BedMaker instance Arguments: infile: tab-delimited data file to read initial data from column_names: names to assign to data columns read in from the file """ # Initialise base class TabFile.__init__(self,infile,column_names=column_names) # Check if first line is real data if len(self) > 0: if not self[0]['start'].isdigit() or \ not self[0]['stop'].isdigit(): print "First line of input file doesn't look like data, removing" del(self[0]) # Remove blank lines i = 0 while i < len(self): if not str(self[i]).strip(): del(self[i]) else: i += 1
logging.error("Input file '%s' not found" % filen) sys.exit(1) # Report version p.print_version() # Initialise skip_first_line = options.skip_first_line first_line_is_header = options.first_line_is_header fix_chromosome = options.fix_chromosome bedgraph_header = options.header user_selected = str(options.selection).split(',') # Get the input data data = TabFile(filen, skip_first_line=skip_first_line, first_line_is_header=first_line_is_header) print "Read in %d lines" % len(data) if first_line_is_header: print "Header:" for col in data.header(): print "\t%s" % col # Output file output_root = os.path.splitext(os.path.basename(filen))[0] # Selected columns if len(user_selected) == 0: print "No columns selected for output." sys.exit() print "Selected columns = %s" % ' '.join(user_selected)
# Build output file name: if not explicitly supplied on the command # line then use "XLS_<input_name>.xls" if len(sys.argv) == 3: xls_out = sys.argv[2] else: # MACS output file might already have an .xls extension # but we'll add an explicit .xls extension xls_out = "XLS_"+os.path.splitext(os.path.basename(macs_in))[0]+".xls" print "Input file: %s" % macs_in print "Output XLS: %s" % xls_out # Extract the header from the MACS and feed actual data to # TabFile object header = [] data = TabFile(column_names=['chr','start','end','length','summit','tags', '-10*log10(pvalue)','fold_enrichment','FDR(%)']) fp = open(macs_in,'r') for line in fp: if line.startswith('#') or line.strip() == '': # Header line header.append(line.strip()) else: # Data data.append(tabdata=line.strip()) fp.close() # Temporarily remove first line header_line = str(data[0]) del(data[0]) # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
# line then use "XLS_<input_name>.xls" if len(args) == 2: xls_out = args[1] else: # MACS output file might already have an .xls extension # but we'll add an explicit .xls extension xls_out = "XLS_" + os.path.splitext( os.path.basename(macs_in))[0] + ".xls" print "Input file: %s" % macs_in print "Output XLS: %s" % xls_out # Extract the header from the MACS and feed actual data to # TabFile object header = [] data = TabFile(column_names=[ 'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)' ]) fp = open(macs_in, 'r') for line in fp: if line.startswith('#') or line.strip() == '': # Header line header.append(line.strip()) else: # Data data.append(tabdata=line.strip()) fp.close() # Temporarily remove first line header_line = str(data[0]) del (data[0])
def __init__(self): QtGui.QWidget.__init__(self) self.batch_tracking_enabled = False #main widget self.setObjectName(_fromUtf8("self")) self.resize(1000, 835) self.setMinimumSize(QtCore.QSize(450, 770)) # main vertical layout self.vertLO_main = QtGui.QVBoxLayout(self) self.vertLO_main.setObjectName(_fromUtf8("vertLO_main")) # horizontal layout video + options self.hoLO_video_plus_options = QtGui.QHBoxLayout() self.hoLO_video_plus_options.setObjectName( _fromUtf8("hoLO_video_plus_options")) # graphical output label # self.lbl_video_output_label = QtGui.QLabel(self) # self.lbl_video_output_label.setMinimumWidth((self.geometry().width()-22)/2) # self.lbl_video_output_label.setObjectName(_fromUtf8("lbl_videl_output_label")) # self.lbl_video_output_label.setAlignment(QtCore.Qt.AlignCenter) # self.hoLO_video_plus_options.addWidget(self.lbl_video_output_label) # options tab widget self.tab_widget_options = QtGui.QTabWidget(self) self.tab_widget_options.setObjectName(_fromUtf8("tab_widget_options")) # file tab self.tab_file = TabFile() self.tab_widget_options.addTab(self.tab_file, _fromUtf8("")) # roi tab self.tab_roi = TabRoi() self.tab_widget_options.addTab(self.tab_roi, _fromUtf8("")) # adv tab self.tab_adv = TabAdv() self.tab_widget_options.addTab(self.tab_adv, _fromUtf8("")) # visuals tab self.tab_visual = TabVisual() self.tab_widget_options.addTab(self.tab_visual, _fromUtf8("")) # meta tab self.tab_meta = TabMeta() self.tab_widget_options.addTab(self.tab_meta, _fromUtf8("")) # add options widget to horizontal layout self.hoLO_video_plus_options.addWidget(self.tab_widget_options) # add video_plus_options tab to main widget self.vertLO_main.addLayout(self.hoLO_video_plus_options) # horizontal layout bot buttons self.hoLO_bot_buttons = QtGui.QHBoxLayout() self.hoLO_bot_buttons.setObjectName(_fromUtf8("hoLO_bot_buttons")) # button start tracking self.btn_start_tracking = QtGui.QPushButton(self) self.btn_start_tracking.setMinimumSize(QtCore.QSize(0, 50)) self.btn_start_tracking.setObjectName(_fromUtf8("btn_start_tracking")) self.btn_start_tracking.setDisabled(False) self.hoLO_bot_buttons.addWidget(self.btn_start_tracking) # vertical layout file label and progress label self.vert_lo_file_progress = QtGui.QVBoxLayout() # file bel self.lbl_file = QtGui.QLabel() self.lbl_file.setObjectName(_fromUtf8("lbl_file")) self.lbl_file.setText(_fromUtf8("no file started")) self.vert_lo_file_progress.addWidget(self.lbl_file) # progress label self.lbl_progress = QtGui.QLabel() self.lbl_progress.setObjectName(_fromUtf8("lbl_progress")) self.lbl_progress.setText(_fromUtf8("Progress:")) self.vert_lo_file_progress.addWidget(self.lbl_progress) self.hoLO_bot_buttons.addLayout(self.vert_lo_file_progress) # button abort tracking self.btn_abort_tracking = QtGui.QPushButton(self) self.btn_abort_tracking.setMinimumSize(QtCore.QSize(0, 50)) self.btn_abort_tracking.setObjectName(_fromUtf8("btn_abort_tracking")) self.btn_abort_tracking.setDisabled(True) self.hoLO_bot_buttons.addWidget(self.btn_abort_tracking) # add button layout to main widget layout self.vertLO_main.addLayout(self.hoLO_bot_buttons) self.retranslate_ui(self) self.tab_widget_options.setCurrentIndex(0) QtCore.QMetaObject.connectSlotsByName(self) self.controller = Controller(self) self.connect_controller_to_tabs() self.tracker = Tracker(controller=self.controller) # self.tab_roi.populate(self.tracker.roim) self.controller.preset_options() self.connect_widgets() self.set_shortcuts()
class TrackerUserInterface(QtGui.QWidget): def __init__(self): QtGui.QWidget.__init__(self) self.batch_tracking_enabled = False #main widget self.setObjectName(_fromUtf8("self")) self.resize(1000, 835) self.setMinimumSize(QtCore.QSize(450, 770)) # main vertical layout self.vertLO_main = QtGui.QVBoxLayout(self) self.vertLO_main.setObjectName(_fromUtf8("vertLO_main")) # horizontal layout video + options self.hoLO_video_plus_options = QtGui.QHBoxLayout() self.hoLO_video_plus_options.setObjectName( _fromUtf8("hoLO_video_plus_options")) # graphical output label # self.lbl_video_output_label = QtGui.QLabel(self) # self.lbl_video_output_label.setMinimumWidth((self.geometry().width()-22)/2) # self.lbl_video_output_label.setObjectName(_fromUtf8("lbl_videl_output_label")) # self.lbl_video_output_label.setAlignment(QtCore.Qt.AlignCenter) # self.hoLO_video_plus_options.addWidget(self.lbl_video_output_label) # options tab widget self.tab_widget_options = QtGui.QTabWidget(self) self.tab_widget_options.setObjectName(_fromUtf8("tab_widget_options")) # file tab self.tab_file = TabFile() self.tab_widget_options.addTab(self.tab_file, _fromUtf8("")) # roi tab self.tab_roi = TabRoi() self.tab_widget_options.addTab(self.tab_roi, _fromUtf8("")) # adv tab self.tab_adv = TabAdv() self.tab_widget_options.addTab(self.tab_adv, _fromUtf8("")) # visuals tab self.tab_visual = TabVisual() self.tab_widget_options.addTab(self.tab_visual, _fromUtf8("")) # meta tab self.tab_meta = TabMeta() self.tab_widget_options.addTab(self.tab_meta, _fromUtf8("")) # add options widget to horizontal layout self.hoLO_video_plus_options.addWidget(self.tab_widget_options) # add video_plus_options tab to main widget self.vertLO_main.addLayout(self.hoLO_video_plus_options) # horizontal layout bot buttons self.hoLO_bot_buttons = QtGui.QHBoxLayout() self.hoLO_bot_buttons.setObjectName(_fromUtf8("hoLO_bot_buttons")) # button start tracking self.btn_start_tracking = QtGui.QPushButton(self) self.btn_start_tracking.setMinimumSize(QtCore.QSize(0, 50)) self.btn_start_tracking.setObjectName(_fromUtf8("btn_start_tracking")) self.btn_start_tracking.setDisabled(False) self.hoLO_bot_buttons.addWidget(self.btn_start_tracking) # vertical layout file label and progress label self.vert_lo_file_progress = QtGui.QVBoxLayout() # file bel self.lbl_file = QtGui.QLabel() self.lbl_file.setObjectName(_fromUtf8("lbl_file")) self.lbl_file.setText(_fromUtf8("no file started")) self.vert_lo_file_progress.addWidget(self.lbl_file) # progress label self.lbl_progress = QtGui.QLabel() self.lbl_progress.setObjectName(_fromUtf8("lbl_progress")) self.lbl_progress.setText(_fromUtf8("Progress:")) self.vert_lo_file_progress.addWidget(self.lbl_progress) self.hoLO_bot_buttons.addLayout(self.vert_lo_file_progress) # button abort tracking self.btn_abort_tracking = QtGui.QPushButton(self) self.btn_abort_tracking.setMinimumSize(QtCore.QSize(0, 50)) self.btn_abort_tracking.setObjectName(_fromUtf8("btn_abort_tracking")) self.btn_abort_tracking.setDisabled(True) self.hoLO_bot_buttons.addWidget(self.btn_abort_tracking) # add button layout to main widget layout self.vertLO_main.addLayout(self.hoLO_bot_buttons) self.retranslate_ui(self) self.tab_widget_options.setCurrentIndex(0) QtCore.QMetaObject.connectSlotsByName(self) self.controller = Controller(self) self.connect_controller_to_tabs() self.tracker = Tracker(controller=self.controller) # self.tab_roi.populate(self.tracker.roim) self.controller.preset_options() self.connect_widgets() self.set_shortcuts() def connect_controller_to_tabs(self): self.tab_roi.connect_to_controller(self.controller) self.tab_meta.connect_to_controller(self.controller) # TODO connect to other tabs def retranslate_ui(self, tracker_main_widget): tracker_main_widget.setWindowTitle( _translate("tracker_main_widget", "Tool For Tracking Fish - [TF]² Ver. 1.5 beta", None)) self.tab_file.retranslate_tab_file() self.tab_roi.retranslate_tab_roi() self.tab_adv.retranslate_tab_adv() self.tab_visual.retranslate_tab_visual() self.tab_meta.retranslate_tab_meta() self.tab_widget_options.setTabText( self.tab_widget_options.indexOf(self.tab_file), _translate("tracker_main_widget", "File", None)) self.tab_widget_options.setTabText( self.tab_widget_options.indexOf(self.tab_roi), _translate("tracker_main_widget", "ROI", None)) self.tab_widget_options.setTabText( self.tab_widget_options.indexOf(self.tab_adv), _translate("tracker_main_widget", "Advanced", None)) self.tab_widget_options.setTabText( self.tab_widget_options.indexOf(self.tab_visual), _translate("tracker_main_widget", "Visualization", None)) self.tab_widget_options.setTabText( self.tab_widget_options.indexOf(self.tab_meta), _translate("tracker_main_widget", "Meta Data", None)) self.btn_start_tracking.setText( _translate("tracker_main_widget", "Start Tracking", None)) self.btn_abort_tracking.setText( _translate("tracker_main_widget", "Abort Tracking", None)) def set_shortcuts(self): self.btn_start_tracking.setShortcut('Ctrl+s') self.btn_start_tracking.setToolTip("Strg + S") self.tab_file.btn_browse_file.setShortcut('Ctrl+f') self.tab_file.btn_browse_file.setToolTip("Strg + F") def center_ui(self, qApp): # screen = QDesktopWidget().screenGeometry() screen = qApp.desktop().screenGeometry() gui_size = self.geometry() x_pos = (screen.width() - gui_size.width()) / 2 y_pos = (screen.height() - gui_size.height() - gui_size.height()) / 2 self.move(x_pos, y_pos) def set_new_tracker(self, controller): self.tab_roi.clear() self.tab_meta.clear_tabs() if self.batch_tracking_enabled: self.tracker = Tracker(controller=controller, batch_mode_on=True) else: self.tracker = Tracker(controller=controller) return def connect_widgets(self): self.tab_file.connect_widgets(self.controller) self.tab_roi.connect_widgets(self.controller) self.tab_adv.connect_widgets(self.controller) self.tab_visual.connect_widgets(self.controller) self.tab_meta.connect_widgets(self.controller) self.btn_start_tracking.clicked.connect(self.controller.start_tracking) self.btn_abort_tracking.clicked.connect(self.controller.abort_tracking)
def main(): """Main program """ # Set up logging format logging.basicConfig(format='%(levelname)s: %(message)s') p = optparse.OptionParser(usage="%prog [options] <file>.gff", version="%prog "+__version__, description= "Utility to perform various 'cleaning' operations on a GFF file.") p.add_option('-o',action='store',dest='output_gff',default=None, help="Name of output GFF file (default is '<file>_clean.gff')") p.add_option('--prepend',action='store',dest='prepend_str',default=None, help="String to prepend to seqname in first column") p.add_option('--clean',action='store_true',dest='do_clean', help="Perform all the 'cleaning' manipulations on the input data (equivalent " "to specifying all of --clean-score, --clean-replace-attributes, " "--clean-exclude-attributes and --clean-group-sgds)") p.add_option('--clean-score',action='store_true',dest='do_clean_score', help="Replace 'Anc_*' and blanks in 'score' field with zeroes") p.add_option('--clean-replace-attributes',action='store_true', dest='do_clean_replace_attributes', help="Replace 'ID', 'Gene', 'Parent' and 'Name' attributes with the value " "of the SGD attribute, if present") p.add_option('--clean-exclude-attributes',action='store_true', dest='do_clean_exclude_attributes', help="Remove the 'kaks', 'kaks2' and 'ncbi' attributes (to remove " "arbitrary attributes, see the --remove-attribute=... option)") p.add_option('--clean-group-sgds',action='store_true',dest='do_clean_group_sgds', help="Group features with the same SGD by adding unique numbers to the 'ID' " "attributes; IDs will have the form 'CDS:<SGD>:<n>' (where n is a unique " "number for a given SGD)") p.add_option('--report-duplicates',action='store_true',dest='report_duplicates', help="Report duplicate SGD names and write list to <file>_duplicates.gff " "with line numbers, chromosome, start coordinate and strand.") p.add_option('--resolve-duplicates',action='store',dest='mapping_file',default=None, help="Resolve duplicate SGDs by matching against 'best' genes in the supplied " "mapping file; other non-matching genes are discarded and written to " "<file>_discarded.gff.") p.add_option('--discard-unresolved',action='store_true',dest='discard_unresolved', help="Discard any unresolved duplicates, which are written to " "<file>_unresolved.gff.") p.add_option('--insert-missing',action='store',dest='gene_file',default=None, help="Insert genes from gene file with SGD names that don't appear in the " "input GFF. If GENE_FILE is blank ('='s must still be present) then the mapping " "file supplied with the --resolve-duplicates option will be used instead.") p.add_option('--add-exon-ids',action='store_true',dest='add_exon_ids',default=False, help="For exon features without an ID attribute, construct and insert an " "ID of the form 'exon:<Parent>:<n>' (where n is a unique number).") p.add_option('--add-missing-ids',action='store_true',dest='add_missing_ids',default=False, help="For features without an ID attribute, construct and insert a " "generated ID of the form '<feature>:<Parent>:<n>' (where n is a unique " "number).") p.add_option('--no-percent-encoding',action='store_true',dest='no_encoding',default=False, help="Convert encoded attributes to the correct characters in " "the output GFF. WARNING this may result in a non-cannonical GFF that can't " "be read correctly by this or other programs.") p.add_option('--remove-attribute',action='append',dest='rm_attr', help="Remove attribute RM_ATTR from the list of attributes for all records " "in the GFF file (can be specified multiple times)") p.add_option('--strict-attributes',action='store_true',dest='strict_attributes', help="Remove attributes that don't conform to the KEY=VALUE format") p.add_option('--debug',action='store_true',dest='debug', help="Print debugging information") # Process the command line options,arguments = p.parse_args() # Check for debugging if options.debug: # Turn on debugging output logging.getLogger().setLevel(logging.DEBUG) # Input files if len(arguments) != 1: p.error("input GFF file required") else: infile = arguments[0] if not os.path.exists(infile): p.error("Input file '%s' not found" % infile) # Report version p.print_version() # Set flags based on command line # String to prepend to first column prepend_str = options.prepend_str # Cleaning options if options.do_clean: # Update values in the "score" column clean_score = True # Clean up the "attributes" column clean_replace_attributes = True clean_exclude_attributes = True # Set ID field in "attributes" to group lines with matching SGDs group_SGDs = True else: # Set options based on user input clean_score = options.do_clean_score clean_replace_attributes = options.do_clean_replace_attributes clean_exclude_attributes = options.do_clean_exclude_attributes group_SGDs = options.do_clean_group_sgds # Report duplicate names report_duplicates = options.report_duplicates # Resolve duplicated genes using CDS file if options.mapping_file is not None: resolve_duplicates = True cdsfile = options.mapping_file else: resolve_duplicates = False cdsfile = None # Discard unresolved duplicates discard_unresolved = options.discard_unresolved # Insert missing genes if options.gene_file is not None: insert_missing = True if options.gene_file: genefile = options.gene_file else: genefile = cdsfile else: insert_missing = False genefile = None # Add an artificial exon ID attribute add_exon_ids = options.add_exon_ids # Add generated ID attributes add_missing_ids = options.add_missing_ids # Suppress encoding of attributes on output no_attribute_encoding = options.no_encoding # Remove attributes that don't conform to KEY=VALUE format strict_attributes = options.strict_attributes # Name for output files ##outext = os.path.splitext(os.path.basename(infile))[1] if not options.output_gff: outbase = os.path.splitext(os.path.basename(infile))[0] outfile = outbase+'_clean.gff' else: outbase = os.path.splitext(os.path.basename(options.output_gff))[0] outfile = options.output_gff print "Input : %s" % infile print "Output: %s" % outfile dupfile = outbase+'_duplicates.txt' delfile = outbase+'_discarded.gff' unresfile = outbase+'_unresolved.gff' # Read in data from file gff_data = GFFFile(infile) # Prepend string to seqname column if prepend_str is not None: print "Prepending '%s' to values in 'seqname' column" % prepend_str for data in gff_data: data['seqname'] = prepend_str+str(data['seqname']) # Check/clean score column values if clean_score: print "Replacing 'Anc_*' and blanks with '0's in 'score' column" score_unexpected_values = [] for data in gff_data: try: # Numerical value score = float(data['score']) if score != 0: score_unexpected_values.append(data['score']) except ValueError: # String value if data['score'].strip() != '' and not data['score'].startswith('Anc_'): score_unexpected_values.append(data['score']) # Replace "Anc_*" or blank values in "score" column with zero if data['score'].startswith('Anc_') or data['score'].strip() == '': data['score'] = '0' # Report unexpected values n = len(score_unexpected_values) if n > 0: logging.warning("%d 'score' values that are not '', 0 or 'Anc_*'" % n) logging.warning("Other values: %s" % score_unexpected_values) # Clean up the data in "attributes" column: replace keys if clean_replace_attributes: # Initialise mapping of keys from input to output in "attributes" column # where new values are required etc attributes_key_map = OrderedDictionary() attributes_key_map['ID'] = 'SGD' attributes_key_map['Gene'] = 'SGD' attributes_key_map['Parent'] = 'SGD' attributes_key_map['Name'] = 'SGD' attributes_dont_replace_with_empty_data = True print "Cleaning up attributes: replacing keys:" for key in attributes_key_map.keys(): print "\t%s -> %s" % (key,attributes_key_map[key]) if attributes_dont_replace_with_empty_data: print "(Replacement will be skipped if new data is missing/blank)" GFFUpdateAttributes(gff_data,attributes_key_map,[], attributes_dont_replace_with_empty_data) # Clean up the data in "attributes" column: exclude keys if clean_exclude_attributes: # List of keys to exclude attributes_exclude_keys = ['kaks','kaks2','ncbi'] print "Excluding keys:" for key in attributes_exclude_keys: print "\t%s" % key GFFUpdateAttributes(gff_data,{},attributes_exclude_keys,True) # Set the IDs for consecutive lines with matching SGD names, to indicate that # they're in the same gene if group_SGDs: print "Grouping SGDs by setting ID's for consecutive lines with the same SGD values" GFFGroupSGDs(gff_data) # Find duplicates in input file if report_duplicates or resolve_duplicates: duplicate_sgds = GFFGetDuplicateSGDs(gff_data) if report_duplicates: # Write to duplicates file print "Writing duplicate SGD names to %s" % dupfile fd = open(dupfile,'w') ndup = 0 ngroups = 0 for sgd in duplicate_sgds.keys(): assert(len(duplicate_sgds[sgd]) > 1) ndup += 1 fd.write("%s\t" % sgd) for data in duplicate_sgds[sgd]: # Write the line number, chromosome, start and strand data line = ';'.join(('L'+str(data.lineno()), str(data['seqname']),str(data['start']),str(data['end']))) fd.write("\t%s" % line) fd.write("\n") logging.debug("%s\t%s" % (sgd,duplicate_sgds[sgd])) for group in GroupGeneSubsets(duplicate_sgds[sgd]): if len(group) > 1: ngroups += 1 if ndup == 0: fd.write("No duplicate SGDs\n") fd.close() print "%d duplicates found (of which %d are trivial)" % (ndup,ngroups) if resolve_duplicates: print "Resolving duplicate SGDs using data from %s" % cdsfile print "Discarded genes will be written to %s" % delfile # Get data on best gene mappings from CDS file # Format is tab-delimited, each line has: # orf chr start end strand mapping = TabFile(cdsfile,column_names=('name','chr','start','end','strand')) # Overlap margin overlap_margin = 1000 # Perform resolution result = GFFResolveDuplicateSGDs(gff_data,mapping,duplicate_sgds,overlap_margin) # # Report the results # # Convenience variables for lists of unresolved, discarded etc duplicates resolved_sgds = result['resolved_sgds'] unresolved_sgds_no_mapping_genes = result['unresolved_sgds_no_mapping_genes'] unresolved_sgds_no_mapping_genes_after_filter = \ result['unresolved_sgds_no_mapping_genes_after_filter'] unresolved_sgds_no_overlaps = result['unresolved_sgds_no_overlaps'] unresolved_sgds_multiple_matches = result['unresolved_sgds_multiple_matches'] discard = result['discard'] # Remaining unresolved cases if len(unresolved_sgds_no_mapping_genes) > 0: print "No mapping genes with same SGDs found in %s:" % cdsfile for sgd in unresolved_sgds_no_mapping_genes: print "\t%s" % sgd print if len(unresolved_sgds_no_mapping_genes_after_filter) > 0: print "No mapping genes with same chromosome and/or strand:" for sgd in unresolved_sgds_no_mapping_genes_after_filter: print "\t%s" % sgd print if len(unresolved_sgds_no_overlaps) > 0: print "No mapping genes with overlaps:" for sgd in unresolved_sgds_no_overlaps: print "\t%s" % sgd print if len(unresolved_sgds_multiple_matches) > 0: print "Multiple matching mapping genes:" for sgd in unresolved_sgds_multiple_matches: print "\t%s" % sgd print # Summary counts for each case print "Total number of duplicated indexes : %d" % len(duplicate_sgds.keys()) print "Number of resolved duplicate SGDs : %d" % len(resolved_sgds) print "Unresolved duplicates:" print "* No mapping genes with same SGD : %d" % len(unresolved_sgds_no_mapping_genes) print "* No mapping genes with same chr/str : %d" % len(unresolved_sgds_no_mapping_genes_after_filter) print "* No mapping genes with overlap : %d" % len(unresolved_sgds_no_overlaps) print "* Multiple mapping genes match : %d" % len(unresolved_sgds_multiple_matches) # Remove discarded duplicates from the data print "Removing discarded duplicates and writing to %s" % delfile fd = open(delfile,'w') for discard_data in discard: try: ip = gff_data.indexByLineNumber(discard_data.lineno()) del(gff_data[ip]) fd.write("%s\n" % discard_data) except IndexError: logging.warning("Failed to delete line %d: not found" % discard_data.lineno()) fd.close() # Remove unresolved duplicates if requested if discard_unresolved: print "Removing unresolved duplicates and writing to %s" % unresfile # Get list of unresolved SGDs all_unresolved = result['unresolved_sgds'] # Get list of unresolved duplicates unresolved = [] for data in gff_data: attributes = data['attributes'] if 'SGD' in attributes: if attributes['SGD'] in all_unresolved: unresolved.append(data) # Discard them fu = open(unresfile,'w') for discard in unresolved: try: ip = gff_data.indexByLineNumber(discard.lineno()) del(gff_data[ip]) fu.write("%s\n" % discard) except IndexError: logging.warning("Failed to delete line %d: not found" % discard.lineno()) fu.close() # Look for "missing" genes in mapping file if insert_missing: # Get name for file with gene list if genefile is None: genefile = cdsfile print "Inserting unmatched genes from %s" % genefile # Get gene data from CDS file # Format is tab-delimited, each line has: # orf chr start end strand mapping = TabFile(genefile,column_names=('name','chr','start','end','strand')) n_genes_before_insert = len(gff_data) gff_data = GFFInsertMissingGenes(gff_data,mapping) print "Inserted %d missing genes" % (len(gff_data) - n_genes_before_insert) # Construct and insert ID for exons if add_exon_ids: print "Inserting artificial IDs for exon records" gff_data = GFFAddExonIDs(gff_data) # Construct and insert missing ID attributes if add_missing_ids: print "Inserting generated IDs for records where IDs are missing" gff_data = GFFAddIDAttributes(gff_data) # Strip attributes requested for removal if options.rm_attr: print "Removing the following attributes from all records:" for attr in options.rm_attr: print "\t* %s" % attr GFFUpdateAttributes(gff_data,exclude_keys=options.rm_attr) # Remove attributes that don't conform to KEY=VALUE format if strict_attributes: print "Removing attributes that don't conform to KEY=VALUE format" GFFUpdateAttributes(gff_data,exclude_nokeys=True) # Suppress percent encoding of attributes if no_attribute_encoding: print "Converting encoded special characters in attribute data to non-encoded form" logging.warning("!!! Special characters will not be correctly encoded in the output !!!") logging.warning("!!! The resulting GFF may not be readable by this or other programs !!!") gff_data = GFFDecodeAttributes(gff_data) # Write to output file print "Writing output file %s" % outfile gff_data.write(outfile)
class MacsXLS: """Class for reading and manipulating XLS output from MACS Reads the XLS output file from the MACS peak caller and processes and stores the information for subsequent manipulation and output. To read in data from a MACS output file: >>> macs = MacsXLS("macs.xls") This reads in the data and prepends an additional 'order' column (a list of numbers from one to the number of data lines). To get the MACS version: >>> macs.macs_version 2.0.10 To access the 'header' information (as a Python list): >>> macs.header To see the column names (as a Python list): >>> macs.columns The data is stored as a TabFile object; to access the data use the 'data' property, e.g. >>> for line in macs.data: ... print "Chr %s Start %s End" % (line['chr'],line['start'],line['end']) To sort the data on a particular column use the 'sort_on' method, e.g. >>> macs.sort_on('chr') (Note that the order column is always recalculated after sorting.) """ def __init__(self,filen=None,fp=None,name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen,'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith("# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0,"order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception,"Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order() @property def filen(self): """Return the source file name """ return self.__filen @property def name(self): """Return the name property """ return self.__name @property def macs_version(self): """Return the MACS version extracted from the file """ return self.__macs_version @property def command_line(self): """Return the command line string extracted from the header This is the value associated with the "# Command line: ..." header line. Will be 'None' if no matching header line is found, else is the string following the ':'. """ return self.__command_line @property def columns(self): """Return the column names for the MACS data Returns a list of the column names from the data extracted from the file. """ return self.__data.header() @property def columns_as_xls_header(self): """Returns the column name list, with hash prepended """ return ['#'+self.columns[0]] + self.columns[1:] @property def header(self): """Return the header data from the file Returns a list of lines comprising the header extracted from the file. """ return self.__header @property def data(self): """Return the data from the file Returns a TabFile object comprising the data extracted from the file. """ return self.__data @property def with_broad_option(self): """Returns True if MACS was run with --broad option If --broad wasn't detected then returns False. """ if self.macs_version.startswith('1.'): # Not an option in MACS 1.* return False try: # Was --broad specified in the command line? return '--broad' in self.command_line.split() except AttributeError: # No command line? Check for 'abs_summit' column return 'abs_summit' not in self.columns def sort_on(self,column,reverse=True): """Sort data on specified column Sorts the data in-place, by the specified column. By default data is sorted in descending order; set 'reverse' argument to False to sort values in ascending order instead Note that the 'order' column is automatically updated after each sorting operation. Arguments: column: name of the column to sort on reverse: if True (default) then sort in descending order (i.e. largest to smallest). Otherwise sort in ascending order. """ # Sort the data self.__data.sort(lambda line: line[column],reverse=reverse) # Update the 'order' column self.update_order() def update_order(self): # Set/update values in 'order' column for i in range(0,len(self.__data)): self.__data[i]['order'] = i+1