Exemple #1
0
 def __init__(self,
              gff_file,
              fp=None,
              gffdataline=GFFDataLine,
              format='gff'):
     # Storage for format info
     self._format = format
     self._version = None
     # Initialise empty TabFile
     TabFile.__init__(self,
                      None,
                      fp=None,
                      tab_data_line=GFFDataLine,
                      column_names=GFF_COLUMNS)
     # Populate by iterating over GFF file
     for line in GFFIterator(gff_file=gff_file,
                             fp=fp,
                             gffdataline=gffdataline):
         if line.type == ANNOTATION:
             # Append to TabFile
             self.append(tabdataline=line)
         elif line.type == PRAGMA:
             # Try to extract relevant data
             pragma = str(line)[2:].split()
             if pragma[0] == 'gff-version':
                 self._version = pragma[1]
Exemple #2
0
    def write(self, filen):
        """Write the GFF data to an output GFF

        Arguments:
          filen: name of file to write to
        """
        fp = open(filen, 'w')
        if self.format == 'gff':
            fp.write("##gff-version 3\n")
        TabFile.write(self, fp=fp)
        fp.close()
    def write(self, filen):
        """Write the GFF data to an output GFF

        Arguments:
          filen: name of file to write to
        """
        fp = open(filen, "w")
        if self.format == "gff":
            fp.write("##gff-version 3\n")
        TabFile.write(self, fp=fp)
        fp.close()
 def __init__(self, gff_file, fp=None, gffdataline=GFFDataLine, format="gff"):
     # Storage for format info
     self._format = format
     self._version = None
     # Initialise empty TabFile
     TabFile.__init__(self, None, fp=None, tab_data_line=GFFDataLine, column_names=GFF_COLUMNS)
     # Populate by iterating over GFF file
     for line in GFFIterator(gff_file=gff_file, fp=fp, gffdataline=gffdataline):
         if line.type == ANNOTATION:
             # Append to TabFile
             self.append(tabdataline=line)
         elif line.type == PRAGMA:
             # Try to extract relevant data
             pragma = str(line)[2:].split()
             if pragma[0] == "gff-version":
                 self._version = pragma[1]
Exemple #5
0
    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()
    def __init__(self,infile,column_names):
        """Create a new BedMaker instance

        Arguments:
          infile: tab-delimited data file to read initial data from
          column_names: names to assign to data columns read in from the file
        """
        # Initialise base class
        TabFile.__init__(self,infile,column_names=column_names)
        # Check if first line is real data
        if len(self) > 0:
            if not self[0]['start'].isdigit() or \
                    not self[0]['stop'].isdigit():
                print "First line of input file doesn't look like data, removing"
                del(self[0])
        # Remove blank lines
        i = 0
        while i < len(self):
            if not str(self[i]).strip():
                del(self[i])
            else:
                i += 1
        logging.error("Input file '%s' not found" % filen)
        sys.exit(1)

    # Report version
    p.print_version()

    # Initialise
    skip_first_line = options.skip_first_line
    first_line_is_header = options.first_line_is_header
    fix_chromosome = options.fix_chromosome
    bedgraph_header = options.header
    user_selected = str(options.selection).split(',')

    # Get the input data
    data = TabFile(filen,
                   skip_first_line=skip_first_line,
                   first_line_is_header=first_line_is_header)
    print "Read in %d lines" % len(data)
    if first_line_is_header:
        print "Header:"
        for col in data.header():
            print "\t%s" % col

    # Output file
    output_root = os.path.splitext(os.path.basename(filen))[0]

    # Selected columns
    if len(user_selected) == 0:
        print "No columns selected for output."
        sys.exit()
    print "Selected columns = %s" % ' '.join(user_selected)
Exemple #8
0
    # Build output file name: if not explicitly supplied on the command
    # line then use "XLS_<input_name>.xls"
    if len(sys.argv) == 3:
        xls_out = sys.argv[2]
    else:
        # MACS output file might already have an .xls extension
        # but we'll add an explicit .xls extension
        xls_out = "XLS_"+os.path.splitext(os.path.basename(macs_in))[0]+".xls"
    print "Input file: %s" % macs_in
    print "Output XLS: %s" % xls_out

    # Extract the header from the MACS and feed actual data to
    # TabFile object
    header = []
    data = TabFile(column_names=['chr','start','end','length','summit','tags',
                                 '-10*log10(pvalue)','fold_enrichment','FDR(%)'])
    fp = open(macs_in,'r')
    for line in fp:
        if line.startswith('#') or line.strip() == '':
            # Header line
            header.append(line.strip())
        else:
            # Data
            data.append(tabdata=line.strip())
    fp.close()

    # Temporarily remove first line
    header_line = str(data[0])
    del(data[0])

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
Exemple #9
0
    # line then use "XLS_<input_name>.xls"
    if len(args) == 2:
        xls_out = args[1]
    else:
        # MACS output file might already have an .xls extension
        # but we'll add an explicit .xls extension
        xls_out = "XLS_" + os.path.splitext(
            os.path.basename(macs_in))[0] + ".xls"
    print "Input file: %s" % macs_in
    print "Output XLS: %s" % xls_out

    # Extract the header from the MACS and feed actual data to
    # TabFile object
    header = []
    data = TabFile(column_names=[
        'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)',
        'fold_enrichment', 'FDR(%)'
    ])
    fp = open(macs_in, 'r')
    for line in fp:
        if line.startswith('#') or line.strip() == '':
            # Header line
            header.append(line.strip())
        else:
            # Data
            data.append(tabdata=line.strip())
    fp.close()

    # Temporarily remove first line
    header_line = str(data[0])
    del (data[0])
Exemple #10
0
    def __init__(self):
        QtGui.QWidget.__init__(self)

        self.batch_tracking_enabled = False

        #main widget
        self.setObjectName(_fromUtf8("self"))
        self.resize(1000, 835)
        self.setMinimumSize(QtCore.QSize(450, 770))

        # main vertical layout
        self.vertLO_main = QtGui.QVBoxLayout(self)
        self.vertLO_main.setObjectName(_fromUtf8("vertLO_main"))
        # horizontal layout video + options
        self.hoLO_video_plus_options = QtGui.QHBoxLayout()
        self.hoLO_video_plus_options.setObjectName(
            _fromUtf8("hoLO_video_plus_options"))
        # graphical output label
        # self.lbl_video_output_label = QtGui.QLabel(self)
        # self.lbl_video_output_label.setMinimumWidth((self.geometry().width()-22)/2)
        # self.lbl_video_output_label.setObjectName(_fromUtf8("lbl_videl_output_label"))
        # self.lbl_video_output_label.setAlignment(QtCore.Qt.AlignCenter)
        # self.hoLO_video_plus_options.addWidget(self.lbl_video_output_label)

        # options tab widget
        self.tab_widget_options = QtGui.QTabWidget(self)
        self.tab_widget_options.setObjectName(_fromUtf8("tab_widget_options"))

        # file tab
        self.tab_file = TabFile()
        self.tab_widget_options.addTab(self.tab_file, _fromUtf8(""))

        # roi tab
        self.tab_roi = TabRoi()
        self.tab_widget_options.addTab(self.tab_roi, _fromUtf8(""))

        # adv tab
        self.tab_adv = TabAdv()
        self.tab_widget_options.addTab(self.tab_adv, _fromUtf8(""))

        # visuals tab
        self.tab_visual = TabVisual()
        self.tab_widget_options.addTab(self.tab_visual, _fromUtf8(""))

        # meta tab
        self.tab_meta = TabMeta()
        self.tab_widget_options.addTab(self.tab_meta, _fromUtf8(""))

        # add options widget to horizontal layout
        self.hoLO_video_plus_options.addWidget(self.tab_widget_options)

        # add video_plus_options tab to main widget
        self.vertLO_main.addLayout(self.hoLO_video_plus_options)

        # horizontal layout bot buttons
        self.hoLO_bot_buttons = QtGui.QHBoxLayout()
        self.hoLO_bot_buttons.setObjectName(_fromUtf8("hoLO_bot_buttons"))
        # button start tracking
        self.btn_start_tracking = QtGui.QPushButton(self)
        self.btn_start_tracking.setMinimumSize(QtCore.QSize(0, 50))
        self.btn_start_tracking.setObjectName(_fromUtf8("btn_start_tracking"))
        self.btn_start_tracking.setDisabled(False)
        self.hoLO_bot_buttons.addWidget(self.btn_start_tracking)
        # vertical layout file label and progress label
        self.vert_lo_file_progress = QtGui.QVBoxLayout()
        # file bel
        self.lbl_file = QtGui.QLabel()
        self.lbl_file.setObjectName(_fromUtf8("lbl_file"))
        self.lbl_file.setText(_fromUtf8("no file started"))
        self.vert_lo_file_progress.addWidget(self.lbl_file)
        # progress label
        self.lbl_progress = QtGui.QLabel()
        self.lbl_progress.setObjectName(_fromUtf8("lbl_progress"))
        self.lbl_progress.setText(_fromUtf8("Progress:"))
        self.vert_lo_file_progress.addWidget(self.lbl_progress)
        self.hoLO_bot_buttons.addLayout(self.vert_lo_file_progress)
        # button abort tracking
        self.btn_abort_tracking = QtGui.QPushButton(self)
        self.btn_abort_tracking.setMinimumSize(QtCore.QSize(0, 50))
        self.btn_abort_tracking.setObjectName(_fromUtf8("btn_abort_tracking"))
        self.btn_abort_tracking.setDisabled(True)
        self.hoLO_bot_buttons.addWidget(self.btn_abort_tracking)
        # add button layout to main widget layout
        self.vertLO_main.addLayout(self.hoLO_bot_buttons)

        self.retranslate_ui(self)
        self.tab_widget_options.setCurrentIndex(0)
        QtCore.QMetaObject.connectSlotsByName(self)

        self.controller = Controller(self)
        self.connect_controller_to_tabs()
        self.tracker = Tracker(controller=self.controller)

        # self.tab_roi.populate(self.tracker.roim)

        self.controller.preset_options()
        self.connect_widgets()
        self.set_shortcuts()
Exemple #11
0
class TrackerUserInterface(QtGui.QWidget):
    def __init__(self):
        QtGui.QWidget.__init__(self)

        self.batch_tracking_enabled = False

        #main widget
        self.setObjectName(_fromUtf8("self"))
        self.resize(1000, 835)
        self.setMinimumSize(QtCore.QSize(450, 770))

        # main vertical layout
        self.vertLO_main = QtGui.QVBoxLayout(self)
        self.vertLO_main.setObjectName(_fromUtf8("vertLO_main"))
        # horizontal layout video + options
        self.hoLO_video_plus_options = QtGui.QHBoxLayout()
        self.hoLO_video_plus_options.setObjectName(
            _fromUtf8("hoLO_video_plus_options"))
        # graphical output label
        # self.lbl_video_output_label = QtGui.QLabel(self)
        # self.lbl_video_output_label.setMinimumWidth((self.geometry().width()-22)/2)
        # self.lbl_video_output_label.setObjectName(_fromUtf8("lbl_videl_output_label"))
        # self.lbl_video_output_label.setAlignment(QtCore.Qt.AlignCenter)
        # self.hoLO_video_plus_options.addWidget(self.lbl_video_output_label)

        # options tab widget
        self.tab_widget_options = QtGui.QTabWidget(self)
        self.tab_widget_options.setObjectName(_fromUtf8("tab_widget_options"))

        # file tab
        self.tab_file = TabFile()
        self.tab_widget_options.addTab(self.tab_file, _fromUtf8(""))

        # roi tab
        self.tab_roi = TabRoi()
        self.tab_widget_options.addTab(self.tab_roi, _fromUtf8(""))

        # adv tab
        self.tab_adv = TabAdv()
        self.tab_widget_options.addTab(self.tab_adv, _fromUtf8(""))

        # visuals tab
        self.tab_visual = TabVisual()
        self.tab_widget_options.addTab(self.tab_visual, _fromUtf8(""))

        # meta tab
        self.tab_meta = TabMeta()
        self.tab_widget_options.addTab(self.tab_meta, _fromUtf8(""))

        # add options widget to horizontal layout
        self.hoLO_video_plus_options.addWidget(self.tab_widget_options)

        # add video_plus_options tab to main widget
        self.vertLO_main.addLayout(self.hoLO_video_plus_options)

        # horizontal layout bot buttons
        self.hoLO_bot_buttons = QtGui.QHBoxLayout()
        self.hoLO_bot_buttons.setObjectName(_fromUtf8("hoLO_bot_buttons"))
        # button start tracking
        self.btn_start_tracking = QtGui.QPushButton(self)
        self.btn_start_tracking.setMinimumSize(QtCore.QSize(0, 50))
        self.btn_start_tracking.setObjectName(_fromUtf8("btn_start_tracking"))
        self.btn_start_tracking.setDisabled(False)
        self.hoLO_bot_buttons.addWidget(self.btn_start_tracking)
        # vertical layout file label and progress label
        self.vert_lo_file_progress = QtGui.QVBoxLayout()
        # file bel
        self.lbl_file = QtGui.QLabel()
        self.lbl_file.setObjectName(_fromUtf8("lbl_file"))
        self.lbl_file.setText(_fromUtf8("no file started"))
        self.vert_lo_file_progress.addWidget(self.lbl_file)
        # progress label
        self.lbl_progress = QtGui.QLabel()
        self.lbl_progress.setObjectName(_fromUtf8("lbl_progress"))
        self.lbl_progress.setText(_fromUtf8("Progress:"))
        self.vert_lo_file_progress.addWidget(self.lbl_progress)
        self.hoLO_bot_buttons.addLayout(self.vert_lo_file_progress)
        # button abort tracking
        self.btn_abort_tracking = QtGui.QPushButton(self)
        self.btn_abort_tracking.setMinimumSize(QtCore.QSize(0, 50))
        self.btn_abort_tracking.setObjectName(_fromUtf8("btn_abort_tracking"))
        self.btn_abort_tracking.setDisabled(True)
        self.hoLO_bot_buttons.addWidget(self.btn_abort_tracking)
        # add button layout to main widget layout
        self.vertLO_main.addLayout(self.hoLO_bot_buttons)

        self.retranslate_ui(self)
        self.tab_widget_options.setCurrentIndex(0)
        QtCore.QMetaObject.connectSlotsByName(self)

        self.controller = Controller(self)
        self.connect_controller_to_tabs()
        self.tracker = Tracker(controller=self.controller)

        # self.tab_roi.populate(self.tracker.roim)

        self.controller.preset_options()
        self.connect_widgets()
        self.set_shortcuts()

    def connect_controller_to_tabs(self):
        self.tab_roi.connect_to_controller(self.controller)
        self.tab_meta.connect_to_controller(self.controller)
        # TODO connect to other tabs

    def retranslate_ui(self, tracker_main_widget):
        tracker_main_widget.setWindowTitle(
            _translate("tracker_main_widget",
                       "Tool For Tracking Fish - [TF]² Ver. 1.5 beta", None))

        self.tab_file.retranslate_tab_file()
        self.tab_roi.retranslate_tab_roi()
        self.tab_adv.retranslate_tab_adv()
        self.tab_visual.retranslate_tab_visual()
        self.tab_meta.retranslate_tab_meta()

        self.tab_widget_options.setTabText(
            self.tab_widget_options.indexOf(self.tab_file),
            _translate("tracker_main_widget", "File", None))
        self.tab_widget_options.setTabText(
            self.tab_widget_options.indexOf(self.tab_roi),
            _translate("tracker_main_widget", "ROI", None))
        self.tab_widget_options.setTabText(
            self.tab_widget_options.indexOf(self.tab_adv),
            _translate("tracker_main_widget", "Advanced", None))
        self.tab_widget_options.setTabText(
            self.tab_widget_options.indexOf(self.tab_visual),
            _translate("tracker_main_widget", "Visualization", None))
        self.tab_widget_options.setTabText(
            self.tab_widget_options.indexOf(self.tab_meta),
            _translate("tracker_main_widget", "Meta Data", None))

        self.btn_start_tracking.setText(
            _translate("tracker_main_widget", "Start Tracking", None))
        self.btn_abort_tracking.setText(
            _translate("tracker_main_widget", "Abort Tracking", None))

    def set_shortcuts(self):
        self.btn_start_tracking.setShortcut('Ctrl+s')
        self.btn_start_tracking.setToolTip("Strg + S")

        self.tab_file.btn_browse_file.setShortcut('Ctrl+f')
        self.tab_file.btn_browse_file.setToolTip("Strg + F")

    def center_ui(self, qApp):
        # screen = QDesktopWidget().screenGeometry()
        screen = qApp.desktop().screenGeometry()
        gui_size = self.geometry()
        x_pos = (screen.width() - gui_size.width()) / 2
        y_pos = (screen.height() - gui_size.height() - gui_size.height()) / 2
        self.move(x_pos, y_pos)

    def set_new_tracker(self, controller):
        self.tab_roi.clear()
        self.tab_meta.clear_tabs()
        if self.batch_tracking_enabled:
            self.tracker = Tracker(controller=controller, batch_mode_on=True)
        else:
            self.tracker = Tracker(controller=controller)
        return

    def connect_widgets(self):
        self.tab_file.connect_widgets(self.controller)
        self.tab_roi.connect_widgets(self.controller)
        self.tab_adv.connect_widgets(self.controller)
        self.tab_visual.connect_widgets(self.controller)
        self.tab_meta.connect_widgets(self.controller)

        self.btn_start_tracking.clicked.connect(self.controller.start_tracking)
        self.btn_abort_tracking.clicked.connect(self.controller.abort_tracking)
Exemple #12
0
def main():
    """Main program
    """
    # Set up logging format
    logging.basicConfig(format='%(levelname)s: %(message)s')

    p = optparse.OptionParser(usage="%prog [options] <file>.gff",
                              version="%prog "+__version__,
                              description=
                              "Utility to perform various 'cleaning' operations on a GFF file.")
    p.add_option('-o',action='store',dest='output_gff',default=None,
                 help="Name of output GFF file (default is '<file>_clean.gff')")
    p.add_option('--prepend',action='store',dest='prepend_str',default=None,
                 help="String to prepend to seqname in first column")
    p.add_option('--clean',action='store_true',dest='do_clean',
                 help="Perform all the 'cleaning' manipulations on the input data (equivalent "
                 "to specifying all of --clean-score, --clean-replace-attributes, "
                 "--clean-exclude-attributes and --clean-group-sgds)")
    p.add_option('--clean-score',action='store_true',dest='do_clean_score',
                 help="Replace 'Anc_*' and blanks in 'score' field with zeroes")
    p.add_option('--clean-replace-attributes',action='store_true',
                 dest='do_clean_replace_attributes',
                 help="Replace 'ID', 'Gene', 'Parent' and 'Name' attributes with the value "
                 "of the SGD attribute, if present")
    p.add_option('--clean-exclude-attributes',action='store_true',
                 dest='do_clean_exclude_attributes',
                 help="Remove the 'kaks', 'kaks2' and 'ncbi' attributes (to remove "
                 "arbitrary attributes, see the --remove-attribute=... option)")
    p.add_option('--clean-group-sgds',action='store_true',dest='do_clean_group_sgds',
                 help="Group features with the same SGD by adding unique numbers to the 'ID' "
                 "attributes; IDs will have the form 'CDS:<SGD>:<n>' (where n is a unique "
                 "number for a given SGD)")
    p.add_option('--report-duplicates',action='store_true',dest='report_duplicates',
                 help="Report duplicate SGD names and write list to <file>_duplicates.gff "
                 "with line numbers, chromosome, start coordinate and strand.")
    p.add_option('--resolve-duplicates',action='store',dest='mapping_file',default=None,
                 help="Resolve duplicate SGDs by matching against 'best' genes in the supplied "
                 "mapping file; other non-matching genes are discarded and written to "
                 "<file>_discarded.gff.")
    p.add_option('--discard-unresolved',action='store_true',dest='discard_unresolved',
                 help="Discard any unresolved duplicates, which are written to "
                 "<file>_unresolved.gff.")
    p.add_option('--insert-missing',action='store',dest='gene_file',default=None,
                 help="Insert genes from gene file with SGD names that don't appear in the "
                 "input GFF. If GENE_FILE is blank ('='s must still be present) then the mapping "
                 "file supplied with the --resolve-duplicates option will be used instead.")
    p.add_option('--add-exon-ids',action='store_true',dest='add_exon_ids',default=False,
                 help="For exon features without an ID attribute, construct and insert an "
                 "ID of the form 'exon:<Parent>:<n>' (where n is a unique number).")
    p.add_option('--add-missing-ids',action='store_true',dest='add_missing_ids',default=False,
                 help="For features without an ID attribute, construct and insert a "
                 "generated ID of the form '<feature>:<Parent>:<n>' (where n is a unique "
                 "number).")
    p.add_option('--no-percent-encoding',action='store_true',dest='no_encoding',default=False,
                 help="Convert encoded attributes to the correct characters in "
                 "the output GFF. WARNING this may result in a non-cannonical GFF that can't "
                 "be read correctly by this or other programs.")
    p.add_option('--remove-attribute',action='append',dest='rm_attr',
                 help="Remove attribute RM_ATTR from the list of attributes for all records "
                 "in the GFF file (can be specified multiple times)")
    p.add_option('--strict-attributes',action='store_true',dest='strict_attributes',
                 help="Remove attributes that don't conform to the KEY=VALUE format")
    p.add_option('--debug',action='store_true',dest='debug',
                 help="Print debugging information")

    # Process the command line
    options,arguments = p.parse_args()

    # Check for debugging
    if options.debug:
        # Turn on debugging output
        logging.getLogger().setLevel(logging.DEBUG)

    # Input files
    if len(arguments) != 1:
        p.error("input GFF file required")
    else:
        infile = arguments[0]
        if not os.path.exists(infile):
            p.error("Input file '%s' not found" % infile)

    # Report version
    p.print_version()

    # Set flags based on command line

    # String to prepend to first column
    prepend_str = options.prepend_str
    # Cleaning options
    if options.do_clean:
        # Update values in the "score" column
        clean_score = True
        # Clean up the "attributes" column
        clean_replace_attributes = True
        clean_exclude_attributes = True
        # Set ID field in "attributes" to group lines with matching SGDs
        group_SGDs = True
    else:
        # Set options based on user input
        clean_score = options.do_clean_score
        clean_replace_attributes = options.do_clean_replace_attributes
        clean_exclude_attributes = options.do_clean_exclude_attributes
        group_SGDs = options.do_clean_group_sgds
    # Report duplicate names
    report_duplicates = options.report_duplicates
    # Resolve duplicated genes using CDS file
    if options.mapping_file is not None:
        resolve_duplicates = True
        cdsfile = options.mapping_file
    else:
        resolve_duplicates = False
        cdsfile = None
    # Discard unresolved duplicates
    discard_unresolved = options.discard_unresolved
    # Insert missing genes
    if options.gene_file is not None:
        insert_missing = True
        if options.gene_file:
            genefile = options.gene_file
        else:
            genefile = cdsfile
    else:
        insert_missing = False
        genefile = None
    # Add an artificial exon ID attribute
    add_exon_ids = options.add_exon_ids
    # Add generated ID attributes
    add_missing_ids = options.add_missing_ids
    # Suppress encoding of attributes on output
    no_attribute_encoding = options.no_encoding
    # Remove attributes that don't conform to KEY=VALUE format
    strict_attributes = options.strict_attributes

    # Name for output files
    ##outext = os.path.splitext(os.path.basename(infile))[1]
    if not options.output_gff:
        outbase = os.path.splitext(os.path.basename(infile))[0]
        outfile = outbase+'_clean.gff'
    else:
        outbase = os.path.splitext(os.path.basename(options.output_gff))[0]
        outfile = options.output_gff
    print "Input : %s" % infile
    print "Output: %s" % outfile
    dupfile = outbase+'_duplicates.txt'
    delfile = outbase+'_discarded.gff'
    unresfile = outbase+'_unresolved.gff'

    # Read in data from file
    gff_data = GFFFile(infile)

    # Prepend string to seqname column
    if prepend_str is not None:
        print "Prepending '%s' to values in 'seqname' column" % prepend_str
        for data in gff_data:
            data['seqname'] = prepend_str+str(data['seqname'])

    # Check/clean score column values
    if clean_score:
        print "Replacing 'Anc_*' and blanks with '0's in 'score' column"
        score_unexpected_values = []
        for data in gff_data:
            try:
                # Numerical value
                score = float(data['score'])
                if score != 0:
                    score_unexpected_values.append(data['score'])
            except ValueError:
                # String value
                if data['score'].strip() != '' and not data['score'].startswith('Anc_'):
                    score_unexpected_values.append(data['score'])
            # Replace "Anc_*" or blank values in "score" column with zero
            if data['score'].startswith('Anc_') or data['score'].strip() == '':
                data['score'] = '0'
        # Report unexpected values
        n = len(score_unexpected_values)
        if n > 0:
            logging.warning("%d 'score' values that are not '', 0 or 'Anc_*'" % n)
            logging.warning("Other values: %s" % score_unexpected_values)

    # Clean up the data in "attributes" column: replace keys
    if clean_replace_attributes:
        # Initialise mapping of keys from input to output in "attributes" column
        # where new values are required etc
        attributes_key_map = OrderedDictionary()
        attributes_key_map['ID'] = 'SGD'
        attributes_key_map['Gene'] = 'SGD'
        attributes_key_map['Parent'] = 'SGD'
        attributes_key_map['Name'] = 'SGD'
        attributes_dont_replace_with_empty_data = True
        print "Cleaning up attributes: replacing keys:"
        for key in attributes_key_map.keys():
            print "\t%s -> %s" % (key,attributes_key_map[key])
        if attributes_dont_replace_with_empty_data:
            print "(Replacement will be skipped if new data is missing/blank)"
        GFFUpdateAttributes(gff_data,attributes_key_map,[],
                            attributes_dont_replace_with_empty_data)

    # Clean up the data in "attributes" column: exclude keys
    if clean_exclude_attributes:
        # List of keys to exclude
        attributes_exclude_keys = ['kaks','kaks2','ncbi']
        print "Excluding keys:"
        for key in attributes_exclude_keys:
            print "\t%s" % key
        GFFUpdateAttributes(gff_data,{},attributes_exclude_keys,True)

    # Set the IDs for consecutive lines with matching SGD names, to indicate that
    # they're in the same gene
    if group_SGDs:
        print "Grouping SGDs by setting ID's for consecutive lines with the same SGD values"
        GFFGroupSGDs(gff_data)

    # Find duplicates in input file
    if report_duplicates or resolve_duplicates:
        duplicate_sgds = GFFGetDuplicateSGDs(gff_data)
                
    if report_duplicates:
        # Write to duplicates file
        print "Writing duplicate SGD names to %s" % dupfile
        fd = open(dupfile,'w')
        ndup = 0
        ngroups = 0
        for sgd in duplicate_sgds.keys():
            assert(len(duplicate_sgds[sgd]) > 1)
            ndup += 1
            fd.write("%s\t" % sgd)
            for data in duplicate_sgds[sgd]:
                # Write the line number, chromosome, start and strand data
                line = ';'.join(('L'+str(data.lineno()),
                                 str(data['seqname']),str(data['start']),str(data['end'])))
                fd.write("\t%s" % line)
            fd.write("\n")
            logging.debug("%s\t%s" % (sgd,duplicate_sgds[sgd]))
            for group in GroupGeneSubsets(duplicate_sgds[sgd]):
                if len(group) > 1: ngroups += 1
        if ndup == 0:
            fd.write("No duplicate SGDs\n")
        fd.close()
        print "%d duplicates found (of which %d are trivial)" % (ndup,ngroups)

    if resolve_duplicates:
        print "Resolving duplicate SGDs using data from %s" % cdsfile
        print "Discarded genes will be written to %s" % delfile
        # Get data on best gene mappings from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(cdsfile,column_names=('name','chr','start','end','strand'))
        # Overlap margin
        overlap_margin = 1000
        # Perform resolution
        result = GFFResolveDuplicateSGDs(gff_data,mapping,duplicate_sgds,overlap_margin)
        #
        # Report the results
        #
        # Convenience variables for lists of unresolved, discarded etc duplicates
        resolved_sgds = result['resolved_sgds']
        unresolved_sgds_no_mapping_genes = result['unresolved_sgds_no_mapping_genes']
        unresolved_sgds_no_mapping_genes_after_filter = \
            result['unresolved_sgds_no_mapping_genes_after_filter']
        unresolved_sgds_no_overlaps = result['unresolved_sgds_no_overlaps']
        unresolved_sgds_multiple_matches = result['unresolved_sgds_multiple_matches']
        discard = result['discard']
        # Remaining unresolved cases
        if len(unresolved_sgds_no_mapping_genes) > 0:
            print "No mapping genes with same SGDs found in %s:" % cdsfile
            for sgd in unresolved_sgds_no_mapping_genes:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_mapping_genes_after_filter) > 0:
            print "No mapping genes with same chromosome and/or strand:"
            for sgd in unresolved_sgds_no_mapping_genes_after_filter:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_no_overlaps) > 0:
            print "No mapping genes with overlaps:"
            for sgd in unresolved_sgds_no_overlaps:
                print "\t%s" % sgd
            print
        if len(unresolved_sgds_multiple_matches) > 0:
            print "Multiple matching mapping genes:"
            for sgd in unresolved_sgds_multiple_matches:
                print "\t%s" % sgd
            print
        # Summary counts for each case
        print "Total number of duplicated indexes   : %d" % len(duplicate_sgds.keys())
        print "Number of resolved duplicate SGDs    : %d" % len(resolved_sgds)
        print "Unresolved duplicates:"
        print "* No mapping genes with same SGD     : %d" % len(unresolved_sgds_no_mapping_genes)
        print "* No mapping genes with same chr/str : %d" % len(unresolved_sgds_no_mapping_genes_after_filter)
        print "* No mapping genes with overlap      : %d" % len(unresolved_sgds_no_overlaps)
        print "* Multiple mapping genes match       : %d" % len(unresolved_sgds_multiple_matches)

        # Remove discarded duplicates from the data
        print "Removing discarded duplicates and writing to %s" % delfile
        fd = open(delfile,'w')
        for discard_data in discard:
            try:
                ip = gff_data.indexByLineNumber(discard_data.lineno())
                del(gff_data[ip])
                fd.write("%s\n" % discard_data)
            except IndexError:
                logging.warning("Failed to delete line %d: not found" % discard_data.lineno())
        fd.close()

        # Remove unresolved duplicates if requested
        if discard_unresolved:
            print "Removing unresolved duplicates and writing to %s" % unresfile
            # Get list of unresolved SGDs
            all_unresolved = result['unresolved_sgds']
            # Get list of unresolved duplicates
            unresolved = []
            for data in gff_data:
                attributes = data['attributes']
                if 'SGD' in attributes:
                    if attributes['SGD'] in all_unresolved:
                        unresolved.append(data)
            # Discard them
            fu = open(unresfile,'w')
            for discard in unresolved:
                try:
                    ip = gff_data.indexByLineNumber(discard.lineno())
                    del(gff_data[ip])
                    fu.write("%s\n" % discard)
                except IndexError:
                    logging.warning("Failed to delete line %d: not found" % discard.lineno())
            fu.close()

    # Look for "missing" genes in mapping file
    if insert_missing:
        # Get name for file with gene list
        if genefile is None:
            genefile = cdsfile
        print "Inserting unmatched genes from %s" % genefile
        # Get gene data from CDS file
        # Format is tab-delimited, each line has:
        # orf      chr      start     end      strand
        mapping = TabFile(genefile,column_names=('name','chr','start','end','strand'))
        n_genes_before_insert = len(gff_data)
        gff_data = GFFInsertMissingGenes(gff_data,mapping)
        print "Inserted %d missing genes" % (len(gff_data) - n_genes_before_insert)

    # Construct and insert ID for exons
    if add_exon_ids:
        print "Inserting artificial IDs for exon records"
        gff_data = GFFAddExonIDs(gff_data)

    # Construct and insert missing ID attributes
    if add_missing_ids:
        print "Inserting generated IDs for records where IDs are missing"
        gff_data = GFFAddIDAttributes(gff_data)

    # Strip attributes requested for removal
    if options.rm_attr:
        print "Removing the following attributes from all records:"
        for attr in options.rm_attr:
            print "\t* %s" % attr
        GFFUpdateAttributes(gff_data,exclude_keys=options.rm_attr)

    # Remove attributes that don't conform to KEY=VALUE format
    if strict_attributes:
        print "Removing attributes that don't conform to KEY=VALUE format"
        GFFUpdateAttributes(gff_data,exclude_nokeys=True)

    # Suppress percent encoding of attributes
    if no_attribute_encoding:
        print "Converting encoded special characters in attribute data to non-encoded form"
        logging.warning("!!! Special characters will not be correctly encoded in the output  !!!")
        logging.warning("!!! The resulting GFF may not be readable by this or other programs !!!")
        gff_data = GFFDecodeAttributes(gff_data)

    # Write to output file
    print "Writing output file %s" % outfile
    gff_data.write(outfile)
Exemple #13
0
class MacsXLS:
    """Class for reading and manipulating XLS output from MACS

    Reads the XLS output file from the MACS peak caller and
    processes and stores the information for subsequent manipulation
    and output.

    To read in data from a MACS output file:

    >>> macs = MacsXLS("macs.xls")

    This reads in the data and prepends an additional 'order'
    column (a list of numbers from one to the number of data
    lines).

    To get the MACS version:

    >>> macs.macs_version
    2.0.10

    To access the 'header' information (as a Python list):

    >>> macs.header

    To see the column names (as a Python list):

    >>> macs.columns

    The data is stored as a TabFile object; to access the data
    use the 'data' property, e.g.

    >>> for line in macs.data:
    ...    print "Chr %s Start %s End" % (line['chr'],line['start'],line['end'])

    To sort the data on a particular column use the 'sort_on'
    method, e.g.

    >>> macs.sort_on('chr')

    (Note that the order column is always recalculated after
    sorting.)

    """

    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

    @property
    def filen(self):
        """Return the source file name

        """
        return self.__filen

    @property
    def name(self):
        """Return the name property

        """
        return self.__name

    @property
    def macs_version(self):
        """Return the MACS version extracted from the file

        """
        return self.__macs_version

    @property
    def command_line(self):
        """Return the command line string extracted from the header

        This is the value associated with the "# Command line: ..."
        header line.

        Will be 'None' if no matching header line is found, else is
        the string following the ':'.

        """
        return self.__command_line

    @property
    def columns(self):
        """Return the column names for the MACS data

        Returns a list of the column names from the data
        extracted from the file.

        """
        return self.__data.header()

    @property
    def columns_as_xls_header(self):
        """Returns the column name list, with hash prepended

        """
        return ['#'+self.columns[0]] + self.columns[1:]

    @property
    def header(self):
        """Return the header data from the file

        Returns a list of lines comprising the header
        extracted from the file.

        """
        return self.__header

    @property
    def data(self):
        """Return the data from the file

        Returns a TabFile object comprising the data
        extracted from the file.

        """
        return self.__data

    @property
    def with_broad_option(self):
        """Returns True if MACS was run with --broad option

        If --broad wasn't detected then returns False.

        """
        if self.macs_version.startswith('1.'):
            # Not an option in MACS 1.*
            return False
        try:
            # Was --broad specified in the command line?
            return '--broad' in self.command_line.split()
        except AttributeError:
            # No command line? Check for 'abs_summit' column
            return 'abs_summit' not in self.columns

    def sort_on(self,column,reverse=True):
        """Sort data on specified column

        Sorts the data in-place, by the specified column.

        By default data is sorted in descending order; set
        'reverse' argument to False to sort values in ascending
        order instead
 
        Note that the 'order' column is automatically updated
        after each sorting operation.

        Arguments:
          column: name of the column to sort on
          reverse: if True (default) then sort in descending
            order (i.e. largest to smallest). Otherwise sort in
            ascending order.

        """
        # Sort the data
        self.__data.sort(lambda line: line[column],reverse=reverse)
        # Update the 'order' column
        self.update_order()

    def update_order(self):
        # Set/update values in 'order' column
        for i in range(0,len(self.__data)):
            self.__data[i]['order'] = i+1