Beispiel #1
0
    def __init__(self,f,multiline=False):
        """
        Create a new MetricsSummary instance

        Arguments:
          f (str): path to the 'metrics_summary.csv' file
          multiline (bool): if True then expect multiple lines
            of data (default is to expect a single line of
            data)
        """
        # Multi-line summary?
        self._multiline = multiline
        # Read in data from the file
        with open(f,'rt') as fp:
            s = fp.read()
        self._data = dict()
        s = s.strip().split('\n')
        if not self._multiline and len(s) != 2:
            raise Exception("%s: MetricsSummary expects 2 lines (or specify "
                            "multi-line mode)" % f)
        # Set up the tabfile instance
        TabFile.__init__(self,
                         column_names=self._tokenise(s[0]),
                         delimiter=',')
        # Add the data
        for line in s[1:]:
            self.append(data=self._tokenise(line))
 def add_per_lane_statistics(self):
     """
     Add a section with the per-lane statistics
     """
     # Per-lane statistics
     if not os.path.exists(self._per_lane_stats_file):
         logger.debug("No per-lane statistics file found")
         return
     per_lane_stats = self.add_section("Per-lane statistics",
                                       name="per_lane_stats")
     stats = TabFile(self._per_lane_stats_file, first_line_is_header=True)
     tbl = Table(columns=stats.header())
     tbl.append_columns("Assigned/unassigned")
     for line in stats:
         n = tbl.add_row()
         for c in stats.header():
             if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                 value = pretty_print_reads(line[c])
             else:
                 value = line[c]
             tbl.set_value(n, c, value)
         tbl.set_value(
             n, "Assigned/unassigned",
             Img(
                 ustackedbar(
                     (line["Assigned reads"], line["Unassigned reads"]),
                     length=100,
                     height=15,
                     colors=('red', 'white'),
                     inline=True)))
     per_lane_stats.add(tbl)
     self.add_to_toc("Per-lane statistics", per_lane_stats)
Beispiel #3
0
    def __init__(self, filen=None, fp=None, name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen, 'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith(
                        "# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0, "order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception, "Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()
Beispiel #4
0
    def __init__(self, well_list_file):
        """
        Create a new ICell8WellList instance.

        Arguments:
          well_list_file (str): path to the well list
            file.
        """
        self._data = TabFile(filen=well_list_file, first_line_is_header=True)
    def __init__(self,f):
        """
        Create a new AtacSummary instance

        Arguments:
          f (str): path to the 'summary.csv' file
        """
        TabFile.__init__(self,
                         filen=f,
                         first_line_is_header=True,
                         delimiter=',')
    def __init__(self, summary_file=None):
        """
        Create a new FastqcSummary instance

        """
        TabFile.__init__(self, column_names=("Status", "Module", "File"))
        if summary_file:
            summary_file = os.path.abspath(summary_file)
            with open(summary_file, "r") as fp:
                for line in fp:
                    line = line.strip()
                    self.append(tabdata=line)
        self._summary_file = summary_file
    def __init__(self, screen_file):
        """
        Create a new FastqscreenData instance

        """
        TabFile.__init__(self,
                         column_names=(
                             'Library',
                             '%Unmapped',
                             '%One_hit_one_library',
                             '%Multiple_hits_one_library',
                             '%One_hit_multiple_libraries',
                             '%Multiple_hits_multiple_libraries',
                         ))
        self._screen_file = os.path.abspath(screen_file)
        self._version = None
        self._no_hits = None
        # Read in data
        with open(self._screen_file, 'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#Fastq_screen version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('Library') or line.startswith('Genome'):
                    tabfile = TabFile(column_names=line.split())
                    continue
                elif line.startswith('%Hit_no_libraries:') or \
                     line.startswith('%Hit_no_genomes:'):
                    self._no_hits = float(line.split()[-1])
                    continue
                elif not line or \
                   line.startswith('#') or \
                   line.startswith('%'):
                    continue
                tabfile.append(tabdata=line)
        # Handle different terminology for different versions
        if tabfile.header()[0] == 'Library':
            library = 'Library'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_library'
            multiple_hits_one_library = '%Multiple_hits_one_library'
            one_hit_multiple_libraries = '%One_hit_multiple_libraries'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries'
        elif tabfile.header()[0] == 'Genome':
            library = 'Genome'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_genome'
            multiple_hits_one_library = '%Multiple_hits_one_genome'
            one_hit_multiple_libraries = '%One_hit_multiple_genomes'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes'
        # Copy data to main object
        for line in tabfile:
            data = [
                line[library], line[unmapped], line[one_hit_one_library],
                line[multiple_hits_one_library],
                line[one_hit_multiple_libraries],
                line[multiple_hits_multiple_libraries]
            ]
            self.append(data=data)
    def __init__(self,fastq_strand_out):
        """
        Create a new Fastqstrand instance

        """
        self._fastq_strand_out = os.path.abspath(fastq_strand_out)
        self._version = None
        self._genomes = AttributeDictionary()
        # Read in data
        tabfile = None
        with open(self._fastq_strand_out,'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#fastq_strand version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('#Genome'):
                    tabfile = TabFile(column_names=line[1:].split('\t'))
                    continue
                tabfile.append(tabdata=line)
        # Check there is some data
        if tabfile is None:
            raise Exception("Unable to extract fastq_strand data from %s" %
                            self._fastq_strand_out)
        # Copy data to main object
        for line in tabfile:
            # Store the data
            data = AttributeDictionary()
            self._genomes[line['Genome']] = data
            data['forward'] = line['1st forward']
            data['reverse'] = line['2nd reverse']
            # Additional processing
            if data.reverse > 0.0:
                ratio = float(data.forward)/float(data.reverse)
            elif data.forward > 0.0:
                ratio = float("+inf")
            else:
                ratio = None
            if ratio is not None:
                if ratio < 0.2:
                    strandedness = "reverse"
                elif ratio > 5 or ratio == float("+inf"):
                    strandedness = "forward"
                else:
                    strandedness = "unstranded?"
            else:
                strandedness = "undetermined"
            data['ratio'] = ratio
            data['strandedness'] = strandedness
Beispiel #9
0
    def __init__(self, summary_file=None):
        """
        Create a new FastqcSummary instance

        """
        TabFile.__init__(self, column_names=(
            'Status',
            'Module',
            'File',
        ))
        if summary_file:
            summary_file = os.path.abspath(summary_file)
            with open(summary_file, 'r') as fp:
                for line in fp:
                    line = line.strip()
                    self.append(tabdata=line)
        self._summary_file = summary_file
Beispiel #10
0
    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()
    def __init__(self,screen_file):
        """
        Create a new FastqscreenData instance

        """
        TabFile.__init__(self,
                         column_names=('Library',
                                       '%Unmapped',
                                       '%One_hit_one_library',
                                       '%Multiple_hits_one_library',
                                       '%One_hit_multiple_libraries',
                                       '%Multiple_hits_multiple_libraries',))
        self._screen_file = os.path.abspath(screen_file)
        self._version = None
        self._no_hits = None
        # Read in data
        with open(self._screen_file,'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#Fastq_screen version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('Library') or line.startswith('Genome'):
                    tabfile = TabFile(column_names=line.split())
                    continue
                elif line.startswith('%Hit_no_libraries:') or \
                     line.startswith('%Hit_no_genomes:'):
                    self._no_hits = float(line.split()[-1])
                    continue
                elif not line or \
                   line.startswith('#') or \
                   line.startswith('%'):
                    continue
                tabfile.append(tabdata=line)
        # Handle different terminology for different versions
        if tabfile.header()[0] == 'Library':
            library = 'Library'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_library'
            multiple_hits_one_library = '%Multiple_hits_one_library'
            one_hit_multiple_libraries = '%One_hit_multiple_libraries'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries'
        elif tabfile.header()[0] == 'Genome':
            library = 'Genome'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_genome'
            multiple_hits_one_library = '%Multiple_hits_one_genome'
            one_hit_multiple_libraries = '%One_hit_multiple_genomes'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes'
        # Copy data to main object
        for line in tabfile:
            data = [line[library],
                    line[unmapped],
                    line[one_hit_one_library],
                    line[multiple_hits_one_library],
                    line[one_hit_multiple_libraries],
                    line[multiple_hits_multiple_libraries]]
            self.append(data=data)
    def __init__(self,screen_file):
        """
        Create a new FastqscreenData instance

        """
        TabFile.__init__(self,
                         column_names=('Library',
                                       '%Unmapped',
                                       '%One_hit_one_library',
                                       '%Multiple_hits_one_library',
                                       '%One_hit_multiple_libraries',
                                       '%Multiple_hits_multiple_libraries',))
        self._screen_file = os.path.abspath(screen_file)
        self._version = None
        self._no_hits = None
        # Read in data
        with open(self._screen_file,'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#Fastq_screen version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('Library'):
                    tabfile = TabFile(column_names=line.split())
                    continue
                elif line.startswith('%Hit_no_libraries:'):
                    self._no_hits = float(line.split()[-1])
                    continue
                elif not line or \
                   line.startswith('#') or \
                   line.startswith('%'):
                    continue
                tabfile.append(tabdata=line)
        # Move data to main object
        for line in tabfile:
            data = []
            for col in self.header():
                data.append(line[col])
            self.append(data=data)
class ICell8WellList:
    """
    Class representing an ICELL8 well list file

    The file is tab-delimited and consists of an uncommented header
    line which lists the fields ('Row','Col','Candidate',...),
    followed by lines of data.

    The key columns are 'Sample' (gives the cell type) and 'Barcode'
    (the inline barcode sequence).
    """
    def __init__(self,well_list_file):
        """
        Create a new ICell8WellList instance.

        Arguments:
          well_list_file (str): path to the well list
            file.
        """
        self._data = TabFile(filen=well_list_file,
                             first_line_is_header=True)
    def barcodes(self):
        """
        Return a list of barcodes
        """
        return [x['Barcode'] for x in self._data]

    def samples(self):
        """
        Return a list of samples
        """
        samples = set([x['Sample'] for x in self._data])
        return sorted(list(samples))

    def sample(self,barcode):
        """
        Return sample (=cell type) corresponding to barcode
        """
        samples = self._data.lookup('Barcode',barcode)
        try:
            return samples[0]['Sample']
        except IndexError:
            raise KeyError("Failed to locate sample for '%s'" % barcode)
Beispiel #14
0
    def report_basic_stats(self,out_file=None,fp=None):
        """
        Report the 'basic' statistics

        For each FASTQ file, report the following information:

        - Project name
        - Sample name
        - FASTQ file name (without leading directory)
        - Size (human-readable)
        - Nreads (number of reads)
        - Paired_end ('Y' for paired-end, 'N' for single-end)

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file,'w')
        else:
            fpp = fp
        # Report
        stats = TabFile(column_names=('Project',
                                      'Sample',
                                      'Fastq',
                                      'Size',
                                      'Nreads',
                                      'Paired_end'))
        for line in self._stats:
            data = [line[c] for c in stats.header()]
            stats.append(data=data)
        stats.write(fp=fpp,include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()
Beispiel #15
0
    # Remove the working directory
    shutil.rmtree(working_dir)

    # Report the stats
    if args.stats_file is not None:
        # Output column names
        stats_file = os.path.abspath(args.stats_file)
        nreads_col = "Nreads%s" % ('' if args.suffix is None else args.suffix)
        umis_col = "Distinct_UMIs%s" % ('' if args.suffix is None else
                                        args.suffix)
        if not (os.path.isfile(stats_file) and args.append):
            # Create new stats file
            if well_list is not None:
                # Initialise barcode and sample names from well list
                stats_data = TabFile(column_names=('Barcode', 'Sample'))
                for barcode in well_list.barcodes():
                    stats_data.append(data=(barcode,
                                            well_list.sample(barcode)))
            else:
                # Barcodes from collected data
                stats_data = TabFile(column_names=('Barcode', ))
                for barcode in stats.barcodes():
                    stats_data.append(data=(barcode, ))
        else:
            # Append to an existing file
            stats_data = TabFile(filen=stats_file, first_line_is_header=True)
        # Add new columns of data
        stats_data.appendColumn(nreads_col)
        stats_data.appendColumn(umis_col)
        # Populate columns
Beispiel #16
0
class MacsXLS:
    """Class for reading and manipulating XLS output from MACS

    Reads the XLS output file from the MACS peak caller and
    processes and stores the information for subsequent manipulation
    and output.

    To read in data from a MACS output file:

    >>> macs = MacsXLS("macs.xls")

    This reads in the data and prepends an additional 'order'
    column (a list of numbers from one to the number of data
    lines).

    To get the MACS version:

    >>> macs.macs_version
    2.0.10

    To access the 'header' information (as a Python list):

    >>> macs.header

    To see the column names (as a Python list):

    >>> macs.columns

    The data is stored as a TabFile object; to access the data
    use the 'data' property, e.g.

    >>> for line in macs.data:
    ...    print "Chr %s Start %s End" % (line['chr'],line['start'],line['end'])

    To sort the data on a particular column use the 'sort_on'
    method, e.g.

    >>> macs.sort_on('chr')

    (Note that the order column is always recalculated after
    sorting.)

    """

    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

    @property
    def filen(self):
        """Return the source file name

        """
        return self.__filen

    @property
    def name(self):
        """Return the name property

        """
        return self.__name

    @property
    def macs_version(self):
        """Return the MACS version extracted from the file

        """
        return self.__macs_version

    @property
    def command_line(self):
        """Return the command line string extracted from the header

        This is the value associated with the "# Command line: ..."
        header line.

        Will be 'None' if no matching header line is found, else is
        the string following the ':'.

        """
        return self.__command_line

    @property
    def columns(self):
        """Return the column names for the MACS data

        Returns a list of the column names from the data
        extracted from the file.

        """
        return self.__data.header()

    @property
    def columns_as_xls_header(self):
        """Returns the column name list, with hash prepended

        """
        return ['#'+self.columns[0]] + self.columns[1:]

    @property
    def header(self):
        """Return the header data from the file

        Returns a list of lines comprising the header
        extracted from the file.

        """
        return self.__header

    @property
    def data(self):
        """Return the data from the file

        Returns a TabFile object comprising the data
        extracted from the file.

        """
        return self.__data

    @property
    def with_broad_option(self):
        """Returns True if MACS was run with --broad option

        If --broad wasn't detected then returns False.

        """
        if self.macs_version.startswith('1.'):
            # Not an option in MACS 1.*
            return False
        try:
            # Was --broad specified in the command line?
            return '--broad' in self.command_line.split()
        except AttributeError:
            # No command line? Check for 'abs_summit' column
            return 'abs_summit' not in self.columns

    def sort_on(self,column,reverse=True):
        """Sort data on specified column

        Sorts the data in-place, by the specified column.

        By default data is sorted in descending order; set
        'reverse' argument to False to sort values in ascending
        order instead
 
        Note that the 'order' column is automatically updated
        after each sorting operation.

        Arguments:
          column: name of the column to sort on
          reverse: if True (default) then sort in descending
            order (i.e. largest to smallest). Otherwise sort in
            ascending order.

        """
        # Sort the data
        self.__data.sort(lambda line: line[column],reverse=reverse)
        # Update the 'order' column
        self.update_order()

    def update_order(self):
        # Set/update values in 'order' column
        for i in range(0,len(self.__data)):
            self.__data[i]['order'] = i+1
 def add_per_fastq_statistics(self):
     """
     Add a section with the per-Fastq statistics
     """
     # Per fastq statistics
     if not os.path.exists(self._stats_file):
         logger.debug("No per-Fastq statistics file found")
         return
     per_file_stats = self.add_section("Per-file statistics by project",
                                       name="per_file_stats")
     project_toc_list = List()
     per_file_stats.add(project_toc_list)
     stats = TabFile(self._stats_file, first_line_is_header=True)
     projects = sorted(list(set([d['Project'] for d in stats])))
     lanes = [c for c in stats.header() if c.startswith('L')]
     sample = None
     for project in projects:
         # Get subset of lines for this project
         subset = sorted([d for d in stats if d['Project'] == project],
                         key=lambda l: split_sample_name(l['Sample']))
         # Determine which lanes this project appears in
         subset_lanes = []
         for l in lanes:
             for d in subset:
                 if d[l]:
                     subset_lanes.append(l)
                     break
         # Add a new section for this project
         s = per_file_stats.add_subsection("%s" % project,
                                           name="per_file_stats_%s" %
                                           project)
         # Check for problems
         has_warnings = False
         for line in subset:
             nreads = [line[l] for l in subset_lanes if line[l] != '']
             if not nreads or min(nreads) == 0:
                 s.add(
                     self.warning("One or more Fastqs with zero read "
                                  "counts in one or more lanes"))
                 has_warnings = True
                 break
         # Add link to project from ToC
         link = Link("%s" % project, s)
         if not has_warnings:
             project_toc_list.add_item(link)
         else:
             project_toc_list.add_item(WarningIcon(), link)
             self.flag_warnings()
         # Build the data of data
         tbl = Table(columns=('Sample', 'Fastq', 'Size'))
         if subset_lanes:
             tbl.append_columns(*subset_lanes)
         tbl.append_columns('Barplot', 'Nreads')
         s.add(tbl)
         for line in subset:
             if sample == line['Sample']:
                 sname = "&nbsp;"
             else:
                 sample = line['Sample']
                 sname = sample
             data = {
                 'Sample':
                 sname,
                 'Fastq':
                 line['Fastq'],
                 'Size':
                 line['Size'],
                 'Nreads': (pretty_print_reads(line['Nreads'])
                            if line['Nreads'] != '' else '')
             }
             for l in subset_lanes:
                 data[l] = (pretty_print_reads(line[l])
                            if line[l] != '' else '')
             nreads = [line[l] for l in subset_lanes if line[l] != '']
             if not nreads:
                 nreads = [
                     0,
                 ]
             if min(nreads) == 0:
                 # Add warning icon to Fastq with no reads in
                 # at least one lane
                 data['Fastq'] = Para(WarningIcon(), data['Fastq'])
             barplot = ustackedbar(nreads,
                                   length=100,
                                   height=10,
                                   colors=('grey', 'lightgrey'),
                                   bbox=True,
                                   inline=True)
             data['Barplot'] = Img(barplot)
             tbl.add_row(**data)
     # Add to table of contents
     self.add_to_toc("Per-file statistics by project", per_file_stats,
                     project_toc_list)
Beispiel #18
0
def report_processing_qc(analysis_dir, html_file):
    """
    Generate HTML report for processing statistics

    Arguments:
      analysis_dir (AnalysisDir): 
      html_file (str): destination path and file name for
        HTML report
    """
    # Initialise the HTML report
    processing_qc = Document("Processing report for %s" %
                             os.path.basename(analysis_dir.analysis_dir))
    processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
    processing_qc.add_css_rule("table { font-size: 80%;\n"
                               "        font-family: sans-serif; }")
    processing_qc.add_css_rule("td { text-align: right; }")
    # Add table of contents
    toc = processing_qc.add_section("Contents", name="toc")
    toc_list = List()
    toc.add(toc_list)
    # Per-lane statistics
    per_lane_stats_file = analysis_dir.params.per_lane_stats_file
    if per_lane_stats_file is None:
        per_lane_stats_file = "per_lane_statistics.info"
    if os.path.exists(per_lane_stats_file):
        per_lane_stats = processing_qc.add_section("Per-lane statistics",
                                                   name="per_lane_stats")
        stats = TabFile(per_lane_stats_file, first_line_is_header=True)
        tbl = Table(columns=stats.header())
        tbl.append_columns("Assigned/unassigned")
        for line in stats:
            n = tbl.add_row()
            for c in stats.header():
                if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                    value = pretty_print_reads(line[c])
                else:
                    value = line[c]
                tbl.set_value(n, c, value)
            tbl.set_value(
                n, "Assigned/unassigned",
                Img(
                    ustackedbar(
                        (line["Assigned reads"], line["Unassigned reads"]),
                        length=100,
                        height=15,
                        colors=('red', 'white'),
                        inline=True)))
        per_lane_stats.add(tbl)
        toc_list.add_item(Link("Per-lane statistics", per_lane_stats))
    # Per lane by sample statistics
    per_lane_sample_stats_file = "per_lane_sample_stats.info"
    if os.path.exists(per_lane_sample_stats_file):
        per_lane_sample_stats = processing_qc.add_section(
            "Per-lane statistics by sample", name="per_lane_sample_stats")
        lane_toc_list = List()
        per_lane_sample_stats.add(lane_toc_list)
        # Store the data for each lane
        with open("per_lane_sample_stats.info") as stats:
            lane_data = []
            for line in stats:
                if line.startswith("Lane "):
                    lane = int(line.split(' ')[-1])
                    lane_data.append({
                        'lane': lane,
                        'total_reads': None,
                        'samples': []
                    })
                elif line.startswith("Total reads = "):
                    total_reads = int(line.split('=')[-1].strip())
                    lane_data[-1]['total_reads'] = total_reads
                elif line.startswith("- "):
                    pname = line.split()[1].split('/')[0]
                    sname = line.split()[1].split('/')[1]
                    nreads = int(line.split()[2])
                    percreads = line.split()[3]
                    lane_data[-1]['samples'].append({
                        'pname': pname,
                        'sname': sname,
                        'nreads': nreads,
                        'percreads': percreads
                    })
        # Create a section and table for each lane
        for data in lane_data:
            lane = data['lane']
            max_reads = max([d['nreads'] for d in data['samples']])
            total_reads = data['total_reads']
            s = per_lane_sample_stats.add_subsection(
                "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane)
            lane_toc_list.add_item(Link("Lane %d" % lane, s))
            current_project = None
            tbl = Table(
                columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'),
                pname='Project',
                sname='Sample',
                nreads='Nreads',
                percreads='%reads',
                barplot='',
            )
            s.add(tbl)
            for sample in data['samples']:
                pname = sample['pname']
                sname = sample['sname']
                nreads = sample['nreads']
                percreads = sample['percreads']
                if pname == current_project:
                    pname = "&nbsp;"
                else:
                    current_project = pname
                barplot = ustackedbar((nreads, max_reads - nreads),
                                      length=100,
                                      height=5,
                                      colors=('black', 'lightgrey'),
                                      bbox=False,
                                      inline=True)
                tbl.add_row(pname=pname,
                            sname=sname,
                            nreads=pretty_print_reads(nreads),
                            percreads=percreads,
                            barplot=Img(barplot))
            tbl.add_row(pname="Total reads for lane %d" % lane,
                        nreads=pretty_print_reads(total_reads))
        toc_list.add_item(
            Link("Per-lane statistics by sample", per_lane_sample_stats),
            lane_toc_list)
    # Per fastq statistics
    stats_file = "statistics_full.info"
    if not os.path.exists(stats_file):
        if analysis_dir.params.stats_file is not None:
            stats_file = analysis_dir.params.stats_file
        else:
            stats_file = "statistics.info"
    if os.path.exists(stats_file):
        per_file_stats = processing_qc.add_section(
            "Per-file statistics by project", name="per_file_stats")
        project_toc_list = List()
        per_file_stats.add(project_toc_list)
        stats = TabFile(stats_file, first_line_is_header=True)
        projects = sorted(list(set([d['Project'] for d in stats])))
        lanes = filter(lambda c: c.startswith('L'), stats.header())
        sample = None
        for project in projects:
            subset = filter(lambda d: d['Project'] == project, stats)
            subset_lanes = filter(
                lambda l: reduce(lambda x, y: x or bool(y),
                                 [d[l] for d in subset], False), lanes)
            s = per_file_stats.add_subsection("%s" % project,
                                              name="per_file_stats_%s" %
                                              project)
            project_toc_list.add_item(Link("%s" % project, s))
            tbl = Table(columns=('Sample', 'Fastq', 'Size'))
            if subset_lanes:
                tbl.append_columns(*subset_lanes)
            tbl.append_columns('Barplot', 'Nreads')
            s.add(tbl)
            for line in subset:
                if sample == line['Sample']:
                    sname = "&nbsp;"
                else:
                    sample = line['Sample']
                    sname = sample
                data = {
                    'Sample':
                    sname,
                    'Fastq':
                    line['Fastq'],
                    'Size':
                    line['Size'],
                    'Nreads': (pretty_print_reads(line['Nreads'])
                               if line['Nreads'] != '' else '')
                }
                for l in subset_lanes:
                    data[l] = (pretty_print_reads(line[l])
                               if line[l] != '' else '')
                barplot = ustackedbar(filter(lambda n: n != '',
                                             [line[l] for l in subset_lanes]),
                                      length=100,
                                      height=10,
                                      colors=('grey', 'lightgrey'),
                                      bbox=True,
                                      inline=True)
                data['Barplot'] = Img(barplot)
                tbl.add_row(**data)
        toc_list.add_item(
            Link("Per-file statistics by project", per_file_stats),
            project_toc_list)
    # Write the processing QC summary file
    processing_qc.write(html_file)
    # Build output file name: if not explicitly supplied on the command
    # line then use "XLS_<input_name>.xls"
    if len(args) == 2:
        xls_out = args[1]
    else:
        # MACS output file might already have an .xls extension
        # but we'll add an explicit .xls extension
        xls_out = "XLS_"+os.path.splitext(os.path.basename(macs_in))[0]+".xls"
    print "Input file: %s" % macs_in
    print "Output XLS: %s" % xls_out

    # Extract the header from the MACS and feed actual data to
    # TabFile object
    header = []
    data = TabFile(column_names=['chr','start','end','length','summit','tags',
                                 '-10*log10(pvalue)','fold_enrichment','FDR(%)'])
    fp = open(macs_in,'r')
    for line in fp:
        if line.startswith('#') or line.strip() == '':
            # Header line
            header.append(line.strip())
        else:
            # Data
            data.append(tabdata=line.strip())
    fp.close()

    # Temporarily remove first line
    header_line = str(data[0])
    del(data[0])

    # Attempt to detect MACS version
Beispiel #20
0
    # line then use "XLS_<input_name>.xls"
    if len(args) == 2:
        xls_out = args[1]
    else:
        # MACS output file might already have an .xls extension
        # but we'll add an explicit .xls extension
        xls_out = "XLS_" + os.path.splitext(
            os.path.basename(macs_in))[0] + ".xls"
    print "Input file: %s" % macs_in
    print "Output XLS: %s" % xls_out

    # Extract the header from the MACS and feed actual data to
    # TabFile object
    header = []
    data = TabFile(column_names=[
        'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)',
        'fold_enrichment', 'FDR(%)'
    ])
    fp = open(macs_in, 'r')
    for line in fp:
        if line.startswith('#') or line.strip() == '':
            # Header line
            header.append(line.strip())
        else:
            # Data
            data.append(tabdata=line.strip())
    fp.close()

    # Temporarily remove first line
    header_line = str(data[0])
    del (data[0])
Beispiel #21
0
 def _get_data(self, filen=None):
     """
     Collect statistics for FASTQ outputs from an Illumina run
     """
     # Collect FASTQ files
     fastqstats = []
     for project in self._illumina_data.projects:
         for sample in project.samples:
             for fastq in sample.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(sample.dirn, fastq),
                                project.name, sample.name))
     # Gather same information for undetermined reads (if present)
     if self._illumina_data.undetermined is not None:
         for lane in self._illumina_data.undetermined.samples:
             for fastq in lane.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(lane.dirn, fastq),
                                self._illumina_data.undetermined.name,
                                lane.name))
     # Collect the data for each file
     if self._n_processors > 1:
         # Multiple cores
         pool = Pool(self._n_processors)
         results = pool.map(collect_fastq_data, fastqstats)
         pool.close()
         pool.join()
     else:
         # Single core
         results = map(collect_fastq_data, fastqstats)
     # Set up tabfile to hold pre-existing data
     if filen is not None:
         existing_stats = TabFile(filen, first_line_is_header=True)
     else:
         existing_stats = None
     # Set up class to hold all collected data
     self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                         'Size', 'Nreads', 'Paired_end',
                                         'Read_number'))
     # Split result sets into R1 and R2
     results_r1 = filter(lambda f: f.read_number == 1, results)
     results_r2 = filter(lambda f: f.read_number == 2, results)
     # Determine which lanes are present and append
     # columns for each
     lanes = set()
     for fastq in results_r1:
         logger.debug("-- %s: lanes %s" %
                      (fastq.name, ','.join([str(l) for l in fastq.lanes])))
         for lane in fastq.lanes:
             lanes.add(lane)
     # Add lane numbers from pre-existing stats file
     if existing_stats is not None:
         for c in existing_stats.header():
             if c.startswith('L'):
                 lanes.add(int(c[1:]))
     self._lanes = sorted(list(lanes))
     logger.debug("Lanes found: %s" %
                  ','.join([str(l) for l in self._lanes]))
     for lane in self._lanes:
         self._stats.appendColumn("L%s" % lane)
     # Copy pre-existing stats into new tabfile
     if existing_stats:
         for line in existing_stats:
             data = [
                 line['Project'], line['Sample'], line['Fastq'],
                 line['Size'], line['Nreads'], line['Paired_end'],
                 line['Read_number']
             ]
             for lane in lanes:
                 try:
                     data.append(line["L%s" % lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
     # Copy reads per lane from R1 FASTQs into R2
     for r2_fastq in results_r2:
         # Get corresponding R1 name
         logger.debug("-- Fastq R2: %s" % r2_fastq.name)
         r1_fastq_name = IlluminaFastq(r2_fastq.name)
         r1_fastq_name.read_number = 1
         r1_fastq_name = str(r1_fastq_name)
         logger.debug("--    -> R1: %s" % r1_fastq_name)
         # Locate corresponding data
         r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                           results_r1)[0]
         r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
     # Write the data into the tabfile
     paired_end = ('Y' if self._illumina_data.paired_end else 'N')
     for fastq in results:
         # Check for existing entry
         existing_entry = False
         for line in self._stats:
             if (line['Project'] == fastq.project
                     and line['Sample'] == fastq.sample
                     and line['Fastq'] == fastq.name):
                 # Overwrite the existing entry
                 existing_entry = True
                 break
         # Write the data
         if not existing_entry:
             # Append new entry
             data = [
                 fastq.project, fastq.sample, fastq.name,
                 bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                 paired_end, fastq.read_number
             ]
             for lane in lanes:
                 try:
                     data.append(fastq.reads_by_lane[lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
         else:
             # Overwrite existing entry
             logging.warning("Overwriting exisiting entry for "
                             "%s/%s/%s" %
                             (fastq.project, fastq.sample, fastq.name))
             line['Size'] = bcf_utils.format_file_size(fastq.fsize)
             line['Nreads'] = fastq.nreads
             line['Paired_end'] = paired_end
             line['Read_number'] = fastq.read_number
             for lane in lanes:
                 lane_name = "L%d" % lane
                 try:
                     line[lane_name] = fastq.reads_by_lane[lane]
                 except:
                     line[lane_name] = ''
Beispiel #22
0
class FastqStatistics:
    """
    Class for collecting and reporting stats on Illumina FASTQs

    Given a directory with fastq(.gz) files arranged in the same
    structure as the output from bcl2fastq or bcl2fastq2,
    collects statistics for each file and provides methods for
    reporting different aspects.

    Example usage:

    >>> from IlluminaData import IlluminaData
    >>> data = IlluminaData('120117_BLAH_JSHJHXXX','bcl2fastq')
    >>> stats = FastqStatistics(data)
    >>> stats.report_basic_stats('basic_stats.out')

    """
    def __init__(self, illumina_data, n_processors=1, add_to=None):
        """
        Create a new FastqStatistics instance

        Arguments:
          illumina_data: populated IlluminaData object describing the
            run.
          n_processors: number of processors to use (if >1 then uses
            the multiprocessing library to run the statistics gathering
            using multiple cores).
          add_to: optional, add the data to that from an existing
            statistics file
        """
        self._illumina_data = illumina_data
        self._n_processors = n_processors
        self._stats = None
        self._lane_names = []
        self._get_data(filen=add_to)

    def _get_data(self, filen=None):
        """
        Collect statistics for FASTQ outputs from an Illumina run
        """
        # Collect FASTQ files
        fastqstats = []
        for project in self._illumina_data.projects:
            for sample in project.samples:
                for fastq in sample.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(sample.dirn, fastq),
                                   project.name, sample.name))
        # Gather same information for undetermined reads (if present)
        if self._illumina_data.undetermined is not None:
            for lane in self._illumina_data.undetermined.samples:
                for fastq in lane.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(lane.dirn, fastq),
                                   self._illumina_data.undetermined.name,
                                   lane.name))
        # Collect the data for each file
        if self._n_processors > 1:
            # Multiple cores
            pool = Pool(self._n_processors)
            results = pool.map(collect_fastq_data, fastqstats)
            pool.close()
            pool.join()
        else:
            # Single core
            results = map(collect_fastq_data, fastqstats)
        # Set up tabfile to hold pre-existing data
        if filen is not None:
            existing_stats = TabFile(filen, first_line_is_header=True)
        else:
            existing_stats = None
        # Set up class to hold all collected data
        self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                            'Size', 'Nreads', 'Paired_end',
                                            'Read_number'))
        # Split result sets into R1 and R2
        results_r1 = filter(lambda f: f.read_number == 1, results)
        results_r2 = filter(lambda f: f.read_number == 2, results)
        # Determine which lanes are present and append
        # columns for each
        lanes = set()
        for fastq in results_r1:
            logger.debug("-- %s: lanes %s" %
                         (fastq.name, ','.join([str(l) for l in fastq.lanes])))
            for lane in fastq.lanes:
                lanes.add(lane)
        # Add lane numbers from pre-existing stats file
        if existing_stats is not None:
            for c in existing_stats.header():
                if c.startswith('L'):
                    lanes.add(int(c[1:]))
        self._lanes = sorted(list(lanes))
        logger.debug("Lanes found: %s" %
                     ','.join([str(l) for l in self._lanes]))
        for lane in self._lanes:
            self._stats.appendColumn("L%s" % lane)
        # Copy pre-existing stats into new tabfile
        if existing_stats:
            for line in existing_stats:
                data = [
                    line['Project'], line['Sample'], line['Fastq'],
                    line['Size'], line['Nreads'], line['Paired_end'],
                    line['Read_number']
                ]
                for lane in lanes:
                    try:
                        data.append(line["L%s" % lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
        # Copy reads per lane from R1 FASTQs into R2
        for r2_fastq in results_r2:
            # Get corresponding R1 name
            logger.debug("-- Fastq R2: %s" % r2_fastq.name)
            r1_fastq_name = IlluminaFastq(r2_fastq.name)
            r1_fastq_name.read_number = 1
            r1_fastq_name = str(r1_fastq_name)
            logger.debug("--    -> R1: %s" % r1_fastq_name)
            # Locate corresponding data
            r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                              results_r1)[0]
            r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
        # Write the data into the tabfile
        paired_end = ('Y' if self._illumina_data.paired_end else 'N')
        for fastq in results:
            # Check for existing entry
            existing_entry = False
            for line in self._stats:
                if (line['Project'] == fastq.project
                        and line['Sample'] == fastq.sample
                        and line['Fastq'] == fastq.name):
                    # Overwrite the existing entry
                    existing_entry = True
                    break
            # Write the data
            if not existing_entry:
                # Append new entry
                data = [
                    fastq.project, fastq.sample, fastq.name,
                    bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                    paired_end, fastq.read_number
                ]
                for lane in lanes:
                    try:
                        data.append(fastq.reads_by_lane[lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
            else:
                # Overwrite existing entry
                logging.warning("Overwriting exisiting entry for "
                                "%s/%s/%s" %
                                (fastq.project, fastq.sample, fastq.name))
                line['Size'] = bcf_utils.format_file_size(fastq.fsize)
                line['Nreads'] = fastq.nreads
                line['Paired_end'] = paired_end
                line['Read_number'] = fastq.read_number
                for lane in lanes:
                    lane_name = "L%d" % lane
                    try:
                        line[lane_name] = fastq.reads_by_lane[lane]
                    except:
                        line[lane_name] = ''

    @property
    def lane_names(self):
        """
        Return list of lane names (e.g. ['L1','L2',...])
        """
        return [("L%d" % l) for l in self._lanes]

    @property
    def raw(self):
        """
        Return the 'raw' statistics TabFile instance
        """
        return self._stats

    def report_full_stats(self, out_file=None, fp=None):
        """
        Report all statistics gathered for all FASTQs

        Essentially a dump of all the data.

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        self._stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_basic_stats(self, out_file=None, fp=None):
        """
        Report the 'basic' statistics

        For each FASTQ file, report the following information:

        - Project name
        - Sample name
        - FASTQ file name (without leading directory)
        - Size (human-readable)
        - Nreads (number of reads)
        - Paired_end ('Y' for paired-end, 'N' for single-end)

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size',
                                      'Nreads', 'Paired_end'))
        for line in self._stats:
            data = [line[c] for c in stats.header()]
            stats.append(data=data)
        stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_sample_stats(self, out_file=None, fp=None):
        """
        Report of reads per sample in each lane

        Reports the number of reads for each sample in each
        lane plus the total reads for each lane.

        Example output:

        Lane 1
        Total reads = 182851745
        - KatyDobbs/KD-K1      79888058        43.7%
        - KatyDobbs/KD-K3      97854292        53.5%
        - Undetermined_indices/lane1       5109395 2.8%
        ...

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        lanes = self.lane_names
        for lane in lanes:
            lane_number = int(lane[1:])
            samples = filter(lambda x: x['Read_number'] == 1 and bool(x[lane]),
                             self._stats)
            try:
                total_reads = sum([int(s[lane]) for s in samples])
            except Exception as ex:
                for s in samples:
                    try:
                        int(s[lane])
                    except ValueError:
                        logging.critical("Bad value for read count in "
                                         "lane %s sample %s: '%s'" %
                                         (lane, s['Sample'], s[lane]))
                raise ex
            fpp.write("\nLane %d\n" % lane_number)
            fpp.write("Total reads = %d\n" % total_reads)
            for sample in samples:
                sample_name = "%s/%s" % (sample['Project'], sample['Sample'])
                nreads = float(sample[lane])
                fpp.write("- %s\t%d\t%.1f%%\n" %
                          (sample_name, nreads, nreads / total_reads * 100.0))
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_summary_stats(self, out_file=None, fp=None):
        """
        Report summary of total and unassigned reads per-lane

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Set up TabFile to hold the data collected
        per_lane_stats = TabFile(column_names=('Lane', 'Total reads',
                                               'Assigned reads',
                                               'Unassigned reads', '%assigned',
                                               '%unassigned'))
        # Initialise counts for each lane
        assigned = {}
        unassigned = {}
        for lane in self.lane_names:
            assigned[lane] = 0
            unassigned[lane] = 0
        # Count assigned and unassigned (= undetermined) reads
        for line in filter(
                lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[
                    'Fastq']).is_index_read, self._stats):
            if line['Project'] != 'Undetermined_indices':
                counts = assigned
            else:
                counts = unassigned
            for lane in self.lane_names:
                if line[lane]:
                    try:
                        counts[lane] += line[lane]
                    except KeyError:
                        counts[lane] = line[lane]
        # Write out data for each lane
        for lane in self.lane_names:
            lane_number = int(lane[1:])
            assigned_reads = assigned[lane]
            try:
                unassigned_reads = unassigned[lane]
            except KeyError:
                # lane doesn't have any unassigned reads
                unassigned_reads = 0
            total_reads = assigned_reads + unassigned_reads
            if total_reads > 0:
                percent_assigned = float(assigned_reads)/ \
                                   float(total_reads)*100.0
                percent_unassigned = float(unassigned_reads)/ \
                                     float(total_reads)*100.0
            else:
                percent_assigned = 0.0
                percent_unassigned = 0.0
            per_lane_stats.append(data=("Lane %d" % lane_number, total_reads,
                                        assigned_reads, unassigned_reads,
                                        "%.2f" % percent_assigned,
                                        "%.2f" % percent_unassigned))
        # Write to file
        per_lane_stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()
Beispiel #23
0
    def report_per_lane_summary_stats(self, out_file=None, fp=None):
        """
        Report summary of total and unassigned reads per-lane

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Set up TabFile to hold the data collected
        per_lane_stats = TabFile(column_names=('Lane', 'Total reads',
                                               'Assigned reads',
                                               'Unassigned reads', '%assigned',
                                               '%unassigned'))
        # Initialise counts for each lane
        assigned = {}
        unassigned = {}
        for lane in self.lane_names:
            assigned[lane] = 0
            unassigned[lane] = 0
        # Count assigned and unassigned (= undetermined) reads
        for line in filter(
                lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[
                    'Fastq']).is_index_read, self._stats):
            if line['Project'] != 'Undetermined_indices':
                counts = assigned
            else:
                counts = unassigned
            for lane in self.lane_names:
                if line[lane]:
                    try:
                        counts[lane] += line[lane]
                    except KeyError:
                        counts[lane] = line[lane]
        # Write out data for each lane
        for lane in self.lane_names:
            lane_number = int(lane[1:])
            assigned_reads = assigned[lane]
            try:
                unassigned_reads = unassigned[lane]
            except KeyError:
                # lane doesn't have any unassigned reads
                unassigned_reads = 0
            total_reads = assigned_reads + unassigned_reads
            if total_reads > 0:
                percent_assigned = float(assigned_reads)/ \
                                   float(total_reads)*100.0
                percent_unassigned = float(unassigned_reads)/ \
                                     float(total_reads)*100.0
            else:
                percent_assigned = 0.0
                percent_unassigned = 0.0
            per_lane_stats.append(data=("Lane %d" % lane_number, total_reads,
                                        assigned_reads, unassigned_reads,
                                        "%.2f" % percent_assigned,
                                        "%.2f" % percent_unassigned))
        # Write to file
        per_lane_stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()
Beispiel #24
0
class MacsXLS:
    """Class for reading and manipulating XLS output from MACS

    Reads the XLS output file from the MACS peak caller and
    processes and stores the information for subsequent manipulation
    and output.

    To read in data from a MACS output file:

    >>> macs = MacsXLS("macs.xls")

    This reads in the data and prepends an additional 'order'
    column (a list of numbers from one to the number of data
    lines).

    To get the MACS version:

    >>> macs.macs_version
    2.0.10

    To access the 'header' information (as a Python list):

    >>> macs.header

    To see the column names (as a Python list):

    >>> macs.columns

    The data is stored as a TabFile object; to access the data
    use the 'data' property, e.g.

    >>> for line in macs.data:
    ...    print "Chr %s Start %s End" % (line['chr'],line['start'],line['end'])

    To sort the data on a particular column use the 'sort_on'
    method, e.g.

    >>> macs.sort_on('chr')

    (Note that the order column is always recalculated after
    sorting.)

    """

    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

    @property
    def filen(self):
        """Return the source file name

        """
        return self.__filen

    @property
    def name(self):
        """Return the name property

        """
        return self.__name

    @property
    def macs_version(self):
        """Return the MACS version extracted from the file

        """
        return self.__macs_version

    @property
    def command_line(self):
        """Return the command line string extracted from the header

        This is the value associated with the "# Command line: ..."
        header line.

        Will be 'None' if no matching header line is found, else is
        the string following the ':'.

        """
        return self.__command_line

    @property
    def columns(self):
        """Return the column names for the MACS data

        Returns a list of the column names from the data
        extracted from the file.

        """
        return self.__data.header()

    @property
    def columns_as_xls_header(self):
        """Returns the column name list, with hash prepended

        """
        return ['#'+self.columns[0]] + self.columns[1:]

    @property
    def header(self):
        """Return the header data from the file

        Returns a list of lines comprising the header
        extracted from the file.

        """
        return self.__header

    @property
    def data(self):
        """Return the data from the file

        Returns a TabFile object comprising the data
        extracted from the file.

        """
        return self.__data

    @property
    def with_broad_option(self):
        """Returns True if MACS was run with --broad option

        If --broad wasn't detected then returns False.

        """
        if self.macs_version.startswith('1.'):
            # Not an option in MACS 1.*
            return False
        try:
            # Was --broad specified in the command line?
            return '--broad' in self.command_line.split()
        except AttributeError:
            # No command line? Check for 'abs_summit' column
            return 'abs_summit' not in self.columns

    def sort_on(self,column,reverse=True):
        """Sort data on specified column

        Sorts the data in-place, by the specified column.

        By default data is sorted in descending order; set
        'reverse' argument to False to sort values in ascending
        order instead
 
        Note that the 'order' column is automatically updated
        after each sorting operation.

        Arguments:
          column: name of the column to sort on
          reverse: if True (default) then sort in descending
            order (i.e. largest to smallest). Otherwise sort in
            ascending order.

        """
        # Sort the data
        self.__data.sort(lambda line: line[column],reverse=reverse)
        # Update the 'order' column
        self.update_order()

    def update_order(self):
        # Set/update values in 'order' column
        for i in range(0,len(self.__data)):
            self.__data[i]['order'] = i+1
Beispiel #25
0
def report_processing_qc(analysis_dir, html_file):
    """
    Generate HTML report for processing statistics

    Arguments:
      analysis_dir (AutoProcess): AutoProcess instance for
        the directory to report the processing from
      html_file (str): destination path and file name for
        HTML report
    """
    # Initialise the HTML report
    processing_qc = Document("Processing report for %s" %
                             os.path.basename(analysis_dir.analysis_dir))
    processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
    processing_qc.add_css_rule("table { font-size: 80%;\n"
                               "        font-family: sans-serif; }")
    processing_qc.add_css_rule("td { text-align: right; }")
    processing_qc.add_css_rule("p.warning { padding: 5px;\n"
                               "            border: solid 1px red;\n"
                               "            background-color: F5BCA9;\n"
                               "            color: red;\n"
                               "            font-weight: bold;\n"
                               "            border-radius: 10px;\n"
                               "            display: inline-block; }")
    processing_qc.add_css_rule(".warnings { padding: 2px;\n"
                               "            border: solid 3px red;\n"
                               "            background-color: F5BCA9;\n"
                               "            color: red;\n"
                               "            font-weight: bold;\n"
                               "            margin: 10px;\n"
                               "            border-radius: 10px;\n"
                               "            display: inline-block; }")
    processing_qc.add_css_rule("img { vertical-align: middle; }")
    processing_qc.add_css_rule(".hide { display: none; }")
    # Add table of contents
    toc = processing_qc.add_section("Contents", name="toc")
    toc_list = List()
    toc.add(toc_list)
    # Add warnings section
    # This will be hidden if there are no issues
    status = True
    warnings = processing_qc.add_section(css_classes=("warnings", ))
    warnings.add(
        Para(WarningIcon(size=50),
             "There are issues with one or more lanes or samples"))
    # Per-lane statistics
    per_lane_stats_file = analysis_dir.params.per_lane_stats_file
    if per_lane_stats_file is None:
        per_lane_stats_file = "per_lane_statistics.info"
    per_lane_stats_file = get_absolute_file_path(
        per_lane_stats_file, base=analysis_dir.analysis_dir)
    if os.path.exists(per_lane_stats_file):
        per_lane_stats = processing_qc.add_section("Per-lane statistics",
                                                   name="per_lane_stats")
        stats = TabFile(per_lane_stats_file, first_line_is_header=True)
        tbl = Table(columns=stats.header())
        tbl.append_columns("Assigned/unassigned")
        for line in stats:
            n = tbl.add_row()
            for c in stats.header():
                if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                    value = pretty_print_reads(line[c])
                else:
                    value = line[c]
                tbl.set_value(n, c, value)
            tbl.set_value(
                n, "Assigned/unassigned",
                Img(
                    ustackedbar(
                        (line["Assigned reads"], line["Unassigned reads"]),
                        length=100,
                        height=15,
                        colors=('red', 'white'),
                        inline=True)))
        per_lane_stats.add(tbl)
        toc_list.add_item(Link("Per-lane statistics", per_lane_stats))
    # Per lane by sample statistics
    per_lane_sample_stats_file = get_absolute_file_path(
        "per_lane_sample_stats.info", base=analysis_dir.analysis_dir)
    if os.path.exists(per_lane_sample_stats_file):
        per_lane_sample_stats = processing_qc.add_section(
            "Per-lane statistics by sample", name="per_lane_sample_stats")
        lane_toc_list = List()
        per_lane_sample_stats.add(lane_toc_list)
        # Store the data for each lane
        lane_data = list()
        with open(per_lane_sample_stats_file, 'r') as stats:
            for line in stats:
                if line.startswith("Lane "):
                    lane = int(line.split(' ')[-1])
                    lane_data.append({
                        'lane': lane,
                        'total_reads': None,
                        'samples': []
                    })
                elif line.startswith("Total reads = "):
                    total_reads = int(line.split('=')[-1].strip())
                    lane_data[-1]['total_reads'] = total_reads
                elif line.startswith("- "):
                    pname = line.split()[1].split('/')[0]
                    sname = line.split()[1].split('/')[1]
                    nreads = int(line.split()[2])
                    percreads = line.split()[3]
                    lane_data[-1]['samples'].append({
                        'pname': pname,
                        'sname': sname,
                        'nreads': nreads,
                        'percreads': percreads
                    })
        # Create a section and table for each lane
        for data in lane_data:
            lane = data['lane']
            s = per_lane_sample_stats.add_subsection(
                "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane)
            # Check for problems
            has_warnings = False
            if not data['samples']:
                # No samples reported
                s.add(
                    Para(WarningIcon(),
                         "No samples reported for this lane",
                         css_classes=('warning', )))
                has_warnings = True
            elif min([d['nreads'] for d in data['samples']]) == 0:
                # There are samples with no reads
                s.add(
                    Para(WarningIcon(),
                         "One or more samples with no reads",
                         css_classes=('warning', )))
                has_warnings = True
            # Add link to lane for lane ToC
            link = Link("Lane %d" % lane, s)
            if not has_warnings:
                lane_toc_list.add_item(link)
            else:
                lane_toc_list.add_item(WarningIcon(), link)
                status = False
            # Write out the data, if there is any
            if not data['samples']:
                continue
            max_reads = max([d['nreads'] for d in data['samples']])
            total_reads = data['total_reads']
            current_project = None
            tbl = Table(
                columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'),
                pname='Project',
                sname='Sample',
                nreads='Nreads',
                percreads='%reads',
                barplot='',
            )
            s.add(tbl)
            # Sort the sample data into order of sample name
            samples = sorted([s for s in data['samples']],
                             key=lambda s: split_sample_name(s['sname']))
            # Write the table
            for sample in samples:
                pname = sample['pname']
                sname = sample['sname']
                nreads = sample['nreads']
                percreads = sample['percreads']
                if pname == current_project:
                    pname = "&nbsp;"
                else:
                    current_project = pname
                barplot = ustackedbar((nreads, max_reads - nreads),
                                      length=100,
                                      height=5,
                                      colors=('black', 'lightgrey'),
                                      bbox=False,
                                      inline=True)
                if nreads == 0:
                    sname = Para(WarningIcon(), sname)
                tbl.add_row(pname=pname,
                            sname=sname,
                            nreads=pretty_print_reads(nreads),
                            percreads=percreads,
                            barplot=Img(barplot))
            tbl.add_row(pname="Total reads for lane %d" % lane,
                        nreads=pretty_print_reads(total_reads))
        # Add link to section from main ToC
        toc_list.add_item(
            Link("Per-lane statistics by sample", per_lane_sample_stats),
            lane_toc_list)
    # Per fastq statistics
    stats_file = get_absolute_file_path("statistics_full.info",
                                        base=analysis_dir.analysis_dir)
    if not os.path.exists(stats_file):
        if analysis_dir.params.stats_file is not None:
            stats_file = analysis_dir.params.stats_file
        else:
            stats_file = "statistics.info"
    stats_file = get_absolute_file_path(stats_file,
                                        base=analysis_dir.analysis_dir)
    if os.path.exists(stats_file):
        per_file_stats = processing_qc.add_section(
            "Per-file statistics by project", name="per_file_stats")
        project_toc_list = List()
        per_file_stats.add(project_toc_list)
        stats = TabFile(stats_file, first_line_is_header=True)
        projects = sorted(list(set([d['Project'] for d in stats])))
        lanes = filter(lambda c: c.startswith('L'), stats.header())
        sample = None
        for project in projects:
            # Get subset of lines for this project
            subset = sorted(filter(lambda d: d['Project'] == project, stats),
                            key=lambda l: split_sample_name(l['Sample']))
            # Work out which lanes are included
            subset_lanes = filter(
                lambda l: reduce(lambda x, y: x or bool(y),
                                 [d[l] for d in subset], False), lanes)
            # Add a new section for this project
            s = per_file_stats.add_subsection("%s" % project,
                                              name="per_file_stats_%s" %
                                              project)
            # Check for problems
            has_warnings = False
            for line in subset:
                nreads = filter(lambda n: n != '',
                                [line[l] for l in subset_lanes])
                if not nreads or min(nreads) == 0:
                    s.add(
                        Para(WarningIcon(), "One or more Fastqs with zero "
                             "read counts in one or lanes",
                             css_classes=('warning', )))
                    has_warnings = True
                    break
            # Add link to project from ToC
            link = Link("%s" % project, s)
            if not has_warnings:
                project_toc_list.add_item(link)
            else:
                project_toc_list.add_item(WarningIcon(), link)
                status = False
            # Build the data of data
            tbl = Table(columns=('Sample', 'Fastq', 'Size'))
            if subset_lanes:
                tbl.append_columns(*subset_lanes)
            tbl.append_columns('Barplot', 'Nreads')
            s.add(tbl)
            for line in subset:
                if sample == line['Sample']:
                    sname = "&nbsp;"
                else:
                    sample = line['Sample']
                    sname = sample
                data = {
                    'Sample':
                    sname,
                    'Fastq':
                    line['Fastq'],
                    'Size':
                    line['Size'],
                    'Nreads': (pretty_print_reads(line['Nreads'])
                               if line['Nreads'] != '' else '')
                }
                for l in subset_lanes:
                    data[l] = (pretty_print_reads(line[l])
                               if line[l] != '' else '')
                nreads = filter(lambda n: n != '',
                                [line[l] for l in subset_lanes])
                if not nreads:
                    nreads = [
                        0,
                    ]
                if min(nreads) == 0:
                    # Add warning icon to Fastq with no reads in
                    # at least one lane
                    data['Fastq'] = Para(WarningIcon(), data['Fastq'])
                barplot = ustackedbar(nreads,
                                      length=100,
                                      height=10,
                                      colors=('grey', 'lightgrey'),
                                      bbox=True,
                                      inline=True)
                data['Barplot'] = Img(barplot)
                tbl.add_row(**data)
        toc_list.add_item(
            Link("Per-file statistics by project", per_file_stats),
            project_toc_list)
    # Set the visibility of the warning header
    if status:
        warnings.add_css_classes("hide")
    # Add an non-visible section that the publisher can
    # read to determine if there were problems
    s = processing_qc.add_section(name="status", css_classes=("hide", ))
    s.add("Status: %s" % ('OK' if status else 'WARNINGS', ))
    # Write the processing QC summary file
    processing_qc.write(html_file)