Beispiel #1
0
    def sort(self, key=None):
        """Sort the bins in this array (in-place).

        Optional argument 'key' is one of:

            - a function that computes a sorting key from a CopyNumArray row
            - a string identifier for an existing data column
            - a list/array/iterable of precomputed keys equal in length to the
              number of rows in this CopyNumArray.

        By default, bins are sorted by chromosomal coordinates.
        """
        if key is None:
            # Sort by chrom, then by start position
            chrom_keys = list(map(core.sorter_chrom, self.data['chromosome']))
            order = numpy.lexsort((self.data['start'], chrom_keys))
        else:
            # Sort by the given key, using a stable sort algorithm
            if isinstance(key, basestring):
                keys = self.data[key]
            elif callable(key):
                keys = list(map(key, self.data))
            else:
                if not len(key) == len(self):
                    raise ValueError("Sort key, as an array, must have the "
                                     "same length as the CopyNumArray to sort "
                                     "(%d vs. %d)." % (len(key), len(self)))
                keys = key
            order = numpy.argsort(keys, kind='mergesort')
        self.data = self.data.take(order)
Beispiel #2
0
    def sort(self, key=None):
        """Sort the bins in this array (in-place).

        Optional argument 'key' is one of:

            - a function that computes a sorting key from a CopyNumArray row
            - a string identifier for an existing data column
            - a list/array/iterable of precomputed keys equal in length to the
              number of rows in this CopyNumArray.

        By default, bins are sorted by chromosomal coordinates.
        """
        if key is None:
            # Sort by chrom, then by start position
            chrom_keys = list(map(core.sorter_chrom, self.data['chromosome']))
            order = numpy.lexsort((self.data['start'], chrom_keys))
        else:
            # Sort by the given key, using a stable sort algorithm
            if isinstance(key, basestring):
                keys = self.data[key]
            elif callable(key):
                keys = list(map(key, self.data))
            else:
                if not len(key) == len(self):
                    raise ValueError("Sort key, as an array, must have the "
                                     "same length as the CopyNumArray to sort "
                                     "(%d vs. %d)." % (len(key), len(self)))
                keys = key
            order = numpy.argsort(keys, kind='mergesort')
        self.data = self.data.take(order)
Beispiel #3
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    if min_mapq > 0:
        bedcov_args = ['-Q', str(min_mapq)]
    else:
        bedcov_args = []
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    for line in lines:
        try:
            chrom, start_s, end_s, name, basecount_s = line.split('\t')
        except:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = map(int, (start_s, end_s, basecount_s.strip()))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        yield chrom, start, end, name, count, mean_depth
 def __eq__(self, other):
     if len(self.data) != len(other.data):
         return 0
     ok = reduce(
         lambda x, y: x and y, map(lambda x, y: x == y, self.data, other.data)
     )
     return ok
Beispiel #5
0
def read_fasta_index(fasta_fname):
    """Load a FASTA file's index.

    Returns a dict of:
        {seq_id: (length, offset, chars_per_line, bytes_per_line), ...}

    The index file contains, in one row per sequence, tab-separated columns:

        - sequence identifier
        - length
        - offset of the first sequence character in the file
        - number of characters per line
        - number of bytes per line (including the end-of-line character)

    With this information, we can easily compute the byte offset of the i-th
    character of a sequence in a file by looking at its index record. We skip to
    this byte offset in the file and from there, we can read the necessary
    sequence characters.

    See: http://trac.seqan.de/wiki/Tutorial/IndexedFastaIO:
    """
    # Build a dict of keys -> offsets
    index = {}
    fai_fname = ensure_fasta_index(fasta_fname)
    with open(fai_fname) as faifile:
        for line in faifile:
            fields = line.rstrip().split('\t')
            seq_id = fields[0]
            assert seq_id not in index, "Duplicate ID: " + seq_id
            index[fields[0]] = tuple(map(int, fields[1:]))
    return index
Beispiel #6
0
def bedcov(bed_fname, bam_fname):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude 0-MAPQ reads
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, '-Q', '1')
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    for line in lines:
        try:
            chrom, start_s, end_s, name, basecount_s = line.split('\t')
        except:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = map(int, (start_s, end_s, basecount_s.strip()))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        yield chrom, start, end, name, count, mean_depth
Beispiel #7
0
def read_fasta_index(fasta_fname):
    """Load a FASTA file's index.

    Returns a dict of:
        {seq_id: (length, offset, chars_per_line, bytes_per_line), ...}

    The index file contains, in one row per sequence, tab-separated columns:

        - sequence identifier
        - length
        - offset of the first sequence character in the file
        - number of characters per line
        - number of bytes per line (including the end-of-line character)

    With this information, we can easily compute the byte offset of the i-th
    character of a sequence in a file by looking at its index record. We skip to
    this byte offset in the file and from there, we can read the necessary
    sequence characters.

    See: http://trac.seqan.de/wiki/Tutorial/IndexedFastaIO:
    """
    index = {}
    fai_fname = ensure_fasta_index(fasta_fname)
    with open(fai_fname) as faifile:
        for line in faifile:
            fields = line.rstrip().split('\t')
            seq_id = fields[0]
            assert seq_id not in index, "Duplicate ID: " + seq_id
            index[fields[0]] = tuple(map(int, fields[1:]))
    return index
Beispiel #8
0
def reference2regions(reference, coord_only=False):
    """Extract iterables of target and antitarget regions from a reference.

    Like loading two BED files with ngfrills.parse_regions.
    """
    cna2rows = (_cna2coords if coord_only else _cna2regions)
    return map(cna2rows, _ref_split_targets(reference))
Beispiel #9
0
def reference2regions(reference, coord_only=False):
    """Extract iterables of target and antitarget regions from a reference CNA.

    Like loading two BED files with ngfrills.parse_regions.
    """
    cna2rows = (_cna2coords if coord_only else _cna2regions)
    return map(cna2rows, _ref_split_targets(reference))
Beispiel #10
0
def get_max_abs_value(sample_data):
    val = 0
    for i in range(len(sample_data)):
        for chrom in sample_data[i].keys():
            vals = [data[2] for data in sample_data[i][chrom]]
            vals = filter(lambda x: x != -1000, vals)
            val = max(val, max(map(abs, vals)))
    return val
Beispiel #11
0
def _sniff_xtra(header_line):
    colnames = tuple(map(str.strip, header_line.split('\t')))
    assert colnames[:5] == ('chromosome', 'start', 'end', 'gene', 'log2'), \
            colnames[:5]
    xtra = colnames[5:]
    assert all(x in ('gc', 'rmask', 'spread', 'weight', 'probes')
               for x in xtra)
    return xtra
Beispiel #12
0
def _sniff_xtra(header_line):
    colnames = tuple(map(str.strip, header_line.split('\t')))
    assert colnames[:5] == ('chromosome', 'start', 'end', 'gene', 'log2'), \
            colnames[:5]
    xtra = colnames[5:]
    assert all(x in ('gc', 'rmask', 'spread', 'weight', 'probes')
               for x in xtra)
    return xtra
Beispiel #13
0
def get_max_abs_value(sample_data):
    val = 0
    for i in range(len(sample_data)):
        for chrom in sample_data[i].keys():
            vals = [data[2] for data in sample_data[i][chrom]]
            vals = filter(lambda x: x!= -1000, vals) 
            val = max(val, max(map(abs, vals)))
    return val
Beispiel #14
0
 def parse_line(line):
     fields = line.rstrip().split('\t')
     coverage, chrom, start, end, gene = fields[1:6]
     outrow = [chrom, int(start), int(end), gene, float(coverage)]
     # Parse extra fields as numbers (common type: float)
     rest = list(map(float, fields[6:]))
     core.assert_equal("Number of extra columns parsed doesn't match "
                     "extra fields given",
                     **{"extra columns": len(rest),
                         "extra fields": len(xtra)})
     return tuple(outrow + rest)
Beispiel #15
0
def read_class_color(path):
    # in case of categorical data
    # the use should provide a table with mappings of category labels
    # to rgb value
    # e.g.
    # positive 0.9 0.1 0.1
    # negative 0.1 0.1 0.9
    cols = collections.OrderedDict()
    for line in file(path):
        toks = line.strip().split('\t')
        cols[toks[0]] = tuple(map(float, toks[1:]))
    return cols
Beispiel #16
0
def read_class_color(path):
    # in case of categorical data
    # the use should provide a table with mappings of category labels
    # to rgb value
    # e.g.
    # positive 0.9 0.1 0.1
    # negative 0.1 0.1 0.9
    cols = collections.OrderedDict()
    for line in file(path):
        toks = line.strip().split('\t')
        cols[toks[0]] = tuple(map(float, toks[1:]))
    return cols
Beispiel #17
0
def warn_bad_probes(probes):
    """Warn about target probes where coverage is poor.

    Prints a formatted table to stderr.
    """
    bad_probes = probes[mask_bad_probes(probes)]
    fg_index = (bad_probes['gene'] != 'Background')
    fg_bad_probes = bad_probes[fg_index]
    if len(fg_bad_probes) > 0:
        # ENH: print coverage and spread too
        bad_pct = 100 * len(fg_bad_probes) / sum(
            probes['gene'] != 'Background')
        echo("*WARNING*", len(fg_bad_probes), "targets",
             "(%.4f)" % bad_pct + '%', "failed filters:")
        gene_cols = max(map(len, fg_bad_probes['gene']))
        labels = list(map(row2label, fg_bad_probes))
        chrom_cols = max(map(len, labels))
        last_gene = None
        for label, probe in zip(labels, fg_bad_probes):
            if probe['gene'] == last_gene:
                gene = '  "'
            else:
                gene = probe['gene']
                last_gene = gene
            if 'rmask' in probes:
                print("  %s  %s  coverage=%.3f  spread=%.3f  rmask=%.3f" %
                      (gene.ljust(gene_cols), label.ljust(chrom_cols),
                       probe['coverage'], probe['spread'], probe['rmask']))
            else:
                print("  %s  %s  coverage=%.3f  spread=%.3f" %
                      (gene.ljust(gene_cols), label.ljust(chrom_cols),
                       probe['coverage'], probe['spread']))

    # Count the number of BG probes dropped, too (names are all "Background")
    bg_bad_probes = bad_probes[True - fg_index]
    if len(bg_bad_probes) > 0:
        bad_pct = 100 * len(bg_bad_probes) / sum(
            probes['gene'] == 'Background')
        echo("Antitargets:", len(bg_bad_probes), "(%.4f)" % bad_pct + '%',
             "failed filters")
Beispiel #18
0
def warn_bad_probes(probes):
    """Warn about target probes where coverage is poor.

    Prints a formatted table to stderr.
    """
    bad_probes = probes[fix.mask_bad_probes(probes)]
    fg_index = (bad_probes['gene'] != 'Background')
    fg_bad_probes = bad_probes[fg_index]
    if len(fg_bad_probes) > 0:
        bad_pct = 100 * len(fg_bad_probes) / sum(
            probes['gene'] != 'Background')
        logging.info("Targets: %d (%s) bins failed filters:",
                     len(fg_bad_probes), "%.4f" % bad_pct + '%')
        gene_cols = max(map(len, fg_bad_probes['gene']))
        labels = list(map(CNA.row2label, fg_bad_probes))
        chrom_cols = max(map(len, labels))
        last_gene = None
        for label, probe in zip(labels, fg_bad_probes):
            if probe['gene'] == last_gene:
                gene = '  "'
            else:
                gene = probe['gene']
                last_gene = gene
            if 'rmask' in probes:
                logging.info(
                    "  %s  %s  coverage=%.3f  spread=%.3f  rmask=%.3f",
                    gene.ljust(gene_cols), label.ljust(chrom_cols),
                    probe['log2'], probe['spread'], probe['rmask'])
            else:
                logging.info("  %s  %s  coverage=%.3f  spread=%.3f",
                             gene.ljust(gene_cols), label.ljust(chrom_cols),
                             probe['log2'], probe['spread'])

    # Count the number of BG probes dropped, too (names are all "Background")
    bg_bad_probes = bad_probes[~fg_index]
    if len(bg_bad_probes) > 0:
        bad_pct = 100 * len(bg_bad_probes) / sum(
            probes['gene'] == 'Background')
        logging.info("Antitargets: %d (%s) bins failed filters",
                     len(bg_bad_probes), "%.4f" % bad_pct + '%')
Beispiel #19
0
def warn_bad_probes(probes):
    """Warn about target probes where coverage is poor.

    Prints a formatted table to stderr.
    """
    bad_probes = probes[mask_bad_probes(probes)]
    fg_index = (bad_probes['gene'] != 'Background')
    fg_bad_probes = bad_probes[fg_index]
    if len(fg_bad_probes) > 0:
        # ENH: print coverage and spread too
        bad_pct = 100 * len(fg_bad_probes) / sum(probes['gene'] != 'Background')
        echo("*WARNING*", len(fg_bad_probes), "targets",
             "(%.4f)" % bad_pct + '%', "failed filters:")
        gene_cols = max(map(len, fg_bad_probes['gene']))
        labels = list(map(row2label, fg_bad_probes))
        chrom_cols = max(map(len, labels))
        last_gene = None
        for label, probe in zip(labels, fg_bad_probes):
            if probe['gene'] == last_gene:
                gene = '  "'
            else:
                gene = probe['gene']
                last_gene = gene
            if 'rmask' in probes:
                print("  %s  %s  coverage=%.3f  spread=%.3f  rmask=%.3f"
                      % (gene.ljust(gene_cols), label.ljust(chrom_cols),
                         probe['coverage'], probe['spread'], probe['rmask']))
            else:
                print("  %s  %s  coverage=%.3f  spread=%.3f"
                      % (gene.ljust(gene_cols), label.ljust(chrom_cols),
                         probe['coverage'], probe['spread']))

    # Count the number of BG probes dropped, too (names are all "Background")
    bg_bad_probes = bad_probes[True - fg_index]
    if len(bg_bad_probes) > 0:
        bad_pct = 100 * len(bg_bad_probes) / sum(probes['gene'] == 'Background')
        echo("Antitargets:", len(bg_bad_probes), "(%.4f)" % bad_pct + '%',
             "failed filters")
Beispiel #20
0
def warn_bad_probes(probes):
    """Warn about target probes where coverage is poor.

    Prints a formatted table to stderr.
    """
    bad_probes = probes[fix.mask_bad_probes(probes)]
    fg_index = (bad_probes['gene'] != 'Background')
    fg_bad_probes = bad_probes[fg_index]
    if len(fg_bad_probes) > 0:
        bad_pct = 100 * len(fg_bad_probes) / sum(probes['gene'] != 'Background')
        logging.info("Targets: %d (%s) bins failed filters:",
                     len(fg_bad_probes), "%.4f" % bad_pct + '%')
        gene_cols = max(map(len, fg_bad_probes['gene']))
        labels = list(map(CNA.row2label, fg_bad_probes))
        chrom_cols = max(map(len, labels))
        last_gene = None
        for label, probe in zip(labels, fg_bad_probes):
            if probe.gene == last_gene:
                gene = '  "'
            else:
                gene = probe.gene
                last_gene = gene
            if 'rmask' in probes:
                logging.info("  %s  %s  coverage=%.3f  spread=%.3f  rmask=%.3f",
                             gene.ljust(gene_cols), label.ljust(chrom_cols),
                             probe.log2, probe.spread, probe.rmask)
            else:
                logging.info("  %s  %s  coverage=%.3f  spread=%.3f",
                             gene.ljust(gene_cols), label.ljust(chrom_cols),
                             probe.log2, probe.spread)

    # Count the number of BG probes dropped, too (names are all "Background")
    bg_bad_probes = bad_probes[~fg_index]
    if len(bg_bad_probes) > 0:
        bad_pct = 100 * len(bg_bad_probes) / sum(probes['gene'] == 'Background')
        logging.info("Antitargets: %d (%s) bins failed filters",
                     len(bg_bad_probes), "%.4f" % bad_pct + '%')
Beispiel #21
0
def merge_rows(rows):
    """Combine equivalent rows of coverage data across multiple samples.

    Check that probe info matches across all samples, then merge the log2
    coverage values.

    Input: a list of individual rows corresponding to the same probes from
    different coverage files.
    Output: a list starting with the single common Probe object, followed by the
    log2 coverage values from each sample, in order.
    """
    probe_infos, coverages = zip(*map(row_to_probe_coverage, rows))
    probe_info = core.check_unique(probe_infos, "probe Name")
    combined_row = [probe_info] + list(coverages)
    return combined_row
Beispiel #22
0
    def write(self, outfile=sys.stdout):
        """Write coverage data to a file or handle in tabular format.

        This is similar to BED or BedGraph format, but with extra columns.

        To combine multiple samples in one file and/or convert to another
        format, see the 'export' subcommand.
        """
        colnames = ['chromosome', 'start', 'end', 'gene', 'log2']
        colnames.extend(self._xtra)
        rows = (list(map(str, row)) for row in self.data)
        with ngfrills.safe_write(outfile) as handle:
            header = '\t'.join(colnames) + '\n'
            handle.write(header)
            handle.writelines('\t'.join(row) + '\n' for row in rows)
Beispiel #23
0
def merge_rows(rows):
    """Combine equivalent rows of coverage data across multiple samples.

    Check that probe info matches across all samples, then merge the log2
    coverage values.

    Input: a list of individual rows corresponding to the same probes from
    different coverage files.
    Output: a list starting with the single common Probe object, followed by the
    log2 coverage values from each sample, in order.
    """
    probe_infos, coverages = zip(*map(row_to_probe_coverage, rows))
    probe_info = core.check_unique(probe_infos, "probe Name")
    combined_row = [probe_info] + list(coverages)
    return combined_row
Beispiel #24
0
    def write(self, outfile=sys.stdout):
        """Write coverage data to a file or handle in tabular format.

        This is similar to BED or BedGraph format, but with extra columns.

        To combine multiple samples in one file and/or convert to another
        format, see the 'export' subcommand.
        """
        colnames = ['chromosome', 'start', 'end', 'gene', 'log2']
        colnames.extend(self._xtra)
        rows = (list(map(str, row)) for row in self.data)
        with ngfrills.safe_write(outfile) as handle:
            header = '\t'.join(colnames) + '\n'
            handle.write(header)
            handle.writelines('\t'.join(row) + '\n' for row in rows)
Beispiel #25
0
def _get_coords(filename):
    alb = open(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line  # rstrip not needed
            else:
                end_line = line

    if end_line is None:  # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line])))  # returns [(start0, end0), (start1, end1)]
Beispiel #26
0
def _get_coords(filename):
    alb = open(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line  # rstrip not needed
            else:
                end_line = line

    if end_line is None:  # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line])))  # returns [(start0, end0), (start1, end1)]
Beispiel #27
0
    def squash_genes(self,
                     ignore=('-', 'CGH', '.'),
                     squash_background=False,
                     summary_stat=metrics.biweight_location):
        """Combine consecutive bins with the same targeted gene name.

        The `ignore` parameter lists bin names that not be counted as genes to
        be output.

        Parameter `summary_stat` is a function that summarizes an array of
        coverage values to produce the "squashed" gene's coverage value. By
        default this is the biweight location, but you might want median, mean,
        max, min or something else in some cases.

        Optional columns, if present, are dropped.
        """
        def squash_rows(name, rows):
            """Combine multiple rows (for the same gene) into one row."""
            chrom = core.check_unique(rows['chromosome'], 'chromosome')
            start = rows[0]['start']
            end = rows[-1]['end']
            cvg = summary_stat(rows['coverage'])
            outrow = [chrom, start, end, name, cvg]
            # Handle extra fields
            # ENH - no coverage stat; do weighted average as appropriate
            for xfield in ('gc', 'rmask', 'spread', 'weight'):
                if xfield in self:
                    outrow.append(summary_stat(rows[xfield]))
            if 'probes' in self:
                outrow.append(sum(rows['probes']))
            return tuple(outrow)

        outrows = []
        for name, subarr in self.by_gene(ignore):
            if name == 'Background' and not squash_background:
                outrows.extend(map(tuple, subarr))
            else:
                outrows.append(squash_rows(name, subarr))
        return self.to_rows(outrows)
Beispiel #28
0
def do_segmentation(cnarr,
                    method,
                    threshold=None,
                    variants=None,
                    skip_low=False,
                    skip_outliers=10,
                    save_dataframe=False,
                    rlibpath=None,
                    processes=1):
    """Infer copy number segments from the given coverage table."""
    # XXX parallel flasso segfaults in R when run on a single chromosome
    if processes == 1 or method == 'flasso':
        cna = _do_segmentation(cnarr, method, threshold, variants, skip_low,
                               skip_outliers, save_dataframe, rlibpath)
        if save_dataframe:
            cna, rstr = cna
            rstr = _to_str(rstr)

    else:
        with futures.ProcessPoolExecutor(processes) as pool:
            rets = list(
                pool.map(_ds, ((ca, method, threshold, variants, skip_low,
                                skip_outliers, save_dataframe, rlibpath)
                               for _, ca in cnarr.by_chromosome())))
        if save_dataframe:
            # rets is a list of (CNA, R dataframe string) -- unpack
            rets, r_dframe_strings = zip(*rets)
            # Strip the header line from all but the first dataframe, then combine
            r_dframe_strings = map(_to_str, r_dframe_strings)
            rstr = [next(r_dframe_strings)]
            rstr.extend(r[r.index('\n') + 1:] for r in r_dframe_strings)
            rstr = "".join(rstr)
        cna = cnarr.concat(rets)

    if save_dataframe:
        return cna, rstr
    return cna
Beispiel #29
0
    def squash_genes(self, ignore=('-', 'CGH', '.'), squash_background=False,
                     summary_stat=metrics.biweight_location):
        """Combine consecutive bins with the same targeted gene name.

        The `ignore` parameter lists bin names that not be counted as genes to
        be output.

        Parameter `summary_stat` is a function that summarizes an array of
        coverage values to produce the "squashed" gene's coverage value. By
        default this is the biweight location, but you might want median, mean,
        max, min or something else in some cases.

        Optional columns, if present, are dropped.
        """
        def squash_rows(name, rows):
            """Combine multiple rows (for the same gene) into one row."""
            chrom = core.check_unique(rows['chromosome'], 'chromosome')
            start = rows[0]['start']
            end = rows[-1]['end']
            cvg = summary_stat(rows['coverage'])
            outrow = [chrom, start, end, name, cvg]
            # Handle extra fields
            # ENH - no coverage stat; do weighted average as appropriate
            for xfield in ('gc', 'rmask', 'spread', 'weight'):
                if xfield in self:
                    outrow.append(summary_stat(rows[xfield]))
            if 'probes' in self:
                outrow.append(sum(rows['probes']))
            return tuple(outrow)

        outrows = []
        for name, subarr in self.by_gene(ignore):
            if name == 'Background' and not squash_background:
                outrows.extend(map(tuple, subarr))
            else:
                outrows.append(squash_rows(name, subarr))
        return self.to_rows(outrows)
Beispiel #30
0
def create_heatmap(filenames, index, colormap, crange, show_chromosome=None):
    """
    Plot signal for multiple samples as a heatmap.
    Signal can be continuous or categorical value.
    In case of numeric values, a continuous color
    scale will be used to map values to color. 
    In case of categorical value, the user should 
    provide mapping of values to color as an input"""

    _fig = pyplot.figure(figsize=(12, 8))
    gs = matplotlib.gridspec.GridSpec(2, 1, height_ratios=[30, 1])
    axis = pyplot.subplot(gs[0])
    axis_aux = pyplot.subplot(gs[1])

    # List sample names on the y-axis
    axis.set_yticks([i + 0.5 for i in range(len(filenames))])
    axis.set_yticklabels(list(map(fbase, filenames)))
    axis.invert_yaxis()
    axis.set_ylabel("Samples")
    axis.set_axis_bgcolor('#DDDDDD')

    # Group each file's probes/segments by chromosome
    sample_data = [collections.defaultdict(list) for _f in filenames]

    #-------------------------------------------------------------#
    # read in the signal value in each sample from the input files
    index = int(index)
    for i, fname in enumerate(filenames):
        f_h = file(fname)
        for line in f_h:
            toks = line.strip().split('\t')
            if toks[0].lower() in ['chrom', 'chromosome']:
                # header line
                continue
            if colormap == None:
                # numerical data, convert signal to floating point numbers
                sample_data[i][toks[0]].append(
                    (int(toks[1]), int(toks[2]), ifloat(toks[index])))
            else:
                sample_data[i][toks[0]].append(
                    (int(toks[1]), int(toks[2]), toks[index]))

        f_h.close()
    #-------------------------------------------------------------#

    # Calculate the size (max endpoint value) of each chromosome
    chrom_sizes = {}
    for row in sample_data:
        for chrom, data in iteritems(row):
            max_posn = max(coord[1] for coord in data)
            chrom_sizes[chrom] = max(max_posn, chrom_sizes.get(chrom, 0))
    chrom_sizes = collections.OrderedDict(
        sorted(iteritems(chrom_sizes), key=sorter_chrom_at(0)))

    if colormap != None:
        cvg2rgb = CVG2RGB(colormap=colormap)
    else:
        if crange != None:
            vmax = float(crange)
            vmin = (-1) * vmax
        else:
            vmax = get_max_abs_value(sample_data)
            vmin = (-1) * vmax

        # this matplotlib color map is appropriate for diverging data
        # ; e.g. copy number values. can be subsituted by any maplotlib
        # colormap
        my_cmap = matplotlib.cm.seismic
        color_norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)

        cvg2rgb = CVG2RGB(rcmap=my_cmap, color_norm=color_norm)

    def plot_rect(y_idx, x_start, x_end, cvg):
        """Draw a rectangle in the given coordinates and color."""
        x_coords = (x_start, x_start, x_end + 1, x_end + 1)
        y_coords = (y_idx, y_idx + 1, y_idx + 1, y_idx)
        if cvg in [-1000.0, 'NA']:
            # missing data, a shade of dark gray
            rgbcolor = (0.3, 0.3, 0.3)
        else:
            rgbcolor = cvg2rgb.get_color(cvg)
        axis.fill(x_coords, y_coords, color=rgbcolor)

    if show_chromosome:
        # Lay out only the selected chromosome
        chrom_offsets = {show_chromosome: 0.0}
        # Set x-axis the chromosomal positions (in Mb), title as the chromosome
        axis.set_xlim(0, chrom_sizes[show_chromosome] * MB)
        axis.set_title(show_chromosome)
        axis.set_xlabel("Position (Mb)")
        axis.tick_params(which='both', direction='out')
        axis.get_xaxis().tick_bottom()
        axis.get_yaxis().tick_left()
        # Plot the individual probe/segment coverages
        for i, row in enumerate(sample_data):
            for start, end, cvg in row[show_chromosome]:
                plot_rect(i, start * MB, end * MB, cvg)

    else:
        # Lay out chromosome dividers and x-axis labels
        # (Just enough padding to avoid overlap with the divider line)
        chrom_offsets = plot_x_dividers(axis, chrom_sizes, 1)
        # Plot the individual probe/segment coverages
        for i, row in enumerate(sample_data):
            for chrom, curr_offset in iteritems(chrom_offsets):
                for start, end, cvg in row[chrom]:
                    plot_rect(i, start + curr_offset, end + curr_offset, cvg)

    plot_y_dividers(axis, sample_data, color=(0.4, 0.4, 0.4))

    if colormap != None:
        cols = cvg2rgb.label2color.items()

        cols = [(index, cols[index][0], cols[index][1]) \
                for index in range(len(cols))]

        colors = [list(x[2]) for x in cols]
        labels = [x[1] for x in cols]
        cmap = matplotlib.colors.ListedColormap(colors)
        bounds = [col[0] + 0.5 for col in cols]
        bounds = [-0.5] + bounds
        norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)
        cb2 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = cmap,\
                                               boundaries = bounds, \
                                               norm = norm, \
                                               ticks = bounds, \
                                               orientation = 'horizontal')
        cb2.set_ticks([x + 0.5 for x in bounds])
        cb2.set_ticklabels(labels)

    else:
        cb1 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = my_cmap, norm = color_norm,\
                                               orientation = 'horizontal')
        pass
Beispiel #31
0
def create_heatmap(filenames, index, colormap, crange, show_chromosome=None):
    """
    Plot signal for multiple samples as a heatmap.
    Signal can be continuous or categorical value.
    In case of numeric values, a continuous color
    scale will be used to map values to color. 
    In case of categorical value, the user should 
    provide mapping of values to color as an input"""
    
    _fig = pyplot.figure(figsize = (12,8))
    gs = matplotlib.gridspec.GridSpec(2,1,height_ratios = [30,1])
    axis = pyplot.subplot(gs[0])
    axis_aux = pyplot.subplot(gs[1])

    # List sample names on the y-axis
    axis.set_yticks([i + 0.5 for i in range(len(filenames))])
    axis.set_yticklabels(list(map(fbase, filenames)))
    axis.invert_yaxis()
    axis.set_ylabel("Samples")
    axis.set_axis_bgcolor('#DDDDDD')

    # Group each file's probes/segments by chromosome
    sample_data = [collections.defaultdict(list) for _f in filenames]
    
    #-------------------------------------------------------------#
    # read in the signal value in each sample from the input files
    index = int(index)
    for i, fname in enumerate(filenames):
        f_h = file(fname)
        for line in f_h:
            toks = line.strip().split('\t')
            if toks[0].lower() in ['chrom', 'chromosome']:
                # header line
                continue
            if colormap == None:
                # numerical data, convert signal to floating point numbers
                sample_data[i][toks[0]].append((int(toks[1]), int(toks[2]), ifloat(toks[index])))
            else:
                sample_data[i][toks[0]].append((int(toks[1]), int(toks[2]), toks[index]))

        f_h.close()
     #-------------------------------------------------------------#

    # Calculate the size (max endpoint value) of each chromosome
    chrom_sizes = {}
    for row in sample_data:
        for chrom, data in iteritems(row):
            max_posn = max(coord[1] for coord in data)
            chrom_sizes[chrom] = max(max_posn, chrom_sizes.get(chrom, 0))
    chrom_sizes = collections.OrderedDict(sorted(iteritems(chrom_sizes),
                                                 key=sorter_chrom_at(0)))

    if colormap != None:
        cvg2rgb = CVG2RGB(colormap = colormap)
    else:
        if crange != None:
            vmax = float(crange)
            vmin = (-1) * vmax
        else:
            vmax = get_max_abs_value(sample_data)
            vmin = (-1) * vmax

        # this matplotlib color map is appropriate for diverging data
        # ; e.g. copy number values. can be subsituted by any maplotlib
        # colormap
        my_cmap = matplotlib.cm.seismic
        color_norm = matplotlib.colors.Normalize(vmin = vmin, vmax = vmax)
    
        cvg2rgb = CVG2RGB(rcmap = my_cmap, color_norm = color_norm)

        
    def plot_rect(y_idx, x_start, x_end, cvg):
        """Draw a rectangle in the given coordinates and color."""
        x_coords = (x_start, x_start, x_end + 1, x_end + 1)
        y_coords = (y_idx, y_idx + 1, y_idx + 1, y_idx)
        if cvg in [-1000.0, 'NA']:
            # missing data, a shade of dark gray
            rgbcolor =  (0.3, 0.3, 0.3)
        else:
            rgbcolor = cvg2rgb.get_color(cvg)
        axis.fill(x_coords, y_coords, color=rgbcolor)

    if show_chromosome:
        # Lay out only the selected chromosome
        chrom_offsets = {show_chromosome: 0.0}
        # Set x-axis the chromosomal positions (in Mb), title as the chromosome
        axis.set_xlim(0, chrom_sizes[show_chromosome] * MB)
        axis.set_title(show_chromosome)
        axis.set_xlabel("Position (Mb)")
        axis.tick_params(which='both', direction='out')
        axis.get_xaxis().tick_bottom()
        axis.get_yaxis().tick_left()
        # Plot the individual probe/segment coverages
        for i, row in enumerate(sample_data):
            for start, end, cvg in row[show_chromosome]:
                plot_rect(i, start * MB, end * MB, cvg)

    else:
        # Lay out chromosome dividers and x-axis labels
        # (Just enough padding to avoid overlap with the divider line)
        chrom_offsets = plot_x_dividers(axis, chrom_sizes, 1)
        # Plot the individual probe/segment coverages
        for i, row in enumerate(sample_data):
            for chrom, curr_offset in iteritems(chrom_offsets):
                for start, end, cvg in row[chrom]:
                    plot_rect(i, start + curr_offset, end + curr_offset, cvg)

    plot_y_dividers(axis, sample_data, color = (0.4, 0.4, 0.4))
    
    if colormap != None:
        cols = cvg2rgb.label2color.items()
        
        cols = [(index, cols[index][0], cols[index][1]) \
                for index in range(len(cols))]

        colors = [list(x[2]) for x in cols]
        labels = [x[1] for x in cols]
        cmap = matplotlib.colors.ListedColormap(colors)
        bounds = [col[0] + 0.5 for col in cols]
        bounds = [-0.5] + bounds
        norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)
        cb2 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = cmap,\
                                               boundaries = bounds, \
                                               norm = norm, \
                                               ticks = bounds, \
                                               orientation = 'horizontal')
        cb2.set_ticks([x + 0.5 for x in bounds])
        cb2.set_ticklabels(labels)

    else:
        cb1 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = my_cmap, norm = color_norm,\
                                               orientation = 'horizontal')
        pass
Beispiel #32
0
 def __eq__(self, other):
     if len(self.data) != len(other.data):
         return 0
     ok = reduce(lambda x, y: x and y,
                 map(lambda x, y: x == y, self.data, other.data))
     return ok
Beispiel #33
0
def train(training_set, results, feature_fns, update_fn=None,
          max_iis_iterations=10000, iis_converge=1.0e-5,
          max_newton_iterations=100, newton_converge=1.0e-10):
    """Train a maximum entropy classifier, returns MaxEntropy object.

    Train a maximum entropy classifier on a training set.
    training_set is a list of observations.  results is a list of the
    class assignments for each observation.  feature_fns is a list of
    the features.  These are callback functions that take an
    observation and class and return a 1 or 0.  update_fn is a
    callback function that is called at each training iteration.  It is
    passed a MaxEntropy object that encapsulates the current state of
    the training.

    The maximum number of iterations and the convergence criterion for IIS
    are given by max_iis_iterations and iis_converge, respectively, while
    max_newton_iterations and newton_converge are the maximum number
    of iterations and the convergence criterion for Newton's method.
    """
    if not training_set:
        raise ValueError("No data in the training set.")
    if len(training_set) != len(results):
        raise ValueError("training_set and results should be parallel lists.")

    # Rename variables for convenience.
    xs, ys = training_set, results

    # Get a list of all the classes that need to be trained.
    classes = sorted(set(results))

    # Cache values for all features.
    features = [_eval_feature_fn(fn, training_set, classes)
                for fn in feature_fns]
    # Cache values for f#.
    f_sharp = _calc_f_sharp(len(training_set), len(classes), features)

    # Pre-calculate the empirical expectations of the features.
    e_empirical = _calc_empirical_expects(xs, ys, classes, features)

    # Now train the alpha parameters to weigh each feature.
    alphas = [0.0] * len(features)
    iters = 0
    while iters < max_iis_iterations:
        nalphas = _train_iis(xs, classes, features, f_sharp,
                             alphas, e_empirical,
                             max_newton_iterations, newton_converge)
        diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas)
        diff = reduce(lambda x, y: x+y, diff, 0)
        alphas = nalphas

        me = MaxEntropy()
        me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns
        if update_fn is not None:
            update_fn(me)

        if diff < iis_converge:   # converged
            break
    else:
        raise RuntimeError("IIS did not converge")

    return me
Beispiel #34
0
def train(training_set,
          results,
          feature_fns,
          update_fn=None,
          max_iis_iterations=10000,
          iis_converge=1.0e-5,
          max_newton_iterations=100,
          newton_converge=1.0e-10):
    """Train a maximum entropy classifier, returns MaxEntropy object.

    Train a maximum entropy classifier on a training set.
    training_set is a list of observations.  results is a list of the
    class assignments for each observation.  feature_fns is a list of
    the features.  These are callback functions that take an
    observation and class and return a 1 or 0.  update_fn is a
    callback function that is called at each training iteration.  It is
    passed a MaxEntropy object that encapsulates the current state of
    the training.

    The maximum number of iterations and the convergence criterion for IIS
    are given by max_iis_iterations and iis_converge, respectively, while
    max_newton_iterations and newton_converge are the maximum number
    of iterations and the convergence criterion for Newton's method.
    """
    if not training_set:
        raise ValueError("No data in the training set.")
    if len(training_set) != len(results):
        raise ValueError("training_set and results should be parallel lists.")

    # Rename variables for convenience.
    xs, ys = training_set, results

    # Get a list of all the classes that need to be trained.
    classes = sorted(set(results))

    # Cache values for all features.
    features = [
        _eval_feature_fn(fn, training_set, classes) for fn in feature_fns
    ]
    # Cache values for f#.
    f_sharp = _calc_f_sharp(len(training_set), len(classes), features)

    # Pre-calculate the empirical expectations of the features.
    e_empirical = _calc_empirical_expects(xs, ys, classes, features)

    # Now train the alpha parameters to weigh each feature.
    alphas = [0.0] * len(features)
    iters = 0
    while iters < max_iis_iterations:
        nalphas = _train_iis(xs, classes, features, f_sharp, alphas,
                             e_empirical, max_newton_iterations,
                             newton_converge)
        diff = map(lambda x, y: numpy.fabs(x - y), alphas, nalphas)
        diff = reduce(lambda x, y: x + y, diff, 0)
        alphas = nalphas

        me = MaxEntropy()
        me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns
        if update_fn is not None:
            update_fn(me)

        if diff < iis_converge:  # converged
            break
    else:
        raise RuntimeError("IIS did not converge")

    return me