Esempio n. 1
0
def summary_targets(dataset, targets_attr="targets", chunks_attr="chunks", maxc=30, maxt=20):
    """Provide summary statistics over the targets and chunks

    Parameters
    ----------
    dataset : `Dataset`
      Dataset to operate on
    targets_attr : str, optional
      Name of sample attributes of targets
    chunks_attr : str, optional
      Name of sample attributes of chunks -- independent groups of samples
    maxc : int
      Maximal number of chunks when provide details
    maxt : int
      Maximal number of targets when provide details
    """
    # We better avoid bound function since if people only
    # imported Dataset without miscfx it would fail
    spcl = get_samples_per_chunk_target(dataset, targets_attr=targets_attr, chunks_attr=chunks_attr)
    # XXX couldn't they be unordered?
    ul = dataset.sa[targets_attr].unique.tolist()
    uc = dataset.sa[chunks_attr].unique.tolist()
    s = ""
    if len(ul) < maxt and len(uc) < maxc:
        s += "\nCounts of targets in each chunk:"
        # only in a reasonable case do printing
        table = [["  %s\\%s" % (chunks_attr, targets_attr)] + ul]
        table += [[""] + ["---"] * len(ul)]
        for c, counts in zip(uc, spcl):
            table.append([str(c)] + counts.tolist())
        s += "\n" + table2string(table)
    else:
        s += "No details due to large number of targets or chunks. " "Increase maxc and maxt if desired"

    def cl_stats(axis, u, name1, name2):
        """Compute statistics per target
        """
        stats = {
            "min": np.min(spcl, axis=axis),
            "max": np.max(spcl, axis=axis),
            "mean": np.mean(spcl, axis=axis),
            "std": np.std(spcl, axis=axis),
            "#%s" % name2: np.sum(spcl > 0, axis=axis),
        }
        entries = ["  " + name1, "mean", "std", "min", "max", "#%s" % name2]
        table = [entries]
        for i, l in enumerate(u):
            d = {"  " + name1: l}
            d.update(dict([(k, stats[k][i]) for k in stats.keys()]))
            table.append([("%.3g", "%s")[isinstance(d[e], basestring)] % d[e] for e in entries])
        return "\nSummary for %s across %s\n" % (name1, name2) + table2string(table)

    if len(ul) < maxt:
        s += cl_stats(0, ul, targets_attr, chunks_attr)
    if len(uc) < maxc:
        s += cl_stats(1, uc, chunks_attr, targets_attr)
    return s
Esempio n. 2
0
 def cl_stats(axis, u, name1, name2):
     """Compute statistics per target
     """
     stats = {
         "min": np.min(spcl, axis=axis),
         "max": np.max(spcl, axis=axis),
         "mean": np.mean(spcl, axis=axis),
         "std": np.std(spcl, axis=axis),
         "#%s" % name2: np.sum(spcl > 0, axis=axis),
     }
     entries = ["  " + name1, "mean", "std", "min", "max", "#%s" % name2]
     table = [entries]
     for i, l in enumerate(u):
         d = {"  " + name1: l}
         d.update(dict([(k, stats[k][i]) for k in stats.keys()]))
         table.append([("%.3g", "%s")[isinstance(d[e], basestring)] % d[e] for e in entries])
     return "\nSummary for %s across %s\n" % (name1, name2) + table2string(table)
Esempio n. 3
0
 def cl_stats(axis, u, name1, name2):
     """Compute statistics per target
     """
     stats = {'min': np.min(spcl, axis=axis),
              'max': np.max(spcl, axis=axis),
              'mean': np.mean(spcl, axis=axis),
              'std': np.std(spcl, axis=axis),
              '#%s' % name2: np.sum(spcl>0, axis=axis)}
     entries = ['  ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2]
     table = [ entries ]
     for i, l in enumerate(u):
         d = {'  ' + name1 : l}
         d.update(dict([ (k, stats[k][i]) for k in stats.keys()]))
         table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)]
                         % d[e] for e in entries] )
     return '\nSummary for %s across %s\n' % (name1, name2) \
            + table2string(table)
Esempio n. 4
0
    def _compute(self):
        """Compute stats and string representation
        """
        # Do actual computation
        order = self.order
        seq = list(self._seq)  # assure list
        nsamples = len(seq)  # # of samples/targets
        utargets = sorted(list(set(seq)))  # unique targets
        ntargets = len(utargets)  # # of targets

        # mapping for targets
        targets_map = dict([(l, i) for i, l in enumerate(utargets)])

        # map sequence first
        seqm = [targets_map[i] for i in seq]
        # npertarget = np.bincount(seqm)

        res = dict(utargets=utargets)
        # Estimate counter-balance
        cbcounts = np.zeros((order, ntargets, ntargets), dtype=int)
        for cb in xrange(order):
            for i, j in zip(seqm[:-(cb + 1)], seqm[cb + 1:]):
                cbcounts[cb, i, j] += 1
        res['cbcounts'] = cbcounts
        """
        Lets compute relative counter-balancing
        Ideally, npertarget[i]/ntargets should precede each target
        """
        # Autocorrelation
        corr = []
        # for all possible shifts:
        for shift in xrange(1, nsamples):
            shifted = seqm[shift:] + seqm[:shift]
            # ??? User pearsonsr with p may be?
            corr += [np.corrcoef(seqm, shifted)[0, 1]]
            # ??? report high (anti)correlations?
        res['corrcoef'] = corr = np.array(corr)
        res['sumabscorr'] = sumabscorr = np.sum(np.abs(corr))
        self.update(res)

        # Assign textual summary
        # XXX move into a helper function and do on demand
        t = [[""] * (1 + self.order * (ntargets + 1))
             for i in xrange(ntargets + 1)]
        t[0][0] = "Targets/Order"
        for i, l in enumerate(utargets):
            t[i + 1][0] = '%s:' % l
        for cb in xrange(order):
            t[0][1 + cb * (ntargets + 1)] = "O%d" % (cb + 1)
            for i in xrange(ntargets + 1):
                t[i][(cb + 1) * (ntargets + 1)] = " | "
            m = cbcounts[cb]
            # ??? there should be better way to get indexes
            ind = np.where(~np.isnan(m))
            for i, j in zip(*ind):
                t[1 + i][1 + cb * (ntargets + 1) + j] = '%d' % m[i, j]

        sout = "Sequence statistics for %d entries" \
               " from set %s\n" % (len(seq), utargets) + \
               "Counter-balance table for orders up to %d:\n" % order \
               + table2string(t)
        if len(corr):
            sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \
                    % (min(corr), max(corr), np.mean(corr), sumabscorr)
        self._str_stats = sout
Esempio n. 5
0
def summary_targets(dataset,
                    targets_attr='targets',
                    chunks_attr='chunks',
                    maxc=30,
                    maxt=20):
    """Provide summary statistics over the targets and chunks

    Parameters
    ----------
    dataset : `Dataset`
      Dataset to operate on
    targets_attr : str, optional
      Name of sample attributes of targets
    chunks_attr : str, optional
      Name of sample attributes of chunks -- independent groups of samples
    maxc : int
      Maximal number of chunks when provide details
    maxt : int
      Maximal number of targets when provide details
    """
    # We better avoid bound function since if people only
    # imported Dataset without miscfx it would fail
    spcl = get_samples_per_chunk_target(dataset,
                                        targets_attr=targets_attr,
                                        chunks_attr=chunks_attr)
    # XXX couldn't they be unordered?
    ul = dataset.sa[targets_attr].unique.tolist()
    uc = dataset.sa[chunks_attr].unique.tolist()
    s = ""
    if len(ul) < maxt and len(uc) < maxc:
        s += "\nCounts of targets in each chunk:"
        # only in a reasonable case do printing
        table = [['  %s\\%s' % (chunks_attr, targets_attr)] + ul]
        table += [[''] + ['---'] * len(ul)]
        for c, counts in zip(uc, spcl):
            table.append([str(c)] + counts.tolist())
        s += '\n' + table2string(table)
    else:
        s += "No details due to large number of targets or chunks. " \
             "Increase maxc and maxt if desired"

    def cl_stats(axis, u, name1, name2):
        """Compute statistics per target
        """
        stats = {
            'min': np.min(spcl, axis=axis),
            'max': np.max(spcl, axis=axis),
            'mean': np.mean(spcl, axis=axis),
            'std': np.std(spcl, axis=axis),
            '#%s' % name2: np.sum(spcl > 0, axis=axis)
        }
        entries = ['  ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2]
        table = [entries]
        for i, l in enumerate(u):
            d = {'  ' + name1: l}
            d.update(dict([(k, stats[k][i]) for k in stats.keys()]))
            table.append([
                ('%.3g', '%s')[isinstance(d[e], basestring) or d[e] is None] %
                d[e] for e in entries
            ])
        return '\nSummary for %s across %s\n' % (name1, name2) \
               + table2string(table)

    if len(ul) < maxt:
        s += cl_stats(0, ul, targets_attr, chunks_attr)
    if len(uc) < maxc:
        s += cl_stats(1, uc, chunks_attr, targets_attr)
    return s
Esempio n. 6
0
    def _compute(self):
        """Compute stats and string representation
        """
        # Do actual computation
        order = self.order
        seq = list(self._seq)  # assure list
        nsamples = len(seq)  # # of samples/targets
        utargets = sorted(list(set(seq)))  # unique targets
        ntargets = len(utargets)  # # of targets

        # mapping for targets
        targets_map = dict([(l, i) for i, l in enumerate(utargets)])

        # map sequence first
        seqm = [targets_map[i] for i in seq]
        # npertarget = np.bincount(seqm)

        res = dict(utargets=utargets)
        # Estimate counter-balance
        cbcounts = np.zeros((order, ntargets, ntargets), dtype=int)
        for cb in xrange(order):
            for i, j in zip(seqm[: -(cb + 1)], seqm[cb + 1 :]):
                cbcounts[cb, i, j] += 1
        res["cbcounts"] = cbcounts

        """
        Lets compute relative counter-balancing
        Ideally, npertarget[i]/ntargets should precede each target
        """
        # Autocorrelation
        corr = []
        # for all possible shifts:
        for shift in xrange(1, nsamples):
            shifted = seqm[shift:] + seqm[:shift]
            # ??? User pearsonsr with p may be?
            corr += [np.corrcoef(seqm, shifted)[0, 1]]
            # ??? report high (anti)correlations?
        res["corrcoef"] = corr = np.array(corr)
        res["sumabscorr"] = sumabscorr = np.sum(np.abs(corr))
        self.update(res)

        # Assign textual summary
        # XXX move into a helper function and do on demand
        t = [[""] * (1 + self.order * (ntargets + 1)) for i in xrange(ntargets + 1)]
        t[0][0] = "Targets/Order"
        for i, l in enumerate(utargets):
            t[i + 1][0] = "%s:" % l
        for cb in xrange(order):
            t[0][1 + cb * (ntargets + 1)] = "O%d" % (cb + 1)
            for i in xrange(ntargets + 1):
                t[i][(cb + 1) * (ntargets + 1)] = " | "
            m = cbcounts[cb]
            # ??? there should be better way to get indexes
            ind = np.where(~np.isnan(m))
            for i, j in zip(*ind):
                t[1 + i][1 + cb * (ntargets + 1) + j] = "%d" % m[i, j]

        sout = "Sequence statistics for %d entries" " from set %s\n" % (
            len(seq),
            utargets,
        ) + "Counter-balance table for orders up to %d:\n" % order + table2string(t)
        if len(corr):
            sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" % (
                min(corr),
                max(corr),
                np.mean(corr),
                sumabscorr,
            )
        self._str_stats = sout