Ejemplo n.º 1
0
def cog_info(candidates, sp2hits):
    sp_coverages = [hits/float(len(candidates)) for hits in list(sp2hits.values())]
    species_covered = len(set(sp2hits.keys()))+1
    min_cov = _min(sp_coverages)
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    return min_cov, max_cov, median_cov
Ejemplo n.º 2
0
def cog_info(candidates, sp2hits):
    sp_coverages = [
        hits / float(len(candidates)) for hits in list(sp2hits.values())
    ]
    species_covered = len(set(sp2hits.keys())) + 1
    min_cov = _min(sp_coverages)
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    return min_cov, max_cov, median_cov
Ejemplo n.º 3
0
 def sort_cogs_by_sp_repr(c1, c2):
     c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
     c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
     r = cmp(c1_repr, c2_repr)
     if r == 0:
         r = -1 * cmp(len(c1), len(c2))
         if r == 0:
             return cmp(sorted(c1), sorted(c2))
         else:
             return r
     else:
         return r
Ejemplo n.º 4
0
 def sort_cogs_by_sp_repr(c1, c2):
     c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
     c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
     r = cmp(c1_repr, c2_repr)
     if r == 0:
         r = -1 * cmp(len(c1), len(c2))
         if r == 0:
             return cmp(sorted(c1), sorted(c2))
         else:
             return r
     else:
         return r
Ejemplo n.º 5
0
    def _sort_cogs(cogs1, cogs2):
        cogs1 = cogs1[1] # discard seed info
        cogs2 = cogs2[1] # discard seed info
        cog_sizes1 = [len(cog) for cog in cogs1]
        cog_sizes2 = [len(cog) for cog in cogs2]
        mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(_mean(cog_sizes1))
        mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(_mean(cog_sizes2))

        # we want to maximize all these values in the following order:
        for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))):
            v = -1 * cmp(i, j)
            if v != 0:
                break
        return v
Ejemplo n.º 6
0
    def _sort_cogs(cogs1, cogs2):
        cogs1 = cogs1[1]  # discard seed info
        cogs2 = cogs2[1]  # discard seed info
        cog_sizes1 = [len(cog) for cog in cogs1]
        cog_sizes2 = [len(cog) for cog in cogs2]
        mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(
            _mean(cog_sizes1))
        mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(
            _mean(cog_sizes2))

        # we want to maximize all these values in the following order:
        for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))):
            v = -1 * cmp(i, j)
            if v != 0:
                break
        return v
Ejemplo n.º 7
0
def get_cog_score(candidates, sp2hits, max_cogs, all_species):

    cog_cov = _mean([len(cogs) for cogs in candidates])/float(len(sp2hits)+1)
    cog_mean_cov = _mean([len(cogs)/float(len(sp2hits)) for cogs in candidates]) # numero medio de especies en cada cog
    cog_min_sp = _min([len(cogs) for cogs in candidates])

    sp_coverages = [sp2hits.get(sp, 0)/float(len(candidates)) for sp in all_species]
    species_covered = len(set(sp2hits.keys()))+1

    nfactor = len(candidates)/float(max_cogs) # Numero de cogs
    min_cov = _min(sp_coverages) # el coverage de la peor especie
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    cov_std = _std(sp_coverages)

    score = _min([nfactor, cog_mean_cov, min_cov])
    return score, min_cov, max_cov, median_cov, cov_std, cog_cov
Ejemplo n.º 8
0
        def sort_cogs_by_size(c1, c2):
            '''
            sort cogs by descending size. If two cogs are the same size, sort
            them keeping first the one with the less represented
            species. Otherwise sort by sequence name sp_seqid.'''

            r = -1 * cmp(len(c1), len(c2))
            if r == 0:
                # finds the cog including the less represented species
                c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
                c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
                r = cmp(c1_repr, c2_repr)
                if r == 0:
                    return cmp(sorted(c1), sorted(c2))
                else:
                    return r
            else:
                return r
Ejemplo n.º 9
0
        def sort_cogs_by_size(c1, c2):
            '''
            sort cogs by descending size. If two cogs are the same size, sort
            them keeping first the one with the less represented
            species. Otherwise sort by sequence name sp_seqid.'''

            r = -1 * cmp(len(c1), len(c2))
            if r == 0:
                # finds the cog including the less represented species
                c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
                c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
                r = cmp(c1_repr, c2_repr)
                if r == 0:
                    return cmp(sorted(c1), sorted(c2))
                else:
                    return r
            else:
                return r
Ejemplo n.º 10
0
def get_identity(fname):
    s = SeqGroup(fname)
    seqlen = len(six.itervalues(s.id2seq))
    ident = list()
    for i in range(seqlen):
        states = defaultdict(int)
        for seq in six.itervalues(s.id2seq):
            if seq[i] != "-":
                states[seq[i]] += 1
        values = list(states.values())
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))
Ejemplo n.º 11
0
def get_cog_score(candidates, sp2hits, max_cogs, all_species):

    cog_cov = _mean([len(cogs)
                     for cogs in candidates]) / float(len(sp2hits) + 1)
    cog_mean_cov = _mean([
        len(cogs) / float(len(sp2hits)) for cogs in candidates
    ])  # numero medio de especies en cada cog
    cog_min_sp = _min([len(cogs) for cogs in candidates])

    sp_coverages = [
        sp2hits.get(sp, 0) / float(len(candidates)) for sp in all_species
    ]
    species_covered = len(set(sp2hits.keys())) + 1

    nfactor = len(candidates) / float(max_cogs)  # Numero de cogs
    min_cov = _min(sp_coverages)  # el coverage de la peor especie
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    cov_std = _std(sp_coverages)

    score = _min([nfactor, cog_mean_cov, min_cov])
    return score, min_cov, max_cov, median_cov, cov_std, cog_cov
Ejemplo n.º 12
0
def get_identity(fname):
    s = SeqGroup(fname)
    seqlen = len(six.itervalues(s.id2seq))
    ident = list()
    for i in range(seqlen):
        states = defaultdict(int)
        for seq in six.itervalues(s.id2seq):
            if seq[i] != "-":
                states[seq[i]] += 1
        values = list(states.values())
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))
Ejemplo n.º 13
0
def get_seqs_identity(alg, seqs):
    ''' Returns alg statistics regarding a set of sequences'''
    seqlen = len(alg.get_seq(seqs[0]))
    ident = list()
    for i in range(seqlen):
        states = defaultdict(int)
        for seq_id in seqs:
            seq = alg.get_seq(seq_id)
            if seq[i] != "-":
                states[seq[i]] += 1
        values = list(states.values())
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))
Ejemplo n.º 14
0
def get_seqs_identity(alg, seqs):
    """ Returns alg statistics regarding a set of sequences"""
    seqlen = len(alg.get_seq(seqs[0]))
    ident = list()
    for i in range(seqlen):
        states = defaultdict(int)
        for seq_id in seqs:
            seq = alg.get_seq(seq_id)
            if seq[i] != "-":
                states[seq[i]] += 1
        values = list(states.values())
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))