def test_natural_cutoff(self): lo, hi = 0.2, 0.5 self.assertAlmostEqual( sequt.natural_cutoff([0.28, 0.40, 0.50, 0.65], lo, hi), 0.28) self.assertAlmostEqual( sequt.natural_cutoff([0.1, 0.17, 0.21, 0.30, 0.51], lo, hi), 0.21) self.assertAlmostEqual( sequt.natural_cutoff([0.1, 0.17, 0.21, 0.28, 0.37, 0.45, 0.51], lo, hi), 0.2)
def test_natural_cutoff (self): lo, hi = 0.2, 0.5 self.assertAlmostEqual( sequt.natural_cutoff( [0.28, 0.40, 0.50, 0.65], lo, hi ), 0.28 ) self.assertAlmostEqual( sequt.natural_cutoff( [0.1, 0.17, 0.21, 0.30, 0.51], lo, hi ), 0.21 ) self.assertAlmostEqual( sequt.natural_cutoff( [0.1, 0.17, 0.21, 0.28, 0.37, 0.45, 0.51], lo, hi ), 0.2 )
def cluster_by_jaccard (self, minlen=5): #, threshold=0.7): """ """ lass = self.lass dim = len(lass) jaccdist = self.jaccarrays['distance'] for i, istr in enumerate(lass): if len(istr) < minlen: continue threshold = sequt.natural_cutoff(jaccdist[i], lo=0.2, hi=0.3) for j in xrange(dim): jstr = lass[j] if (i == j) or len(jstr) < minlen: continue if jaccdist[i,j] <= threshold: self.clusters.merge(istr, jstr)
def link_across_length_levels (self): #, threshold=0.75): """ Link across all length levels to create larger clusters. """ #assert 0 < threshold <= 1 #threshold = 0.3 lass = self.lass dim = len(lass) for i, istr in enumerate(lass): vec_i = self.editdist[i] # array1d dummy = np.ones(dim) * 999 for j in xrange(dim): if i == j: continue dummy[j] = 1.0 * vec_i[j] / min(len(istr), len(lass[j])) jmin = np.argmin(dummy) threshold = sequt.natural_cutoff(dummy, lo=0.2, hi=0.4) if dummy[jmin] <= threshold: self.clusters.merge(istr, lass[jmin])
def merge_within_length_level (self): """ Find out small clusters within each length level. """ #assert 0 < threshold <= 1 lass = self.lass for length, lo_hi in lass.lenbounds.iteritems(): lo, hi = lo_hi if hi - lo < 2: continue #for i in xrange(lo, hi-1): # for j in xrange(i+1, hi): # if self.editdist[i, j] <= length * 0.5: # self.clusters.merge(lass[i], lass[j]) for i in xrange(lo, hi): # Find the best cut-off in [0.2, 0.5] threshold = sequt.natural_cutoff( self.editdist[i, lo:hi] /float(length), lo=0.2, hi=0.4) for j in xrange(lo, hi): if i != j and self.editdist[i, j] <= threshold * length: self.clusters.merge(lass[i], lass[j])