def all_distances_correct(self): length = len(self) As, Bs = self.AB_list() for i in xrange(length): for j in xrange(i + 1, length): if fast_distance(As[i],As[j]) != fast_distance(Bs[i],Bs[j]) or \ fast_distance(As[i],Bs[i]) != fast_distance(As[j],Bs[j]): return False return True
def discrepancies(self): result = collections.defaultdict(int) for i, [A, B] in enumerate(self): for C, D in self[i+1:]: if fast_distance(A,C) != fast_distance(B,D): if __trace__: print >> sys.stderr, '# d(%s, %s) = %d != %d = d(%s, %s)...' % \ (A,C,fast_distance(A,C),fast_distance(B,D),B,D) result[A,B] += 1 result[C,D] += 1 return result
def split_by_horizontal_distance(self, indistinguishables): if __no_horizontal_splitting__: if __trace__: print >> sys.stderr, '# No horizontal splitting...' yield self raise StopIteration if __trace__: print >> sys.stderr, '# hcluster = %s' % self subclusters = collections.defaultdict(list) for Aprime, Bprime in self: # Include all the possible equivalent strings. equivA = indistinguishables.all(Aprime) equivB = indistinguishables.all(Bprime) if __trace__: print >> sys.stderr, 'equivA = %s, equivB = %s' % (equivA, equivB) for A, B in itertools.product(equivA, equivB): # if common_substring(A,B): # This was a heuristic! May be theoretically not valid. dAB = fast_distance(A,B) if __trace__: print >> sys.stderr, 'd(%s, %s) = %d' % (equivA, equivB, dAB) subclusters[dAB].append((A,B)) if __trace__: print >> sys.stderr, '# subclusters = %s' % subclusters if 0 < len(subclusters): # Split into subclusters by distance. for dAB in subclusters: # Then insert each subcluster. if __minimal_size__ <= len(subclusters[dAB]): # The subcluster should contain at least 2 ratios to make a valid cluster or be bigger than min size. if __trace__: print >> sys.stderr, 'subclusters[%d] = %s' % (dAB, subclusters[dAB]) result = StrCluster( subclusters[dAB] ) # if not result.all_distances_correct(): # print 'HORIZONTAL %s' % result yield result
def __eq__(self, other): """ Testing for equality with ==. Caution: this test is not exact as it tests only the first analogy (median strings) between the two clusters. >>> Cluster.fromFile('jouer : jouais :: trouver : trouvais') == Cluster.fromFile('chantais : chanter :: portais : porter :: regardais : regarder') False >>> Cluster.fromFile('a : abc :: d : dbc') == Cluster.fromFile('a : acb :: d : dcb') False >>> Cluster.fromFile('ab : aabb :: aaabbb : aaaabbbb') == Cluster.fromFile('ab : abab :: abab : ababab :: ababab : abababab') False """ self.set_attributes() other.set_attributes() if self.attributes == other.attributes: for pair1 in self: for pair2 in other: if not fast_distance(pair1[0], pair2[0]) == fast_distance( pair1[1], pair2[1]): return False return True else: return False
def set_attributes(self): """ Compute the attributes of a cluster. At the moment, there are the following attributes: 1. the distance between As and Bs; 2. the difference of multisets of symbols in As and in Bs. 3. the difference of multisets of symbols in Bs and in As. """ if self.attributes_set: return self.normalize() Attributes = collections.namedtuple( 'Attributes', ['distance', 'left_diff', 'right_diff']) A, B = self[0][0], self[0][1] multisetA, multisetB = collections.Counter(A), collections.Counter(B) self.attributes = Attributes(fast_distance(A, B), multisetA - multisetB, multisetB - multisetA) self.attributes_set = True
def fast_distance(word1, word2): return _fast_distance.fast_distance(word1, word2)
############################################################################### def read_argv(): from optparse import OptionParser this_version = 'v%s (c) %s %s' % (__version__, __date__.split('/')[2], __author__) this_description = __description__ this_usage = '''%prog straingA stringB ''' parser = OptionParser(version=this_version, description=this_description, usage=this_usage) parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='run in verbose mode') (options, args) = parser.parse_args() return options, args ############################################################################### if __name__ == '__main__': options, args = read_argv() __verbose__ = options.verbose t1 = time.time() if len(args) != 2: print 'Error: two arguments required.' sys.exit(-1) print fast_distance(args[0], args[1]) if __verbose__: print >> sys.stderr, '# Processing time: %.2fs' % (time.time() - t1)