Esempio n. 1
0
 def all_distances_correct(self):
     length = len(self)
     As, Bs = self.AB_list()
     for i in xrange(length):
         for j in xrange(i + 1, length):
             if fast_distance(As[i],As[j]) != fast_distance(Bs[i],Bs[j]) or \
              fast_distance(As[i],Bs[i]) != fast_distance(As[j],Bs[j]):
                 return False
     return True
Esempio n. 2
0
	def discrepancies(self):
		result = collections.defaultdict(int)
		for i, [A, B] in enumerate(self):
			for C, D in self[i+1:]:
				if fast_distance(A,C) != fast_distance(B,D):
					if __trace__: print >> sys.stderr, '# d(%s, %s) = %d != %d = d(%s, %s)...' % \
						(A,C,fast_distance(A,C),fast_distance(B,D),B,D)
					result[A,B] += 1
					result[C,D] += 1
		return result
Esempio n. 3
0
	def split_by_horizontal_distance(self, indistinguishables):

		if __no_horizontal_splitting__:
			if __trace__: print >> sys.stderr, '# No horizontal splitting...'
			yield self
			raise StopIteration
		if __trace__: print >> sys.stderr, '# hcluster = %s' % self
		subclusters = collections.defaultdict(list)
		for Aprime, Bprime in self:
			# Include all the possible equivalent strings.
			equivA = indistinguishables.all(Aprime)
			equivB = indistinguishables.all(Bprime)
			if __trace__: print >> sys.stderr, 'equivA = %s, equivB = %s' % (equivA, equivB)
			for A, B in itertools.product(equivA, equivB):
#				if common_substring(A,B):		# This was a heuristic! May be theoretically not valid.
				dAB = fast_distance(A,B)
				if __trace__: print >> sys.stderr, 'd(%s, %s) = %d' % (equivA, equivB, dAB)
				subclusters[dAB].append((A,B))
		if __trace__: print >> sys.stderr, '# subclusters = %s' % subclusters
		if 0 < len(subclusters):		# Split into subclusters by distance.
			for dAB in subclusters:	# Then insert each subcluster.
				if __minimal_size__ <= len(subclusters[dAB]):	# The subcluster should contain at least 2 ratios to make a valid cluster or be bigger than min size.
					if __trace__: print >> sys.stderr, 'subclusters[%d] = %s' % (dAB, subclusters[dAB])
					result = StrCluster( subclusters[dAB] )
#					if not result.all_distances_correct():
#						print 'HORIZONTAL %s' % result
					yield result
Esempio n. 4
0
    def __eq__(self, other):
        """
		Testing for equality with ==.
		Caution: this test is not exact as it tests only the first analogy (median strings) between the two clusters.
		>>> Cluster.fromFile('jouer : jouais :: trouver : trouvais') == Cluster.fromFile('chantais : chanter :: portais : porter :: regardais : regarder')
		False
		>>> Cluster.fromFile('a : abc :: d : dbc') == Cluster.fromFile('a : acb :: d : dcb')
		False
		>>> Cluster.fromFile('ab : aabb :: aaabbb : aaaabbbb') == Cluster.fromFile('ab : abab :: abab : ababab :: ababab : abababab')
		False
		
		"""
        self.set_attributes()
        other.set_attributes()
        if self.attributes == other.attributes:
            for pair1 in self:
                for pair2 in other:
                    if not fast_distance(pair1[0], pair2[0]) == fast_distance(
                            pair1[1], pair2[1]):
                        return False
            return True
        else:
            return False
Esempio n. 5
0
    def set_attributes(self):
        """
		Compute the attributes of a cluster.
		At the moment, there are the following attributes:
			1. the distance between As and Bs;
			2. the difference of multisets of symbols in As and in Bs.
			3. the difference of multisets of symbols in Bs and in As.
		"""
        if self.attributes_set: return
        self.normalize()
        Attributes = collections.namedtuple(
            'Attributes', ['distance', 'left_diff', 'right_diff'])
        A, B = self[0][0], self[0][1]
        multisetA, multisetB = collections.Counter(A), collections.Counter(B)
        self.attributes = Attributes(fast_distance(A,
                                                   B), multisetA - multisetB,
                                     multisetB - multisetA)
        self.attributes_set = True
Esempio n. 6
0
def fast_distance(word1, word2):
    return _fast_distance.fast_distance(word1, word2)
Esempio n. 7
0
###############################################################################

def read_argv():

	from optparse import OptionParser
	this_version = 'v%s (c) %s %s' % (__version__, __date__.split('/')[2], __author__)
	this_description = __description__
	this_usage = '''%prog  straingA  stringB
	'''

	parser = OptionParser(version=this_version, description=this_description, usage=this_usage)
	parser.add_option('-v', '--verbose',
						action='store_true', dest='verbose', default=False,
                  		help='run in verbose mode')
						
	(options, args) = parser.parse_args()
	return options, args

###############################################################################

if __name__ == '__main__':
	options, args = read_argv()
	__verbose__ = options.verbose
	t1 = time.time()
	if len(args) != 2:
		print 'Error: two arguments required.'
		sys.exit(-1)
	print fast_distance(args[0], args[1])
	if __verbose__: print >> sys.stderr, '# Processing time: %.2fs' % (time.time() - t1)