def extract_terms(filep, log=logging.getLogger()):
	'''
	Read all entry neighbours lists from the given file, returning a list 
	containing one element for each base entry. Each element consists of pair; 
	the base_entry string, and another list of neighbour/score tuples.	
	'''
	log = log.getChild('load')
	if log.isEnabledFor(logging.INFO): 
		log.info('Reading thesaurus \'%s\'.' % filep.name)
	terms = []
	base_entry_count = 0
	neighbour_count = 0
	for line in filep:
		f = string.split(line, "\t")
		base_entry = f[0]
		neighbours = [(f[i], float(f[i+1])) for i in xrange(1, len(f), 2)]
		terms.append( (base_entry, neighbours) )
		base_entry_count += 1
		neighbour_count += len(neighbours)
		if log.isEnabledFor(logging.INFO) and base_entry_count % 20000 == 0:
			log.info('Read %d entries with %d neighbours.' % (
				base_entry_count, neighbour_count))
	if log.isEnabledFor(logging.INFO):
		log.info('Completed: Read %d entries with %d neighbours.' % (
			base_entry_count, neighbour_count))
	filep.close()
	return terms
def thesaurus_file_similarity(filep1, filep2, 
		measure=cosine, weighting=k_minus_rank, maxrank=None,
		log=logging.getLogger()):
	'''
	'''
	log = log.getChild('thesaurus_similarity')
	log.info('-----------------------------------------')
	
	# Load the thesauri files into neighbours lists
	neighs1 = extract_terms(filep1, log=log)
	neighs2 = extract_terms(filep2, log=log)
	
	log.info('-----------------------------------------')
	
	# Sort in ascending order of base entry (to allow merging)
	log.info('Sorting first neighbours list.')
	neighs1.sort()
	log.info('Sorting second neighbours list.')
	neighs2.sort()
		
	log.info('-----------------------------------------')
	
	# Calculate the similarities
	(mu, sigma) = thesaurus_similarity(neighs1, neighs2, 
		measure=measure, weighting=weighting, maxrank=maxrank, log=log)
	
	return (mu, sigma)
def thesaurus_similarities(neighs1, neighs2, 
		measure=cosine, weighting=k_minus_rank, maxrank=0,
		log=logging.getLogger()):
	'''
	Produce a list of similarity tuples, where each tuple is a base entry with
	the neighbours list similarity between each thesauri. If a base entry 
	doesn't exist in one or other of the thesauri no score is produced.
	
	Neighbours lists are assumed to have been previous sorted 
	lexicographically in ascending order of entry.
	'''
	log = log.getChild('compute')
	if log.isEnabledFor(logging.INFO):
		log.info('Calculating %s similarities between %s weighted neighbours lists.' % (
			measure.func_name, weighting.func_name))
	sims = []
	i,j = 0,0
	
	
	while i < len(neighs1) and j < len(neighs2) :
		if neighs1[i][0] == neighs2[j][0]:
			if log.isEnabledFor(logging.DEBUG):	
				log.debug('entry: %s' % neighs1[i][0])			
			
			sim = neighbours_list_similarity(neighs1[i][1], neighs2[j][1],
				measure=measure, weighting=weighting, maxrank=maxrank)			
			sims.append( (neighs1[i][0], sim) )			
			i += 1
			j += 1
		elif neighs1[i][0] < neighs2[j][0]:
			sim = neighbours_list_similarity(neighs1[i][1], [],
				measure=measure, weighting=weighting, maxrank=maxrank)
			sims.append( (neighs1[i][0], sim) )	
			i += 1
		else: #if th1[i][0] > th2[j][0]:
			sim = neighbours_list_similarity([], neighs2[j][1],
				measure=measure, weighting=weighting, maxrank=maxrank)			
			sims.append( (neighs2[j][0], sim) )			
			j += 1
		if log.isEnabledFor(logging.INFO) and max(i,j) % 1000 == 0: 
			log.info('Calculated %d similarities. (%.1f%% complete)' % (
				len(sims), 100.0 * (i+j) / (len(neighs1)+len(neighs2))))
	while i < len(neighs1):
		sim = neighbours_list_similarity(neighs1[i][1], [],
			measure=measure, weighting=weighting, maxrank=maxrank)
		sims.append( (neighs1[i][0], sim) )	
		i += 1
	while j < len(neighs2):
		sim = neighbours_list_similarity([], neighs2[j][1],
			measure=measure, weighting=weighting, maxrank=maxrank)			
		sims.append( (neighs2[j][0], sim) )			
		j += 1
	if log.isEnabledFor(logging.INFO):
		log.info('Calculated %d similarities. (%.1f%% complete)' % (
			len(sims), 100.0))
	
	return sims