def thesaurus_file_similarity(filep1, filep2, measure=cosine, weighting=k_minus_rank, maxrank=None, log=logging.getLogger()): ''' ''' log = log.getChild('thesaurus_similarity') log.info('-----------------------------------------') # Load the thesauri files into neighbours lists neighs1 = extract_terms(filep1, log=log) neighs2 = extract_terms(filep2, log=log) log.info('-----------------------------------------') # Sort in ascending order of base entry (to allow merging) log.info('Sorting first neighbours list.') neighs1.sort() log.info('Sorting second neighbours list.') neighs2.sort() log.info('-----------------------------------------') # Calculate the similarities (mu, sigma) = thesaurus_similarity(neighs1, neighs2, measure=measure, weighting=weighting, maxrank=maxrank, log=log) return (mu, sigma)
def extract_terms(filep, log=logging.getLogger()): ''' Read all entry neighbours lists from the given file, returning a list containing one element for each base entry. Each element consists of pair; the base_entry string, and another list of neighbour/score tuples. ''' log = log.getChild('load') if log.isEnabledFor(logging.INFO): log.info('Reading thesaurus \'%s\'.' % filep.name) terms = [] base_entry_count = 0 neighbour_count = 0 for line in filep: f = string.split(line, "\t") base_entry = f[0] neighbours = [(f[i], float(f[i+1])) for i in xrange(1, len(f), 2)] terms.append( (base_entry, neighbours) ) base_entry_count += 1 neighbour_count += len(neighbours) if log.isEnabledFor(logging.INFO) and base_entry_count % 20000 == 0: log.info('Read %d entries with %d neighbours.' % ( base_entry_count, neighbour_count)) if log.isEnabledFor(logging.INFO): log.info('Completed: Read %d entries with %d neighbours.' % ( base_entry_count, neighbour_count)) filep.close() return terms
def thesaurus_similarities(neighs1, neighs2, measure=cosine, weighting=k_minus_rank, maxrank=0, log=logging.getLogger()): ''' Produce a list of similarity tuples, where each tuple is a base entry with the neighbours list similarity between each thesauri. If a base entry doesn't exist in one or other of the thesauri no score is produced. Neighbours lists are assumed to have been previous sorted lexicographically in ascending order of entry. ''' log = log.getChild('compute') if log.isEnabledFor(logging.INFO): log.info('Calculating %s similarities between %s weighted neighbours lists.' % ( measure.func_name, weighting.func_name)) sims = [] i,j = 0,0 while i < len(neighs1) and j < len(neighs2) : if neighs1[i][0] == neighs2[j][0]: if log.isEnabledFor(logging.DEBUG): log.debug('entry: %s' % neighs1[i][0]) sim = neighbours_list_similarity(neighs1[i][1], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 j += 1 elif neighs1[i][0] < neighs2[j][0]: sim = neighbours_list_similarity(neighs1[i][1], [], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 else: #if th1[i][0] > th2[j][0]: sim = neighbours_list_similarity([], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs2[j][0], sim) ) j += 1 if log.isEnabledFor(logging.INFO) and max(i,j) % 1000 == 0: log.info('Calculated %d similarities. (%.1f%% complete)' % ( len(sims), 100.0 * (i+j) / (len(neighs1)+len(neighs2)))) while i < len(neighs1): sim = neighbours_list_similarity(neighs1[i][1], [], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 while j < len(neighs2): sim = neighbours_list_similarity([], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs2[j][0], sim) ) j += 1 if log.isEnabledFor(logging.INFO): log.info('Calculated %d similarities. (%.1f%% complete)' % ( len(sims), 100.0)) return sims