class ComputeSeriation(object): """Seriation algorithm. Re-order words to improve promote the legibility of multi-word phrases and reveal the clustering of related terms. As output, the algorithm produces a list of seriated terms and its 'ranking' (i.e., the iteration in which a term was seriated). """ DEFAULT_NUM_SERIATED_TERMS = 100 def __init__(self, logging_level): self.logger = logging.getLogger('ComputeSeriation') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, data_path, numSeriatedTerms=None): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Computing term seriation...') self.logger.info(' data_path = %s', data_path) self.logger.info(' number_of_seriated_terms = %d', numSeriatedTerms) self.logger.info('Connecting to data...') self.saliency = SaliencyAPI(data_path) self.similarity = SimilarityAPI(data_path) self.seriation = SeriationAPI(data_path) self.logger.info('Reading data from disk...') self.saliency.read() self.similarity.read() self.logger.info('Reshaping saliency data...') self.reshape() self.logger.info('Computing seriation...') self.compute(numSeriatedTerms) self.logger.info('Writing data to disk...') self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' ) def reshape(self): self.candidateSize = 100 self.orderedTermList = [] self.termSaliency = {} self.termFreqs = {} self.termDistinct = {} self.termRank = {} self.termVisibility = {} for element in self.saliency.term_info: term = element['term'] self.orderedTermList.append(term) self.termSaliency[term] = element['saliency'] self.termFreqs[term] = element['frequency'] self.termDistinct[term] = element['distinctiveness'] self.termRank[term] = element['rank'] self.termVisibility[term] = element['visibility'] def compute(self, numSeriatedTerms): # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output... # set in init (i.e. read from config file) # Seriate! start_time = time.time() candidateTerms = self.orderedTermList self.seriation.term_ordering = [] self.seriation.term_iter_index = [] self.buffers = [0, 0] preBest = [] postBest = [] for iteration in range(numSeriatedTerms) and candidateTerms: print "Iteration no. ", iteration addedTerm = 0 if len(self.seriation.term_iter_index) > 0: addedTerm = self.seriation.term_iter_index[-1] if iteration == 1: (preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms) (preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm) (candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff( candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration) print "---------------" seriation_time = time.time() - start_time # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered #print "term_ordering: ", self.seriation.term_ordering #print "term_iter_index: ", self.seriation.term_iter_index # Feel free to pick a less confusing variable name #print "similarity matrix generation time: ", compute_sim_time #print "seriation time: ", seriation_time self.logger.debug("seriation time: " + str(seriation_time)) #-------------------------------------------------------------------------------# # Helper Functions def initBestEnergies(self, firstTerm, candidateTerms): preBest = [] postBest = [] for candidate in candidateTerms: pre_score = 0 post_score = 0 # preBest if (candidate, firstTerm) in self.similarity.combined_g2: pre_score = self.similarity.combined_g2[(candidate, firstTerm)] # postBest if (firstTerm, candidate) in self.similarity.combined_g2: post_score = self.similarity.combined_g2[(firstTerm, candidate)] preBest.append((candidate, pre_score)) postBest.append((candidate, post_score)) return (preBest, postBest) def getBestEnergies(self, preBest, postBest, addedTerm): if addedTerm == 0: return (preBest, postBest, []) term_order = [x[0] for x in preBest] # compare candidate terms' bests against newly added term remove_index = -1 for existingIndex in range(len(preBest)): term = term_order[existingIndex] if term == addedTerm: remove_index = existingIndex # check pre energies if (term, addedTerm) in self.similarity.combined_g2: if self.similarity.combined_g2[( term, addedTerm)] > preBest[existingIndex][1]: preBest[existingIndex] = ( term, self.similarity.combined_g2[(term, addedTerm)]) # check post energies if (addedTerm, term) in self.similarity.combined_g2: if self.similarity.combined_g2[( addedTerm, term)] > postBest[existingIndex][1]: postBest[existingIndex] = ( term, self.similarity.combined_g2[(addedTerm, term)]) # remove the added term's preBest and postBest scores if remove_index != -1: del preBest[remove_index] del postBest[remove_index] #create and sort the bestEnergies list energyMax = [ sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest]) ] bestEnergies = zip([x[0] for x in preBest], energyMax) return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True)) def iterate_eff(self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no): maxEnergyChange = -9999999999999999 maxTerm = "" maxPosition = 0 if len(bestEnergies) != 0: bestEnergy_terms = [x[0] for x in bestEnergies] else: bestEnergy_terms = candidateTerms breakout_counter = 0 for candidate_index in range(len(bestEnergy_terms)): breakout_counter += 1 candidate = bestEnergy_terms[candidate_index] for position in range(len(term_ordering) + 1): current_buffer = buffers[position] candidateRank = self.termRank[candidate] if candidateRank <= (len(term_ordering) + self.candidateSize): current_energy_change = self.getEnergyChange( candidate, position, term_ordering, current_buffer, iteration_no) if current_energy_change > maxEnergyChange: maxEnergyChange = current_energy_change maxTerm = candidate maxPosition = position # check for early termination if candidate_index < len(bestEnergy_terms) - 1 and len( bestEnergies) != 0: if maxEnergyChange >= ( 2 * (bestEnergies[candidate_index][1] + current_buffer)): print "#-------- breaking out early ---------#" print "candidates checked: ", breakout_counter break print "change in energy: ", maxEnergyChange print "maxTerm: ", maxTerm print "maxPosition: ", maxPosition candidateTerms.remove(maxTerm) # update buffers buf_score = 0 if len(term_ordering) == 0: buffers = buffers elif maxPosition >= len(term_ordering): if (term_ordering[-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)] buffers.insert(len(buffers) - 1, buf_score) elif maxPosition == 0: if (maxTerm, term_ordering[0]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])] buffers.insert(1, buf_score) else: if (term_ordering[maxPosition - 1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[( term_ordering[maxPosition - 1], maxTerm)] buffers[maxPosition] = buf_score buf_score = 0 if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[( maxTerm, term_ordering[maxPosition])] buffers.insert(maxPosition + 1, buf_score) # update term ordering and ranking if maxPosition >= len(term_ordering): term_ordering.append(maxTerm) else: term_ordering.insert(maxPosition, maxTerm) term_iter_index.append(maxTerm) return (candidateTerms, term_ordering, term_iter_index, buffers) def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no): prevBond = 0.0 postBond = 0.0 # first iteration only if iteration_no == 0: current_freq = 0.0 current_saliency = 0.0 if candidateTerm in self.termFreqs: current_freq = self.termFreqs[candidateTerm] if candidateTerm in self.termSaliency: current_saliency = self.termSaliency[candidateTerm] return 0.001 * current_freq * current_saliency # get previous term if position > 0: prev_term = term_list[position - 1] if (prev_term, candidateTerm) in self.similarity.combined_g2: prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)] # get next term if position < len(term_list): next_term = term_list[position] if (next_term, candidateTerm) in self.similarity.combined_g2: postBond = self.similarity.combined_g2[(candidateTerm, next_term)] return 2 * (prevBond + postBond - currentBuffer)
class ComputeSeriation( object ): """Seriation algorithm. Re-order words to improve promote the legibility of multi-word phrases and reveal the clustering of related terms. As output, the algorithm produces a list of seriated terms and its 'ranking' (i.e., the iteration in which a term was seriated). """ DEFAULT_NUM_SERIATED_TERMS = 100 def __init__( self, logging_level ): self.logger = logging.getLogger( 'ComputeSeriation' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path, numSeriatedTerms = None ): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term seriation...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' number_of_seriated_terms = %d', numSeriatedTerms ) self.logger.info( 'Connecting to data...' ) self.saliency = SaliencyAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.seriation = SeriationAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.saliency.read() self.similarity.read() self.logger.info( 'Reshaping saliency data...' ) self.reshape() self.logger.info( 'Computing seriation...' ) self.compute( numSeriatedTerms ) self.logger.info( 'Writing data to disk...' ) self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' ) def reshape( self ): self.candidateSize = 100 self.orderedTermList = [] self.termSaliency = {} self.termFreqs = {} self.termDistinct = {} self.termRank = {} self.termVisibility = {} for element in self.saliency.term_info: term = element['term'] self.orderedTermList.append( term ) self.termSaliency[term] = element['saliency'] self.termFreqs[term] = element['frequency'] self.termDistinct[term] = element['distinctiveness'] self.termRank[term] = element['rank'] self.termVisibility[term] = element['visibility'] def compute( self, numSeriatedTerms ): # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output... # set in init (i.e. read from config file) # Seriate! start_time = time.time() candidateTerms = self.orderedTermList self.seriation.term_ordering = [] self.seriation.term_iter_index = [] self.buffers = [0,0] preBest = [] postBest = [] for iteration in range(numSeriatedTerms): print "Iteration no. ", iteration addedTerm = 0 if len(self.seriation.term_iter_index) > 0: addedTerm = self.seriation.term_iter_index[-1] if iteration == 1: (preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms) (preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm) (candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration) print "---------------" seriation_time = time.time() - start_time # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered #print "term_ordering: ", self.seriation.term_ordering #print "term_iter_index: ", self.seriation.term_iter_index # Feel free to pick a less confusing variable name #print "similarity matrix generation time: ", compute_sim_time #print "seriation time: ", seriation_time self.logger.debug("seriation time: " + str(seriation_time)) #-------------------------------------------------------------------------------# # Helper Functions def initBestEnergies(self, firstTerm, candidateTerms): preBest = [] postBest = [] for candidate in candidateTerms: pre_score = 0 post_score = 0 # preBest if (candidate, firstTerm) in self.similarity.combined_g2: pre_score = self.similarity.combined_g2[(candidate, firstTerm)] # postBest if (firstTerm, candidate) in self.similarity.combined_g2: post_score = self.similarity.combined_g2[(firstTerm, candidate)] preBest.append((candidate, pre_score)) postBest.append((candidate, post_score)) return (preBest, postBest) def getBestEnergies(self, preBest, postBest, addedTerm): if addedTerm == 0: return (preBest, postBest, []) term_order = [x[0] for x in preBest] # compare candidate terms' bests against newly added term remove_index = -1 for existingIndex in range(len(preBest)): term = term_order[existingIndex] if term == addedTerm: remove_index = existingIndex # check pre energies if (term, addedTerm) in self.similarity.combined_g2: if self.similarity.combined_g2[(term, addedTerm)] > preBest[existingIndex][1]: preBest[existingIndex] = (term, self.similarity.combined_g2[(term, addedTerm)]) # check post energies if (addedTerm, term) in self.similarity.combined_g2: if self.similarity.combined_g2[(addedTerm, term)] > postBest[existingIndex][1]: postBest[existingIndex] = (term, self.similarity.combined_g2[(addedTerm, term)]) # remove the added term's preBest and postBest scores if remove_index != -1: del preBest[remove_index] del postBest[remove_index] #create and sort the bestEnergies list energyMax = [sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])] bestEnergies = zip([x[0] for x in preBest], energyMax) return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True)) def iterate_eff( self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no ): maxEnergyChange = 0.0; maxTerm = ""; maxPosition = 0; if len(bestEnergies) != 0: bestEnergy_terms = [x[0] for x in bestEnergies] else: bestEnergy_terms = candidateTerms breakout_counter = 0 for candidate_index in range(len(bestEnergy_terms)): breakout_counter += 1 candidate = bestEnergy_terms[candidate_index] for position in range(len(term_ordering)+1): current_buffer = buffers[position] candidateRank = self.termRank[candidate] if candidateRank <= (len(term_ordering) + self.candidateSize): current_energy_change = self.getEnergyChange(candidate, position, term_ordering, current_buffer, iteration_no) if current_energy_change > maxEnergyChange: maxEnergyChange = current_energy_change maxTerm = candidate maxPosition = position # check for early termination if candidate_index < len(bestEnergy_terms)-1 and len(bestEnergies) != 0: if maxEnergyChange >= (2*(bestEnergies[candidate_index][1] + current_buffer)): print "#-------- breaking out early ---------#" print "candidates checked: ", breakout_counter break; print "change in energy: ", maxEnergyChange print "maxTerm: ", maxTerm print "maxPosition: ", maxPosition candidateTerms.remove(maxTerm) # update buffers buf_score = 0 if len(term_ordering) == 0: buffers = buffers elif maxPosition >= len(term_ordering): if (term_ordering[-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)] buffers.insert(len(buffers)-1, buf_score) elif maxPosition == 0: if (maxTerm, term_ordering[0]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])] buffers.insert(1, buf_score) else: if (term_ordering[maxPosition-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[maxPosition-1], maxTerm)] buffers[maxPosition] = buf_score buf_score = 0 if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[maxPosition])] buffers.insert(maxPosition+1, buf_score) # update term ordering and ranking if maxPosition >= len(term_ordering): term_ordering.append(maxTerm) else: term_ordering.insert(maxPosition, maxTerm) term_iter_index.append(maxTerm) return (candidateTerms, term_ordering, term_iter_index, buffers) def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no): prevBond = 0.0 postBond = 0.0 # first iteration only if iteration_no == 0: current_freq = 0.0 current_saliency = 0.0 if candidateTerm in self.termFreqs: current_freq = self.termFreqs[candidateTerm] if candidateTerm in self.termSaliency: current_saliency = self.termSaliency[candidateTerm] return 0.001 * current_freq * current_saliency # get previous term if position > 0: prev_term = term_list[position-1] if (prev_term, candidateTerm) in self.similarity.combined_g2: prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)] # get next term if position < len(term_list): next_term = term_list[position] if (next_term, candidateTerm) in self.similarity.combined_g2: postBond = self.similarity.combined_g2[(candidateTerm, next_term)] return 2*(prevBond + postBond - currentBuffer)