Python SimilarityAPI Examples

Programming Language: Python

Namespace/Package Name: api_utils

Class/Type: SimilarityAPI

Examples at hotexamples.com: 6

Python SimilarityAPI - 6 examples found. These are the top rated real world Python examples of api_utils.SimilarityAPI extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SimilarityAPI(2)

read(1)

write(1)

Example #1

Show file

    def execute(self, data_path, numSeriatedTerms=None):

        assert data_path is not None
        if numSeriatedTerms is None:
            numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Computing term seriation...')
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    number_of_seriated_terms = %d', numSeriatedTerms)

        self.logger.info('Connecting to data...')
        self.saliency = SaliencyAPI(data_path)
        self.similarity = SimilarityAPI(data_path)
        self.seriation = SeriationAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.saliency.read()
        self.similarity.read()

        self.logger.info('Reshaping saliency data...')
        self.reshape()

        self.logger.info('Computing seriation...')
        self.compute(numSeriatedTerms)

        self.logger.info('Writing data to disk...')
        self.seriation.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )

Example #2

Show file

File: compute_seriation.py Project: Nventdata/termite

	def execute( self, data_path, numSeriatedTerms = None ):
		
		assert data_path is not None
		if numSeriatedTerms is None:
			numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term seriation...'                                                      )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    number_of_seriated_terms = %d', numSeriatedTerms                              )
		
		self.logger.info( 'Connecting to data...' )
		self.saliency = SaliencyAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		self.seriation = SeriationAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.saliency.read()
		self.similarity.read()
		
		self.logger.info( 'Reshaping saliency data...' )
		self.reshape()
		
		self.logger.info( 'Computing seriation...' )
		self.compute( numSeriatedTerms )
		
		self.logger.info( 'Writing data to disk...' )
		self.seriation.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )

Example #3

Show file

File: compute_similarity.py Project: riyachanduka/termite_test

	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )

Example #4

Show file

File: compute_similarity.py Project: riyachanduka/termite_test

class ComputeSimilarity( object ):
	"""
	Similarity measures.
	
	Compute term similarity based on co-occurrence and
	collocation likelihoods.
	"""
	
	DEFAULT_SLIDING_WINDOW_SIZE = 10
	MAX_FREQ = 100.0
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ComputeSimilarity' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def incrementCount( self, occurrence, key ):
		if key not in occurrence:
			occurrence[ key ] = 1
		else:
			occurrence[ key ] += 1
	
	def computeDocumentCooccurrence( self ):
		document_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			self.logger.debug( '    %s (%d tokens)', docID, len(docTokens) )
			tokenSet = frozenset(docTokens)
			document_count += 1
			for token in tokenSet:
				self.incrementCount( occurrence, token )
			for aToken in tokenSet:
				for bToken in tokenSet:
					if aToken < bToken:
						self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.document_count = document_count
		self.similarity.document_occurrence = occurrence
		self.similarity.document_cooccurrence = cooccurrence
	
	def computeSlidingWindowCooccurrence( self, sliding_window_size ):
		window_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size )
			self.logger.debug( '    %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) )
			for windowTokens in allWindowTokens:
				tokenSet = frozenset(windowTokens)
				window_count += 1
				for token in tokenSet:
					self.incrementCount( occurrence, token )
				for aToken in tokenSet:
					for bToken in tokenSet:
						if aToken < bToken:
							self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.window_count = window_count
		self.similarity.window_occurrence = occurrence
		self.similarity.window_cooccurrence = cooccurrence
	
	def getSlidingWindowTokens( self, tokens, sliding_window_size ):
		allWindows = []
		aIndex = 0 - sliding_window_size
		bIndex = len(tokens) + sliding_window_size
		for index in range( aIndex, bIndex ):
			a = max( 0           , index - sliding_window_size )
			b = min( len(tokens) , index + sliding_window_size )
			allWindows.append( tokens[a:b] )
		return allWindows
	
	def computeTokenCounts( self ):
		token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() )
		
		unigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			for token in docTokens:
				self.incrementCount( unigram_counts, token )
		
		bigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			prevToken = None
			for currToken in docTokens:
				if prevToken is not None:
					self.incrementCount( bigram_counts, (prevToken, currToken) )
				prevToken = currToken
		
		self.token_count = token_count
		self.similarity.unigram_counts = unigram_counts
		self.similarity.bigram_counts = bigram_counts
	
	def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ):
		assert B_given_A >= 0
		assert B_given_notA >= 0
		assert any_given_A >= B_given_A
		assert any_given_notA >= B_given_notA
		
		a = float( B_given_A )
		b = float( B_given_notA )
		c = float( any_given_A )
		d = float( any_given_notA )
		E1 = c * ( a + b ) / ( c + d )
		E2 = d * ( a + b ) / ( c + d )
		
		g2a = 0
		g2b = 0
		if a > 0:
			g2a = a * math.log( a / E1 )
		if b > 0:
			g2b = b * math.log( b / E2 )
		return 2 * ( g2a + g2b )
	
	def getG2( self, freq_all, freq_ab, freq_a, freq_b ):
		assert freq_all >= freq_a
		assert freq_all >= freq_b
		assert freq_a >= freq_ab
		assert freq_b >= freq_ab
		assert freq_all >= 0
		assert freq_ab >= 0
		assert freq_a >= 0
		assert freq_b >= 0
		
		B_given_A = freq_ab
		B_given_notA = freq_b - freq_ab
		any_given_A = freq_a
		any_given_notA = freq_all - freq_a
		
		return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA )
	
	def getG2Stats( self, max_count, occurrence, cooccurrence ):
		g2_stats = {}
		freq_all = max_count
		for ( firstToken, secondToken ) in cooccurrence:
			freq_a = occurrence[ firstToken ]
			freq_b = occurrence[ secondToken ]
			freq_ab = cooccurrence[ (firstToken, secondToken) ]
			
			scale = ComputeSimilarity.MAX_FREQ / freq_all
			rescaled_freq_all = freq_all * scale
			rescaled_freq_a = freq_a * scale
			rescaled_freq_b = freq_b * scale
			rescaled_freq_ab = freq_ab * scale
			if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0:
				g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b )
		return g2_stats
	
	def combineSimilarityMatrices( self ):
		self.logger.info( 'Combining similarity matrices...' )
		self.similarity.combined_g2 = {}
		
		keys_queued = []
		for key in self.similarity.document_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.window_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.collocation_g2:
			keys_queued.append( key )
		
		keys_processed = {}
		for key in keys_queued:
			keys_processed[ key ] = False
		
		for key in keys_queued:
			if not keys_processed[ key ]:
				keys_processed[ key ] = True
				
				( firstToken, secondToken ) = key
				if firstToken < secondToken:
					orderedKey = key
				else:
					orderedKey = ( secondToken, firstToken )
				score = 0.0
				if orderedKey in self.similarity.document_g2:
					score += self.similarity.document_g2[ orderedKey ]
				if orderedKey in self.similarity.window_g2:
					score += self.similarity.window_g2[ orderedKey ]
				if key in self.similarity.collocation_g2:
					score += self.similarity.collocation_g2[ key ]
				if score > 0.0:
					self.similarity.combined_g2[ key ] = score

Example #5

Show file

class ComputeSeriation(object):
    """Seriation algorithm.

	Re-order words to improve promote the legibility of multi-word
	phrases and reveal the clustering of related terms.
	
	As output, the algorithm produces a list of seriated terms and its 'ranking'
	(i.e., the iteration in which a term was seriated).
	"""

    DEFAULT_NUM_SERIATED_TERMS = 100

    def __init__(self, logging_level):
        self.logger = logging.getLogger('ComputeSeriation')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, data_path, numSeriatedTerms=None):

        assert data_path is not None
        if numSeriatedTerms is None:
            numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Computing term seriation...')
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    number_of_seriated_terms = %d', numSeriatedTerms)

        self.logger.info('Connecting to data...')
        self.saliency = SaliencyAPI(data_path)
        self.similarity = SimilarityAPI(data_path)
        self.seriation = SeriationAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.saliency.read()
        self.similarity.read()

        self.logger.info('Reshaping saliency data...')
        self.reshape()

        self.logger.info('Computing seriation...')
        self.compute(numSeriatedTerms)

        self.logger.info('Writing data to disk...')
        self.seriation.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )

    def reshape(self):
        self.candidateSize = 100
        self.orderedTermList = []
        self.termSaliency = {}
        self.termFreqs = {}
        self.termDistinct = {}
        self.termRank = {}
        self.termVisibility = {}
        for element in self.saliency.term_info:
            term = element['term']
            self.orderedTermList.append(term)
            self.termSaliency[term] = element['saliency']
            self.termFreqs[term] = element['frequency']
            self.termDistinct[term] = element['distinctiveness']
            self.termRank[term] = element['rank']
            self.termVisibility[term] = element['visibility']

    def compute(self, numSeriatedTerms):
        # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output...
        # set in init (i.e. read from config file)

        # Seriate!
        start_time = time.time()
        candidateTerms = self.orderedTermList
        self.seriation.term_ordering = []
        self.seriation.term_iter_index = []
        self.buffers = [0, 0]

        preBest = []
        postBest = []

        for iteration in range(numSeriatedTerms) and candidateTerms:
            print "Iteration no. ", iteration

            addedTerm = 0
            if len(self.seriation.term_iter_index) > 0:
                addedTerm = self.seriation.term_iter_index[-1]
            if iteration == 1:
                (preBest,
                 postBest) = self.initBestEnergies(addedTerm, candidateTerms)
            (preBest, postBest,
             self.bestEnergies) = self.getBestEnergies(preBest, postBest,
                                                       addedTerm)
            (candidateTerms, self.seriation.term_ordering,
             self.seriation.term_iter_index, self.buffers) = self.iterate_eff(
                 candidateTerms, self.seriation.term_ordering,
                 self.seriation.term_iter_index, self.buffers,
                 self.bestEnergies, iteration)

            print "---------------"
        seriation_time = time.time() - start_time

        # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered
        #print "term_ordering: ", self.seriation.term_ordering
        #print "term_iter_index: ", self.seriation.term_iter_index   # Feel free to pick a less confusing variable name

        #print "similarity matrix generation time: ", compute_sim_time
        #print "seriation time: ", seriation_time
        self.logger.debug("seriation time: " + str(seriation_time))

#-------------------------------------------------------------------------------#
# Helper Functions

    def initBestEnergies(self, firstTerm, candidateTerms):

        preBest = []
        postBest = []
        for candidate in candidateTerms:
            pre_score = 0
            post_score = 0

            # preBest
            if (candidate, firstTerm) in self.similarity.combined_g2:
                pre_score = self.similarity.combined_g2[(candidate, firstTerm)]
            # postBest
            if (firstTerm, candidate) in self.similarity.combined_g2:
                post_score = self.similarity.combined_g2[(firstTerm,
                                                          candidate)]

            preBest.append((candidate, pre_score))
            postBest.append((candidate, post_score))

        return (preBest, postBest)

    def getBestEnergies(self, preBest, postBest, addedTerm):
        if addedTerm == 0:
            return (preBest, postBest, [])

        term_order = [x[0] for x in preBest]
        # compare candidate terms' bests against newly added term
        remove_index = -1
        for existingIndex in range(len(preBest)):
            term = term_order[existingIndex]
            if term == addedTerm:
                remove_index = existingIndex

            # check pre energies
            if (term, addedTerm) in self.similarity.combined_g2:
                if self.similarity.combined_g2[(
                        term, addedTerm)] > preBest[existingIndex][1]:
                    preBest[existingIndex] = (
                        term, self.similarity.combined_g2[(term, addedTerm)])
            # check post energies
            if (addedTerm, term) in self.similarity.combined_g2:
                if self.similarity.combined_g2[(
                        addedTerm, term)] > postBest[existingIndex][1]:
                    postBest[existingIndex] = (
                        term, self.similarity.combined_g2[(addedTerm, term)])

        # remove the added term's preBest and postBest scores
        if remove_index != -1:
            del preBest[remove_index]
            del postBest[remove_index]

        #create and sort the bestEnergies list
        energyMax = [
            sum(pair)
            for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])
        ]
        bestEnergies = zip([x[0] for x in preBest], energyMax)

        return (preBest, postBest,
                sorted(bestEnergies, key=itemgetter(1), reverse=True))

    def iterate_eff(self, candidateTerms, term_ordering, term_iter_index,
                    buffers, bestEnergies, iteration_no):
        maxEnergyChange = -9999999999999999
        maxTerm = ""
        maxPosition = 0

        if len(bestEnergies) != 0:
            bestEnergy_terms = [x[0] for x in bestEnergies]
        else:
            bestEnergy_terms = candidateTerms

        breakout_counter = 0
        for candidate_index in range(len(bestEnergy_terms)):
            breakout_counter += 1
            candidate = bestEnergy_terms[candidate_index]
            for position in range(len(term_ordering) + 1):
                current_buffer = buffers[position]
                candidateRank = self.termRank[candidate]
                if candidateRank <= (len(term_ordering) + self.candidateSize):
                    current_energy_change = self.getEnergyChange(
                        candidate, position, term_ordering, current_buffer,
                        iteration_no)
                    if current_energy_change > maxEnergyChange:
                        maxEnergyChange = current_energy_change
                        maxTerm = candidate
                        maxPosition = position
            # check for early termination
            if candidate_index < len(bestEnergy_terms) - 1 and len(
                    bestEnergies) != 0:
                if maxEnergyChange >= (
                        2 *
                    (bestEnergies[candidate_index][1] + current_buffer)):
                    print "#-------- breaking out early ---------#"
                    print "candidates checked: ", breakout_counter
                    break

        print "change in energy: ", maxEnergyChange
        print "maxTerm: ", maxTerm
        print "maxPosition: ", maxPosition

        candidateTerms.remove(maxTerm)

        # update buffers
        buf_score = 0
        if len(term_ordering) == 0:
            buffers = buffers
        elif maxPosition >= len(term_ordering):
            if (term_ordering[-1], maxTerm) in self.similarity.combined_g2:
                buf_score = self.similarity.combined_g2[(term_ordering[-1],
                                                         maxTerm)]
            buffers.insert(len(buffers) - 1, buf_score)
        elif maxPosition == 0:
            if (maxTerm, term_ordering[0]) in self.similarity.combined_g2:
                buf_score = self.similarity.combined_g2[(maxTerm,
                                                         term_ordering[0])]
            buffers.insert(1, buf_score)
        else:
            if (term_ordering[maxPosition - 1],
                    maxTerm) in self.similarity.combined_g2:
                buf_score = self.similarity.combined_g2[(
                    term_ordering[maxPosition - 1], maxTerm)]
            buffers[maxPosition] = buf_score

            buf_score = 0
            if (maxTerm,
                    term_ordering[maxPosition]) in self.similarity.combined_g2:
                buf_score = self.similarity.combined_g2[(
                    maxTerm, term_ordering[maxPosition])]
            buffers.insert(maxPosition + 1, buf_score)

        # update term ordering and ranking
        if maxPosition >= len(term_ordering):
            term_ordering.append(maxTerm)
        else:
            term_ordering.insert(maxPosition, maxTerm)
        term_iter_index.append(maxTerm)

        return (candidateTerms, term_ordering, term_iter_index, buffers)

    def getEnergyChange(self, candidateTerm, position, term_list,
                        currentBuffer, iteration_no):
        prevBond = 0.0
        postBond = 0.0

        # first iteration only
        if iteration_no == 0:
            current_freq = 0.0
            current_saliency = 0.0

            if candidateTerm in self.termFreqs:
                current_freq = self.termFreqs[candidateTerm]
            if candidateTerm in self.termSaliency:
                current_saliency = self.termSaliency[candidateTerm]
            return 0.001 * current_freq * current_saliency

        # get previous term
        if position > 0:
            prev_term = term_list[position - 1]
            if (prev_term, candidateTerm) in self.similarity.combined_g2:
                prevBond = self.similarity.combined_g2[(prev_term,
                                                        candidateTerm)]

        # get next term
        if position < len(term_list):
            next_term = term_list[position]
            if (next_term, candidateTerm) in self.similarity.combined_g2:
                postBond = self.similarity.combined_g2[(candidateTerm,
                                                        next_term)]

        return 2 * (prevBond + postBond - currentBuffer)

Example #6

Show file

File: compute_seriation.py Project: Nventdata/termite

class ComputeSeriation( object ):
	"""Seriation algorithm.

	Re-order words to improve promote the legibility of multi-word
	phrases and reveal the clustering of related terms.
	
	As output, the algorithm produces a list of seriated terms and its 'ranking'
	(i.e., the iteration in which a term was seriated).
	"""
	
	DEFAULT_NUM_SERIATED_TERMS = 100
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ComputeSeriation' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path, numSeriatedTerms = None ):
		
		assert data_path is not None
		if numSeriatedTerms is None:
			numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term seriation...'                                                      )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    number_of_seriated_terms = %d', numSeriatedTerms                              )
		
		self.logger.info( 'Connecting to data...' )
		self.saliency = SaliencyAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		self.seriation = SeriationAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.saliency.read()
		self.similarity.read()
		
		self.logger.info( 'Reshaping saliency data...' )
		self.reshape()
		
		self.logger.info( 'Computing seriation...' )
		self.compute( numSeriatedTerms )
		
		self.logger.info( 'Writing data to disk...' )
		self.seriation.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def reshape( self ):
		self.candidateSize = 100
		self.orderedTermList = []
		self.termSaliency = {}
		self.termFreqs = {}
		self.termDistinct = {}
		self.termRank = {}
		self.termVisibility = {}
		for element in self.saliency.term_info:
			term = element['term']
			self.orderedTermList.append( term )
			self.termSaliency[term] = element['saliency']
			self.termFreqs[term] = element['frequency']
			self.termDistinct[term] = element['distinctiveness']
			self.termRank[term] = element['rank']
			self.termVisibility[term] = element['visibility']
	
	def compute( self, numSeriatedTerms ):
		# Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output...
		# set in init (i.e. read from config file)
		
		# Seriate!
		start_time = time.time()
		candidateTerms = self.orderedTermList
		self.seriation.term_ordering = []
		self.seriation.term_iter_index = []
		self.buffers = [0,0]
		
		preBest = []
		postBest = []
		
		for iteration in range(numSeriatedTerms):
			print "Iteration no. ", iteration
			
			addedTerm = 0
			if len(self.seriation.term_iter_index) > 0:
				addedTerm = self.seriation.term_iter_index[-1]
			if iteration == 1:
				(preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms)
			(preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm)
			(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration)
			
			print "---------------"
		seriation_time = time.time() - start_time
		
		# Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered
		#print "term_ordering: ", self.seriation.term_ordering
		#print "term_iter_index: ", self.seriation.term_iter_index   # Feel free to pick a less confusing variable name
		
		#print "similarity matrix generation time: ", compute_sim_time
		#print "seriation time: ", seriation_time
		self.logger.debug("seriation time: " +  str(seriation_time))

#-------------------------------------------------------------------------------#
# Helper Functions
	
	def initBestEnergies(self, firstTerm, candidateTerms):
		
		preBest = []
		postBest = []
		for candidate in candidateTerms:
			pre_score = 0
			post_score = 0
			
			# preBest
			if (candidate, firstTerm) in self.similarity.combined_g2:
				pre_score = self.similarity.combined_g2[(candidate, firstTerm)]
			# postBest
			if (firstTerm, candidate) in self.similarity.combined_g2:
				post_score = self.similarity.combined_g2[(firstTerm, candidate)]
			
			preBest.append((candidate, pre_score))
			postBest.append((candidate, post_score))
		
		return (preBest, postBest)
	
	def getBestEnergies(self, preBest, postBest, addedTerm):
		if addedTerm == 0:
			return (preBest, postBest, [])
		
		term_order = [x[0] for x in preBest]
		# compare candidate terms' bests against newly added term
		remove_index = -1
		for existingIndex in range(len(preBest)):
			term = term_order[existingIndex]
			if term == addedTerm:
				remove_index = existingIndex
			
			# check pre energies
			if (term, addedTerm) in self.similarity.combined_g2:
				if self.similarity.combined_g2[(term, addedTerm)] > preBest[existingIndex][1]:
					preBest[existingIndex] = (term, self.similarity.combined_g2[(term, addedTerm)])
			# check post energies
			if (addedTerm, term) in self.similarity.combined_g2:
				if self.similarity.combined_g2[(addedTerm, term)] > postBest[existingIndex][1]:
					postBest[existingIndex] = (term, self.similarity.combined_g2[(addedTerm, term)])
		
		# remove the added term's preBest and postBest scores
		if remove_index != -1:
			del preBest[remove_index]
			del postBest[remove_index]
		
		#create and sort the bestEnergies list
		energyMax = [sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])]
		bestEnergies = zip([x[0] for x in preBest], energyMax)
		
		return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True))
	
	def iterate_eff( self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no ):
		maxEnergyChange = 0.0;
		maxTerm = "";
		maxPosition = 0;
		
		if len(bestEnergies) != 0:
			bestEnergy_terms = [x[0] for x in bestEnergies]
		else:
			bestEnergy_terms = candidateTerms
		
		breakout_counter = 0
		for candidate_index in range(len(bestEnergy_terms)):
			breakout_counter += 1
			candidate = bestEnergy_terms[candidate_index]
			for position in range(len(term_ordering)+1):
				current_buffer = buffers[position]
				candidateRank = self.termRank[candidate]
				if candidateRank <= (len(term_ordering) + self.candidateSize):
					current_energy_change = self.getEnergyChange(candidate, position, term_ordering, current_buffer, iteration_no)
					if current_energy_change > maxEnergyChange:
						maxEnergyChange = current_energy_change
						maxTerm = candidate
						maxPosition = position
			# check for early termination
			if candidate_index < len(bestEnergy_terms)-1 and len(bestEnergies) != 0:
				if maxEnergyChange >= (2*(bestEnergies[candidate_index][1] + current_buffer)):
					print "#-------- breaking out early ---------#"
					print "candidates checked: ", breakout_counter
					break;
		
		print "change in energy: ", maxEnergyChange
		print "maxTerm: ", maxTerm
		print "maxPosition: ", maxPosition
		
		candidateTerms.remove(maxTerm)
		
		# update buffers
		buf_score = 0
		if len(term_ordering) == 0:
			buffers = buffers
		elif maxPosition >= len(term_ordering):
			if (term_ordering[-1], maxTerm) in self.similarity.combined_g2:
				buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)]
			buffers.insert(len(buffers)-1, buf_score)
		elif maxPosition == 0:
			if (maxTerm, term_ordering[0]) in self.similarity.combined_g2:
				buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])]
			buffers.insert(1, buf_score)
		else:
			if (term_ordering[maxPosition-1], maxTerm) in self.similarity.combined_g2:
				buf_score = self.similarity.combined_g2[(term_ordering[maxPosition-1], maxTerm)]
			buffers[maxPosition] = buf_score
			
			buf_score = 0
			if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2:
				buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[maxPosition])]
			buffers.insert(maxPosition+1, buf_score)
		
		# update term ordering and ranking
		if maxPosition >= len(term_ordering):
			term_ordering.append(maxTerm)
		else:
			term_ordering.insert(maxPosition, maxTerm)
		term_iter_index.append(maxTerm)
			
		
		return (candidateTerms, term_ordering, term_iter_index, buffers)
	
	def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no):
		prevBond = 0.0
		postBond = 0.0
		
		# first iteration only
		if iteration_no == 0:
			current_freq = 0.0
			current_saliency = 0.0
			
			if candidateTerm in self.termFreqs:
				current_freq = self.termFreqs[candidateTerm]
			if candidateTerm in self.termSaliency:
				current_saliency = self.termSaliency[candidateTerm]
			return 0.001 * current_freq * current_saliency
		
		# get previous term
		if position > 0:
			prev_term = term_list[position-1]
			if (prev_term, candidateTerm) in self.similarity.combined_g2:
				prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)]
		
		# get next term
		if position < len(term_list):
			next_term = term_list[position]
			if (next_term, candidateTerm) in self.similarity.combined_g2:
				postBond = self.similarity.combined_g2[(candidateTerm, next_term)]
		
		return 2*(prevBond + postBond - currentBuffer)