class ComputeSimilarity( object ):
	"""
	Similarity measures.
	
	Compute term similarity based on co-occurrence and
	collocation likelihoods.
	"""
	
	DEFAULT_SLIDING_WINDOW_SIZE = 10
	MAX_FREQ = 100.0
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ComputeSimilarity' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def incrementCount( self, occurrence, key ):
		if key not in occurrence:
			occurrence[ key ] = 1
		else:
			occurrence[ key ] += 1
	
	def computeDocumentCooccurrence( self ):
		document_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			self.logger.debug( '    %s (%d tokens)', docID, len(docTokens) )
			tokenSet = frozenset(docTokens)
			document_count += 1
			for token in tokenSet:
				self.incrementCount( occurrence, token )
			for aToken in tokenSet:
				for bToken in tokenSet:
					if aToken < bToken:
						self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.document_count = document_count
		self.similarity.document_occurrence = occurrence
		self.similarity.document_cooccurrence = cooccurrence
	
	def computeSlidingWindowCooccurrence( self, sliding_window_size ):
		window_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size )
			self.logger.debug( '    %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) )
			for windowTokens in allWindowTokens:
				tokenSet = frozenset(windowTokens)
				window_count += 1
				for token in tokenSet:
					self.incrementCount( occurrence, token )
				for aToken in tokenSet:
					for bToken in tokenSet:
						if aToken < bToken:
							self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.window_count = window_count
		self.similarity.window_occurrence = occurrence
		self.similarity.window_cooccurrence = cooccurrence
	
	def getSlidingWindowTokens( self, tokens, sliding_window_size ):
		allWindows = []
		aIndex = 0 - sliding_window_size
		bIndex = len(tokens) + sliding_window_size
		for index in range( aIndex, bIndex ):
			a = max( 0           , index - sliding_window_size )
			b = min( len(tokens) , index + sliding_window_size )
			allWindows.append( tokens[a:b] )
		return allWindows
	
	def computeTokenCounts( self ):
		token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() )
		
		unigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			for token in docTokens:
				self.incrementCount( unigram_counts, token )
		
		bigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			prevToken = None
			for currToken in docTokens:
				if prevToken is not None:
					self.incrementCount( bigram_counts, (prevToken, currToken) )
				prevToken = currToken
		
		self.token_count = token_count
		self.similarity.unigram_counts = unigram_counts
		self.similarity.bigram_counts = bigram_counts
	
	def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ):
		assert B_given_A >= 0
		assert B_given_notA >= 0
		assert any_given_A >= B_given_A
		assert any_given_notA >= B_given_notA
		
		a = float( B_given_A )
		b = float( B_given_notA )
		c = float( any_given_A )
		d = float( any_given_notA )
		E1 = c * ( a + b ) / ( c + d )
		E2 = d * ( a + b ) / ( c + d )
		
		g2a = 0
		g2b = 0
		if a > 0:
			g2a = a * math.log( a / E1 )
		if b > 0:
			g2b = b * math.log( b / E2 )
		return 2 * ( g2a + g2b )
	
	def getG2( self, freq_all, freq_ab, freq_a, freq_b ):
		assert freq_all >= freq_a
		assert freq_all >= freq_b
		assert freq_a >= freq_ab
		assert freq_b >= freq_ab
		assert freq_all >= 0
		assert freq_ab >= 0
		assert freq_a >= 0
		assert freq_b >= 0
		
		B_given_A = freq_ab
		B_given_notA = freq_b - freq_ab
		any_given_A = freq_a
		any_given_notA = freq_all - freq_a
		
		return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA )
	
	def getG2Stats( self, max_count, occurrence, cooccurrence ):
		g2_stats = {}
		freq_all = max_count
		for ( firstToken, secondToken ) in cooccurrence:
			freq_a = occurrence[ firstToken ]
			freq_b = occurrence[ secondToken ]
			freq_ab = cooccurrence[ (firstToken, secondToken) ]
			
			scale = ComputeSimilarity.MAX_FREQ / freq_all
			rescaled_freq_all = freq_all * scale
			rescaled_freq_a = freq_a * scale
			rescaled_freq_b = freq_b * scale
			rescaled_freq_ab = freq_ab * scale
			if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0:
				g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b )
		return g2_stats
	
	def combineSimilarityMatrices( self ):
		self.logger.info( 'Combining similarity matrices...' )
		self.similarity.combined_g2 = {}
		
		keys_queued = []
		for key in self.similarity.document_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.window_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.collocation_g2:
			keys_queued.append( key )
		
		keys_processed = {}
		for key in keys_queued:
			keys_processed[ key ] = False
		
		for key in keys_queued:
			if not keys_processed[ key ]:
				keys_processed[ key ] = True
				
				( firstToken, secondToken ) = key
				if firstToken < secondToken:
					orderedKey = key
				else:
					orderedKey = ( secondToken, firstToken )
				score = 0.0
				if orderedKey in self.similarity.document_g2:
					score += self.similarity.document_g2[ orderedKey ]
				if orderedKey in self.similarity.window_g2:
					score += self.similarity.window_g2[ orderedKey ]
				if key in self.similarity.collocation_g2:
					score += self.similarity.collocation_g2[ key ]
				if score > 0.0:
					self.similarity.combined_g2[ key ] = score