Beispiel #1
0
    def execute(self, data_path, numSeriatedTerms=None):

        assert data_path is not None
        if numSeriatedTerms is None:
            numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Computing term seriation...')
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    number_of_seriated_terms = %d', numSeriatedTerms)

        self.logger.info('Connecting to data...')
        self.saliency = SaliencyAPI(data_path)
        self.similarity = SimilarityAPI(data_path)
        self.seriation = SeriationAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.saliency.read()
        self.similarity.read()

        self.logger.info('Reshaping saliency data...')
        self.reshape()

        self.logger.info('Computing seriation...')
        self.compute(numSeriatedTerms)

        self.logger.info('Writing data to disk...')
        self.seriation.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )