def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
Esempio n. 2
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == 'unicode':
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == 'whitespace':
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == 'alpha':
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == 'alphanumeric':
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenization = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
Esempio n. 3
0
    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )