def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION elif tokenization == 'unicode': tokenization = Tokenize.UNICODE_TOKENIZATION elif tokenization == 'whitespace': tokenization = Tokenize.WHITESPACE_TOKENIZATION elif tokenization == 'alpha': tokenization = Tokenize.ALPHA_TOKENIZATION elif tokenization == 'alphanumeric': tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenization = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization=None): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenziation = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )