def execute( self, corpus_format, corpus_path, data_path, tokenization = None ): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Tokenizing source corpus...' ) self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' tokenziation = %s', tokenization ) self.logger.info( 'Connecting to data...' ) self.documents = DocumentsAPI( corpus_format, corpus_path ) self.tokens = TokensAPI( data_path ) self.logger.info( 'Reading from disk...' ) self.documents.read() self.logger.info( 'Tokenizing...' ) self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) ) self.logger.info( 'Writing to disk...' ) self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION elif tokenization == "unicode": tokenization = Tokenize.UNICODE_TOKENIZATION elif tokenization == "whitespace": tokenization = Tokenize.WHITESPACE_TOKENIZATION elif tokenization == "alpha": tokenization = Tokenize.ALPHA_TOKENIZATION elif tokenization == "alphanumeric": tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION self.logger.info("--------------------------------------------------------------------------------") self.logger.info("Tokenizing source corpus...") self.logger.info(" corpus_path = %s (%s)", corpus_path, corpus_format) self.logger.info(" data_path = %s", data_path) self.logger.info(" tokenization = %s", tokenization) self.logger.info("Connecting to data...") self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info("Reading from disk...") self.documents.read() self.logger.info("Tokenizing...") self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info("Writing to disk...") self.tokens.write() self.logger.info("--------------------------------------------------------------------------------")
def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION elif tokenization == 'unicode': tokenization = Tokenize.UNICODE_TOKENIZATION elif tokenization == 'whitespace': tokenization = Tokenize.WHITESPACE_TOKENIZATION elif tokenization == 'alpha': tokenization = Tokenize.ALPHA_TOKENIZATION elif tokenization == 'alphanumeric': tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenization = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization=None): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenziation = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
class ComputeSimilarity( object ): """ Similarity measures. Compute term similarity based on co-occurrence and collocation likelihoods. """ DEFAULT_SLIDING_WINDOW_SIZE = 10 MAX_FREQ = 100.0 def __init__( self, logging_level ): self.logger = logging.getLogger( 'ComputeSimilarity' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' ) def incrementCount( self, occurrence, key ): if key not in occurrence: occurrence[ key ] = 1 else: occurrence[ key ] += 1 def computeDocumentCooccurrence( self ): document_count = 0 occurrence = {} cooccurrence = {} for docID, docTokens in self.tokens.data.iteritems(): self.logger.debug( ' %s (%d tokens)', docID, len(docTokens) ) tokenSet = frozenset(docTokens) document_count += 1 for token in tokenSet: self.incrementCount( occurrence, token ) for aToken in tokenSet: for bToken in tokenSet: if aToken < bToken: self.incrementCount( cooccurrence, (aToken, bToken) ) self.document_count = document_count self.similarity.document_occurrence = occurrence self.similarity.document_cooccurrence = cooccurrence def computeSlidingWindowCooccurrence( self, sliding_window_size ): window_count = 0 occurrence = {} cooccurrence = {} for docID, docTokens in self.tokens.data.iteritems(): allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size ) self.logger.debug( ' %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) ) for windowTokens in allWindowTokens: tokenSet = frozenset(windowTokens) window_count += 1 for token in tokenSet: self.incrementCount( occurrence, token ) for aToken in tokenSet: for bToken in tokenSet: if aToken < bToken: self.incrementCount( cooccurrence, (aToken, bToken) ) self.window_count = window_count self.similarity.window_occurrence = occurrence self.similarity.window_cooccurrence = cooccurrence def getSlidingWindowTokens( self, tokens, sliding_window_size ): allWindows = [] aIndex = 0 - sliding_window_size bIndex = len(tokens) + sliding_window_size for index in range( aIndex, bIndex ): a = max( 0 , index - sliding_window_size ) b = min( len(tokens) , index + sliding_window_size ) allWindows.append( tokens[a:b] ) return allWindows def computeTokenCounts( self ): token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() ) unigram_counts = {} for docTokens in self.tokens.data.itervalues(): for token in docTokens: self.incrementCount( unigram_counts, token ) bigram_counts = {} for docTokens in self.tokens.data.itervalues(): prevToken = None for currToken in docTokens: if prevToken is not None: self.incrementCount( bigram_counts, (prevToken, currToken) ) prevToken = currToken self.token_count = token_count self.similarity.unigram_counts = unigram_counts self.similarity.bigram_counts = bigram_counts def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ): assert B_given_A >= 0 assert B_given_notA >= 0 assert any_given_A >= B_given_A assert any_given_notA >= B_given_notA a = float( B_given_A ) b = float( B_given_notA ) c = float( any_given_A ) d = float( any_given_notA ) E1 = c * ( a + b ) / ( c + d ) E2 = d * ( a + b ) / ( c + d ) g2a = 0 g2b = 0 if a > 0: g2a = a * math.log( a / E1 ) if b > 0: g2b = b * math.log( b / E2 ) return 2 * ( g2a + g2b ) def getG2( self, freq_all, freq_ab, freq_a, freq_b ): assert freq_all >= freq_a assert freq_all >= freq_b assert freq_a >= freq_ab assert freq_b >= freq_ab assert freq_all >= 0 assert freq_ab >= 0 assert freq_a >= 0 assert freq_b >= 0 B_given_A = freq_ab B_given_notA = freq_b - freq_ab any_given_A = freq_a any_given_notA = freq_all - freq_a return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA ) def getG2Stats( self, max_count, occurrence, cooccurrence ): g2_stats = {} freq_all = max_count for ( firstToken, secondToken ) in cooccurrence: freq_a = occurrence[ firstToken ] freq_b = occurrence[ secondToken ] freq_ab = cooccurrence[ (firstToken, secondToken) ] scale = ComputeSimilarity.MAX_FREQ / freq_all rescaled_freq_all = freq_all * scale rescaled_freq_a = freq_a * scale rescaled_freq_b = freq_b * scale rescaled_freq_ab = freq_ab * scale if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0: g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b ) return g2_stats def combineSimilarityMatrices( self ): self.logger.info( 'Combining similarity matrices...' ) self.similarity.combined_g2 = {} keys_queued = [] for key in self.similarity.document_g2: ( firstToken, secondToken ) = key otherKey = ( secondToken, firstToken ) keys_queued.append( key ) keys_queued.append( otherKey ) for key in self.similarity.window_g2: ( firstToken, secondToken ) = key otherKey = ( secondToken, firstToken ) keys_queued.append( key ) keys_queued.append( otherKey ) for key in self.similarity.collocation_g2: keys_queued.append( key ) keys_processed = {} for key in keys_queued: keys_processed[ key ] = False for key in keys_queued: if not keys_processed[ key ]: keys_processed[ key ] = True ( firstToken, secondToken ) = key if firstToken < secondToken: orderedKey = key else: orderedKey = ( secondToken, firstToken ) score = 0.0 if orderedKey in self.similarity.document_g2: score += self.similarity.document_g2[ orderedKey ] if orderedKey in self.similarity.window_g2: score += self.similarity.window_g2[ orderedKey ] if key in self.similarity.collocation_g2: score += self.similarity.collocation_g2[ key ] if score > 0.0: self.similarity.combined_g2[ key ] = score
class Tokenize( object ): """ Takes in the input corpus doc and writes it out as a list of tokens. Currently, supports only single document corpus with one document per line of format: doc_id<tab>document_content (Two fields delimited by tab.) Support for multiple files, directory(ies), and Lucene considered for future releases. """ WHITESPACE_TOKENIZATION = r'[^ ]+' ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*' ALPHA_TOKENIZATION = r'[A-Za-z_]+' UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+' DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION def __init__( self, logging_level ): self.logger = logging.getLogger( 'Tokenize' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, corpus_format, corpus_path, data_path, tokenization = None ): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Tokenizing source corpus...' ) self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' tokenziation = %s', tokenization ) self.logger.info( 'Connecting to data...' ) self.documents = DocumentsAPI( corpus_format, corpus_path ) self.tokens = TokensAPI( data_path ) self.logger.info( 'Reading from disk...' ) self.documents.read() self.logger.info( 'Tokenizing...' ) self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) ) self.logger.info( 'Writing to disk...' ) self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' ) def TokenizeDocuments( self, tokenizer ): for docID, docContent in self.documents.data.iteritems(): docTokens = self.TokenizeDocument( docContent, tokenizer ) self.tokens.data[ docID ] = docTokens def TokenizeDocument( self, text, tokenizer ): tokens = [] for token in re.findall( tokenizer, text ): tokens.append( token.lower() ) return tokens
class Tokenize(object): """ Takes in the input corpus doc and writes it out as a list of tokens. Currently, supports only single document corpus with one document per line of format: doc_id<tab>document_content (Two fields delimited by tab.) Support for multiple files, directory(ies), and Lucene considered for future releases. """ WHITESPACE_TOKENIZATION = r'[^ ]+' ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*' ALPHA_TOKENIZATION = r'[A-Za-z_]+' UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+' DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION def __init__(self, logging_level): self.logger = logging.getLogger('Tokenize') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, corpus_format, corpus_path, data_path, tokenization=None): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenziation = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' ) def TokenizeDocuments(self, tokenizer): for docID, docContent in self.documents.data.iteritems(): docTokens = self.TokenizeDocument(docContent, tokenizer) self.tokens.data[docID] = docTokens def TokenizeDocument(self, text, tokenizer): tokens = [] for token in re.findall(tokenizer, text): tokens.append(token.lower()) return tokens