예제 #1
0
	def execute( self, corpus_format, corpus_path, data_path, tokenization = None ):
		
		assert corpus_format is not None
		assert corpus_path is not None
		assert data_path is not None
		if tokenization is None:
			tokenization = Tokenize.DEFAULT_TOKENIZATION
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Tokenizing source corpus...'                                                      )
		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    tokenziation = %s', tokenization                                              )
		
		self.logger.info( 'Connecting to data...' )
		self.documents = DocumentsAPI( corpus_format, corpus_path )
		self.tokens = TokensAPI( data_path )
		
		self.logger.info( 'Reading from disk...' )
		self.documents.read()
		
		self.logger.info( 'Tokenizing...' )
		self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) )
		
		self.logger.info( 'Writing to disk...' )
		self.tokens.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
예제 #2
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == "unicode":
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == "whitespace":
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == "alpha":
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == "alphanumeric":
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info("--------------------------------------------------------------------------------")
        self.logger.info("Tokenizing source corpus...")
        self.logger.info("    corpus_path = %s (%s)", corpus_path, corpus_format)
        self.logger.info("    data_path = %s", data_path)
        self.logger.info("    tokenization = %s", tokenization)

        self.logger.info("Connecting to data...")
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info("Reading from disk...")
        self.documents.read()

        self.logger.info("Tokenizing...")
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info("Writing to disk...")
        self.tokens.write()

        self.logger.info("--------------------------------------------------------------------------------")
	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
예제 #4
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == 'unicode':
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == 'whitespace':
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == 'alpha':
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == 'alphanumeric':
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenization = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
예제 #5
0
    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
class ComputeSimilarity( object ):
	"""
	Similarity measures.
	
	Compute term similarity based on co-occurrence and
	collocation likelihoods.
	"""
	
	DEFAULT_SLIDING_WINDOW_SIZE = 10
	MAX_FREQ = 100.0
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ComputeSimilarity' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path, sliding_window_size = None ):
		
		assert data_path is not None
		if sliding_window_size is None:
			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term similarity...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
		
		self.logger.info( 'Connecting to data...' )
		self.tokens = TokensAPI( data_path )
		self.similarity = SimilarityAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.tokens.read()
		
		self.logger.info( 'Computing document co-occurrence...' )
		self.computeDocumentCooccurrence()
		
		self.logger.info( 'Computing sliding-window co-occurrence...' )
		self.computeSlidingWindowCooccurrence( sliding_window_size )
		
		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
		self.computeTokenCounts()
		
		self.logger.info( 'Computing document co-occurrence likelihood...' )
		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
		
		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
		
		self.logger.info( 'Computing collocation likelihood...' )
		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
		
		self.combineSimilarityMatrices()
		
		self.logger.info( 'Writing data to disk...' )
		self.similarity.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def incrementCount( self, occurrence, key ):
		if key not in occurrence:
			occurrence[ key ] = 1
		else:
			occurrence[ key ] += 1
	
	def computeDocumentCooccurrence( self ):
		document_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			self.logger.debug( '    %s (%d tokens)', docID, len(docTokens) )
			tokenSet = frozenset(docTokens)
			document_count += 1
			for token in tokenSet:
				self.incrementCount( occurrence, token )
			for aToken in tokenSet:
				for bToken in tokenSet:
					if aToken < bToken:
						self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.document_count = document_count
		self.similarity.document_occurrence = occurrence
		self.similarity.document_cooccurrence = cooccurrence
	
	def computeSlidingWindowCooccurrence( self, sliding_window_size ):
		window_count = 0
		occurrence = {}
		cooccurrence = {}
		for docID, docTokens in self.tokens.data.iteritems():
			allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size )
			self.logger.debug( '    %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) )
			for windowTokens in allWindowTokens:
				tokenSet = frozenset(windowTokens)
				window_count += 1
				for token in tokenSet:
					self.incrementCount( occurrence, token )
				for aToken in tokenSet:
					for bToken in tokenSet:
						if aToken < bToken:
							self.incrementCount( cooccurrence, (aToken, bToken) )
		
		self.window_count = window_count
		self.similarity.window_occurrence = occurrence
		self.similarity.window_cooccurrence = cooccurrence
	
	def getSlidingWindowTokens( self, tokens, sliding_window_size ):
		allWindows = []
		aIndex = 0 - sliding_window_size
		bIndex = len(tokens) + sliding_window_size
		for index in range( aIndex, bIndex ):
			a = max( 0           , index - sliding_window_size )
			b = min( len(tokens) , index + sliding_window_size )
			allWindows.append( tokens[a:b] )
		return allWindows
	
	def computeTokenCounts( self ):
		token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() )
		
		unigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			for token in docTokens:
				self.incrementCount( unigram_counts, token )
		
		bigram_counts = {}
		for docTokens in self.tokens.data.itervalues():
			prevToken = None
			for currToken in docTokens:
				if prevToken is not None:
					self.incrementCount( bigram_counts, (prevToken, currToken) )
				prevToken = currToken
		
		self.token_count = token_count
		self.similarity.unigram_counts = unigram_counts
		self.similarity.bigram_counts = bigram_counts
	
	def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ):
		assert B_given_A >= 0
		assert B_given_notA >= 0
		assert any_given_A >= B_given_A
		assert any_given_notA >= B_given_notA
		
		a = float( B_given_A )
		b = float( B_given_notA )
		c = float( any_given_A )
		d = float( any_given_notA )
		E1 = c * ( a + b ) / ( c + d )
		E2 = d * ( a + b ) / ( c + d )
		
		g2a = 0
		g2b = 0
		if a > 0:
			g2a = a * math.log( a / E1 )
		if b > 0:
			g2b = b * math.log( b / E2 )
		return 2 * ( g2a + g2b )
	
	def getG2( self, freq_all, freq_ab, freq_a, freq_b ):
		assert freq_all >= freq_a
		assert freq_all >= freq_b
		assert freq_a >= freq_ab
		assert freq_b >= freq_ab
		assert freq_all >= 0
		assert freq_ab >= 0
		assert freq_a >= 0
		assert freq_b >= 0
		
		B_given_A = freq_ab
		B_given_notA = freq_b - freq_ab
		any_given_A = freq_a
		any_given_notA = freq_all - freq_a
		
		return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA )
	
	def getG2Stats( self, max_count, occurrence, cooccurrence ):
		g2_stats = {}
		freq_all = max_count
		for ( firstToken, secondToken ) in cooccurrence:
			freq_a = occurrence[ firstToken ]
			freq_b = occurrence[ secondToken ]
			freq_ab = cooccurrence[ (firstToken, secondToken) ]
			
			scale = ComputeSimilarity.MAX_FREQ / freq_all
			rescaled_freq_all = freq_all * scale
			rescaled_freq_a = freq_a * scale
			rescaled_freq_b = freq_b * scale
			rescaled_freq_ab = freq_ab * scale
			if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0:
				g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b )
		return g2_stats
	
	def combineSimilarityMatrices( self ):
		self.logger.info( 'Combining similarity matrices...' )
		self.similarity.combined_g2 = {}
		
		keys_queued = []
		for key in self.similarity.document_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.window_g2:
			( firstToken, secondToken ) = key
			otherKey = ( secondToken, firstToken )
			keys_queued.append( key )
			keys_queued.append( otherKey )
		for key in self.similarity.collocation_g2:
			keys_queued.append( key )
		
		keys_processed = {}
		for key in keys_queued:
			keys_processed[ key ] = False
		
		for key in keys_queued:
			if not keys_processed[ key ]:
				keys_processed[ key ] = True
				
				( firstToken, secondToken ) = key
				if firstToken < secondToken:
					orderedKey = key
				else:
					orderedKey = ( secondToken, firstToken )
				score = 0.0
				if orderedKey in self.similarity.document_g2:
					score += self.similarity.document_g2[ orderedKey ]
				if orderedKey in self.similarity.window_g2:
					score += self.similarity.window_g2[ orderedKey ]
				if key in self.similarity.collocation_g2:
					score += self.similarity.collocation_g2[ key ]
				if score > 0.0:
					self.similarity.combined_g2[ key ] = score
예제 #7
0
class Tokenize( object ):

	"""
	Takes in the input corpus doc and writes it out as a list of tokens.
	
	Currently, supports only single document corpus with one document per line of format:
		doc_id<tab>document_content
	(Two fields delimited by tab.)
	
	Support for multiple files, directory(ies), and Lucene considered for future releases.
	"""
	
	WHITESPACE_TOKENIZATION = r'[^ ]+'
	ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*'
	ALPHA_TOKENIZATION = r'[A-Za-z_]+'
	UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+'
	DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'Tokenize' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, corpus_format, corpus_path, data_path, tokenization = None ):
		
		assert corpus_format is not None
		assert corpus_path is not None
		assert data_path is not None
		if tokenization is None:
			tokenization = Tokenize.DEFAULT_TOKENIZATION
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Tokenizing source corpus...'                                                      )
		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    tokenziation = %s', tokenization                                              )
		
		self.logger.info( 'Connecting to data...' )
		self.documents = DocumentsAPI( corpus_format, corpus_path )
		self.tokens = TokensAPI( data_path )
		
		self.logger.info( 'Reading from disk...' )
		self.documents.read()
		
		self.logger.info( 'Tokenizing...' )
		self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) )
		
		self.logger.info( 'Writing to disk...' )
		self.tokens.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def TokenizeDocuments( self, tokenizer ):
		for docID, docContent in self.documents.data.iteritems():
			docTokens = self.TokenizeDocument( docContent, tokenizer )
			self.tokens.data[ docID ] = docTokens
	
	def TokenizeDocument( self, text, tokenizer ):
		tokens = []
		for token in re.findall( tokenizer, text ):
			tokens.append( token.lower() )
		return tokens
예제 #8
0
class Tokenize(object):
    """
	Takes in the input corpus doc and writes it out as a list of tokens.
	
	Currently, supports only single document corpus with one document per line of format:
		doc_id<tab>document_content
	(Two fields delimited by tab.)
	
	Support for multiple files, directory(ies), and Lucene considered for future releases.
	"""

    WHITESPACE_TOKENIZATION = r'[^ ]+'
    ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*'
    ALPHA_TOKENIZATION = r'[A-Za-z_]+'
    UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+'
    DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION

    def __init__(self, logging_level):
        self.logger = logging.getLogger('Tokenize')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )

    def TokenizeDocuments(self, tokenizer):
        for docID, docContent in self.documents.data.iteritems():
            docTokens = self.TokenizeDocument(docContent, tokenizer)
            self.tokens.data[docID] = docTokens

    def TokenizeDocument(self, text, tokenizer):
        tokens = []
        for token in re.findall(tokenizer, text):
            tokens.append(token.lower())
        return tokens