Ejemplo n.º 1
0
	def execute( self, corpus_format, corpus_path, data_path, tokenization = None ):
		
		assert corpus_format is not None
		assert corpus_path is not None
		assert data_path is not None
		if tokenization is None:
			tokenization = Tokenize.DEFAULT_TOKENIZATION
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Tokenizing source corpus...'                                                      )
		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    tokenziation = %s', tokenization                                              )
		
		self.logger.info( 'Connecting to data...' )
		self.documents = DocumentsAPI( corpus_format, corpus_path )
		self.tokens = TokensAPI( data_path )
		
		self.logger.info( 'Reading from disk...' )
		self.documents.read()
		
		self.logger.info( 'Tokenizing...' )
		self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) )
		
		self.logger.info( 'Writing to disk...' )
		self.tokens.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
Ejemplo n.º 2
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == "unicode":
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == "whitespace":
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == "alpha":
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == "alphanumeric":
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info("--------------------------------------------------------------------------------")
        self.logger.info("Tokenizing source corpus...")
        self.logger.info("    corpus_path = %s (%s)", corpus_path, corpus_format)
        self.logger.info("    data_path = %s", data_path)
        self.logger.info("    tokenization = %s", tokenization)

        self.logger.info("Connecting to data...")
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info("Reading from disk...")
        self.documents.read()

        self.logger.info("Tokenizing...")
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info("Writing to disk...")
        self.tokens.write()

        self.logger.info("--------------------------------------------------------------------------------")
Ejemplo n.º 3
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == 'unicode':
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == 'whitespace':
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == 'alpha':
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == 'alphanumeric':
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenization = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
Ejemplo n.º 4
0
    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
Ejemplo n.º 5
0
class Tokenize( object ):

	"""
	Takes in the input corpus doc and writes it out as a list of tokens.
	
	Currently, supports only single document corpus with one document per line of format:
		doc_id<tab>document_content
	(Two fields delimited by tab.)
	
	Support for multiple files, directory(ies), and Lucene considered for future releases.
	"""
	
	WHITESPACE_TOKENIZATION = r'[^ ]+'
	ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*'
	ALPHA_TOKENIZATION = r'[A-Za-z_]+'
	UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+'
	DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'Tokenize' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, corpus_format, corpus_path, data_path, tokenization = None ):
		
		assert corpus_format is not None
		assert corpus_path is not None
		assert data_path is not None
		if tokenization is None:
			tokenization = Tokenize.DEFAULT_TOKENIZATION
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Tokenizing source corpus...'                                                      )
		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
		self.logger.info( '    data_path = %s', data_path                                                    )
		self.logger.info( '    tokenziation = %s', tokenization                                              )
		
		self.logger.info( 'Connecting to data...' )
		self.documents = DocumentsAPI( corpus_format, corpus_path )
		self.tokens = TokensAPI( data_path )
		
		self.logger.info( 'Reading from disk...' )
		self.documents.read()
		
		self.logger.info( 'Tokenizing...' )
		self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) )
		
		self.logger.info( 'Writing to disk...' )
		self.tokens.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def TokenizeDocuments( self, tokenizer ):
		for docID, docContent in self.documents.data.iteritems():
			docTokens = self.TokenizeDocument( docContent, tokenizer )
			self.tokens.data[ docID ] = docTokens
	
	def TokenizeDocument( self, text, tokenizer ):
		tokens = []
		for token in re.findall( tokenizer, text ):
			tokens.append( token.lower() )
		return tokens
Ejemplo n.º 6
0
class Tokenize(object):
    """
	Takes in the input corpus doc and writes it out as a list of tokens.
	
	Currently, supports only single document corpus with one document per line of format:
		doc_id<tab>document_content
	(Two fields delimited by tab.)
	
	Support for multiple files, directory(ies), and Lucene considered for future releases.
	"""

    WHITESPACE_TOKENIZATION = r'[^ ]+'
    ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*'
    ALPHA_TOKENIZATION = r'[A-Za-z_]+'
    UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+'
    DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION

    def __init__(self, logging_level):
        self.logger = logging.getLogger('Tokenize')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )

    def TokenizeDocuments(self, tokenizer):
        for docID, docContent in self.documents.data.iteritems():
            docTokens = self.TokenizeDocument(docContent, tokenizer)
            self.tokens.data[docID] = docTokens

    def TokenizeDocument(self, text, tokenizer):
        tokens = []
        for token in re.findall(tokenizer, text):
            tokens.append(token.lower())
        return tokens