def execute( self, corpus_format, corpus_path, data_path, tokenization = None ): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Tokenizing source corpus...' ) self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' tokenziation = %s', tokenization ) self.logger.info( 'Connecting to data...' ) self.documents = DocumentsAPI( corpus_format, corpus_path ) self.tokens = TokensAPI( data_path ) self.logger.info( 'Reading from disk...' ) self.documents.read() self.logger.info( 'Tokenizing...' ) self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) ) self.logger.info( 'Writing to disk...' ) self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION elif tokenization == "unicode": tokenization = Tokenize.UNICODE_TOKENIZATION elif tokenization == "whitespace": tokenization = Tokenize.WHITESPACE_TOKENIZATION elif tokenization == "alpha": tokenization = Tokenize.ALPHA_TOKENIZATION elif tokenization == "alphanumeric": tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION self.logger.info("--------------------------------------------------------------------------------") self.logger.info("Tokenizing source corpus...") self.logger.info(" corpus_path = %s (%s)", corpus_path, corpus_format) self.logger.info(" data_path = %s", data_path) self.logger.info(" tokenization = %s", tokenization) self.logger.info("Connecting to data...") self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info("Reading from disk...") self.documents.read() self.logger.info("Tokenizing...") self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info("Writing to disk...") self.tokens.write() self.logger.info("--------------------------------------------------------------------------------")
def execute(self, corpus_format, corpus_path, data_path, tokenization): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION elif tokenization == 'unicode': tokenization = Tokenize.UNICODE_TOKENIZATION elif tokenization == 'whitespace': tokenization = Tokenize.WHITESPACE_TOKENIZATION elif tokenization == 'alpha': tokenization = Tokenize.ALPHA_TOKENIZATION elif tokenization == 'alphanumeric': tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenization = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute(self, corpus_format, corpus_path, data_path, tokenization=None): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenziation = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' )
class Tokenize( object ): """ Takes in the input corpus doc and writes it out as a list of tokens. Currently, supports only single document corpus with one document per line of format: doc_id<tab>document_content (Two fields delimited by tab.) Support for multiple files, directory(ies), and Lucene considered for future releases. """ WHITESPACE_TOKENIZATION = r'[^ ]+' ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*' ALPHA_TOKENIZATION = r'[A-Za-z_]+' UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+' DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION def __init__( self, logging_level ): self.logger = logging.getLogger( 'Tokenize' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, corpus_format, corpus_path, data_path, tokenization = None ): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Tokenizing source corpus...' ) self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' tokenziation = %s', tokenization ) self.logger.info( 'Connecting to data...' ) self.documents = DocumentsAPI( corpus_format, corpus_path ) self.tokens = TokensAPI( data_path ) self.logger.info( 'Reading from disk...' ) self.documents.read() self.logger.info( 'Tokenizing...' ) self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) ) self.logger.info( 'Writing to disk...' ) self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' ) def TokenizeDocuments( self, tokenizer ): for docID, docContent in self.documents.data.iteritems(): docTokens = self.TokenizeDocument( docContent, tokenizer ) self.tokens.data[ docID ] = docTokens def TokenizeDocument( self, text, tokenizer ): tokens = [] for token in re.findall( tokenizer, text ): tokens.append( token.lower() ) return tokens
class Tokenize(object): """ Takes in the input corpus doc and writes it out as a list of tokens. Currently, supports only single document corpus with one document per line of format: doc_id<tab>document_content (Two fields delimited by tab.) Support for multiple files, directory(ies), and Lucene considered for future releases. """ WHITESPACE_TOKENIZATION = r'[^ ]+' ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*' ALPHA_TOKENIZATION = r'[A-Za-z_]+' UNICODE_TOKENIZATION = r'[\p{L}\p{M}]+' DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION def __init__(self, logging_level): self.logger = logging.getLogger('Tokenize') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, corpus_format, corpus_path, data_path, tokenization=None): assert corpus_format is not None assert corpus_path is not None assert data_path is not None if tokenization is None: tokenization = Tokenize.DEFAULT_TOKENIZATION self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Tokenizing source corpus...') self.logger.info(' corpus_path = %s (%s)', corpus_path, corpus_format) self.logger.info(' data_path = %s', data_path) self.logger.info(' tokenziation = %s', tokenization) self.logger.info('Connecting to data...') self.documents = DocumentsAPI(corpus_format, corpus_path) self.tokens = TokensAPI(data_path) self.logger.info('Reading from disk...') self.documents.read() self.logger.info('Tokenizing...') self.TokenizeDocuments(re.compile(tokenization, re.UNICODE)) self.logger.info('Writing to disk...') self.tokens.write() self.logger.info( '--------------------------------------------------------------------------------' ) def TokenizeDocuments(self, tokenizer): for docID, docContent in self.documents.data.iteritems(): docTokens = self.TokenizeDocument(docContent, tokenizer) self.tokens.data[docID] = docTokens def TokenizeDocument(self, text, tokenizer): tokens = [] for token in re.findall(tokenizer, text): tokens.append(token.lower()) return tokens