Ejemplo n.º 1
0
    def execute(self, corpus_format, corpus_path, data_path, tokenization):
        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION
        elif tokenization == 'unicode':
            tokenization = Tokenize.UNICODE_TOKENIZATION
        elif tokenization == 'whitespace':
            tokenization = Tokenize.WHITESPACE_TOKENIZATION
        elif tokenization == 'alpha':
            tokenization = Tokenize.ALPHA_TOKENIZATION
        elif tokenization == 'alphanumeric':
            tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenization = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
Ejemplo n.º 2
0
    def execute(self,
                corpus_format,
                corpus_path,
                data_path,
                tokenization=None):

        assert corpus_format is not None
        assert corpus_path is not None
        assert data_path is not None
        if tokenization is None:
            tokenization = Tokenize.DEFAULT_TOKENIZATION

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Tokenizing source corpus...')
        self.logger.info('    corpus_path = %s (%s)', corpus_path,
                         corpus_format)
        self.logger.info('    data_path = %s', data_path)
        self.logger.info('    tokenziation = %s', tokenization)

        self.logger.info('Connecting to data...')
        self.documents = DocumentsAPI(corpus_format, corpus_path)
        self.tokens = TokensAPI(data_path)

        self.logger.info('Reading from disk...')
        self.documents.read()

        self.logger.info('Tokenizing...')
        self.TokenizeDocuments(re.compile(tokenization, re.UNICODE))

        self.logger.info('Writing to disk...')
        self.tokens.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )