Ejemplo n.º 1
0
    def _generate_training_data(self, config):
        if 'data' in config and 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
        else:
            train_dir = 'train'
        data_path = os.path.join(self._corpus_dir, train_dir)
        num_samples = None
        summary = None
        metadata = None
        logger.info('Generating training data from %s', data_path)
        if 'data' in config and 'sample_dist' in config['data']:
            sample_dir = os.path.join(self._data_dir, 'sample')
            if not os.path.exists(sample_dir):
                os.mkdir(sample_dir)
            sample_path = os.path.join(sample_dir, train_dir)
            logger.info('Sampling training data to %s', sample_path)
            summary, metadata = sample(config['data']['sample'],
                                       config['data']['sample_dist'],
                                       data_path, sample_path,
                                       config['source'], config['target'])
            num_samples = sum(six.itervalues(summary['file']))
            if num_samples == 0:
                raise RuntimeError('data sampling generated 0 sentences')
            data_path = sample_path
        if 'tokenization' in config:
            tok_config = config['tokenization']
            src_tokenizer = tokenizer.build_tokenizer(
                tok_config['source'] if 'source' in tok_config else tok_config)
            tgt_tokenizer = tokenizer.build_tokenizer(
                tok_config['target'] if 'target' in tok_config else tok_config)
            tokenized_dir = os.path.join(self._data_dir, 'tokenized')
            if not os.path.exists(tokenized_dir):
                os.mkdir(tokenized_dir)
            tokenized_path = os.path.join(tokenized_dir, train_dir)
            logger.info('Tokenizing training data to %s', tokenized_path)
            tokenizer.tokenize_directory(data_path, tokenized_path,
                                         src_tokenizer, tgt_tokenizer,
                                         config['source'], config['target'])
            data_path = tokenized_path
        if not self._support_multi_training_files:
            merged_dir = os.path.join(self._data_dir, 'merged')
            if not os.path.exists(merged_dir):
                os.mkdir(merged_dir)
            merged_path = os.path.join(merged_dir, train_dir)
            logger.info('Merging training data to %s/train.{%s,%s}',
                        merged_path, config['source'], config['target'])
            data.merge_files_in_directory(data_path, merged_path,
                                          config['source'], config['target'])
            data_path = merged_path

        return data_path, num_samples, summary, metadata
Ejemplo n.º 2
0
    def _generate_training_data(self, config):
        if 'data' in config and 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
        else:
            train_dir = 'train'
        data_path = os.path.join(self._corpus_dir, train_dir)
        num_samples = None
        summary = None
        metadata = None
        if 'data' in config and 'sample_dist' in config['data']:
            sample_dir = os.path.join(self._data_dir, 'sample')
            if not os.path.exists(sample_dir):
                os.mkdir(sample_dir)
            sample_path = os.path.join(sample_dir, train_dir)
            logger.info('Sampling training data to %s', sample_path)
            summary, metadata = sample(
                config['data']['sample'],
                config['data']['sample_dist'],
                data_path,
                sample_path,
                config['source'],
                config['target'])
            num_samples = sum(six.itervalues(summary['file']))
            data_path = sample_path
        if 'tokenization' in config:
            tok_config = config['tokenization']
            src_tokenizer = 'source' in tok_config and tokenizer.build_tokenizer(tok_config['source'])
            tgt_tokenizer = 'target' in tok_config and tokenizer.build_tokenizer(tok_config['target'])
            tokenized_dir = os.path.join(self._data_dir, 'tokenized')
            if not os.path.exists(tokenized_dir):
                os.mkdir(tokenized_dir)
            tokenized_path = os.path.join(tokenized_dir, train_dir)
            logger.info('Tokenizing training data to %s', tokenized_path)
            tokenizer.tokenize_directory(
                data_path,
                tokenized_path,
                src_tokenizer,
                tgt_tokenizer,
                config['source'],
                config['target'])
            data_path = tokenized_path

        return data_path, train_dir, num_samples, summary, metadata
Ejemplo n.º 3
0
def generate_preprocessed_data(config, corpus_dir, data_dir):

    # TODO : annotations
    # TODO : file-specific rules/extra

    # For backward compatibility with old relative path configurations.
    train_dir = 'train'
    if 'data' in config:
        if 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
    else:
        logger.warning('No \'data\' field in configuration, \
                        default corpus directory and all corpora are used.)')

    # Default data path.
    data_path = os.path.join(corpus_dir, train_dir)

    num_samples = None
    summary = None
    metadata = None

    # If some sampling OR preprocessing is applied, change directory.
    if 'data' in config or 'preprocess' in config:

        preprocess_dir = os.path.join(data_dir, 'preprocess')
        if not os.path.exists(preprocess_dir):
            os.mkdir(preprocess_dir)
        if not os.path.isdir(preprocess_dir):
            raise RuntimeError('%s is not a directory' % preprocess_dir)
        logger.info('Generating training data to %s', preprocess_dir)

        # Sample files and write information to a special file structure.
        all_files, summary, metadata = sample(config, data_path)

        num_samples = 0
        for f in all_files:
            lines_filtered = 0
            if f.lines_kept:

                # Default batch size is the whole sample size.
                batch_size = f.lines_kept
                if 'preprocess' in config and 'batch_size' in config[
                        'preprocess']:
                    batch_size = config['preprocess']['batch_size']

                loader = prepoperator.FileLoader(f, batch_size)
                pipeline = prepoperator.PreprocessingPipeline()
                # TODO : Initialize FILE-SPECIFIC preprocessor pipeline
                # if 'preprocess' in config:
                # pipeline.add(buildPreprocessPipeline(config['preprocess']))
                # TODO : ultimately, tokenization should be part of the preprocess pipeline
                if 'tokenization' in config:
                    pipeline.add(prepoperator.Tokenizer(
                        config['tokenization']))
                writer = prepoperator.FileWriter(f, preprocess_dir)

                for tu_batch in loader():
                    tu_batch = pipeline(tu_batch)
                    writer(tu_batch)
                    lines_filtered += len(tu_batch)
                    # TODO : parallelization
                f.close_files()
                writer.close_files()

            if lines_filtered != f.lines_kept:
                num_samples += lines_filtered
                summary[f.base_name]["lines_filtered"] = lines_filtered
            else:
                num_samples += f.lines_kept
                summary[f.base_name]["lines_filtered"] = f.lines_kept

        data_path = preprocess_dir

    return data_path, train_dir, num_samples, summary, metadata
Ejemplo n.º 4
0
def generate_preprocessed_data(config,
                               corpus_dir,
                               data_dir,
                               result='preprocess'):

    # TODO V2 : annotations
    # TODO V2 : file-specific rules/extra

    # For backward compatibility with old relative path configurations.
    train_dir = 'train'
    if 'data' in config:
        if 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
    else:
        logger.warning('No \'data\' field in configuration, \
                        default corpus directory and all corpora are used.)')

    # Default data path.
    data_path = os.path.join(corpus_dir, train_dir)

    num_samples = None
    summary = None
    metadata = None

    # If some sampling OR preprocessing is applied, change result directory.
    if 'data' in config or 'preprocess' in config:

        result_dir = os.path.join(data_dir, result)
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)
        if not os.path.isdir(result_dir):
            raise RuntimeError('%s is not a directory' % result_dir)
        logger.info('Generating data to %s', result_dir)

        # Sample files and write information to a special file structure.
        all_files, summary, metadata = sample(config, data_path)

        num_samples = 0
        consumer = prepoperator.make_consumer(config, result_dir, result)
        for f in all_files:
            lines_filtered = 0
            if f.lines_kept:

                # Default batch size is the whole sample size.
                batch_size = f.lines_kept
                if 'preprocess' in config and 'batch_size' in config[
                        'preprocess']:
                    batch_size = config['preprocess']['batch_size']

                # Loader : load selected lines into batches.
                loader = prepoperator.FileLoader(f, batch_size)

                # Preprocessor : preprocess lines in batch.
                pipeline = prepoperator.PreprocessingPipeline()
                # TODO V2 : Initialize FILE-SPECIFIC preprocessor pipeline
                # if 'preprocess' in config:
                # pipeline.add(buildPreprocessPipeline(config['preprocess']))
                # TODO V2 : ultimately, tokenization should be part of the preprocess pipeline
                if 'tokenization' in config:
                    pipeline.add(prepoperator.Tokenizer(
                        config['tokenization']))

                # Consumer : action after preprocessing.
                # * write lines to file, if simple preprocessing.
                # * feed to subword learner, if building subword model.
                # * add words to vocabulary, if building vocabulary.
                consumer.open_files(f)

                for tu_batch in loader():
                    tu_batch = pipeline(tu_batch)
                    consumer(tu_batch)
                    lines_filtered += len(tu_batch)
                    # TODO V2 : parallelization
                f.close_files()

                consumer.close_files()

            consumer.finalize(config)

            if lines_filtered != f.lines_kept:
                num_samples += lines_filtered
                summary[f.base_name]["lines_filtered"] = lines_filtered
            else:
                num_samples += f.lines_kept
                summary[f.base_name]["lines_filtered"] = f.lines_kept

        data_path = result_dir

    return data_path, train_dir, num_samples, summary, metadata