def _merge_multi_training_files(self, data_path, train_dir, source, target):
     merged_dir = os.path.join(self._data_dir, 'merged')
     if not os.path.exists(merged_dir):
         os.mkdir(merged_dir)
     merged_path = os.path.join(merged_dir, train_dir)
     logger.info('Merging training data to %s/train.{%s,%s}',
                 merged_path, source, target)
     data_util.merge_files_in_directory(data_path, merged_path, source, target)
     return merged_path
Esempio n. 2
0
    def _generate_training_data(self, config):
        if 'data' in config and 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
        else:
            train_dir = 'train'
        data_path = os.path.join(self._corpus_dir, train_dir)
        num_samples = None
        summary = None
        metadata = None
        logger.info('Generating training data from %s', data_path)
        if 'data' in config and 'sample_dist' in config['data']:
            sample_dir = os.path.join(self._data_dir, 'sample')
            if not os.path.exists(sample_dir):
                os.mkdir(sample_dir)
            sample_path = os.path.join(sample_dir, train_dir)
            logger.info('Sampling training data to %s', sample_path)
            summary, metadata = sample(config['data']['sample'],
                                       config['data']['sample_dist'],
                                       data_path, sample_path,
                                       config['source'], config['target'])
            num_samples = sum(six.itervalues(summary['file']))
            if num_samples == 0:
                raise RuntimeError('data sampling generated 0 sentences')
            data_path = sample_path
        if 'tokenization' in config:
            tok_config = config['tokenization']
            src_tokenizer = tokenizer.build_tokenizer(
                tok_config['source'] if 'source' in tok_config else tok_config)
            tgt_tokenizer = tokenizer.build_tokenizer(
                tok_config['target'] if 'target' in tok_config else tok_config)
            tokenized_dir = os.path.join(self._data_dir, 'tokenized')
            if not os.path.exists(tokenized_dir):
                os.mkdir(tokenized_dir)
            tokenized_path = os.path.join(tokenized_dir, train_dir)
            logger.info('Tokenizing training data to %s', tokenized_path)
            tokenizer.tokenize_directory(data_path, tokenized_path,
                                         src_tokenizer, tgt_tokenizer,
                                         config['source'], config['target'])
            data_path = tokenized_path
        if not self._support_multi_training_files:
            merged_dir = os.path.join(self._data_dir, 'merged')
            if not os.path.exists(merged_dir):
                os.mkdir(merged_dir)
            merged_path = os.path.join(merged_dir, train_dir)
            logger.info('Merging training data to %s/train.{%s,%s}',
                        merged_path, config['source'], config['target'])
            data.merge_files_in_directory(data_path, merged_path,
                                          config['source'], config['target'])
            data_path = merged_path

        return data_path, num_samples, summary, metadata