def _generate_training_data(self, config): if 'data' in config and 'train_dir' in config['data']: train_dir = config['data']['train_dir'] else: train_dir = 'train' data_path = os.path.join(self._corpus_dir, train_dir) num_samples = None summary = None metadata = None logger.info('Generating training data from %s', data_path) if 'data' in config and 'sample_dist' in config['data']: sample_dir = os.path.join(self._data_dir, 'sample') if not os.path.exists(sample_dir): os.mkdir(sample_dir) sample_path = os.path.join(sample_dir, train_dir) logger.info('Sampling training data to %s', sample_path) summary, metadata = sample(config['data']['sample'], config['data']['sample_dist'], data_path, sample_path, config['source'], config['target']) num_samples = sum(six.itervalues(summary['file'])) if num_samples == 0: raise RuntimeError('data sampling generated 0 sentences') data_path = sample_path if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = tokenizer.build_tokenizer( tok_config['source'] if 'source' in tok_config else tok_config) tgt_tokenizer = tokenizer.build_tokenizer( tok_config['target'] if 'target' in tok_config else tok_config) tokenized_dir = os.path.join(self._data_dir, 'tokenized') if not os.path.exists(tokenized_dir): os.mkdir(tokenized_dir) tokenized_path = os.path.join(tokenized_dir, train_dir) logger.info('Tokenizing training data to %s', tokenized_path) tokenizer.tokenize_directory(data_path, tokenized_path, src_tokenizer, tgt_tokenizer, config['source'], config['target']) data_path = tokenized_path if not self._support_multi_training_files: merged_dir = os.path.join(self._data_dir, 'merged') if not os.path.exists(merged_dir): os.mkdir(merged_dir) merged_path = os.path.join(merged_dir, train_dir) logger.info('Merging training data to %s/train.{%s,%s}', merged_path, config['source'], config['target']) data.merge_files_in_directory(data_path, merged_path, config['source'], config['target']) data_path = merged_path return data_path, num_samples, summary, metadata
def _generate_training_data(self, config): if 'data' in config and 'train_dir' in config['data']: train_dir = config['data']['train_dir'] else: train_dir = 'train' data_path = os.path.join(self._corpus_dir, train_dir) num_samples = None summary = None metadata = None if 'data' in config and 'sample_dist' in config['data']: sample_dir = os.path.join(self._data_dir, 'sample') if not os.path.exists(sample_dir): os.mkdir(sample_dir) sample_path = os.path.join(sample_dir, train_dir) logger.info('Sampling training data to %s', sample_path) summary, metadata = sample( config['data']['sample'], config['data']['sample_dist'], data_path, sample_path, config['source'], config['target']) num_samples = sum(six.itervalues(summary['file'])) data_path = sample_path if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = 'source' in tok_config and tokenizer.build_tokenizer(tok_config['source']) tgt_tokenizer = 'target' in tok_config and tokenizer.build_tokenizer(tok_config['target']) tokenized_dir = os.path.join(self._data_dir, 'tokenized') if not os.path.exists(tokenized_dir): os.mkdir(tokenized_dir) tokenized_path = os.path.join(tokenized_dir, train_dir) logger.info('Tokenizing training data to %s', tokenized_path) tokenizer.tokenize_directory( data_path, tokenized_path, src_tokenizer, tgt_tokenizer, config['source'], config['target']) data_path = tokenized_path return data_path, train_dir, num_samples, summary, metadata