def sample_normalize(self, k_samples=1000, overwrite=False): """ Estimate the mean and std of the features from the training set Params: k_samples (int): Use this number of samples for estimation """ # if k_samples is negative then it goes through total dataset if k_samples < 0: audio_paths_iter = iter(self.audio_paths) # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) audio_paths_iter = iter(samples) audio_clip = audio_paths_iter.next() feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite) feat_squared = np.square(feat) count = float(feat.shape[0]) dim = feat.shape[1] for iter_index in range(len(samples) - 1): next_feat = self.featurize(audio_clip=audio_paths_iter.next(), overwrite=overwrite) next_feat_squared = np.square(next_feat) feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim) feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) feat_squared_vertically_stacked = np.concatenate((feat_squared, next_feat_squared)).reshape(-1, dim) feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) count = count + float(next_feat.shape[0]) self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std)
def sample_normalize(self, k_samples=1000, overwrite=False): """ Estimate the mean and std of the features from the training set Params: k_samples (int): Use this number of samples for estimation """ log = LogUtil().getlogger() log.info("Calculating mean and std from samples") # if k_samples is negative then it goes through total dataset if k_samples < 0: audio_paths = self.audio_paths # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) audio_paths = samples manager = Manager() return_dict = manager.dict() jobs = [] num_processes = min(len(audio_paths), cpu_count()) split_size = int( math.ceil(float(len(audio_paths)) / float(num_processes))) audio_paths_split = [] for i in range(0, len(audio_paths), split_size): audio_paths_split.append(audio_paths[i:i + split_size]) for thread_index in range(num_processes): proc = Process(target=self.preprocess_sample_normalize, args=(thread_index, audio_paths_split[thread_index], overwrite, return_dict)) jobs.append(proc) proc.start() for proc in jobs: proc.join() feat = np.sum(np.vstack( [item['feat'] for item in return_dict.values()]), axis=0) count = sum([item['count'] for item in return_dict.values()]) print(feat, count) feat_squared = np.sum(np.vstack( [item['feat_squared'] for item in return_dict.values()]), axis=0) self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) log.info("End calculating mean and std from samples")
def sample_normalize(self, k_samples=1000, overwrite=False): """ Estimate the mean and std of the features from the training set Params: k_samples (int): Use this number of samples for estimation """ log = LogUtil().getlogger() log.info("Calculating mean and std from samples") # if k_samples is negative then it goes through total dataset if k_samples < 0: audio_paths = self.audio_paths # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) audio_paths = samples manager = Manager() return_dict = manager.dict() jobs = [] for threadIndex in range(cpu_count()): proc = Process(target=self.preprocess_sample_normalize, args=(threadIndex, audio_paths, overwrite, return_dict)) jobs.append(proc) proc.start() for proc in jobs: proc.join() feat = np.sum(np.vstack([item['feat'] for item in return_dict.values()]), axis=0) count = sum([item['count'] for item in return_dict.values()]) feat_squared = np.sum(np.vstack([item['feat_squared'] for item in return_dict.values()]), axis=0) self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) log.info("End calculating mean and std from samples")
def load_data(args): mode = args.config.get('common', 'mode') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') if mode == 'predict': test_json = args.config.get('data', 'test_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(test_json) datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) else: data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(data_json) datagen.load_validation_data(val_json) if mode == "train": normalize_target_k = args.config.getint('train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) elif mode == "load": # get feat_mean and feat_std to normalize dataset datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm: raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") max_label_length = datagen.get_max_label_length(partition="train") elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = datagen.get_max_label_length(partition="test") else: raise Exception( 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) if mode == "train": sort_by_duration=True shuffle=False else: sort_by_duration=False shuffle=True data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, shuffle=shuffle) if mode == 'predict': return data_loaded, args else: validation_loaded = STTIter(partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=True, shuffle=False) return data_loaded, validation_loaded, args
def load_data(args): mode = args.config.get('common', 'mode') if mode not in ['train', 'predict', 'load']: raise Exception( 'mode must be the one of the followings - train,predict,load') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') overwrite_bi_graphemes_dictionary = args.config.getboolean( 'train', 'overwrite_bi_graphemes_dictionary') max_duration = args.config.getfloat('data', 'max_duration') language = args.config.get('data', 'language') log = LogUtil().getlogger() labelUtil = LabelUtil.getInstance() if mode == "train" or mode == "load": data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(data_json, max_duration=max_duration) datagen.load_validation_data(val_json, max_duration=max_duration) if is_bi_graphemes: if not os.path.isfile( "resources/unicodemap_en_baidu_bi_graphemes.csv" ) or overwrite_bi_graphemes_dictionary: load_labelutil(labelUtil=labelUtil, is_bi_graphemes=False, language=language) generate_bi_graphemes_dictionary(datagen.train_texts + datagen.val_texts) load_labelutil(labelUtil=labelUtil, is_bi_graphemes=is_bi_graphemes, language=language) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == "train": if overwrite_meta_files: log.info("Generate mean and std from samples") normalize_target_k = args.config.getint( 'train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) else: log.info("Read mean and std from meta files") datagen.get_meta_from_file( np.loadtxt( generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt( generate_file_path(save_dir, model_name, 'feats_std'))) elif mode == "load": # get feat_mean and feat_std to normalize dataset datagen.get_meta_from_file( np.loadtxt( generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt( generate_file_path(save_dir, model_name, 'feats_std'))) elif mode == 'predict': test_json = args.config.get('data', 'test_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(test_json, max_duration=max_duration) labelutil = load_labelutil(labelUtil, is_bi_graphemes, language="en") args.config.set('arch', 'n_classes', str(labelUtil.get_count())) datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm and (mode == 'train' or mode == 'load'): raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") max_label_length = \ datagen.get_max_label_length(partition="train", is_bi_graphemes=is_bi_graphemes) elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = \ datagen.get_max_label_length(partition="test", is_bi_graphemes=is_bi_graphemes) args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) sort_by_duration = (mode == "train") is_bucketing = args.config.getboolean('arch', 'is_bucketing') save_feature_as_csvfile = args.config.getboolean( 'train', 'save_feature_as_csvfile') if is_bucketing: buckets = json.loads(args.config.get('arch', 'buckets')) data_loaded = BucketSTTIter( partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, buckets=buckets, save_feature_as_csvfile=save_feature_as_csvfile) else: data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, save_feature_as_csvfile=save_feature_as_csvfile) if mode == 'train' or mode == 'load': if is_bucketing: validation_loaded = BucketSTTIter( partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes, buckets=buckets, save_feature_as_csvfile=save_feature_as_csvfile) else: validation_loaded = STTIter( partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes, save_feature_as_csvfile=save_feature_as_csvfile) return data_loaded, validation_loaded, args elif mode == 'predict': return data_loaded, args
def load_data(args): mode = args.config.get('common', 'mode') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') if mode == 'predict': test_json = args.config.get('data', 'test_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(test_json) datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) elif mode == "train" or mode == "load": data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(data_json) #test bigramphems language = args.config.get('data', 'language') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') if overwrite_meta_files and is_bi_graphemes: generate_bi_graphemes_dictionary(datagen.train_texts) labelUtil = LabelUtil.getInstance() if language == "en": if is_bi_graphemes: try: labelUtil.load_unicode_set( "resources/unicodemap_en_baidu_bi_graphemes.csv") except: raise Exception( "There is no resources/unicodemap_en_baidu_bi_graphemes.csv. Please set overwrite_meta_files at train section True" ) else: labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv") else: raise Exception("Error: Language Type: %s" % language) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == "train": if overwrite_meta_files: normalize_target_k = args.config.getint( 'train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) else: datagen.get_meta_from_file( np.loadtxt( generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt( generate_file_path(save_dir, model_name, 'feats_std'))) datagen.load_validation_data(val_json) elif mode == "load": # get feat_mean and feat_std to normalize dataset datagen.get_meta_from_file( np.loadtxt( generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt( generate_file_path(save_dir, model_name, 'feats_std'))) datagen.load_validation_data(val_json) else: raise Exception( 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.' ) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm: raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") max_label_length = datagen.get_max_label_length( partition="train", is_bi_graphemes=is_bi_graphemes) elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = datagen.get_max_label_length( partition="test", is_bi_graphemes=is_bi_graphemes) else: raise Exception( 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.' ) args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) if mode == "train": sort_by_duration = True else: sort_by_duration = False data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes) if mode == 'predict': return data_loaded, args else: validation_loaded = STTIter(partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes) return data_loaded, validation_loaded, args
def __init__(self, args): self.args = args # set parameters from data section(common) self.mode = self.args.config.get('common', 'mode') # get meta file where character to number conversions are defined self.contexts = parse_contexts(self.args) self.num_gpu = len(self.contexts) self.batch_size = self.args.config.getint('common', 'batch_size') # check the number of gpus is positive divisor of the batch size for data parallel self.is_batchnorm = self.args.config.getboolean('arch', 'is_batchnorm') self.is_bucketing = self.args.config.getboolean('arch', 'is_bucketing') # log current config self.config_logger = ConfigLogger(log) self.config_logger(args.config) save_dir = 'checkpoints' model_name = self.args.config.get('common', 'prefix') max_freq = self.args.config.getint('data', 'max_freq') self.datagen = DataGenerator(save_dir=save_dir, model_name=model_name, max_freq=max_freq) self.datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) self.buckets = json.loads(self.args.config.get('arch', 'buckets')) default_bucket_key = self.buckets[-1] self.args.config.set('arch', 'max_t_count', str(default_bucket_key)) self.args.config.set('arch', 'max_label_length', str(100)) self.labelUtil = LabelUtil() is_bi_graphemes = self.args.config.getboolean('common', 'is_bi_graphemes') load_labelutil(self.labelUtil, is_bi_graphemes, language="zh") self.args.config.set('arch', 'n_classes', str(self.labelUtil.get_count())) self.max_t_count = self.args.config.getint('arch', 'max_t_count') # self.load_optimizer_states = self.args.config.getboolean('load', 'load_optimizer_states') # load model self.model_loaded, self.model_num_epoch, self.model_path = load_model( self.args) symbol, self.arg_params, self.aux_params = mx.model.load_checkpoint( self.model_path, self.model_num_epoch) # all_layers = symbol.get_internals() # s_sym = all_layers['concat36457_output'] # sm = mx.sym.SoftmaxOutput(data=s_sym, name='softmax') # self.model = STTBucketingModule( # sym_gen=self.model_loaded, # default_bucket_key=default_bucket_key, # context=self.contexts # ) s_mod = mx.mod.BucketingModule(sym_gen=self.model_loaded, context=self.contexts, default_bucket_key=default_bucket_key) from importlib import import_module prepare_data_template = import_module( self.args.config.get('arch', 'arch_file')) self.init_states = prepare_data_template.prepare_data(self.args) self.width = self.args.config.getint('data', 'width') self.height = self.args.config.getint('data', 'height') s_mod.bind(data_shapes=[ ('data', (self.batch_size, default_bucket_key, self.width * self.height)) ] + self.init_states, for_training=False) s_mod.set_params(self.arg_params, self.aux_params, allow_extra=True, allow_missing=True) for bucket in self.buckets: provide_data = [ ('data', (self.batch_size, bucket, self.width * self.height)) ] + self.init_states s_mod.switch_bucket(bucket_key=bucket, data_shapes=provide_data) self.model = s_mod try: from swig_wrapper import Scorer vocab_list = [ chars.encode("utf-8") for chars in self.labelUtil.byList ] log.info("vacab_list len is %d" % len(vocab_list)) _ext_scorer = Scorer(0.26, 0.1, self.args.config.get('common', 'kenlm'), vocab_list) lm_char_based = _ext_scorer.is_character_based() lm_max_order = _ext_scorer.get_max_order() lm_dict_size = _ext_scorer.get_dict_size() log.info("language model: " "is_character_based = %d," % lm_char_based + " max_order = %d," % lm_max_order + " dict_size = %d" % lm_dict_size) self.scorer = _ext_scorer # self.eval_metric = EvalSTTMetric(batch_size=self.batch_size, num_gpu=self.num_gpu, is_logging=True, # scorer=_ext_scorer) except ImportError: import kenlm km = kenlm.Model(self.args.config.get('common', 'kenlm')) # self.eval_metric = EvalSTTMetric(batch_size=self.batch_size, num_gpu=self.num_gpu, is_logging=True, # scorer=km.score) self.scorer = km.score
def load_data(args): mode = args.config.get('common', 'mode') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') language = args.config.get('data', 'language') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') labelUtil = LabelUtil.getInstance() if language == "en": if is_bi_graphemes: try: labelUtil.load_unicode_set("resources/unicodemap_en_baidu_bi_graphemes.csv") except: raise Exception("There is no resources/unicodemap_en_baidu_bi_graphemes.csv. Please set overwrite_meta_files at train section True") else: labelUtil.load_unicode_set("resources/unicodemap_en_baidu.csv") else: raise Exception("Error: Language Type: %s" % language) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == 'predict': test_json = args.config.get('data', 'test_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(test_json) datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) elif mode =="train" or mode == "load": data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(data_json) #test bigramphems if overwrite_meta_files and is_bi_graphemes: generate_bi_graphemes_dictionary(datagen.train_texts) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == "train": if overwrite_meta_files: normalize_target_k = args.config.getint('train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) else: datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) datagen.load_validation_data(val_json) elif mode == "load": # get feat_mean and feat_std to normalize dataset datagen.get_meta_from_file(np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) datagen.load_validation_data(val_json) else: raise Exception( 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm: raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") max_label_length = datagen.get_max_label_length(partition="train",is_bi_graphemes=is_bi_graphemes) elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = datagen.get_max_label_length(partition="test",is_bi_graphemes=is_bi_graphemes) else: raise Exception( 'Define mode in the cfg file first. train or predict or load can be the candidate for the mode.') args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) if mode == "train": sort_by_duration = True else: sort_by_duration = False data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes) if mode == 'predict': return data_loaded, args else: validation_loaded = STTIter(partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes) return data_loaded, validation_loaded, args
def load_data(args): mode = args.config.get('common', 'mode') if mode not in ['train', 'predict', 'load']: raise Exception('mode must be the one of the followings - train,predict,load') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') overwrite_bi_graphemes_dictionary = args.config.getboolean('train', 'overwrite_bi_graphemes_dictionary') max_duration = args.config.getfloat('data', 'max_duration') language = args.config.get('data', 'language') log = LogUtil().getlogger() labelUtil = LabelUtil.getInstance() if mode == "train" or mode == "load": data_json = args.config.get('data', 'train_json') val_json = args.config.get('data', 'val_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(data_json, max_duration=max_duration) datagen.load_validation_data(val_json, max_duration=max_duration) if is_bi_graphemes: if not os.path.isfile("resources/unicodemap_en_baidu_bi_graphemes.csv") or overwrite_bi_graphemes_dictionary: load_labelutil(labelUtil=labelUtil, is_bi_graphemes=False, language=language) generate_bi_graphemes_dictionary(datagen.train_texts+datagen.val_texts) load_labelutil(labelUtil=labelUtil, is_bi_graphemes=is_bi_graphemes, language=language) args.config.set('arch', 'n_classes', str(labelUtil.get_count())) if mode == "train": if overwrite_meta_files: log.info("Generate mean and std from samples") normalize_target_k = args.config.getint('train', 'normalize_target_k') datagen.sample_normalize(normalize_target_k, True) else: log.info("Read mean and std from meta files") datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) elif mode == "load": # get feat_mean and feat_std to normalize dataset datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) elif mode == 'predict': test_json = args.config.get('data', 'test_json') datagen = DataGenerator(save_dir=save_dir, model_name=model_name) datagen.load_train_data(test_json, max_duration=max_duration) labelutil = load_labelutil(labelUtil, is_bi_graphemes, language="en") args.config.set('arch', 'n_classes', str(labelUtil.get_count())) datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm and (mode == 'train' or mode == 'load'): raise Warning('batch size 1 is too small for is_batchnorm') # sort file paths by its duration in ascending order to implement sortaGrad if mode == "train" or mode == "load": max_t_count = datagen.get_max_seq_length(partition="train") max_label_length = \ datagen.get_max_label_length(partition="train", is_bi_graphemes=is_bi_graphemes) elif mode == "predict": max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = \ datagen.get_max_label_length(partition="test", is_bi_graphemes=is_bi_graphemes) args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) sort_by_duration = (mode == "train") is_bucketing = args.config.getboolean('arch', 'is_bucketing') save_feature_as_csvfile = args.config.getboolean('train', 'save_feature_as_csvfile') if is_bucketing: buckets = json.loads(args.config.get('arch', 'buckets')) data_loaded = BucketSTTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, buckets=buckets, save_feature_as_csvfile=save_feature_as_csvfile) else: data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, save_feature_as_csvfile=save_feature_as_csvfile) if mode == 'train' or mode == 'load': if is_bucketing: validation_loaded = BucketSTTIter(partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes, buckets=buckets, save_feature_as_csvfile=save_feature_as_csvfile) else: validation_loaded = STTIter(partition="validation", count=datagen.val_count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=False, is_bi_graphemes=is_bi_graphemes, save_feature_as_csvfile=save_feature_as_csvfile) return data_loaded, validation_loaded, args elif mode == 'predict': return data_loaded, args
def load_data(args, wav_file): mode = args.config.get('common', 'mode') if mode not in ['train', 'predict', 'load']: raise Exception( 'mode must be the one of the followings - train,predict,load') batch_size = args.config.getint('common', 'batch_size') whcs = WHCS() whcs.width = args.config.getint('data', 'width') whcs.height = args.config.getint('data', 'height') whcs.channel = args.config.getint('data', 'channel') whcs.stride = args.config.getint('data', 'stride') save_dir = 'checkpoints' model_name = args.config.get('common', 'prefix') is_bi_graphemes = args.config.getboolean('common', 'is_bi_graphemes') overwrite_meta_files = args.config.getboolean('train', 'overwrite_meta_files') overwrite_bi_graphemes_dictionary = args.config.getboolean( 'train', 'overwrite_bi_graphemes_dictionary') max_duration = args.config.getfloat('data', 'max_duration') max_freq = args.config.getint('data', 'max_freq') language = args.config.get('data', 'language') log = LogUtil().getlogger() labelUtil = LabelUtil() # test_json = "resources/d.json" datagen = DataGenerator(save_dir=save_dir, model_name=model_name, max_freq=max_freq) datagen.train_audio_paths = [wav_file] datagen.train_durations = [get_duration_wave(wav_file)] datagen.train_texts = ["1 1"] datagen.count = 1 # datagen.load_train_data(test_json, max_duration=max_duration) labelutil = load_labelutil(labelUtil, is_bi_graphemes, language="zh") args.config.set('arch', 'n_classes', str(labelUtil.get_count())) datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) is_batchnorm = args.config.getboolean('arch', 'is_batchnorm') if batch_size == 1 and is_batchnorm and (mode == 'train' or mode == 'load'): raise Warning('batch size 1 is too small for is_batchnorm') max_t_count = datagen.get_max_seq_length(partition="test") max_label_length = \ datagen.get_max_label_length(partition="test", is_bi_graphemes=is_bi_graphemes) args.config.set('arch', 'max_t_count', str(max_t_count)) args.config.set('arch', 'max_label_length', str(max_label_length)) from importlib import import_module prepare_data_template = import_module(args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(args) sort_by_duration = (mode == "train") is_bucketing = args.config.getboolean('arch', 'is_bucketing') save_feature_as_csvfile = args.config.getboolean( 'train', 'save_feature_as_csvfile') if is_bucketing: buckets = json.loads(args.config.get('arch', 'buckets')) data_loaded = BucketSTTIter( partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, buckets=buckets, save_feature_as_csvfile=save_feature_as_csvfile) else: data_loaded = STTIter(partition="train", count=datagen.count, datagen=datagen, batch_size=batch_size, num_label=max_label_length, init_states=init_states, seq_length=max_t_count, width=whcs.width, height=whcs.height, sort_by_duration=sort_by_duration, is_bi_graphemes=is_bi_graphemes, save_feature_as_csvfile=save_feature_as_csvfile) return data_loaded, args
def sample_normalize(self, k_samples=1000, overwrite=False, noise_percent=0.4): """ Estimate the mean and std of the features from the training set Params: k_samples (int): Use this number of samples for estimation """ log = LogUtil().getlogger() log.info("Calculating mean and std from samples") # if k_samples is negative then it goes through total dataset if k_samples < 0: audio_paths = self.train_audio_paths * 10 # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) audio_paths = samples # manager = Manager() # return_dict = manager.dict() # jobs = [] # for threadIndex in range(cpu_count()): # proc = Process(target=self.preprocess_sample_normalize, # args=(threadIndex, audio_paths, overwrite, noise_percent, return_dict)) # jobs.append(proc) # proc.start() # for proc in jobs: # proc.join() # return_dict = {} # self.preprocess_sample_normalize(1, audio_paths, overwrite, noise_percent, return_dict) # pool = Pool(processes=cpu_count()) # results = [] # for i, f in enumerate(audio_paths): # result = pool.apply_async(spectrogram_from_file, args=(f,), kwds={"overwrite":overwrite, "noise_percent":noise_percent}) # results.append(result) # pool.close() # pool.join() # feat_dim = self.feat_dim # feat = np.zeros((1, feat_dim)) # feat_squared = np.zeros((1, feat_dim)) # count = 0 # return_dict = {} # for data in results: # next_feat = data.get() # next_feat_squared = np.square(next_feat) # feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, feat_dim) # feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) # feat_squared_vertically_stacked = np.concatenate( # (feat_squared, next_feat_squared)).reshape(-1, feat_dim) # feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) # count += float(next_feat.shape[0]) # return_dict[1] = {'feat': feat, 'feat_squared': feat_squared, 'count': count} return_dict = {} with concurrent.futures.ThreadPoolExecutor( max_workers=cpu_count()) as executor: feat_dim = self.feat_dim feat = np.zeros((1, feat_dim)) feat_squared = np.zeros((1, feat_dim)) count = 0 future_to_f = { executor.submit(spectrogram_from_file, f, overwrite=overwrite, noise_percent=noise_percent): f for f in audio_paths } for future in concurrent.futures.as_completed(future_to_f): # for f, data in zip(audio_paths, executor.map(spectrogram_from_file, audio_paths, overwrite=overwrite, noise_percent=noise_percent)): f = future_to_f[future] try: next_feat = future.result() next_feat_squared = np.square(next_feat) feat_vertically_stacked = np.concatenate( (feat, next_feat)).reshape(-1, feat_dim) feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) feat_squared_vertically_stacked = np.concatenate( (feat_squared, next_feat_squared)).reshape(-1, feat_dim) feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) count += float(next_feat.shape[0]) except Exception as exc: log.info('%r generated an exception: %s' % (f, exc)) return_dict[1] = { 'feat': feat, 'feat_squared': feat_squared, 'count': count } feat = np.sum(np.vstack( [item['feat'] for item in return_dict.values()]), axis=0) count = sum([item['count'] for item in return_dict.values()]) feat_squared = np.sum(np.vstack( [item['feat_squared'] for item in return_dict.values()]), axis=0) self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) log.info("End calculating mean and std from samples")
def sample_normalize_fbank(self, k_samples=1000, overwrite=False, noise_percent=0.4): log = LogUtil().getlogger() log.info("Calculating mean and std from samples") # if k_samples is negative then it goes through total dataset if k_samples < 0: audio_paths = self.train_audio_paths * 10 # using sample else: k_samples = min(k_samples, len(self.train_audio_paths)) samples = self.rng.sample(self.train_audio_paths, k_samples) audio_paths = samples return_dict = {} with concurrent.futures.ThreadPoolExecutor( max_workers=cpu_count()) as executor: feat_dim = 3 * 41 feat = np.zeros((1, feat_dim)) feat_squared = np.zeros((1, feat_dim)) count = 0 future_to_f = { executor.submit(fbank_from_file, f, overwrite=overwrite, noise_percent=noise_percent): f for f in audio_paths } for future in concurrent.futures.as_completed(future_to_f): # for f, data in zip(audio_paths, executor.map(spectrogram_from_file, audio_paths, overwrite=overwrite, noise_percent=noise_percent)): f = future_to_f[future] try: next_feat = future.result().swapaxes(0, 1).reshape( -1, feat_dim) next_feat_squared = np.square(next_feat) feat_vertically_stacked = np.concatenate( (feat, next_feat)).reshape(-1, feat_dim) feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) feat_squared_vertically_stacked = np.concatenate( (feat_squared, next_feat_squared)).reshape(-1, feat_dim) feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) count += float(next_feat.shape[0]) except Exception as exc: log.info('%r generated an exception: %s' % (f, exc)) return_dict[1] = { 'feat': feat, 'feat_squared': feat_squared, 'count': count } feat = np.sum(np.vstack( [item['feat'] for item in return_dict.values()]), axis=0) count = sum([item['count'] for item in return_dict.values()]) feat_squared = np.sum(np.vstack( [item['feat_squared'] for item in return_dict.values()]), axis=0) self.feats_mean = feat / float(count) self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) np.savetxt( generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) log.info("End calculating mean and std from samples")
def __init__(self): if len(sys.argv) <= 1: raise Exception('cfg file path must be provided. ' + 'ex)python main.py --configfile examplecfg.cfg') self.args = parse_args(sys.argv[1]) # set parameters from cfg file # give random seed self.random_seed = self.args.config.getint('common', 'random_seed') self.mx_random_seed = self.args.config.getint('common', 'mx_random_seed') # random seed for shuffling data list if self.random_seed != -1: np.random.seed(self.random_seed) # set mx.random.seed to give seed for parameter initialization if self.mx_random_seed != -1: mx.random.seed(self.mx_random_seed) else: mx.random.seed(hash(datetime.now())) # set log file name self.log_filename = self.args.config.get('common', 'log_filename') self.log = LogUtil(filename=self.log_filename).getlogger() # set parameters from data section(common) self.mode = self.args.config.get('common', 'mode') save_dir = 'checkpoints' model_name = self.args.config.get('common', 'prefix') max_freq = self.args.config.getint('data', 'max_freq') self.datagen = DataGenerator(save_dir=save_dir, model_name=model_name, max_freq=max_freq) self.datagen.get_meta_from_file( np.loadtxt(generate_file_path(save_dir, model_name, 'feats_mean')), np.loadtxt(generate_file_path(save_dir, model_name, 'feats_std'))) self.buckets = json.loads(self.args.config.get('arch', 'buckets')) # get meta file where character to number conversions are defined self.contexts = parse_contexts(self.args) self.num_gpu = len(self.contexts) self.batch_size = self.args.config.getint('common', 'batch_size') # check the number of gpus is positive divisor of the batch size for data parallel self.is_batchnorm = self.args.config.getboolean('arch', 'is_batchnorm') self.is_bucketing = self.args.config.getboolean('arch', 'is_bucketing') # log current config self.config_logger = ConfigLogger(self.log) self.config_logger(self.args.config) default_bucket_key = 1600 self.args.config.set('arch', 'max_t_count', str(default_bucket_key)) self.args.config.set('arch', 'max_label_length', str(95)) self.labelUtil = LabelUtil() is_bi_graphemes = self.args.config.getboolean('common', 'is_bi_graphemes') load_labelutil(self.labelUtil, is_bi_graphemes, language="zh") self.args.config.set('arch', 'n_classes', str(self.labelUtil.get_count())) self.max_t_count = self.args.config.getint('arch', 'max_t_count') # self.load_optimizer_states = self.args.config.getboolean('load', 'load_optimizer_states') # load model self.model_loaded, self.model_num_epoch, self.model_path = load_model( self.args) # self.model = STTBucketingModule( # sym_gen=self.model_loaded, # default_bucket_key=default_bucket_key, # context=self.contexts # ) from importlib import import_module prepare_data_template = import_module( self.args.config.get('arch', 'arch_file')) init_states = prepare_data_template.prepare_data(self.args) width = self.args.config.getint('data', 'width') height = self.args.config.getint('data', 'height') for bucket in self.buckets: net, init_state_names, ll = self.model_loaded(bucket) net.save('checkpoints/%s-symbol.json' % bucket) input_shapes = dict([('data', (self.batch_size, default_bucket_key, width * height))] + init_states + [('label', (1, 18))]) # self.executor = net.simple_bind(ctx=mx.cpu(), **input_shapes) # self.model.bind(data_shapes=[('data', (self.batch_size, default_bucket_key, width * height))] + init_states, # label_shapes=[ # ('label', (self.batch_size, self.args.config.getint('arch', 'max_label_length')))], # for_training=True) symbol, self.arg_params, self.aux_params = mx.model.load_checkpoint( self.model_path, self.model_num_epoch) all_layers = symbol.get_internals() concat = all_layers['concat36457_output'] sm = mx.sym.SoftmaxOutput(data=concat, name='softmax') self.executor = sm.simple_bind(ctx=mx.cpu(), **input_shapes) # self.model.set_params(self.arg_params, self.aux_params, allow_extra=True, allow_missing=True) for key in self.executor.arg_dict.keys(): if key in self.arg_params: self.arg_params[key].copyto(self.executor.arg_dict[key]) init_state_names.remove('data') init_state_names.sort() self.states_dict = dict( zip(init_state_names, self.executor.outputs[1:])) self.input_arr = mx.nd.zeros( (self.batch_size, default_bucket_key, width * height)) try: from swig_wrapper import Scorer vocab_list = [ chars.encode("utf-8") for chars in self.labelUtil.byList ] self.log.info("vacab_list len is %d" % len(vocab_list)) _ext_scorer = Scorer(0.26, 0.1, self.args.config.get('common', 'kenlm'), vocab_list) lm_char_based = _ext_scorer.is_character_based() lm_max_order = _ext_scorer.get_max_order() lm_dict_size = _ext_scorer.get_dict_size() self.log.info("language model: " "is_character_based = %d," % lm_char_based + " max_order = %d," % lm_max_order + " dict_size = %d" % lm_dict_size) self.eval_metric = EvalSTTMetric(batch_size=self.batch_size, num_gpu=self.num_gpu, is_logging=True, scorer=_ext_scorer) except ImportError: import kenlm km = kenlm.Model(self.args.config.get('common', 'kenlm')) self.eval_metric = EvalSTTMetric(batch_size=self.batch_size, num_gpu=self.num_gpu, is_logging=True, scorer=km.score)