def __init__(self, params, datadict, is_eval=False): self.params = params self.datadict = datadict self.is_eval = is_eval self.max_length = params['max_length'] if 'max_length' in params and not is_eval else 1000 self.apply_spec_augment = True if params['spec_augment'] and not self.is_eval else False if self.apply_spec_augment: logging.info('Apply SpecAugment!') self.spec_augment_config = params['spec_augment_config'] logging.info('Config: %s' % ' '.join([key+':'+str(value) for key, value in self.spec_augment_config.items()])) self.unit2idx = load_vocab(params['vocab']) self.target_dict = {} for text_file in self.datadict['text']: with open(text_file, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split() utt_id = parts[0] tokens = self.encode(parts[1:]) if len(tokens) > self.max_length: continue self.target_dict[utt_id] = tokens self.feat_list = [] for feat_file in self.datadict['feat']: with open(feat_file, 'r', encoding='utf-8') as f: for line in f: utt_id, feat_path = line.strip().split() if utt_id not in self.target_dict: continue self.feat_list.append([utt_id, feat_path]) if 'utt2spk' in self.datadict: self.apply_cmvn = True assert 'cmvn' in self.datadict self.utt2spk = {} for utt2spk in self.datadict['utt2spk']: with open(utt2spk, 'r') as f: for line in f: uttid, spkid = line.strip().split() self.utt2spk[uttid] = spkid self.cmvn = {} for cmvn in self.datadict['cmvn']: with open(cmvn, 'r') as f: for line in f: spkid, path = line.strip().split() self.cmvn[spkid] = path logging.info('Apply CMVN!') else: self.apply_cmvn = False
def main(args): checkpoint = torch.load(args.load_model) if 'params' in checkpoint: params = checkpoint['params'] else: assert os.path.isfile(args.config), 'please specify a configure file.' with open(args.config, 'r') as f: params = yaml.load(f) params['data']['shuffle'] = False params['data']['spec_argument'] = False params['data']['short_first'] = False params['data']['batch_size'] = args.batch_size model = Transformer(params['model']) model.load_state_dict(checkpoint['model']) print('Load pre-trained model from %s' % args.load_model) model.eval() if args.ngpu > 0: model.cuda() char2unit = load_vocab(params['data']['vocab']) unit2char = {i: c for c, i in char2unit.items()} recognizer = TransformerRecognizer(model, unit2char=unit2char, beam_width=args.beam_width, max_len=args.max_len, penalty=args.penalty, lamda=args.lamda, ngpu=args.ngpu) # inputs_length: [len] inputs, inputs_length = calc_fbank(args.file, params['data']) if args.ngpu > 0: inputs = inputs.cuda() inputs_length = inputs_length.cuda() preds = recognizer.recognize(inputs, inputs_length) print('preds: {}'.format(preds[0].replace(' ', '')))
def __init__(self, params, datadict, is_eval=False): self.params = params self.datadict = datadict self.is_eval = is_eval self.apply_spec_augment = True if params[ 'spec_augment'] and not self.is_eval else False if self.apply_spec_augment: logging.info('Apply SpecAugment!') self.spec_augment_config = params['spec_augment_config'] logging.info('Config: %s' % ' '.join([ key + ':' + str(value) for key, value in self.spec_augment_config.items() ])) self.unit2idx = load_vocab(params['vocab']) with open(self.datadict['json'], 'r') as f: self.utts = [(k, v) for k, v in json.load(f)['utts'].items()]
def main(args): checkpoint = torch.load(args.load_model) if 'params' in checkpoint: params = checkpoint['params'] else: assert os.path.isfile(args.config), 'please specify a configure file.' with open(args.config, 'r') as f: params = yaml.load(f) params['data']['shuffle'] = False params['data']['spec_argument'] = False params['data']['short_first'] = False params['data']['batch_size'] = args.batch_size expdir = os.path.join('egs', params['data']['name'], 'exp', params['train']['save_name']) if args.suffix is None: decode_dir = os.path.join(expdir, 'decode_%s' % args.decode_set) else: decode_dir = os.path.join( expdir, 'decode_%s_%s' % (args.decode_set, args.suffix)) if not os.path.exists(decode_dir): os.makedirs(decode_dir) model = Transformer(params['model']) model.load_state_dict(checkpoint['model']) print('Load pre-trained model from %s' % args.load_model) model.eval() if args.ngpu > 0: model.cuda() char2unit = load_vocab(params['data']['vocab']) unit2char = {i: c for c, i in char2unit.items()} dataset = AudioDataset(params['data'], args.decode_set) data_loader = FeatureLoader(dataset) recognizer = TransformerRecognizer(model, unit2char=unit2char, beam_width=args.beam_width, max_len=args.max_len, penalty=args.penalty, lamda=args.lamda, ngpu=args.ngpu) totals = len(dataset) batch_size = params['data']['batch_size'] writer = open(os.path.join(decode_dir, 'predict.txt'), 'w') for step, (utt_id, batch) in enumerate(data_loader.loader): if args.ngpu > 0: inputs = batch['inputs'].cuda() inputs_length = batch['inputs_length'].cuda() preds = recognizer.recognize(inputs, inputs_length) targets = batch['targets'] targets_length = batch['targets_length'] for b in range(len(preds)): n = step * batch_size + b truth = ' '.join( [unit2char[i.item()] for i in targets[b][1:targets_length[b]]]) print('[%d / %d ] %s - pred : %s' % (n, totals, utt_id[b], preds[b])) print('[%d / %d ] %s - truth: %s' % (n, totals, utt_id[b], truth)) writer.write(utt_id[b] + ' ' + preds[b] + '\n') writer.close()
def __init__(self, params, datadict, is_eval=False): self.params = params self.datadict= datadict self.is_eval = is_eval self.apply_spec_augment = params['spec_augment'] if not self.is_eval else False logger.info('[Online-Reader] Read the feature extracted online!') self.normalization = params['normalization'] self.feature_extractor = params['feature_extractor'] if 'feature_extractor' in params else 'torchaudio' assert self.feature_extractor in ['torchaudio', 'python_speech_feature', 'ta', 'psf'] logger.info('Utilize %s to extract feature from wav.' % self.feature_extractor) if self.normalization: logger.info('Apply Feature Normalization!') if 'global_cmvn' in params: self.global_mean = torch.from_numpy(np.load(params['global_cmvn'] + '.mean.npy')) self.global_std = torch.from_numpy(np.load(params['global_cmvn'] + '.std.npy')) logger.info('Load global mean and std vector from files') self.apply_global_cmvn = True else: self.apply_global_cmvn = False if self.apply_spec_augment and not self.is_eval: self.spec_augment_config = params['spec_augment_config'] logger.info('Apply SpecAugment!') logger.info('Config: %s' % ' '.join([key+':'+str(value) for key, value in self.spec_augment_config.items()])) if 'gaussian_noise' in params and not self.is_eval: self.gaussian_noise = params['gaussian_noise'] if self.gaussian_noise > 0.0: logger.info('Apply Gaussian Noise with std: %f.' % self.gaussian_noise) else: self.gaussian_noise = 0.0 # if 'speed_perturb' in params and not self.is_eval: # self.apply_speed_perturb = params['speed_perturb'] # if self.apply_speed_perturb: logger.info('Apply Speed Perturb during the training!') # else: # self.apply_speed_perturb = False if 'volume_perturb' in params and not self.is_eval: self.apply_volume_perturb = params['volume_perturb'] if self.apply_volume_perturb: logger.info('Apply Volume Perturb during the training!') else: self.apply_volume_perturb = False self.unit2idx = load_vocab(params['vocab']) self.targets_dict = {} for text_file in self.datadict['text']: with open(text_file, 'r', encoding='utf-8') as t: for line in t: parts = line.strip().split() utt_id = parts[0] label = [] for c in parts[1:]: label.append(self.unit2idx[c] if c in self.unit2idx else self.unit2idx[UNK_TOKEN]) self.targets_dict[utt_id] = label self.file_list = [] for feat_file in self.datadict['feat']: with open(feat_file, 'r', encoding='utf-8') as fid: for line in fid: idx, path = line.strip().split() self.file_list.append([idx, path]) assert len(self.file_list) <= len(self.targets_dict)
def main(args): checkpoint = torch.load(args.load_model) if 'params' in checkpoint: params = checkpoint['params'] else: assert os.path.isfile(args.config), 'please specify a configure file.' with open(args.config, 'r') as f: params = yaml.load(f) params['data']['shuffle'] = False params['data']['spec_augment'] = False params['data']['short_first'] = False params['data']['batch_size'] = args.batch_size expdir = os.path.join('egs', params['data']['name'], 'exp', params['train']['save_name']) decoder_set_name = 'decode_%s' % args.decode_set if args.load_language_model is not None: decoder_set_name += '_lm_lmw%.2f' % args.lm_weight if args.suffix is not None: decoder_set_name += '_%s' % args.suffix decode_dir = os.path.join(expdir, decoder_set_name) if not os.path.exists(decode_dir): os.makedirs(decode_dir) model = Transformer(params['model']) model.load_state_dict(checkpoint['model']) print('Load pre-trained model from %s' % args.load_model) model.eval() if args.ngpu > 0: model.cuda() if args.load_language_model is not None: lm_chkpt = torch.load(args.load_language_model) lm = TransformerLanguageModel(lm_chkpt['params']['model']) lm.load_state_dict(lm_chkpt['model']) lm.eval() if args.ngpu > 0: lm.cuda() print('Load pre-trained transformer language model from %s' % args.load_language_model) else: lm = None char2unit = load_vocab(params['data']['vocab']) unit2char = {i: c for c, i in char2unit.items()} data_loader = FeatureLoader(params, args.decode_set, is_eval=True) recognizer = TransformerRecognizer(model, lm=lm, lm_weight=args.lm_weight, unit2char=unit2char, beam_width=args.beam_width, max_len=args.max_len, penalty=args.penalty, lamda=args.lamda, ngpu=args.ngpu) totals = len(data_loader.dataset) batch_size = params['data']['batch_size'] writer = open(os.path.join(decode_dir, 'predict.txt'), 'w') writerRef = open(os.path.join(decode_dir, 'reference.txt'), 'w') for step, (utt_id, batch) in enumerate(data_loader.loader): if args.ngpu > 0: inputs = batch['inputs'].cuda() inputs_length = batch['inputs_length'].cuda() else: inputs = batch['inputs'] inputs_length = batch['inputs_length'] preds = recognizer.recognize(inputs, inputs_length) targets = batch['targets'] targets_length = batch['targets_length'] for b in range(len(preds)): n = step * batch_size + b truth = ' '.join([ unit2char[i.item()] for i in targets[b][1:targets_length[b] + 1] ]) print('[%d / %d ] %s - pred : %s' % (n, totals, utt_id[b], preds[b])) print('[%d / %d ] %s - truth: %s' % (n, totals, utt_id[b], truth)) if utt_id[b][7] == '1': newpred = preds[b] + " (" + 'S1000' + "-" + utt_id[b] + ")" newtruth = truth + " (" + 'S1000' + "-" + utt_id[b] + ")" elif utt_id[b][7] == '2': newpred = preds[b] + " (" + 'S2000' + "-" + utt_id[b] + ")" newtruth = truth + " (" + 'S2000' + "-" + utt_id[b] + ")" elif utt_id[b][0] == 'O': newpred = preds[b] + " (" + 'O' + "-" + utt_id[b] + ")" newtruth = truth + " (" + 'O' + "-" + utt_id[b] + ")" else: newpred = preds[b] + " (" + utt_id[b][6:11] + "-" + utt_id[ b] + ")" newtruth = truth + " (" + utt_id[b][6:11] + "-" + utt_id[ b] + ")" writer.write(newpred + '\n') writerRef.write(newtruth + '\n') writer.close() writerRef.close()