def getVectorizedPairBatch(pair_batch, modality, train=True): # TODO: Refactor ftype = 'train' if train else 'val' if modality == 's': with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype), 'rb') as f: feat_map = pickle.load(f) final_pair_batch = [] for x, y in pair_batch: final_pair_batch.append( [ki.read_mat(ku.get_ark_rep(x, feat_map)), y]) return final_pair_batch elif modality == 'ss': with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype), 'rb') as f: feat_map = pickle.load(f) final_pair_batch = [] for x, y in pair_batch: e = ki.read_mat(ku.get_ark_rep(x, feat_map)) final_pair_batch.append([e, e]) return final_pair_batch elif modality == 'v': video_feats = np.load( '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format( conf['data_dir'], ftype)) return [[video_feats[idx], y] for idx, (_, y) in enumerate(pair_batch)] elif modality == 'vv': video_feats = np.load( '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format( conf['data_dir'], ftype)) final_pair_batch = [] for idx, (_, y) in enumerate(pair_batch): e = video_feats[idx] final_pair_batch.append([e, e]) return final_pair_batch elif modality in ['sv', 'ss-vv']: # Load speech vectors with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype), 'rb') as f: feat_map = pickle.load(f) # Load video vectors video_feats = np.load( '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format( conf['data_dir'], ftype)) final_pair_batch = [] # Note that the text (t) is simply discarded for training mode for idx, (x, t) in enumerate(pair_batch): if train: final_pair_batch.append([ video_feats[idx], ki.read_mat(ku.get_ark_rep(x, feat_map)) ]) else: final_pair_batch.append([ video_feats[idx], ki.read_mat(ku.get_ark_rep(x, feat_map)), t ]) return final_pair_batch
def __getitem__(self, idx): rec = self.recs[idx] utts = self.rec2utt_dict[rec] spkrs = [self.utt2spk_dict[u] for u in utts] ref_rttm_lines = self.rttm_lines_from_rec(rec) # hyp_rttm_lines, segutts = self.segments_lines_from_rec(rec) # assert (segutts == utts).all() segcols = self.segments_cols_from_rec(rec) okay_feats = [] okay_spkrs = [] okay_idx = [] fpaths = [self.utt_fpath_dict[utt] for utt in utts] for i, fpath in enumerate(fpaths): try: okay_feats.append(torch.FloatTensor(read_mat(fpath))) okay_spkrs.append(spkrs[i]) okay_idx.append(i) except: print('Reading utterance {} failed'.format(utts[i])) continue okay_idx = np.array(okay_idx) get_lines = lambda a: a[okay_idx] newsegcols = [get_lines(c) for c in segcols] return okay_feats, okay_spkrs, ref_rttm_lines, newsegcols, rec
def recognize(args): # model char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) encoder = Encoder( args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen, ) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen, ) model = Transformer(encoder, decoder) model.load_state_dict(flow.load(args.model_path)) device = flow.device("cuda") model.eval() model.to(device) LFR_m = args.LFR_m LFR_n = args.LFR_n char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] # decode each utterance new_js = {} with flow.no_grad(): for idx, name in enumerate(js.keys(), 1): print("(%d/%d) decoding %s" % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]["input"][0]["feat"]) input = build_LFR_features(input, LFR_m, LFR_n) input = flow.tensor(input).to(dtype=flow.float32) input_length = flow.tensor([input.size(0)], dtype=flow.int64) input = input.to(device) input_length = input_length.to(device) nbest_hyps = model.recognize(input, input_length, char_list, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, "wb") as f: f.write(json.dumps({"utts": new_js}, indent=4, sort_keys=True).encode("utf_8"))
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path) print(model) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # import Language Model lm_model = kenlm.Model(args.lm_path) # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): print('(%d/%d) decoding %s' % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]['input'][0]['feat']) # TxD input = build_LFR_features(input, LFR_m, LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() input_length = input_length.cuda() nbest_hyps = model.recognize(input, input_length, char_list, lm_model, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def __getitem__(self, counter): index = self.mapping[counter] utt_id = self.key_dic[index] X = np.expand_dims(read_mat(self.ark_dic[index]), axis=0) y = self.label_dic[index] return utt_id, X, y
def load_utt(ark, utt, position): with open(ark, 'rb') as f: f.seek(position - len(utt) - 1) ark_key = kaldi_io.read_key(f) assert ark_key == utt, f'Keys does not match: `{ark_key}` and `{utt}`.' mat = kaldi_io.read_mat(f) return mat
def __init__(self, feat_dir, fst_dir): super(ChainDataset, self).__init__() self.feat_dir = feat_dir self.fst_dir = fst_dir self.feat_scp = os.path.join(self.feat_dir, 'feats.scp') self.fst_scp = os.path.join(self.fst_dir, 'fst_nor.1.scp') self.feat_len_map = os.path.join(self.feat_dir, 'utt2featlen.txt') if not os.path.exists(self.feat_len_map): print( '{} does not exist, generating utt2featlen.txt (a map from utt_id' ' to feature length) for the first time... It is used to form a minibatch' ' with similar length in training.'.format(self.feat_len_map)) with open(self.feat_len_map, 'w') as map_f: with open(self.feat_scp) as f: for i, line in tqdm(enumerate(f)): utt_id, feat_ark = line.strip().split() feat = kaldi_io.read_mat(feat_ark) feat_len = feat.shape[0] map_f.write('{} {}\n'.format(utt_id, feat_len)) # Pairing self.samples = [] # list of dicts self.utt_ids = {} # a dict that maps utt_ids(str) to id(int) self.samples_tmp = [] with open(self.feat_scp) as f: for i, line in enumerate(f): utt_id, feat_ark = line.strip().split() self.utt_ids[utt_id] = i self.samples_tmp.append({'utt_id': utt_id, 'feat': feat_ark}) with open(self.feat_len_map) as f: for i, line in enumerate(f): utt_id, feat_len = line.strip().split() id = self.utt_ids[utt_id] self.samples_tmp[id]['feat_len'] = int(feat_len) # we always cache all fsts into memory at once as its relatively small with open(self.fst_scp) as f: print("Loading training FSTs...") for i, line in tqdm(enumerate(f)): utt_id, fst_rxf = line.strip().split() if utt_id not in self.utt_ids: raise ValueError( '{} has no corresponding feats'.format(utt_id)) id = self.utt_ids[utt_id] filename, offset = self.parse_rxfile(fst_rxf) filename = filename.split('/')[-1] file_path = os.path.join(self.fst_dir, filename) fst = simplefst.StdVectorFst.read_ark(file_path, offset) graph = ChainGraph(fst) dict_tmp = self.samples_tmp[id] dict_tmp['graph'] = graph self.samples.append(dict_tmp) #self.samples[id]['graph'] = graph # sort the samples by their feature length self.samples = sorted(self.samples, key=lambda sample: sample['feat_len'])
def load_data(trainning_triples): data = {} data['key'] = [triples[0] for triples in trainning_triples] data['src_seq'] = [triples[1] for triples in trainning_triples] data['tgt_seq'] = [triples[2] for triples in trainning_triples] #load data loaded = [] for scripts in data['src_seq']: mat = kaldi_io.read_mat(scripts) loaded.append(mat) data['src_seq'] = loaded data['src_seq'], data['src_pad_mask'] = instances_handler.pad_to_longest( data['src_seq']) data['tgt_seq'], data['tgt_pad_mask'] = instances_handler.pad_to_longest( data['tgt_seq']) data['src_seq'] = np.array(data['src_seq']) data['src_pad_mask'] = np.array(data['src_pad_mask']) data['tgt_seq'] = np.array(data['tgt_seq']) data['tgt_pad_mask'] = np.array(data['tgt_pad_mask']) archive = { 'key': data['key'], 'src_seq': data['src_seq'], 'src_pad_mask': data['src_pad_mask'], 'tgt_seq': data['tgt_seq'], 'tgt_pad_mask': data['tgt_pad_mask'] } return archive
def compute_num_frames_from_feat_or_waveform(rxfile: str) -> int: if re.search(r"\.ark:\d+$", rxfile.strip()) is not None: # from feats.scp if not has_kaldi_io: raise ImportError( "Please install kaldi_io with: pip install kaldi_io") try: feat = kaldi_io.read_mat(rxfile) except Exception: raise Exception("failed to read feature matrix {}.".format(rxfile)) assert feat is not None and isinstance(feat, np.ndarray) num_frames = feat.shape[0] elif re.search(r"\|$", rxfile.strip()) is not None: # from a command source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout) waveform, sample_rate = get_waveform(source, always_2d=True) num_frames = num_samples_to_num_frames(waveform.shape[1], sample_rate, frame_length=25.0, frame_shift=10.0) else: # from a raw waveform file if not has_soundfile: raise ImportError( "Please install soundfile with: pip install soundfile") info = soundfile.info(rxfile) num_frames = num_samples_to_num_frames(info.frames, info.samplerate, frame_length=25.0, frame_shift=10.0) return num_frames
def load_inputs_and_targets(batch, token2idx, label_type, LFR_m, LFR_n): # From: espnet/src/asr/asr_utils.py: load_inputs_and_targets # load acoustic features and target sequence of token ids # for b in batch: # print(b[1]['input'][0]['feat']) xs = [kaldi_io.read_mat(b[1]['input'][0]['feat']) for b in batch] ys = [b[1]['output'][0][label_type].split() for b in batch] if LFR_m != 1 or LFR_n != 1: # xs = build_LFR_features(xs, LFR_m, LFR_n) xs = [build_LFR_features(x, LFR_m, LFR_n) for x in xs] # get index of non-zero length samples nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs))) # sort in input lengths nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i])) if len(nonzero_sorted_idx) != len(xs): print("warning: Target sequences include empty token") # remove zero-lenght samples xs = [xs[i] for i in nonzero_sorted_idx] ys = [ np.fromiter(map(lambda x: token2idx[x], ys[i]), dtype=np.int64) for i in nonzero_sorted_idx ] return xs, ys
def __init__(self, mu_path, ep_path, transform_dir, enroll_feats, test_feats, trial): self.S_mu = np.load(mu_path) self.S_ep = np.load(ep_path) self.enroll = {} for key, mat in kaldi_io.read_vec_flt_scp(enroll_feats): self.enroll[key] = mat self.test = {} for key, mat in kaldi_io.read_vec_flt_scp(test_feats): self.test[key] = mat self.len = len(self.test) self.scores = np.zeros((self.len, self.len)) self.trial_path = trial xvectors = [] for _, mat in kaldi_io.read_vec_flt_scp(enroll_feats): xvectors.append(mat) xvectors = np.array(xvectors) transform = EstPca(xvectors, target_energy=0.1) adapt_transform = np.array(kaldi_io.read_mat(transform_dir)) self.adapt_transform = transform self.transform = transform self.S_mu = np.dot(np.dot(adapt_transform, self.S_mu), adapt_transform.T) self.S_ep = np.dot(np.dot(adapt_transform, self.S_ep), adapt_transform.T) self.S_mu = np.dot(np.dot(transform.T, self.S_mu), transform) self.S_ep = np.dot(np.dot(transform.T, self.S_ep), transform) F = np.linalg.pinv(self.S_ep) G = np.dot( np.dot(-np.linalg.pinv(2 * self.S_mu + self.S_ep), self.S_mu), F) self.A = np.linalg.pinv(self.S_mu + self.S_ep) - (F + G) self.G = G return
def func(utt_ids): utt_datas = [] for utt_id in utt_ids: rx_file = raw_src_datas[utt_id] utt_datas.append(kaldi_io.read_mat(rx_file)) source_data = np.concatenate((utt_datas), axis=0) return source_data
def save_data_info_tar(tar_file_path, minibatch_info, all_data_info, fea_dim, logger, downsampled=False): tar_file = tarfile.TarFile(tar_file_path, 'w') for i in range(all_data_info.shape[0]): logger.info('Writing minibatch: %d' % (i + 1)) len_1 = minibatch_info[i][1] / 2 if downsampled else minibatch_info[i][ 1] # mat = np.zeros((len(all_data_info[i]), len_1, fea_dim), dtype=np.float32) mat = np.zeros((len(all_data_info[i]), len_1, fea_dim), dtype=np.float16) for j, read_info in enumerate(all_data_info[i]): m = kaldi_io.read_mat(read_info[0]) len_2 = read_info[2] / 2 if downsampled else read_info[2] assert m.shape[1] == mat.shape[2] and len_2 == mat.shape[1] temp = m[read_info[1]:read_info[1] + read_info[2], :] if downsampled: # start from frame 1 to work fine for both odd and even array size temp = temp[1::2, :] assert temp.shape[0] == len_2 # mat[j, :, :] = temp mat[j, :, :] = temp.astype(dtype=np.float16) __add2tar_file(tar_file, mat, 'minibatch_' + str(i) + '.npy') tar_file.close()
def make_kaldi_ds(ds_path, seq_len=400, evaluation=False, trials=None): """ Make a SpeakerDataset from only the path of the kaldi dataset. This function will use the files 'feats.scp', 'utt2spk' 'spk2utt' present in ds_path to create the SpeakerDataset. """ if not isinstance(ds_path, list): ds_path = [ds_path] utt2spk, spk2utt, utt2path = {}, {}, {} for _, path in enumerate(ds_path): utt2path.update(data_io.read_scp(path / 'feats.scp')) utt2spk.update(data_io.read_scp(path / 'utt2spk')) # can't do spk2utt.update(t_spk2utt) as update is not additive t_spk2utt = data_io.load_one_tomany(path / 'spk2utt') for spk, utts in t_spk2utt.items(): try: spk2utt[spk] += utts except KeyError: spk2utt[spk] = utts ds = SpeakerDataset( utt2path=utt2path, utt2spk=utt2spk, spk2utt=spk2utt, loading_method=lambda path: torch.FloatTensor(read_mat(path)), seq_len=seq_len, evaluation=evaluation, trials=trials, ) return ds
def load_mfccs_from_numbatch_old(mega_dict, num_to_id_dict, data, device): data_mfcc = [] durs = [] for i, d in enumerate(data): fd = mega_dict[num_to_id_dict[int(d)]] data_mfcc_temp = kaldi_io.read_mat(fd).T data_mfcc.append(data_mfcc_temp) durs.append(data_mfcc_temp.shape[1]) try: tmparr = np.asarray(data_mfcc) data_mfcc = tmparr except: tmparr = np.empty(len(data_mfcc), dtype=object) for i in range(len(data_mfcc)): tmparr[i] = data_mfcc[i] data_mfcc = tmparr if len(data_mfcc.shape) > 1: tensor_X = torch.from_numpy(np.asarray(data_mfcc)).float().to(device) return tensor_X else: sorted_durs, sort_idx = torch.sort(torch.tensor(durs)) data_mfcc = data_mfcc[sort_idx] _, unsort_idx = torch.sort(sort_idx) uniq_durs, uniq_counts = torch.unique(sorted_durs, return_counts=True) split_sections = tuple(np.cumsum(uniq_counts)[:-1]) data_mfcc = np.split(data_mfcc, split_sections) tensor_mfcc_list = [ torch.from_numpy(np.asarray(list(mfcc))).float().to(device) for mfcc in data_mfcc ] return tensor_mfcc_list, sort_idx, unsort_idx
def extract_till_plda_embeddings(model, mega_mfcc_dict, data_loader, num_to_id_dict, device): utts = [] for x1, x2, l in data_loader: for x in x1: if num_to_id_dict[int(x)] not in utts: utts.append(num_to_id_dict[int(x)]) # numframes.append(mega_utt2num_frames_dict[num_to_id_dict[int(x)]]) for x in x2: if num_to_id_dict[int(x)] not in utts: utts.append(num_to_id_dict[int(x)]) # numframes.append(mega_utt2num_frames_dict[num_to_id_dict[int(x)]]) # Here we are forward passing each MFCC one by one, and this is very slow. We need to figure out a way to group the mfccs of similar durations, and extract embeddings together to improve speed and efficiently utilize GPUs. model.eval() extracted_plda_embeddings = {} with torch.no_grad(): for utt in utts: if utt in extracted_plda_embeddings: continue mfcc = kaldi_io.read_mat(mega_mfcc_dict[utt]).T mfcc_t = torch.from_numpy(mfcc[np.newaxis, :, :]).to(device) data_extracted_plda_embeddings = model.extract_plda_embeddings( mfcc_t) extracted_plda_embeddings[utt] = np.asarray( data_extracted_plda_embeddings[0].cpu()) return extracted_plda_embeddings
def load_inputs_and_targets(batch): # From: espnet/src/asr/asr_utils.py: load_inputs_and_targets # load acoustic features and target sequence of token ids # for b in batch: # print(b[1]['input'][0]['feat']) xs = [] xs_ = [kaldi_io.read_mat(b[1]['input']['feat']) for b in batch] ys = [b[1]['output']['tokenid'] for b in batch] for x in xs_: xs.append(x[:x.shape[0] // 4 * 4]) # get index of non-zero length samples nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs))) # sort in input lengths nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i])) if len(nonzero_sorted_idx) != len(xs): print("warning: Target sequences include empty tokenid") # remove zero-lenght samples xs = [xs[i] for i in nonzero_sorted_idx] ys = [ np.fromiter(map(int, ys[i]), dtype=np.int64) for i in nonzero_sorted_idx ] return xs, ys
def get_item_test(self, idx): utt = self.utt_list[idx] fpath = self.utt_fpath_dict[utt] feats = read_mat(fpath) feats = torch.FloatTensor(feats) label_dict = {} speaker = self.utt_spkr_dict[utt] if 'speaker' in self.label_types: label_dict['speaker'] = torch.LongTensor([speaker]) if 'gender' in self.label_types: label_dict['gender'] = torch.LongTensor( [self.spk_gen_dict[speaker]]) if 'nationality' in self.label_types: label_dict['nationality'] = torch.LongTensor( [self.spk_nat_dict[speaker]]) if 'age' in self.label_types: label_dict['age'] = torch.LongTensor( [self.utt_age_class_dict[utt]]) if 'age_regression' in self.label_types: label_dict['age_regression'] = torch.FloatTensor( [self.utt_age_dict[utt]]) return feats, label_dict
def __getitem__(self, idx): utt_infos = self.data_list[idx] utt_id = utt_infos[0] utt_feats = kaldi_io.read_mat(utt_infos[1]['input'][0]['feat']) # add delta feats if self.delta_feats_num > 0: utt_feats = [utt_feats] for i in range(self.delta_feats_num): delta_feats = self.delta(utt_feats[i], N=2) utt_feats.append(delta_feats) utt_feats = np.concatenate(utt_feats, axis=1) if self.normalized: # FLAG:should do in the train dataset with kaldi utt_feats = (utt_feats - np.mean(utt_feats,axis=0)) / np.std(utt_feats,axis=0) # add gaussian noise every time __getitem__ is called # Note that In graves, etc SPEECH RECOGNITION WITH DEEP RECURRENT NEURAL NETWORKS # Weight noise was added once per training sequence, rather than at every timestep if self.add_noise and self.dataset_type == 'train': utt_feats = np.add(utt_feats, np.random.normal(0,0.6,utt_feats.shape)) transcript_ids = map(int, utt_infos[1]['output'][0]['token_id'].strip().split()) return (utt_id, utt_feats, transcript_ids)
def __getitem__(self, idx): key, path, _ = self._index[idx] feat = kaldi_io.read_mat(path) shape = feat.shape window_size = 1 + self._left_context + self._right_context out = np.zeros((int( math.ceil( float(shape[0] - ( (self._left_context + self._right_context) if not self._context_pad else 0)) / self._sub_sample)), window_size * shape[1])) if self._left_context > 0 or self._right_context > 0: feat = np.pad(feat, ((self._right_context if self._context_pad else 0, self._right_context if self._context_pad else (shape[0] - window_size) % self._sub_sample), (0, 0)), 'edge') out[:, self._left_context * shape[1]:(self._left_context + 1) * shape[1]] = \ feat[self._left_context:feat.shape[0] - self._right_context:self._sub_sample] for i in range(self._left_context): # left context out[:, shape[1] * i:shape[1] * (i + 1)] = \ feat[i:feat.shape[0] - self._left_context - self._right_context + i:self._sub_sample, :] for i in range(self._right_context): # right context out[:, shape[1] * (i + self._left_context + 1):shape[1] * (i + self._left_context + 2)] = \ feat[self._left_context + 1 + i:feat.shape[0] - self._right_context + 1 + i:self._sub_sample, :] if len(self._labels) > 0: return key, out, self._labels[key] else: return key, out
def __getitem__(self, i): self.check_index(i) feat = kaldi_io.read_mat(self.rxfiles[i]) if self.specaugment_config is not None and self.specaugment_config != "": with data_utils.numpy_seed(self.seed, self.epoch, i): feat = specaug(feat, **eval(self.specaugment_config)) item = torch.from_numpy(feat).float() return item
def read_data(self): self.data_offsets = np.append([0], np.cumsum(self.sizes)[:-1]) self.buffer = np.empty((sum(self.sizes), self.feat_dim), dtype=self.dtype) for i in range(len(self.data_offsets)): ptx = self.data_offsets[i] dst = self.buffer[ptx:ptx + self.sizes[i]] np.copyto(dst, kaldi_io.read_mat(self.extended_filenames[i]))
def __getitem__(self, i): self.check_index(i) if not self.prefetch_called: # no caching feat = kaldi_io.read_mat(self.rxfiles[i]) return torch.from_numpy(feat).float() if i not in self.cache_index: assert ( self.start_pos_for_next_cache < len(self.ordered_indices) ), "Position for next cache starting beyond the end of ordered_indices." try: pos_start = self.ordered_indices.index( i, self.start_pos_for_next_cache, ) except ValueError: raise ValueError( "index {} not found in self.ordered_indices. Set " "self.ordered_prefetch to False, and/or call self.prefetch() " "with the full list of indices, and then try again.". format(i)) pos_end = min( pos_start + self.cache_size, len(self.ordered_indices), ) self.start_pos_for_next_cache = pos_end if self.ordered_prefetch else 0 total_size = 0 for idx in self.ordered_indices[pos_start:pos_end]: total_size += self.sizes[idx] self.cache = np.empty((total_size, self.feat_dim), dtype=self.dtype) ptx = 0 self.cache_index.clear() for idx in self.ordered_indices[pos_start:pos_end]: self.cache_index[idx] = ptx length = self.sizes[idx] dst = self.cache[ptx:ptx + length] feat = kaldi_io.read_mat(self.rxfiles[idx]) if self.specaugment_config is not None and self.specaugment_config != "": with data_utils.numpy_seed(self.seed, self.epoch, idx): feat = specaug(feat, **eval(self.specaugment_config)) np.copyto(dst, feat) ptx += length ptx = self.cache_index[i] a = self.cache[ptx:ptx + self.sizes[i]].copy() return torch.from_numpy(a).float()
def __init__( self, utt_ids: List[str], rxfiles: List[str], utt2num_frames: Optional[List[int]] = None, feat_dim: Optional[ int] = None, # only relevant when reading from raw waveforms feature_type: Optional[ str] = None, # currently support fbank or mfcc; only relevant when reading from raw waveforms seed=1, feature_transforms_config: Optional[Dict[str, Any]] = None, ): super().__init__() assert len(utt_ids) == len(rxfiles) self.dtype = np.float self.utt_ids = utt_ids self.rxfiles = rxfiles self.size = len(utt_ids) # number of utterances self.sizes = [ ] # length of each utterance in terms of the number of frames if utt2num_frames is not None and len(utt2num_frames) > 0: assert len(utt2num_frames) == self.size self.sizes = utt2num_frames first_rxfile = rxfiles[0] if re.search(r"\.ark:\d+$", first_rxfile.strip()) is not None: # from feats.scp self.input_format = "feat" self.feat_dim = kaldi_io.read_mat(first_rxfile).shape[ 1] # feature dimension else: self.input_format = ("command" if re.search( r"\|$", first_rxfile.strip()) is not None else "wave") self.feat_dim = feat_dim self.feature_type = feature_type assert self.feat_dim is not None assert self.feature_type in ["fbank", "mfcc"] if len(self.sizes) == 0: logger.info("Computing number of frames from audios...") with ThreadPoolExecutor(max_workers=32) as ex: futures = [] for rxfile in self.rxfiles: futures.append( ex.submit(compute_num_frames_from_feat_or_waveform, rxfile)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() self.sizes.append(result) assert len(self.sizes) == self.size self.sizes = np.array(self.sizes, dtype=np.int32) self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict( config=feature_transforms_config) self.seed = seed self.epoch = 1
def _load_cmvn(self, cmvn_file): cmvn = kaldi_io.read_mat(cmvn_file) assert cmvn.shape[0] == 2 cnt = cmvn[0, -1] sums = cmvn[0, :-1] sums2 = cmvn[1, :-1] means = sums / cnt stds = np.sqrt(np.maximum(1e-10, sums2 / cnt - means ** 2)) return means, stds
def extract_xvectors(model, mega_mfcc_dict, device): model.eval() extracted_xvectors = {} for utt in mega_mfcc_dict: mfcc = kaldi_io.read_mat(mega_mfcc_dict[utt]).T mfcc_t = torch.from_numpy(mfcc[np.newaxis, :, :]).to(device) data_extracted_xvectors = model.xvector_extractor.extract(mfcc_t) extracted_xvectors[utt] = data_extracted_xvectors[0].cpu().detach( ).numpy() return extracted_xvectors
def main(): filename = sys.argv[1] out_path = sys.argv[2] lang_emb_start = int(sys.argv[3]) lang_emb_end = int(sys.argv[4]) emb = kaldi_io.read_mat(filename) emb = np.transpose(emb, (1, 0))[lang_emb_start:lang_emb_end, :] print(emb.shape) plt.matshow(emb, fignum=None) plt.savefig(out_path + '.png')
def read_all_data(feat_scp): feat_fid = open(feat_scp, 'r') feat = feat_fid.readlines() feat_fid.close() mat_list = [] for i in range(len(feat)): _, ark = feat[i].split() mat = kaldi_io.read_mat(ark) mat_list.append(mat) return np.concatenate(mat_list, axis=0)
def read_mat_key(file, target_key): """read the matrix of the target key/utterance from a kaldi scp file """ fd = ko.open_or_fd(file) try: for line in fd: (key, rxfile) = line.decode().split(' ') if key == target_key: return ko.read_mat(rxfile) finally: if fd is not file: fd.close()
def __getitem__(self, index): """Generate samples """ rxfile = self.rxfiles[index] full_mat = kaldi_io.read_mat(rxfile) assert len(full_mat) >= self.seq_len pin = np.random.randint(0, len(full_mat) - self.seq_len + 1) chunk_mat = full_mat[pin:pin + self.seq_len, :] y = np.array(self.labels[index]) return chunk_mat, y