def __init__(self, mu_path, ep_path, transform_dir, enroll_feats, test_feats, trial): self.S_mu = np.load(mu_path) self.S_ep = np.load(ep_path) self.enroll = {} for key, mat in kaldi_io.read_vec_flt_scp(enroll_feats): self.enroll[key] = mat self.test = {} for key, mat in kaldi_io.read_vec_flt_scp(test_feats): self.test[key] = mat self.len = len(self.test) self.scores = np.zeros((self.len, self.len)) self.trial_path = trial xvectors = [] for _, mat in kaldi_io.read_vec_flt_scp(enroll_feats): xvectors.append(mat) xvectors = np.array(xvectors) transform = EstPca(xvectors, target_energy=0.1) adapt_transform = np.array(kaldi_io.read_mat(transform_dir)) self.adapt_transform = transform self.transform = transform self.S_mu = np.dot(np.dot(adapt_transform, self.S_mu), adapt_transform.T) self.S_ep = np.dot(np.dot(adapt_transform, self.S_ep), adapt_transform.T) self.S_mu = np.dot(np.dot(transform.T, self.S_mu), transform) self.S_ep = np.dot(np.dot(transform.T, self.S_ep), transform) F = np.linalg.pinv(self.S_ep) G = np.dot( np.dot(-np.linalg.pinv(2 * self.S_mu + self.S_ep), self.S_mu), F) self.A = np.linalg.pinv(self.S_mu + self.S_ep) - (F + G) self.G = G return
def pca(sFileTrai, sFileTest, iComponents): """ Performs PCA Keyword arguments: - sFileTrai: Path to training ivector.scp file - sFileTest: Path to testing ivector.scp file - iComponents: No of components to perform Returns: - vTraiPCA: Training data transformed by PCA for all subject_id - vLTrai: Training labels for all subject_id - vTraiSubjectId: List of the subject_id for training - vTestPCA: Testing data transformed by PCA for all subject_id - vLTest: Test labels for all subject_id - vTestSubjectId: List of the subject_id for test """ dIvecTrai = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTrai)} vTrai = pd.DataFrame((list(dIvecTrai.values()))) # Takes the last character in the filename as it is the label vLTrai = np.array([x[-1] for x in np.array(list(dIvecTrai.keys()))]) pca = PCA(n_components=iComponents, svd_solver='randomized', whiten=True) pca.fit(vTrai) vTraiPCA = pca.transform(vTrai) # FIXME : For realPD, we need more than -5 (CIS-PD subject_id is 4 characters long) # FIXME REAL-PD it's not only int vTraiSubjectId = np.array( ([int(x[-5:-1]) for x in np.array(list(dIvecTrai.keys()))])) vTraiMeasurementId = np.array( [x[-42:-6] for x in np.array(list(dIvecTrai.keys()))]) dIvecTest = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTest)} vTest = np.array(list(dIvecTest.values()), dtype=float) vLTest = np.array([int(x[-1]) for x in np.array(list(dIvecTest.keys()))]) vTestSubjectId = np.array( [int(x[-5:-1]) for x in np.array(list(dIvecTest.keys()))]) vTestMeasurementId = np.array( [x[-42:-6] for x in np.array(list(dIvecTest.keys()))]) # Builds a list of the measurement_id to use for the testing_data subset sPatternMeasurementId = r'(?<=trai_)[a-z\-0-9]+(?=[_])' #vTestMeasurementId = np.array([re.findall(sPatternMeasurementId, fileName)[0] for fileName in np.array(list(dIvecTest.keys()))]) # Get the measurement_id here vTestPCA = pca.transform(vTest) if isinstance(iComponents, str): iComponents = int(iComponents) return vTraiPCA, vLTrai, vTraiSubjectId, vTraiMeasurementId, vTestPCA, vLTest, vTestSubjectId, vTestMeasurementId
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue ph = int(feat[0]) if phone_int2sym is not None: if phone_int2sym[ph] != phone_of[ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] train_data_of.setdefault(ph, []).append((score, feat[1:])) # Make the dataset more blance train_data_of = add_more_negative_data(train_data_of) # Train models with ProcessPoolExecutor(args.nj) as ex: future_to_model = [(ph, ex.submit(train_model_for_phone, pairs)) for ph, pairs in train_data_of.items()] model_of = {ph: future.result() for ph, future in future_to_model} # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def extract_adv_voiced_feats(grads, vads, ori_feats, sigma): grads_data = {key: mat for key, mat in kaldi_io.read_mat_scp(grads)} vad_data = {key: vec for key, vec in kaldi_io.read_vec_flt_scp(vads)} ori_data = {key: mat for key, mat in kaldi_io.read_mat_scp(ori_feats)} num_spoofed = len(grads_data.keys()) num_vad = len(vad_data.keys()) num_ori = len(ori_data.keys()) trial_keys = list(grads_data.keys()) assert num_vad == num_ori, \ "Length does not match! (%d %d)" %(num_vad, num_ori) gen_mat = [] for key in trial_keys: print('Process %s utts.' % (key)) grads_mat = grads_data.get(key) testkey = key[26:] vad_vec = vad_data.get(testkey) ori_mat = ori_data.get(testkey) assert vad_vec is not None, 'No vad for %s %s' % (key, testkey) assert ori_mat is not None, 'No original feats for %s %s' % (key, testkey) sen_mat = [] k = 0 for j in range(len(vad_vec)): if vad_vec[j] == 1.0: sen_mat.append(grads_mat[k] * sigma + ori_mat[j]) k = k + 1 sen_mat = np.stack(sen_mat, 0) gen_mat.append(sen_mat) return trial_keys, gen_mat
def read_embd_seg_info(param): open(param.embedding_scp) embd_seg_dict = {} # for embd_sess_line in spk_embed_sess_list: for embd_sess_line, val in kaldi_io.read_vec_flt_scp(param.embedding_scp): seg_id = embd_sess_line split_seg_info = seg_id.split('-') sess_id = split_seg_info[0] if len(split_seg_info) == 5: offset = nps(split_seg_info[1]) start, end = round(offset + nps(split_seg_info[3]), 2), round(offset + nps(split_seg_info[4]), 2) elif len(split_seg_info) == 3: offset = 0 try: start, end = round(offset + nps(split_seg_info[1]), 2), round(offset + nps(split_seg_info[2]), 2) except: pass else: raise ValueError("Incorrect segments file format (segment id is wrong) ") if sess_id not in embd_seg_dict: embd_seg_dict[sess_id] = [(start, end)] else: embd_seg_dict[sess_id].append((start, end)) return embd_seg_dict
def testFloatVectorReadWrite(self): """ Test read/write for float vectors. """ # read, flt_vec = { k: v for k, v in kaldi_io.read_vec_flt_scp('tests/data/conf.scp') } # scp, return flt_vec2 = { k: v for k, v in kaldi_io.read_vec_flt_ark('tests/data/conf.ark') } # binary-ark, flt_vec3 = { k: v for k, v in kaldi_io.read_vec_flt_ark('tests/data/conf_ascii.ark') } # ascii-ark, # store, with kaldi_io.open_or_fd('tests/data_re-saved/conf.ark', 'wb') as f: for k, v in flt_vec.items(): kaldi_io.write_vec_flt(f, v, k) # read and compare, for k, v in kaldi_io.read_vec_flt_ark('tests/data_re-saved/conf.ark'): self.assertTrue(np.array_equal(v, flt_vec[k]), msg="flt. vector same after re-saving")
def get_train_set(feats, utt2spk): u2s={} f=open(utt2spk,'r') for line in f: s=line.split() if len(s)==0: continue u2s[s[0]] = s[1] f.close() Set = {} for key,mat in kaldi_io.read_vec_flt_scp(feats): cur_ivector = mat cur_speaker = u2s[key] if cur_speaker not in Set.keys(): Set[cur_speaker] = [mat] else: Set[cur_speaker].append(mat) Training_set=[] for k in Set.keys(): #cast spks with too few utts if len(Set[k])<3: continue Training_set.append( np.array((Set[k])) ) return Training_set
def read_impostor_vector(file_path): '''read impostor vector from scp ''' feats = [] feat_gen = kaldi_io.read_vec_flt_scp(file_path) for key, feat in feat_gen: feats.append(feat) return feats
def __init__(self, data_path, uttid_list, class_num): data_dic = { k:m for k,m in kaldi_io.read_vec_flt_scp(data_path) } self.utt_list = list(data_dic.keys()) self.data_list = list(data_dic.values()) self.uttid_list = uttid_list self.class_num = class_num assert len(uttid_list)==len(self.data_list), "The lengths of uttid_list and data unmatch!"
def load_ivectors(filename): ''' Load the ivectors into a dictionary. Input argument may be an ark or scp file. ''' ivectors = {} for key, vec in kaldi_io.read_vec_flt_scp(filename): ivectors[key] = np.array(vec) return ivectors
def pca(sFileTrai, sFileTest, iComponents): """ Performs PCA Keyword arguments: - sFileTrai: TODO - sFileTest: TODO - iComponents: No of components to perform Returns: - vTraiPCA: TODO - vLTrai: TODO - vTraiSubjectId: List of the subject_id for training - vTestPCA: TODO - vLTest: TODO - vTestSubjectId: List of the subject_id for test """ dIvecTrai = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTrai)} vTrai = pd.DataFrame((list(dIvecTrai.values()))) vLTrai = np.array([x[-1] for x in np.array(list(dIvecTrai.keys()))]) # FIXME : For realPD, we need more than -5 (CIS-PD subject_id is 4 characters long) # FIXME REAL-PD it's not only int vTraiSubjectId = np.array( ([int(x[-5:-1]) for x in np.array(list(dIvecTrai.keys()))])) dIvecTest = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTest)} vTest = np.array(list(dIvecTest.values()), dtype=float) vLTest = np.array([int(x[-1]) for x in np.array(list(dIvecTest.keys()))]) vTestSubjectId = np.array( [int(x[-5:-1]) for x in np.array(list(dIvecTest.keys()))]) #iComponents=60; if isinstance(iComponents, str): iComponents = int(iComponents) pca = PCA(n_components=iComponents, svd_solver='randomized', whiten=True) pca.fit(vTrai) vTraiPCA = pca.transform(vTrai) vTestPCA = pca.transform(vTest) return vTraiPCA, vLTrai, vTraiSubjectId, vTestPCA, vLTest, vTestSubjectId
def load_ary_ark_from_scp(basedir, scpfile): cwd = os.getcwd() os.chdir(basedir) key = list() val = list() for k, v in kaldi_io.read_vec_flt_scp(scpfile): key.append(k) val.append(v) os.chdir(cwd) return key, val
def __init__(self, data_path, uttid_list, class_num, k_times): data_dic = { k:m for k,m in kaldi_io.read_vec_flt_scp(data_path) } self.utt_list = list(data_dic.keys()) self.data_list = list(data_dic.values()) self.uttid_list = uttid_list self.class_num = class_num self.k_times = k_times self.triplet_list = self.make_triplet_list(class_num,k_times)
def read_target_vector(file_path, targets): ''' read from vectors in targets ''' spks = [] feats = [] feat_gen = kaldi_io.read_vec_flt_scp(file_path) for key, feat in feat_gen: if key in targets: spks.append(key) feats.append(feat) return spks, feats
def convert_to_npy(datatype, arkscppath, outputnpyfilepath, file_list_path): if not os.path.isdir(outputnpyfilepath): print('Creating directory where npy scores will be saved : {}'.format( outputnpyfilepath)) os.makedirs(outputnpyfilepath) else: print("xvectors numpy path exists !") # exit() file_name = os.path.basename(arkscppath) ext = os.path.splitext(file_name)[1] if datatype == 'mat': #for score files if ext == ".scp": d = {key: mat for key, mat in kaldi_io.read_mat_scp(arkscppath)} else: print("File type not correct. scp required.") elif datatype == 'vec': #for embeddings if ext == ".scp": d = { key: mat for key, mat in kaldi_io.read_vec_flt_scp(arkscppath) } elif ext == ".ark": d = { key: mat for key, mat in kaldi_io.read_vec_flt_ark(arkscppath) } else: print("File type not correct. scp/ark required.") else: print("first argument should be mat/vec ") file_list = open(file_list_path, 'r').readlines() file_count = 0 for count, (i, j) in enumerate(d.items()): if count == 0: system = j.reshape(1, -1) # if count % 100 == 0: # print("Done with {} files".format(count)) fn = file_list[file_count].rsplit()[0] if fn in i: system = np.vstack((system, j)) else: print('fielname:', fn) if not os.path.isfile(outputnpyfilepath + '/' + fn + '.npy'): np.save(outputnpyfilepath + '/' + fn + '.npy', system) file_count = file_count + 1 system = j.reshape(1, -1) # last file print('fielname:', fn) if not os.path.isfile(outputnpyfilepath + '/' + fn + '.npy'): np.save(outputnpyfilepath + '/' + fn + '.npy', system)
def get_embed_dict(scp): embedding_dict, feats_scp_segments_line_dict = {}, {} for seg_id, val in kaldi_io.read_vec_flt_scp(scp): # ipdb.set_trace() sess_id = utt2spk_dict[seg_id] if sess_id not in embedding_dict: embedding_dict[sess_id] = [val] feats_scp_segments_line_dict[sess_id] = [seg_line_dict[seg_id]] else: embedding_dict[sess_id].append(val) feats_scp_segments_line_dict[sess_id].append(seg_line_dict[seg_id]) return embedding_dict, feats_scp_segments_line_dict
def main(): args = get_args() with open(args.model, 'rb') as f: model_of = pickle.load(f) with open(args.output, 'wt') as f: for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): ph = int(feat[0]) feat = feat[1:].reshape(1, -1) score = model_of[ph].predict(feat).reshape(1)[0] score = round_score(score, 1) f.write(f'{ph_key}\t{score:.1f}\t{ph}\n')
def datalist_load(foldername): input_data = [] input_label = [] scpindex = 'ivector.scp' for key, mat in kaldi_io.read_vec_flt_scp( os.path.join(foldername, scpindex)): matl = mat.tolist() input_data.append(matl) input_label.append(key) return np.array(input_data, dtype=np.float32), input_label
def load_vec_ark_from_scp(basedir, scpfile): cwd = os.getcwd() os.chdir(basedir) key = list() val = list() for k, v in kaldi_io.read_vec_flt_scp(scpfile): key.append(k) val.append(v) # Stack matrices to form one matrix mat = np.vstack(val) os.chdir(cwd) return key, mat
def ReadIvectors(self, ivectorfile): keys = [] data = [] i = 0 for key, mat in kaldi_io.read_vec_flt_scp(ivectorfile): # print(key) # print(mat) # print(len(mat.tolist())) # exit(0) i += 1 keys.append(key) data.append(mat.tolist()) print('totally %d ivectors' % (i)) return keys, data
def read_features(self): features_folder = 'exp/xvector_nnet_1a/xvectors_test/' all_scps = sorted( glob.glob(self.kaldi_dir + '/' + features_folder + '/xvector.*.scp')) speaker_map = self.get_speakers() all_features = [] all_labels = [] for scp_file in all_scps: for key, mat in kaldi_io.read_vec_flt_scp(scp_file): speaker_name = speaker_map[key[:-13]] print(key) all_features.append(mat) all_labels.append(speaker_name) return np.asarray(all_features), np.asarray(all_labels)
def get_data(path1, path2): file_ = path1 + 'ivector.scp' features_1 = {k: m for k, m in read_vec_flt_scp(file_)} if path2 is not None: file_ = path2 + 'ivector.scp' features_2 = {k: m for k, m in read_vec_flt_scp(file_)} x, y = [], [] for k, m in features_1.items(): utt_type = k.split('-')[-1] y.append(0 if utt_type == 'spoof' else 1) if path2 is not None and k in features_2: x.append(np.concatenate([m, features_2[k]], 0)) else: x.append(m) return np.asarray(x), np.asarray(y)
def testFloatVectorReadWrite(self): """ Test read/write for float vectors. """ # read, flt_vec = { k:v for k,v in kaldi_io.read_vec_flt_scp('tests/data/conf.scp') } # scp, return flt_vec2 = { k:v for k,v in kaldi_io.read_vec_flt_ark('tests/data/conf.ark') } # binary-ark, flt_vec3 = { k:v for k,v in kaldi_io.read_vec_flt_ark('tests/data/conf_ascii.ark') } # ascii-ark, # store, with kaldi_io.open_or_fd('tests/data_re-saved/conf.ark','wb') as f: for k,v in flt_vec.items(): kaldi_io.write_vec_flt(f, v, k) # read and compare, for k,v in kaldi_io.read_vec_flt_ark('tests/data_re-saved/conf.ark'): self.assertTrue(np.array_equal(v,flt_vec[k]), msg="flt. vector same after re-saving")
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Prepare training data train_data_of = {} for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if ph_key not in score_of: print(f'Warning: no human score for {ph_key}') continue if phone_int2sym is not None: ph = int(feat[0]) if phone_int2sym[ph] != phone_of[ph_key]: print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ') continue score = score_of[ph_key] if ph not in train_data_of: train_data_of[ph] = [] train_data_of[ph].append((score, feat)) # Train models model_of = {} for ph, pairs in train_data_of.items(): model = RandomForestRegressor() labels = [] feats = [] for label, feat in pairs: labels.append(label) feats.append(feat[1:]) labels = np.array(labels).reshape(-1, 1) feats = np.array(feats).reshape(-1, len(feats[0])) feats, labels = balanced_sampling(feats, labels) labels = labels.ravel() model.fit(feats, labels) model_of[ph] = model print(f'Model of phone {ph} trained.') # Write to file with open(args.model, 'wb') as f: pickle.dump(model_of, f)
def load_vector_scp(is_eval, apply_norm, scp_file, npz_file, utt2spk_file): '''load kaldi scp file''' assert (os.path.splitext(scp_file)[1] == ".scp") print("Loading kaldi scp file...") utts = [] vecs = [] for k, v in kaldi_io.read_vec_flt_scp(scp_file): utts.append(k) vecs.append(v) assert (len(utts) == len(vecs)) if is_eval: print("Loading eval data...") utt2spk = eval_create_utt2spk_map(utt2spk_file) else: print("Loading training data...") utt2spk = create_utt2spk_map(utt2spk_file) vectors = [] spker_label = [] utt_label = [] vec_dim = len(vecs[0]) for i in range(len(utts)): if apply_norm: vec = np.array(vecs[i]) norm = np.linalg.norm(vec) vectors.append(math.sqrt(vec_dim) * vec / norm) else: vectors.append(vecs[i]) spker_label.append(utt2spk[utts[i]]) utt_label.append(utts[i]) if not os.path.exists(os.path.dirname(npz_file)): os.makedirs(os.path.dirname(npz_file)) np.savez(npz_file, vectors=vectors, spker_label=spker_label, utt_label=utt_label) print("Convert {} to {} ".format(scp_file, npz_file))
def main(): args = get_args() with open(args.model, 'rb') as f: model_of = pickle.load(f) feats_for_phone = {} idxs_for_phone = {} for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): ph = int(feat[0]) feats_for_phone.setdefault(ph, []).append(feat[1:]) idxs_for_phone.setdefault(ph, []).append(ph_key) with open(args.output, 'wt') as f: for ph in feats_for_phone: feats = np.array(feats_for_phone[ph]) scores = model_of[ph].predict(feats) for ph_key, score in zip(idxs_for_phone[ph], list(scores)): score = round_score(score, 1) f.write(f'{ph_key}\t{score:.1f}\t{ph}\n')
def main(): args = get_args() # Phone symbol table _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table) # Human expert scores score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1) # Gather the features lables = [] features = [] for key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp): if key not in score_of: print(f'Warning: no human score for {key}') continue ph = int(feat[0]) if ph in range(args.min_phone_idx, args.max_phone_idx + 1): if phone_int2sym is not None and ph in phone_int2sym: ph = phone_int2sym[ph] lables.append(f'{ph}-{score_of[key]}') features.append(feat[1:]) # Sampling sampled_paris = random.sample(list(zip(features, lables)), min(args.samples, len(lables))) features, lables = list(zip(*sampled_paris)) # Draw scatters label_counter = Counter(lables) colors = sns.color_palette("colorblind", len(label_counter)) features = TSNE(n_components=2).fit_transform(features) sns_plot = sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=lables, legend='full', palette=colors) sns_plot.get_figure().savefig(args.output)
args = sys.argv src_xvec_dir = args[1] pool_xvec_dir = args[2] scores_dir = args[3] if not isdir(scores_dir): os.makedirs(scores_dir) src_xvec_file = join(src_xvec_dir, 'spk_xvector.scp') pool_xvec_file = join(pool_xvec_dir, 'spk_xvector.scp') pool_xvectors = {} c = 0 with open(pool_xvec_file) as f: for key, xvec in kaldi_io.read_vec_flt_scp(f): #print key, mat.shape pool_xvectors[key] = xvec c += 1 print("Read ", c, "pool xvectors") with open(src_xvec_file) as f: for sspk, sxvec in kaldi_io.read_vec_flt_scp(f): print("Computing cosine measure for " + sspk) with open(join(scores_dir, 'affinity_' + sspk), 'w') as sf: for pspk, pxvec in pool_xvectors.items(): # compute cosine distance between src and pool spk # Multiplying by -1 to ensure compatibility with affinity # Now lower value will indicate less affinity as compared # to original cosine distance dist = -1.0 * distance.cosine(sxvec, pxvec)
spk = i.split()[0] if spk != 's5': # add prefix 'p' spk = 'p' + spk if sys.argv[1] == 'age': trait = int(i.split()[1]) elif sys.argv[1] == 'gender': trait = i.split()[2] elif sys.argv[1] == 'accent': trait = i.split()[3] spk2trait[spk] = trait print('speaker to trait is %s' % spk2trait) tsne = TSNE(n_components=2, verbose=1) X, y = [], [] index = 0 for key,vec in read_vec_flt_scp(sys.argv[2]): X.append(vec) spk = key.split('-')[0] y.append(spk2trait[spk]) #print(vec.shape) #y.append(index) index += 1 X, y = np.array(X), np.array(y) print(len(y)) print(np.unique(y)) X_emb = tsne.fit_transform(X) # tsne transformed # For reproducability of the results np.random.seed(42) N = int(sys.argv[3]) rndperm = np.random.permutation(X_emb.shape[0])
def get_cmap(n, name='hsv'): return plt.cm.get_cmap(name, n) # get gender info spk2gender = {} with open(spk2gender_file) as f: for line in f.read().splitlines(): sp = line.split() spkid = sp[0] gen = sp[1] spk2gender[spkid] = gen X = [] spks = [] for key, mat in kaldi_io.read_vec_flt_scp(spk_xvector_file): #print(key, mat.shape) spks.append(key) X.append(mat[np.newaxis]) X = np.concatenate(X) print("X = ", X.shape) mean_X = np.mean(X, axis=0) std_X = np.std(X, axis=0) X = (X - mean_X) / std_X tsne = TSNE(n_components=2, init='random', random_state=42, perplexity=5) Y = tsne.fit_transform(X) nspk = Y.shape[0]
import kaldi_io if len(sys.argv) != 4: print("usage: %s ood-xvectors id-xvectors ood-transformed-dir" % sys.argv[0]) print("The x-vectors should be normalized by the mean.") quit() print("Perform CORAL transform") ood_vec_file = sys.argv[1] id_vec_file = sys.argv[2] transform_dir = sys.argv[3] ood_vec = [] ood_keys = [] for key, vec in kaldi_io.read_vec_flt_scp(ood_vec_file): ood_vec.append(vec) ood_keys.append(key) ood_vec = np.array(ood_vec) id_vec = [] for key, vec in kaldi_io.read_vec_flt_scp(id_vec_file): id_vec.append(vec) id_vec = np.array(id_vec) dim = id_vec.shape[1] # Covariance Cs = (1.0 / ood_vec.shape[0]) * np.dot(np.transpose(ood_vec), ood_vec) + np.eye(dim) Ct = (1.0 / id_vec.shape[0]) * np.dot(np.transpose(id_vec), id_vec) + np.eye(dim)