Beispiel #1
0
    def __init__(self, mu_path, ep_path, transform_dir, enroll_feats,
                 test_feats, trial):
        self.S_mu = np.load(mu_path)
        self.S_ep = np.load(ep_path)
        self.enroll = {}
        for key, mat in kaldi_io.read_vec_flt_scp(enroll_feats):
            self.enroll[key] = mat
        self.test = {}
        for key, mat in kaldi_io.read_vec_flt_scp(test_feats):
            self.test[key] = mat
        self.len = len(self.test)
        self.scores = np.zeros((self.len, self.len))
        self.trial_path = trial

        xvectors = []
        for _, mat in kaldi_io.read_vec_flt_scp(enroll_feats):
            xvectors.append(mat)
        xvectors = np.array(xvectors)
        transform = EstPca(xvectors, target_energy=0.1)
        adapt_transform = np.array(kaldi_io.read_mat(transform_dir))
        self.adapt_transform = transform
        self.transform = transform
        self.S_mu = np.dot(np.dot(adapt_transform, self.S_mu),
                           adapt_transform.T)
        self.S_ep = np.dot(np.dot(adapt_transform, self.S_ep),
                           adapt_transform.T)
        self.S_mu = np.dot(np.dot(transform.T, self.S_mu), transform)
        self.S_ep = np.dot(np.dot(transform.T, self.S_ep), transform)
        F = np.linalg.pinv(self.S_ep)
        G = np.dot(
            np.dot(-np.linalg.pinv(2 * self.S_mu + self.S_ep), self.S_mu), F)
        self.A = np.linalg.pinv(self.S_mu + self.S_ep) - (F + G)
        self.G = G

        return
Beispiel #2
0
def pca(sFileTrai, sFileTest, iComponents):
    """
    Performs PCA 
    
    Keyword arguments: 
    - sFileTrai: Path to training ivector.scp file 
    - sFileTest: Path to testing ivector.scp file 
    - iComponents: No of components to perform 
    
    Returns: 
    - vTraiPCA: Training data transformed by PCA for all subject_id 
    - vLTrai: Training labels for all subject_id 
    - vTraiSubjectId: List of the subject_id for training 
    - vTestPCA: Testing data transformed by PCA for all subject_id 
    - vLTest: Test labels for all subject_id 
    - vTestSubjectId: List of the subject_id for test
    """
    dIvecTrai = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTrai)}
    vTrai = pd.DataFrame((list(dIvecTrai.values())))
    # Takes the last character in the filename as it is the label
    vLTrai = np.array([x[-1] for x in np.array(list(dIvecTrai.keys()))])

    pca = PCA(n_components=iComponents, svd_solver='randomized', whiten=True)
    pca.fit(vTrai)

    vTraiPCA = pca.transform(vTrai)

    # FIXME : For realPD, we need more than -5 (CIS-PD subject_id is 4 characters long)
    # FIXME REAL-PD it's not only int
    vTraiSubjectId = np.array(
        ([int(x[-5:-1]) for x in np.array(list(dIvecTrai.keys()))]))
    vTraiMeasurementId = np.array(
        [x[-42:-6] for x in np.array(list(dIvecTrai.keys()))])

    dIvecTest = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTest)}
    vTest = np.array(list(dIvecTest.values()), dtype=float)
    vLTest = np.array([int(x[-1]) for x in np.array(list(dIvecTest.keys()))])
    vTestSubjectId = np.array(
        [int(x[-5:-1]) for x in np.array(list(dIvecTest.keys()))])
    vTestMeasurementId = np.array(
        [x[-42:-6] for x in np.array(list(dIvecTest.keys()))])
    # Builds a list of the measurement_id to use for the testing_data subset
    sPatternMeasurementId = r'(?<=trai_)[a-z\-0-9]+(?=[_])'
    #vTestMeasurementId = np.array([re.findall(sPatternMeasurementId, fileName)[0] for fileName in np.array(list(dIvecTest.keys()))])

    # Get the measurement_id here
    vTestPCA = pca.transform(vTest)

    if isinstance(iComponents, str):
        iComponents = int(iComponents)

    return vTraiPCA, vLTrai, vTraiSubjectId, vTraiMeasurementId, vTestPCA, vLTest, vTestSubjectId, vTestMeasurementId
Beispiel #3
0
def main():
    args = get_args()

    # Phone symbol table
    _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table)

    # Human expert scores
    score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1)

    # Prepare training data
    train_data_of = {}
    for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp):
        if ph_key not in score_of:
            print(f'Warning: no human score for {ph_key}')
            continue
        ph = int(feat[0])
        if phone_int2sym is not None:
            if phone_int2sym[ph] != phone_of[ph_key]:
                print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ')
                continue
        score = score_of[ph_key]
        train_data_of.setdefault(ph, []).append((score, feat[1:]))

    # Make the dataset more blance
    train_data_of = add_more_negative_data(train_data_of)

    # Train models
    with ProcessPoolExecutor(args.nj) as ex:
        future_to_model = [(ph, ex.submit(train_model_for_phone, pairs))
                           for ph, pairs in train_data_of.items()]
        model_of = {ph: future.result() for ph, future in future_to_model}

    # Write to file
    with open(args.model, 'wb') as f:
        pickle.dump(model_of, f)
def extract_adv_voiced_feats(grads, vads, ori_feats, sigma):
    grads_data = {key: mat for key, mat in kaldi_io.read_mat_scp(grads)}
    vad_data = {key: vec for key, vec in kaldi_io.read_vec_flt_scp(vads)}
    ori_data = {key: mat for key, mat in kaldi_io.read_mat_scp(ori_feats)}

    num_spoofed = len(grads_data.keys())
    num_vad = len(vad_data.keys())
    num_ori = len(ori_data.keys())

    trial_keys = list(grads_data.keys())

    assert num_vad == num_ori, \
           "Length does not match! (%d %d)" %(num_vad, num_ori)

    gen_mat = []
    for key in trial_keys:
        print('Process %s utts.' % (key))
        grads_mat = grads_data.get(key)
        testkey = key[26:]
        vad_vec = vad_data.get(testkey)
        ori_mat = ori_data.get(testkey)
        assert vad_vec is not None, 'No vad for %s %s' % (key, testkey)
        assert ori_mat is not None, 'No original feats for %s %s' % (key,
                                                                     testkey)
        sen_mat = []
        k = 0
        for j in range(len(vad_vec)):
            if vad_vec[j] == 1.0:
                sen_mat.append(grads_mat[k] * sigma + ori_mat[j])
                k = k + 1

        sen_mat = np.stack(sen_mat, 0)
        gen_mat.append(sen_mat)

    return trial_keys, gen_mat
def read_embd_seg_info(param):
    open(param.embedding_scp)
    embd_seg_dict = {}
     
    # for embd_sess_line in spk_embed_sess_list:
    for embd_sess_line, val in kaldi_io.read_vec_flt_scp(param.embedding_scp):

        seg_id = embd_sess_line
        split_seg_info = seg_id.split('-')
        sess_id = split_seg_info[0]
        if len(split_seg_info) == 5:
            offset = nps(split_seg_info[1])
            start, end = round(offset + nps(split_seg_info[3]), 2),  round(offset + nps(split_seg_info[4]), 2)
        elif len(split_seg_info) == 3: 
            offset = 0
            try:
                start, end = round(offset + nps(split_seg_info[1]), 2),  round(offset + nps(split_seg_info[2]), 2)
            except:
                pass
        else:
            raise ValueError("Incorrect segments file format (segment id is wrong) ")
        
        if sess_id not in embd_seg_dict:
            embd_seg_dict[sess_id] = [(start, end)]
        else:
            embd_seg_dict[sess_id].append((start, end))

    return embd_seg_dict
    def testFloatVectorReadWrite(self):
        """
        Test read/write for float vectors.
        """
        # read,
        flt_vec = {
            k: v
            for k, v in kaldi_io.read_vec_flt_scp('tests/data/conf.scp')
        }  # scp,
        return

        flt_vec2 = {
            k: v
            for k, v in kaldi_io.read_vec_flt_ark('tests/data/conf.ark')
        }  # binary-ark,
        flt_vec3 = {
            k: v
            for k, v in kaldi_io.read_vec_flt_ark('tests/data/conf_ascii.ark')
        }  # ascii-ark,
        # store,
        with kaldi_io.open_or_fd('tests/data_re-saved/conf.ark', 'wb') as f:
            for k, v in flt_vec.items():
                kaldi_io.write_vec_flt(f, v, k)
        # read and compare,
        for k, v in kaldi_io.read_vec_flt_ark('tests/data_re-saved/conf.ark'):
            self.assertTrue(np.array_equal(v, flt_vec[k]),
                            msg="flt. vector same after re-saving")
Beispiel #7
0
def get_train_set(feats, utt2spk):
    u2s={}
    f=open(utt2spk,'r')
    for line in f:
        s=line.split()
        if len(s)==0:
            continue
        u2s[s[0]] = s[1]
    f.close()
    
    Set = {}
    for key,mat in kaldi_io.read_vec_flt_scp(feats):
        cur_ivector = mat
        cur_speaker = u2s[key]
        if cur_speaker not in Set.keys():
            Set[cur_speaker] = [mat]
        else:
            Set[cur_speaker].append(mat)

    Training_set=[]
    for k in Set.keys():
        #cast spks with too few utts
        if len(Set[k])<3:
            continue
        Training_set.append( np.array((Set[k])) )
    return Training_set
def read_impostor_vector(file_path):
    '''read impostor vector from scp
    '''
    feats = []
    feat_gen = kaldi_io.read_vec_flt_scp(file_path)
    for key, feat in feat_gen:
        feats.append(feat)
    return feats
Beispiel #9
0
    def __init__(self, data_path, uttid_list, class_num):
        data_dic = { k:m for k,m in kaldi_io.read_vec_flt_scp(data_path) } 
        self.utt_list = list(data_dic.keys())
        self.data_list = list(data_dic.values())

        self.uttid_list = uttid_list
        self.class_num = class_num

        assert len(uttid_list)==len(self.data_list), "The lengths of uttid_list and data unmatch!"
Beispiel #10
0
def load_ivectors(filename):
    '''
        Load the ivectors into a dictionary.
        Input argument may be an ark or scp file.
    '''
    ivectors = {}
    for key, vec in kaldi_io.read_vec_flt_scp(filename):
        ivectors[key] = np.array(vec)
    return ivectors
Beispiel #11
0
def pca(sFileTrai, sFileTest, iComponents):
    """
    Performs PCA 
    
    Keyword arguments: 
    - sFileTrai: TODO
    - sFileTest: TODO
    - iComponents: No of components to perform 
    
    Returns: 
    - vTraiPCA: TODO
    - vLTrai: TODO
    - vTraiSubjectId: List of the subject_id for training 
    - vTestPCA: TODO
    - vLTest: TODO
    - vTestSubjectId: List of the subject_id for test
    """
    dIvecTrai = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTrai)}
    vTrai = pd.DataFrame((list(dIvecTrai.values())))
    vLTrai = np.array([x[-1] for x in np.array(list(dIvecTrai.keys()))])

    # FIXME : For realPD, we need more than -5 (CIS-PD subject_id is 4 characters long)
    # FIXME REAL-PD it's not only int
    vTraiSubjectId = np.array(
        ([int(x[-5:-1]) for x in np.array(list(dIvecTrai.keys()))]))

    dIvecTest = {key: mat for key, mat in kaldi_io.read_vec_flt_scp(sFileTest)}
    vTest = np.array(list(dIvecTest.values()), dtype=float)
    vLTest = np.array([int(x[-1]) for x in np.array(list(dIvecTest.keys()))])
    vTestSubjectId = np.array(
        [int(x[-5:-1]) for x in np.array(list(dIvecTest.keys()))])

    #iComponents=60;

    if isinstance(iComponents, str):
        iComponents = int(iComponents)

    pca = PCA(n_components=iComponents, svd_solver='randomized', whiten=True)
    pca.fit(vTrai)

    vTraiPCA = pca.transform(vTrai)
    vTestPCA = pca.transform(vTest)

    return vTraiPCA, vLTrai, vTraiSubjectId, vTestPCA, vLTest, vTestSubjectId
Beispiel #12
0
def load_ary_ark_from_scp(basedir, scpfile):
    cwd = os.getcwd()
    os.chdir(basedir)
    key = list()
    val = list()
    for k, v in kaldi_io.read_vec_flt_scp(scpfile):
        key.append(k)
        val.append(v)
    os.chdir(cwd)
    return key, val
Beispiel #13
0
    def __init__(self, data_path, uttid_list, class_num, k_times):
        data_dic = { k:m for k,m in kaldi_io.read_vec_flt_scp(data_path) } 
        self.utt_list = list(data_dic.keys())
        self.data_list = list(data_dic.values())

        self.uttid_list = uttid_list
        self.class_num = class_num
        self.k_times = k_times
        

        self.triplet_list = self.make_triplet_list(class_num,k_times)
def read_target_vector(file_path, targets):
    ''' read from vectors in targets
    '''
    spks = []
    feats = []
    feat_gen = kaldi_io.read_vec_flt_scp(file_path)
    for key, feat in feat_gen:
        if key in targets:
            spks.append(key)
            feats.append(feat)
    return spks, feats
def convert_to_npy(datatype, arkscppath, outputnpyfilepath, file_list_path):
    if not os.path.isdir(outputnpyfilepath):
        print('Creating directory where npy scores will be saved : {}'.format(
            outputnpyfilepath))
        os.makedirs(outputnpyfilepath)
    else:
        print("xvectors numpy path exists !")
        # exit()
    file_name = os.path.basename(arkscppath)
    ext = os.path.splitext(file_name)[1]
    if datatype == 'mat':
        #for score files
        if ext == ".scp":
            d = {key: mat for key, mat in kaldi_io.read_mat_scp(arkscppath)}
        else:
            print("File type not correct. scp required.")
    elif datatype == 'vec':
        #for embeddings
        if ext == ".scp":
            d = {
                key: mat
                for key, mat in kaldi_io.read_vec_flt_scp(arkscppath)
            }
        elif ext == ".ark":
            d = {
                key: mat
                for key, mat in kaldi_io.read_vec_flt_ark(arkscppath)
            }
        else:
            print("File type not correct. scp/ark required.")
    else:
        print("first argument should be mat/vec ")

    file_list = open(file_list_path, 'r').readlines()
    file_count = 0
    for count, (i, j) in enumerate(d.items()):
        if count == 0:
            system = j.reshape(1, -1)
        # if count % 100 == 0:
        #     print("Done with {} files".format(count))
        fn = file_list[file_count].rsplit()[0]
        if fn in i:
            system = np.vstack((system, j))
        else:
            print('fielname:', fn)
            if not os.path.isfile(outputnpyfilepath + '/' + fn + '.npy'):
                np.save(outputnpyfilepath + '/' + fn + '.npy', system)
            file_count = file_count + 1
            system = j.reshape(1, -1)
    # last file
    print('fielname:', fn)
    if not os.path.isfile(outputnpyfilepath + '/' + fn + '.npy'):
        np.save(outputnpyfilepath + '/' + fn + '.npy', system)
def get_embed_dict(scp):
    embedding_dict, feats_scp_segments_line_dict = {}, {}
    for seg_id, val in kaldi_io.read_vec_flt_scp(scp):
        # ipdb.set_trace()
        sess_id = utt2spk_dict[seg_id]
        if sess_id not in embedding_dict:
            embedding_dict[sess_id] = [val]
            feats_scp_segments_line_dict[sess_id] = [seg_line_dict[seg_id]]

        else:
            embedding_dict[sess_id].append(val)
            feats_scp_segments_line_dict[sess_id].append(seg_line_dict[seg_id])
    return embedding_dict, feats_scp_segments_line_dict
Beispiel #17
0
def main():
    args = get_args()

    with open(args.model, 'rb') as f:
        model_of = pickle.load(f)

    with open(args.output, 'wt') as f:
        for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp):
            ph = int(feat[0])
            feat = feat[1:].reshape(1, -1)
            score = model_of[ph].predict(feat).reshape(1)[0]
            score = round_score(score, 1)
            f.write(f'{ph_key}\t{score:.1f}\t{ph}\n')
Beispiel #18
0
def datalist_load(foldername):
    input_data = []
    input_label = []

    scpindex = 'ivector.scp'

    for key, mat in kaldi_io.read_vec_flt_scp(
            os.path.join(foldername, scpindex)):
        matl = mat.tolist()
        input_data.append(matl)
        input_label.append(key)

    return np.array(input_data, dtype=np.float32), input_label
Beispiel #19
0
def load_vec_ark_from_scp(basedir, scpfile):
    cwd = os.getcwd()
    os.chdir(basedir)
    key = list()
    val = list()
    for k, v in kaldi_io.read_vec_flt_scp(scpfile):
        key.append(k)
        val.append(v)

    # Stack matrices to form one matrix
    mat = np.vstack(val)
    os.chdir(cwd)
    return key, mat
 def ReadIvectors(self, ivectorfile):
     keys = []
     data = []
     i = 0
     for key, mat in kaldi_io.read_vec_flt_scp(ivectorfile):
         # print(key)
         # print(mat)
         # print(len(mat.tolist()))
         # exit(0)
         i += 1
         keys.append(key)
         data.append(mat.tolist())
     print('totally %d ivectors' % (i))
     return keys, data
 def read_features(self):
     features_folder = 'exp/xvector_nnet_1a/xvectors_test/'
     all_scps = sorted(
         glob.glob(self.kaldi_dir + '/' + features_folder +
                   '/xvector.*.scp'))
     speaker_map = self.get_speakers()
     all_features = []
     all_labels = []
     for scp_file in all_scps:
         for key, mat in kaldi_io.read_vec_flt_scp(scp_file):
             speaker_name = speaker_map[key[:-13]]
             print(key)
             all_features.append(mat)
             all_labels.append(speaker_name)
     return np.asarray(all_features), np.asarray(all_labels)
def get_data(path1, path2):

    file_ = path1 + 'ivector.scp'

    features_1 = {k: m for k, m in read_vec_flt_scp(file_)}

    if path2 is not None:

        file_ = path2 + 'ivector.scp'

        features_2 = {k: m for k, m in read_vec_flt_scp(file_)}

    x, y = [], []

    for k, m in features_1.items():
        utt_type = k.split('-')[-1]
        y.append(0 if utt_type == 'spoof' else 1)

        if path2 is not None and k in features_2:
            x.append(np.concatenate([m, features_2[k]], 0))
        else:
            x.append(m)

    return np.asarray(x), np.asarray(y)
    def testFloatVectorReadWrite(self):
        """
        Test read/write for float vectors.
        """
        # read,
        flt_vec = { k:v for k,v in kaldi_io.read_vec_flt_scp('tests/data/conf.scp') } # scp,
        return

        flt_vec2 = { k:v for k,v in kaldi_io.read_vec_flt_ark('tests/data/conf.ark') } # binary-ark,
        flt_vec3 = { k:v for k,v in kaldi_io.read_vec_flt_ark('tests/data/conf_ascii.ark') } # ascii-ark,
        # store,
        with kaldi_io.open_or_fd('tests/data_re-saved/conf.ark','wb') as f:
            for k,v in flt_vec.items(): kaldi_io.write_vec_flt(f, v, k)
        # read and compare,
        for k,v in kaldi_io.read_vec_flt_ark('tests/data_re-saved/conf.ark'):
            self.assertTrue(np.array_equal(v,flt_vec[k]), msg="flt. vector same after re-saving")
Beispiel #24
0
def main():
    args = get_args()

    # Phone symbol table
    _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table)

    # Human expert scores
    score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1)

    # Prepare training data
    train_data_of = {}
    for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp):
        if ph_key not in score_of:
            print(f'Warning: no human score for {ph_key}')
            continue
        if phone_int2sym is not None:
            ph = int(feat[0])
            if phone_int2sym[ph] != phone_of[ph_key]:
                print(f'Unmatch: {phone_int2sym[ph]} <--> {phone_of[ph_key]} ')
                continue
        score = score_of[ph_key]

        if ph not in train_data_of:
            train_data_of[ph] = []
        train_data_of[ph].append((score, feat))

    # Train models
    model_of = {}
    for ph, pairs in train_data_of.items():
        model = RandomForestRegressor()
        labels = []
        feats = []
        for label, feat in pairs:
            labels.append(label)
            feats.append(feat[1:])
        labels = np.array(labels).reshape(-1, 1)
        feats = np.array(feats).reshape(-1, len(feats[0]))
        feats, labels = balanced_sampling(feats, labels)
        labels = labels.ravel()
        model.fit(feats, labels)
        model_of[ph] = model
        print(f'Model of phone {ph} trained.')

    # Write to file
    with open(args.model, 'wb') as f:
        pickle.dump(model_of, f)
Beispiel #25
0
def load_vector_scp(is_eval, apply_norm, scp_file, npz_file, utt2spk_file):
    '''load kaldi scp file'''
    assert (os.path.splitext(scp_file)[1] == ".scp")

    print("Loading kaldi scp file...")
    utts = []
    vecs = []
    for k, v in kaldi_io.read_vec_flt_scp(scp_file):
        utts.append(k)
        vecs.append(v)

    assert (len(utts) == len(vecs))

    if is_eval:
        print("Loading eval data...")
        utt2spk = eval_create_utt2spk_map(utt2spk_file)
    else:
        print("Loading training data...")
        utt2spk = create_utt2spk_map(utt2spk_file)

    vectors = []
    spker_label = []
    utt_label = []
    vec_dim = len(vecs[0])
    for i in range(len(utts)):
        if apply_norm:
            vec = np.array(vecs[i])
            norm = np.linalg.norm(vec)
            vectors.append(math.sqrt(vec_dim) * vec / norm)
        else:
            vectors.append(vecs[i])
        spker_label.append(utt2spk[utts[i]])
        utt_label.append(utts[i])

    if not os.path.exists(os.path.dirname(npz_file)):
        os.makedirs(os.path.dirname(npz_file))

    np.savez(npz_file,
             vectors=vectors,
             spker_label=spker_label,
             utt_label=utt_label)
    print("Convert {} to {} ".format(scp_file, npz_file))
Beispiel #26
0
def main():
    args = get_args()

    with open(args.model, 'rb') as f:
        model_of = pickle.load(f)

    feats_for_phone = {}
    idxs_for_phone = {}
    for ph_key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp):
        ph = int(feat[0])
        feats_for_phone.setdefault(ph, []).append(feat[1:])
        idxs_for_phone.setdefault(ph, []).append(ph_key)

    with open(args.output, 'wt') as f:
        for ph in feats_for_phone:
            feats = np.array(feats_for_phone[ph])
            scores = model_of[ph].predict(feats)
            for ph_key, score in zip(idxs_for_phone[ph], list(scores)):
                score = round_score(score, 1)
                f.write(f'{ph_key}\t{score:.1f}\t{ph}\n')
def main():
    args = get_args()

    # Phone symbol table
    _, phone_int2sym = load_phone_symbol_table(args.phone_symbol_table)

    # Human expert scores
    score_of, phone_of = load_human_scores(args.human_scoring_json, floor=1)

    # Gather the features
    lables = []
    features = []
    for key, feat in kaldi_io.read_vec_flt_scp(args.feature_scp):
        if key not in score_of:
            print(f'Warning: no human score for {key}')
            continue
        ph = int(feat[0])
        if ph in range(args.min_phone_idx, args.max_phone_idx + 1):
            if phone_int2sym is not None and ph in phone_int2sym:
                ph = phone_int2sym[ph]
            lables.append(f'{ph}-{score_of[key]}')
            features.append(feat[1:])

    # Sampling
    sampled_paris = random.sample(list(zip(features, lables)),
                                  min(args.samples, len(lables)))
    features, lables = list(zip(*sampled_paris))

    # Draw scatters
    label_counter = Counter(lables)
    colors = sns.color_palette("colorblind", len(label_counter))
    features = TSNE(n_components=2).fit_transform(features)
    sns_plot = sns.scatterplot(x=features[:, 0],
                               y=features[:, 1],
                               hue=lables,
                               legend='full',
                               palette=colors)
    sns_plot.get_figure().savefig(args.output)
Beispiel #28
0
args = sys.argv

src_xvec_dir = args[1]
pool_xvec_dir = args[2]
scores_dir = args[3]

if not isdir(scores_dir):
    os.makedirs(scores_dir)

src_xvec_file = join(src_xvec_dir, 'spk_xvector.scp')
pool_xvec_file = join(pool_xvec_dir, 'spk_xvector.scp')

pool_xvectors = {}
c = 0
with open(pool_xvec_file) as f:
    for key, xvec in kaldi_io.read_vec_flt_scp(f):
        #print key, mat.shape
        pool_xvectors[key] = xvec
        c += 1
print("Read ", c, "pool xvectors")

with open(src_xvec_file) as f:
    for sspk, sxvec in kaldi_io.read_vec_flt_scp(f):
        print("Computing cosine measure for " + sspk)
        with open(join(scores_dir, 'affinity_' + sspk), 'w') as sf:
            for pspk, pxvec in pool_xvectors.items():
                # compute cosine distance between src and pool spk
                # Multiplying by -1 to ensure compatibility with affinity
                # Now lower value will indicate less affinity as compared
                # to original cosine distance
                dist = -1.0 * distance.cosine(sxvec, pxvec)
    spk = i.split()[0]
    if spk != 's5': # add prefix 'p'
        spk = 'p' + spk
    if sys.argv[1] == 'age':
        trait = int(i.split()[1])
    elif sys.argv[1] == 'gender':
        trait = i.split()[2]
    elif sys.argv[1] == 'accent':
        trait = i.split()[3]
    spk2trait[spk] = trait
print('speaker to trait is %s' % spk2trait)

tsne = TSNE(n_components=2, verbose=1)
X, y = [], []
index = 0
for key,vec in read_vec_flt_scp(sys.argv[2]):
    X.append(vec)
    spk = key.split('-')[0]
    y.append(spk2trait[spk])
    #print(vec.shape)
    #y.append(index)
    index += 1
X, y = np.array(X), np.array(y)
print(len(y))
print(np.unique(y))
X_emb = tsne.fit_transform(X) # tsne transformed

# For reproducability of the results
np.random.seed(42)
N = int(sys.argv[3])
rndperm = np.random.permutation(X_emb.shape[0])
Beispiel #30
0
def get_cmap(n, name='hsv'):
    return plt.cm.get_cmap(name, n)

# get gender info
spk2gender = {}
with open(spk2gender_file) as f:
    for line in f.read().splitlines():
        sp = line.split()
        spkid = sp[0]
        gen = sp[1]
        spk2gender[spkid] = gen

X = []
spks = []
for key, mat in kaldi_io.read_vec_flt_scp(spk_xvector_file):
    #print(key, mat.shape)
    spks.append(key)
    X.append(mat[np.newaxis])

X = np.concatenate(X)
print("X = ", X.shape)
mean_X = np.mean(X, axis=0)
std_X = np.std(X, axis=0)
X = (X - mean_X) / std_X

tsne = TSNE(n_components=2, init='random', random_state=42,
                     perplexity=5)
Y = tsne.fit_transform(X)

nspk = Y.shape[0]
import kaldi_io

if len(sys.argv) != 4:
    print("usage: %s ood-xvectors id-xvectors ood-transformed-dir" %
          sys.argv[0])
    print("The x-vectors should be normalized by the mean.")
    quit()

print("Perform CORAL transform")
ood_vec_file = sys.argv[1]
id_vec_file = sys.argv[2]
transform_dir = sys.argv[3]

ood_vec = []
ood_keys = []
for key, vec in kaldi_io.read_vec_flt_scp(ood_vec_file):
    ood_vec.append(vec)
    ood_keys.append(key)
ood_vec = np.array(ood_vec)

id_vec = []
for key, vec in kaldi_io.read_vec_flt_scp(id_vec_file):
    id_vec.append(vec)
id_vec = np.array(id_vec)
dim = id_vec.shape[1]

# Covariance
Cs = (1.0 / ood_vec.shape[0]) * np.dot(np.transpose(ood_vec),
                                       ood_vec) + np.eye(dim)
Ct = (1.0 / id_vec.shape[0]) * np.dot(np.transpose(id_vec),
                                      id_vec) + np.eye(dim)