Ejemplo n.º 1
0
def index_train(feat_filename, index_filename, index_type="IVF1024,Flat"):

    print('Loading feature data...')
    max_feats = 10000
    feats, utt_ids = kaldi_io.readArk(feat_filename, limit=max_feats)

    complete_feat_len = 0
    for feat in feats:
        complete_feat_len += feat.shape[0]

    utt_map = np.zeros(complete_feat_len, dtype=np.int32)
    pos_map = np.zeros(complete_feat_len, dtype=np.int32)

    # create the utt_map and pos_map that map a pos id to utt id and position inside an utterance
    pos = 0
    for i, (feat, utt_id) in enumerate(zip(feats, utt_ids)):
        for j in range(feat.shape[0]):
            utt_map[pos] = i
            pos_map[pos] = j
            pos += 1

    complete_feats = np.concatenate(feats, axis=0)

    print('complete_feats:', complete_feats)

    print('complete_feats.min():', complete_feats.min())
    print('complete_feats.max():', complete_feats.max())
    print('complete_feats.sum()', complete_feats.sum())

    print('Going to index features of shape:', complete_feats.shape)

    index = faiss.index_factory(complete_feats.shape[1],
                                index_type.encode("ascii"))

    index.train(complete_feats)
    index.add(complete_feats)

    print('is_trained:', index.is_trained)
    print('ntotal:', index.ntotal)

    print('Indexing finished.')

    search_vec_index = 100
    search_vec = complete_feats[search_vec_index]
    search_vec = search_vec.reshape((1, len(search_vec)))

    print("Shape of search_vec:", search_vec.shape)

    D, I = index.search(search_vec, 50)

    print("Neighboors of ", search_vec_index)

    print("D:")

    print(D)

    print("I:")

    print(I)
Ejemplo n.º 2
0
def visualize_kaldi_bin_feats(feat_filename,
                              max_frames,
                              num_feat=0,
                              phn_file='',
                              phn_offset=5,
                              wav_file='',
                              do_tsne=False):
    feats, utt_ids = kaldi_io.readArk(feat_filename, limit=10000)

    print([feat.shape for feat in feats], utt_ids)

    print('showing features for utt_id:', utt_ids[num_feat])

    print('min vector:')
    print(np.min(feats[num_feat], axis=1))
    print('max vector:')
    print(np.max(feats[num_feat], axis=1))
    print('sum vector:')
    print(np.sum(feats[num_feat], axis=1))

    print(feats[num_feat].shape)

    if phn_file == '':
        plt.matshow(feats[num_feat][:max_frames].T)
        plt.show()
    else:
        plt.matshow(feats[num_feat][:max_frames].T)
        positions, names = utils.loadPhnFile(phn_file)
        xpositions = [
            float(pos[1]) / samples_per_frame - phn_offset for pos in positions
            if float(pos[1]) / samples_per_frame < max_frames
        ]
        for xc in xpositions:
            plt.axvline(x=xc, color='k', linestyle='--')
        plt.show()

    if do_tsne:
        plt.figure(1)

        print('Calculating TSNE:')
        model = TSNE(n_components=2, random_state=0)

        tsne_data = model.fit_transform(feats[num_feat])
        plt.plot(tsne_data[:, 0], tsne_data[:, 1], '--')

        print('Now showing tsne plot:')
        plt.show()
Ejemplo n.º 3
0
def cluster_rnn_phn(n_clusters,
                    wav_files,
                    ark_file,
                    hopping_size,
                    window_size,
                    subsample,
                    n_jobs=4):

    feats, uttids = kaldi_io.readArk(ark_file)

    tf.segment_mean()
    #tf.

    #from https://github.com/tensorflow/tensorflow/issues/7389
    ones = tf.ones_like(x)
    count = tf.unsorted_segment_sum(ones, ids, 2)
    sums = tf.unsorted_segment_sum(x, ids, 2)
    mean = tf.divide(sums, count)
Ejemplo n.º 4
0
def load_feats_flat(ark_file):

    feats, uttids = kaldi_io.readArk(ark_file)

    # preallocate the array and establish array sizes
    inner_dim = feats[0][0].shape[1]
    sum_len = feats2sumlen(feats)

    feats_flat = np.zeros(sum_len, inner_dim)
    uttids_flat = []
    pos_flat = []

    pos = 0
    for uttid, feat in zip(feats):
        feats_flat[pos:pos + feat.shape[0]] = feat
        pos += feat.shape[0]
        #repeating uttid feat.shape[0] times
        uttids_flat += [uttid] * feat.shape[0]
        pos_flat = np.array(float(x)
                            for x in range(feat.shape[0])) * hopping_size

    return feats, feats_flat, uttids_flat, pos_flat
Ejemplo n.º 5
0
def visualize_stats(feat_filename,
                    max_feats,
                    abs_feats=True,
                    reverse_sort=True):
    feats, utt_ids = kaldi_io.readArk(feat_filename, limit=max_feats)

    feats_len = len(feats)

    print("Loaded:" + str(feats_len) + "feats")

    sums = []
    for feat in feats:
        if abs_feats:
            feat = np.abs(feat)
        local_sum = np.sum(feat, axis=0) / float(len(feat))
        print(local_sum.shape)
        sums.append(local_sum)

    sums = np.stack(sums, axis=0)
    print(sums.shape)

    finalsum = np.sum(sums, axis=0) / float(feats_len)

    finalsum_sorted = np.sort(np.array(finalsum))

    if reverse_sort:
        finalsum_sorted = finalsum_sorted[::-1]

    print(finalsum)
    print(finalsum_sorted)

    plt.plot(finalsum_sorted)

    plt.figure(1)
    plt.matshow([finalsum])
    plt.figure(2)
    plt.matshow([finalsum_sorted])
    plt.show()
Ejemplo n.º 6
0
def get_vectors():
    # possible parameters
    # feat_file -> path to feat_file, must be one returned by /list_avail_reps

    # half_index -> cut vectors at this position (optional, default: -1)
    # limit -> max vectors to return (optional, numeric)
    # average_utts -> average vector for each utterance (optional, default: True)
    # normalize -> normalize vectors to unit length (optional, default: False)

    # Reading parameters from POST request:
    if 'feat_file' in flask.request.form:
        feat_filename = flask.request.form['feat_file']
    else:
        response_str = json.dumps({
            'status':
            'fail',
            'reason':
            'You must supply a feat_file for /get_vectors'
        })
        response = Response(response_str, mimetype='application/json')
        return response

    if 'half_index' in flask.request.form:
        half_index = int(flask.request.form['half_index'])
    else:
        print(
            'POST /get_vectors called without half_index parameter, setting to default -1 (disable)'
        )
        half_index = -1

    if 'limit' in flask.request.form:
        limit = int(flask.request.form['limit'])
    else:
        print(
            'POST /get_vectors called without limit parameter, setting to default -1 (disable)'
        )
        limit = -1

    if 'average_utts' in flask.request.form:
        average_utts = flask.request.form['average_utts']
    else:
        print(
            'POST /get_vectors called without average_utts parameter, setting to default true (enable)'
        )
        average_utts = True

    if not average_utts or average_utts == 'False' or average_utts == 'false':
        if 'stride' in flask.request.form:
            stride = int(flask.request.form['stride'])
        else:
            print(
                'POST /get_vectors called with average_utts = False, but stride parameter is not set, setting it to the default value (1)'
            )
            stride = 1

    normalize = ('normalize' in flask.request.form)

    feats, utt_ids = kaldi_io.readArk(feat_filename, limit=limit)

    feats_len = len(feats)

    assert (len(utt_ids) == len(feats))

    print("Loaded:" + str(feats_len) + " feats.")

    if average_utts or average_utts == 'True' or average_utts == 'true':
        feats = [feat.mean(0) for feat in feats]

        if half_index != -1:
            print('Cutting vectors at ', half_index,
                  'and normalize to unit length' if normalize else '')
            feats = [
                feat[:half_index] /
                (np.linalg.norm(feat[:half_index]) if normalize else 1.0)
                for feat in feats
            ]
        else:
            if normalize:
                print('Normalize to unit length.')
                feats = [feat / np.linalg.norm(feat) for feat in feats]

        response_vec_dict = {}

        for utt_id, feat in zip(utt_ids, feats):
            response_vec_dict[utt_id] = feat.tolist()

        response_str = json.dumps({
            'status': 'success',
            'vectors': response_vec_dict
        })

    else:
        if stride != 1:
            feats = [feat[::stride] for feat in feats]

        if half_index != -1:
            print('Cutting vectors at ', half_index)
            print('Not yet supported')

        if normalize:
            feats = [(feat.T / np.linalg.norm(feat, axis=1)).T
                     for feat in feats]

        response_vec_dict = {}

        for utt_id, feat in zip(utt_ids, feats):
            response_vec_dict[utt_id] = feat.tolist()

        response_str = json.dumps({
            'status': 'success',
            'vectors': response_vec_dict
        })

    response = Response(response_str, mimetype='application/json')
    return response
Ejemplo n.º 7
0
def visualize_classes_tsne(feat_filename,
                           utt_2_class_filename,
                           half_index=-1,
                           normalize=True,
                           class_mean_vector=False):
    feats, utt_ids = kaldi_io.readArk(feat_filename, limit=25000)

    feats_len = len(feats)

    assert (len(utt_ids) == len(feats))

    print("Loaded:" + str(feats_len) + " feats.")

    feats = [feat.mean(0) for feat in feats]

    if half_index != -1:
        print('Cutting vectors at ', half_index,
              'and normalize to unit length' if normalize else '')
        feats = [
            feat[:half_index] /
            (np.linalg.norm(feat[:half_index]) if normalize else 1.0)
            for feat in feats
        ]
    else:
        if normalize:
            print('Normalize to unit length.')
            feats = [feat / np.linalg.norm(feat) for feat in feats]

    utt_2_class = utils.loadUtt2Spk(utt_2_class_filename)
    ground_truth_utt_2_class = [
        utt_2_class[utt_id] for utt_id in utt_ids if utt_id in utt_2_class
    ]
    utt_ids_filtered = [utt_id for utt_id in utt_ids if utt_id in utt_2_class]
    #feats_filtered = [feat for feat,utt_id in zip(feats, utt_ids) if utt_id in utt_2_class]

    assert (len(ground_truth_utt_2_class) == len(utt_ids_filtered))
    #assert(len(utt_ids_filtered) == len(feats_filtered) )

    dataset = {}
    for feat, utt in zip(feats, utt_ids):
        if utt in utt_2_class:
            dataset[utt] = feat

    myclass_2_utt = {}
    myclass_2_samples = {}

    for myclass in set(ground_truth_utt_2_class):
        my_class_filtered_utts = [
            utt_id for utt_id, gd_class in zip(utt_ids_filtered,
                                               ground_truth_utt_2_class)
            if gd_class == myclass
        ]
        if len(my_class_filtered_utts) > 100:
            myclass_2_utt[myclass] = my_class_filtered_utts
            myclass_2_samples[myclass] = random.sample(
                myclass_2_utt[myclass], min(1000, len(myclass_2_utt[myclass])))

    feats_samples = []
    feats_samples_classes = []

    if class_mean_vector:
        for myclass in myclass_2_samples:
            feats_samples += [
                np.vstack(dataset[utt]
                          for utt in myclass_2_samples[myclass]).mean(0)
            ]
            feats_samples_classes += [myclass]
    else:
        for myclass in myclass_2_samples:
            feats_samples += [
                dataset[utt] for utt in myclass_2_samples[myclass]
            ]
            feats_samples_classes += [myclass] * len(
                myclass_2_samples[myclass])
            print('Added', len(myclass_2_samples[myclass]), 'entries for',
                  myclass)
            print([
                utt.replace('train-sample', 'train/sample') + '.mp3'
                for utt in myclass_2_samples[myclass]
            ])

    class_2_num = dict([(a, b)
                        for b, a in enumerate(list(myclass_2_samples.keys()))])

    print(class_2_num)

    feats_samples_classes_num = [
        class_2_num[myclass] for myclass in feats_samples_classes
    ]

    #print(feats_samples_classes_num)

    num_classes = max(feats_samples_classes_num)
    print('Num classes=', num_classes)

    print(feats_samples)

    print('shape:', feats_samples[0].shape)

    print('Calculating TSNE:')

    model = TSNE(n_components=2, random_state=0, metric='euclidean')
    tsne_data = model.fit_transform(np.vstack(feats_samples))

    #model = TSNE(n_components=2, random_state=0, metric='cosine')
    #tsne_data = model.fit_transform([feat[100:] for feat in feats])

    colormap = plt.cm.gist_ncar  #nipy_spectral, Set1,Paired
    colorst = colormap(np.linspace(
        0, 0.9, num_classes +
        1))  #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)]

    cs = [
        colorst[feats_samples_classes_num[i]]
        for i in range(len(feats_samples_classes_num))
    ]

    #print(tsne_data[:,0])
    #print(tsne_data[:,1])

    plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs)
    #for i,elem in enumerate(tsne_data):
    #    print(cs[0])
    #    print(ground_truth_utt_2_spk[0])
    #    plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i])
    plt.legend()

    #      for i in range(tsne_data.shape[0]):
    #          plt.text(tsne_data[i,0], tsne_data[i,1], uttids[i], fontsize=8, color=cs[i])

    print('Now showing tsne plot:')
    plt.show()
Ejemplo n.º 8
0
def convert(alignments, spectrograms):
    input_dir = os.path.abspath(
        os.path.dirname(os.path.abspath(__file__)) +
        '/../dat/speech_tokenizer')
    output_dir = os.path.abspath(
        os.path.dirname(os.path.abspath(__file__)) + '/../dat/fast_load')
    utterances_output_dir = output_dir + '/utterances'
    if not os.path.exists(utterances_output_dir):
        os.makedirs(utterances_output_dir)
    if alignments:
        print('Storing tag information...', end='', flush=True)
        tags = []
        with open(input_dir + '/new_alignments/phones.txt',
                  'r') as file_handle:
            for i, line in enumerate(file_handle):
                tag, id = line.strip('\n').split(' ')
                assert i == int(id)
                tag = tag.split('_')
                tag = (tag[1] if len(tag) > 1 else None, tag[0])
                tags.append(tag)
        with open(output_dir + '/tags.json', 'w') as file_handle:
            json.dump(tags, file_handle)
        print(' DONE')
        print('Converting alignments...', end='', flush=True)
        tag_dict = dict()
        with open(input_dir + '/new_alignments/merged_alignment.txt',
                  'r') as file_handle:
            for line in file_handle:
                # Get the data on one tag
                line = line.strip('\n').split(' ')
                # Convert starts and durations from seconds to numbers of frames without risking floating point errors
                assert len(re.sub('.*\.', '', line[2])) == 3
                assert len(re.sub('.*\.', '', line[3])) == 3
                id, start, duration, tag = line[0], int(
                    re.sub('\.', '', line[2][:-1])), int(
                        re.sub('\.', '', line[3][:-1])), int(line[4])
                if id not in tag_dict:
                    tag_dict[id] = []
                tag_dict[id].append((start, duration, tag))
        n_ids = len(tag_dict.keys())
        print(' DONE')
        start_time = time()
        for i, (key, value) in enumerate(tag_dict.items()):
            progress.print_bar(i, n_ids, 20, 'Storing alignment data... ┃',
                               '┃')
            with open(utterances_output_dir + '/%s.json' % (key, ),
                      'w') as file_handle:
                # file_dict = {'id' : key, 'alignments' : value}
                json.dump(value, file_handle)
        progress.print_bar(i + 1, n_ids, 20, 'Storing alignment data... ┃',
                           '┃ DONE %.4fs' % (time() - start_time))
        print('Storing sequence IDs...', end='', flush=True)
        with open(output_dir + '/utterances.json', 'w') as file_handle:
            json.dump(
                sorted(tag_dict), file_handle, indent=4
            )  # Asserts the same IDs in both alignments and spectrograms
        print(' DONE')
    if spectrograms:
        # Convert spectrograms
        print('Loading spectrogram data (this may take some time)...',
              end='',
              flush=True)
        feats, ids = readArk(
            input_dir + '/TEDLIUM_fbank_train_cleaned/unnormalized.feats.ark')
        n_ids = len(ids)
        print(' DONE')
        start_time = time()
        for i, (feat, id) in enumerate(zip(feats, ids)):
            progress.print_bar(i, n_ids, 20, 'Storing spectrogram data... ┃',
                               '┃')
            np.save(utterances_output_dir + '/%s.npy' % (id, ), feat)
        progress.print_bar(i + 1, n_ids, 20, 'Storing spectrogram data... ┃',
                           '┃ DONE %.4fs' % (time() - start_time))
Ejemplo n.º 9
0
def cluster_speaker(ark_file,
                    cluster_algo='HDBSCAN',
                    half_index=-1,
                    dbscan_eps=0.0005,
                    dbscan_min_samples=3,
                    min_cluster_sizes_str="5",
                    min_samples_str="3",
                    utt_2_spk=None,
                    output_utt_2_spk=None,
                    fileset='dev',
                    tsne_viz=False,
                    n_jobs=4,
                    db_scan_range_search=False,
                    hdb_scan_range_search=False,
                    normalize=True,
                    do_save_result=True,
                    use_gpu=False):

    postfix = ''
    print('Loading feats now:')

    feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset))

    print('feat[0] shape: ', feats[0].shape)

    #feats = np.vstack([pairwise_normalize(feat[0]) for feat in feats])

    print('Generating mean vector.')

    feats = np.vstack([feat.mean(0) for utt, feat in zip(uttids, feats)])

    if half_index != -1:
        print('Cutting vectors at ', half_index,
              'and normalize to unit length' if normalize else '')
        feats = np.vstack([
            feat[half_index:] /
            (np.linalg.norm(feat[half_index:]) if normalize else 1.0)
            for feat in feats
        ])
    else:
        if normalize:
            print('Normalize to unit length.')
            feats = np.vstack([feat / np.linalg.norm(feat) for feat in feats])

    print('Done. feats shape:', feats.shape)

    #    feats = np.vstack([feat[0] for utt,feat in zip(uttids,feats) if 'AlGore' not in utt])
    #    uttids = [utt for utt in uttids if 'AlGore' not in utt]

    print('feats shape:', feats.shape)
    print('feat[0] shape: ', feats[0].shape)

    print('halfindex:', half_index)

    #    print('some distances:')
    #    for a,b in [(random.randint(0, len(feats)-1), random.randint(0, len(feats)-1)) for i in range(10)] + [(0,0)]:
    #        dst = distance.euclidean(feats[a],feats[b])
    #        print('euc dst:', a,b,'=',dst)
    #        dst = distance.cosine(feats[a],feats[b])
    #        print('cos dst:', a,b,'=',dst)
    #        dst = np.dot(feats[a],feats[b])
    #        print('dot dst:', a,b,'=',dst)
    #
    #        dst = pos_neg_dot_distance(feats[a],feats[b])
    #        print('pos_neg_dot_distance dst:', a,b,'=',dst)
    #
    #
    #        dst = pairwise_pos_neg_dot_distance(feats[a],feats[b])
    #        print('pairwise_pos_neg_dot_distance dst:', a,b,'=',dst)
    #
    #    for a in range(10):
    #        for b in range(10):
    #            print('feats[a]:',feats[a])
    #            print('feats[b]:',feats[b])
    #            dst = pos_neg_dot_distance(feats[a],feats[b])
    #            print('pos_neg_dot_distance dst:', a,b,'=',dst)
    #            pairwise_pos_neg_dot_distance(feats[a],feats[b])
    #            print('pairwise_pos_neg_dot_distance dst:', a,b,'=',dst)

    ground_truth_utt_2_spk, ground_truth_utt_2_spk_int = None, None

    if utt_2_spk is not None and utt_2_spk.lower(
    ) != 'none' and utt_2_spk.strip() != '':
        utt_2_spk = utils.loadUtt2Spk(utt_2_spk.replace('%set', fileset))

        ground_truth_utt_2_spk = [utt_2_spk[utt_id] for utt_id in uttids]

        le = preprocessing.LabelEncoder()
        le.fit(ground_truth_utt_2_spk)

        ground_truth_utt_2_spk_int = le.transform(ground_truth_utt_2_spk)

        print("Ground truth speaker classes available:")

        print(ground_truth_utt_2_spk_int)

    print('Now running DBSCAN clustering on', len(uttids), 'entries.')

    bestARI = 0.0
    bestConf = {}

    if db_scan_range_search:

        eps_range = [x / 100.0 for x in range(1, 100)]
        min_samples_range = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 50, 100]

        result_mat = np.zeros((len(eps_range), len(min_samples_range)))

        print('shape result mat:', result_mat.shape)

        for i_eps, dbscan_eps in enumerate(eps_range):
            for i_min_samples, dbscan_min_samples in enumerate(
                    min_samples_range):

                dbscan_algo = DBSCAN(eps=dbscan_eps,
                                     min_samples=dbscan_min_samples,
                                     metric=pairwise_pos_neg_dot_distance,
                                     n_jobs=1)
                clustering = dbscan_algo.fit(feats)
                clustering_labels = list(clustering.labels_)

                print('dbscan_eps', dbscan_eps, 'dbscan_min_samples',
                      dbscan_min_samples)
                print('num clusters:', len(set(clustering_labels)))

                ARI = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int,
                                                  clustering_labels)

                result_mat[i_eps][i_min_samples] = float(ARI)

                print('ARI:', ARI)

                if ARI > bestARI:
                    print('Found new best conf:', ARI)
                    bestConf = {
                        'eps': dbscan_eps,
                        'min_samples': dbscan_min_samples
                    }
                    bestARI = ARI

        plt.matshow(result_mat)
        plt.show()

        np.save(ark_file + '.dbrangescan_cluster_ARI' + postfix, result_mat)

        print('bestARI:', bestARI)
        print('bestConf:', bestConf)

    min_cluster_sizes = [int(x) for x in min_cluster_sizes_str.split(',')]
    min_samples = [int(x) for x in min_samples_str.split(',')]

    result_mat = np.zeros((len(min_cluster_sizes), len(min_samples)))
    result_mat_outliers = np.zeros_like(result_mat)
    result_mat_n = np.zeros_like(result_mat)

    best_pairwise_f1 = 0.0
    bestConf = {}

    # previous good config: min_cluster_size=5, min_samples=3
    for i, min_cluster_size in enumerate(min_cluster_sizes):
        for j, min_sample in enumerate(min_samples):

            feat_key = ark_file.split('/')[-3] + '_' + str(
                min_cluster_size) + '_' + str(min_sample)

            if do_save_result:
                save_result(feat_key, "cl_size", str(min_cluster_size))
                save_result(feat_key, "min_s", str(min_sample))

            if cluster_algo == 'HDBSCAN':
                print('Running HDBSCAN with min_cluster_size',
                      min_cluster_size, 'min_samples', dbscan_min_samples)
                cluster_algo = HDBSCAN(min_cluster_size=min_cluster_size,
                                       min_samples=min_sample,
                                       metric='euclidean',
                                       algorithm='best',
                                       core_dist_n_jobs=28)
            elif cluster_algo == 'DBSCAN':
                print('Running DBSCAN with dbscan_eps', dpscan_eps,
                      'dbscan_min_samples', dbscan_min_samples)
                cluster_algo = DBSCAN(eps=dbscan_eps,
                                      min_samples=dbscan_min_samples,
                                      metric='euclidean',
                                      n_jobs=28)
            elif cluster_algo == 'kmeans':
                print(
                    'kmeans clustering not available for speaker clustering yet. Exiting.'
                )
                sys.exit(-1)
            else:
                print('cluster_algo:', cluster_algo, 'not supported. Exiting.')

            clustering = cluster_algo.fit(feats)
            clustering_labels = list(clustering.labels_)

            print('Num of clusters (as determined by density clustering):',
                  len(set(clustering_labels)))
            print(clustering_labels)

            sys.stdout.flush()

            #print('Numpy bincount of the clustering:', np.bincount(clustering))

            number_format = "%.4f"

            clustering_labels1 = clustering_labels
            clustering_labels2 = []

            num_outliers = -1
            for elem in clustering_labels1:
                if elem == -1:
                    clustering_labels2.append(num_outliers)
                    num_outliers -= 1
                else:
                    clustering_labels2.append(elem)

            num_outliers = (num_outliers + 1) * -1

            if utt_2_spk is not None and utt_2_spk.lower(
            ) != 'none' and utt_2_spk.strip() != '':

                if do_save_result:
                    save_result(feat_key, 'outliers_' + fileset,
                                str(num_outliers))
                    save_result(feat_key, 'clusters_' + fileset,
                                str(len(set(clustering_labels))))

                print(
                    'Number of outliers:', num_outliers, '(', number_format %
                    (float(num_outliers) * 100.0 / float(len(uttids))), '%)')

                #This would compute scores with all outliers in the same cluster:

                #ARI = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int, clustering_labels)
                #print('ARI score:', number_format % ARI)
                #vmeasure = metrics.v_measure_score(ground_truth_utt_2_spk_int, clustering_labels)
                #print('V-measure:', number_format % vmeasure)

                ARI2 = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int,
                                                   clustering_labels2)
                print('ARI score (each outlier its own cluster):',
                      number_format % ARI2)
                vmeasure2 = metrics.v_measure_score(ground_truth_utt_2_spk_int,
                                                    clustering_labels2)
                print('NMI / V-measure (each outlier its own cluster):',
                      number_format % vmeasure2)

                if do_save_result:
                    save_result(feat_key, 'ARI_' + fileset,
                                number_format % ARI2)
                    save_result(feat_key, 'NMI_' + fileset,
                                number_format % vmeasure2)

                print('Calculating pairwise recall:')

                cluster_pairwise = pdist(
                    np.asarray(clustering_labels2)[:, np.newaxis],
                    metric='chebyshev') < 1
                groundtruth_pairwise = pdist(
                    np.asarray(ground_truth_utt_2_spk_int)[:, np.newaxis],
                    metric='chebyshev') < 1

                #scitkits recall_score and precision_score is slow unfortunatly
                #pairwise_recall = metrics.recall_score(groundtruth_pairwise, cluster_pairwise , pos_label=True, average='binary')
                #pairwise_precision = metrics.precision_score(groundtruth_pairwise, cluster_pairwise , pos_label=True, average='binary')
                #print('scikit learn recall / precision:', pairwise_recall, pairwise_precision)

                # efficient binary comparision, since the pairwise matrix can be huge for large n
                tp = np.sum(
                    np.bitwise_and(groundtruth_pairwise, cluster_pairwise))
                fp = np.sum(
                    np.bitwise_and(np.invert(groundtruth_pairwise),
                                   cluster_pairwise))
                fn = np.sum(
                    np.bitwise_and(groundtruth_pairwise,
                                   np.invert(cluster_pairwise)))

                pairwise_precision = tp / (tp + fp)
                pairwise_recall = tp / (tp + fn)

                pairwise_f1 = 2.0 * pairwise_recall * pairwise_precision / (
                    pairwise_recall + pairwise_precision)

                print(
                    'pairwise recall / precision / f1-score (each outlier its own cluster):',
                    number_format % pairwise_recall,
                    number_format % pairwise_precision,
                    number_format % pairwise_f1)

                if do_save_result:
                    save_result(feat_key, 'recall_' + fileset,
                                number_format % pairwise_recall)
                    save_result(feat_key, 'prec_' + fileset,
                                number_format % pairwise_precision)
                    save_result(feat_key, 'f1_' + fileset,
                                number_format % pairwise_f1)

                if pairwise_f1 > best_pairwise_f1:
                    print('Found new best pairwise f1:', pairwise_f1)
                    bestConf = {
                        'min_cluster_size': min_cluster_size,
                        'min_sample': min_sample,
                        'n': len(set(clustering_labels)),
                        'outliers': num_outliers
                    }
                    best_pairwise_f1 = pairwise_f1

                result_mat[i][j] = float(pairwise_f1)
                result_mat_outliers[i][j] = num_outliers
                result_mat_n[i][j] = len(set(clustering_labels))

                #print('pairwise recall / precision / f1-score:', number_format % pairwise_recall, number_format % pairwise_precision, number_format % pairwise_f1)

                print('Clustering predicted classes:',
                      len(set(clustering_labels)))
                print('Ground truth classes',
                      len(set(ground_truth_utt_2_spk_int)))


#    if len(min_cluster_sizes) > 1 or len(min_samples) > 1:
#        np.save(ark_file + '.hdbrangescan_cluster_f1' + postfix,  result_mat)
#
#        print('best f1:', best_pairwise_f1)
#        print(bestConf)
#
#        print('f1 scores:')
#        plt.matshow(result_mat)
#        plt.show()
#
#        print('num outliers')
#        plt.matshow(result_mat_outliers)
#        plt.show()
#
#        print('n')
#        plt.matshow(result_mat_n)
#        plt.show()

    if tsne_viz:
        print('Calculating TSNE:')

        model = TSNE(n_components=2, random_state=0, metric='euclidean')
        tsne_data = model.fit_transform(feats)

        #model = TSNE(n_components=2, random_state=0, metric='cosine')
        #tsne_data = model.fit_transform([feat[100:] for feat in feats])

        if utt_2_spk is not None and utt_2_spk.lower(
        ) != 'none' and utt_2_spk.strip() != '':
            num_speakers = max(ground_truth_utt_2_spk_int) + 1
        else:
            num_speakers = len(set(clustering_labels))

        colormap = plt.cm.gist_ncar  #nipy_spectral, Set1,Paired
        colorst = colormap(
            np.linspace(0, 0.9, num_speakers)
        )  #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)]

        if utt_2_spk is not None and utt_2_spk.lower(
        ) != 'none' and utt_2_spk.strip() != '':
            cs = [
                colorst[ground_truth_utt_2_spk_int[i]]
                for i in range(len(clustering_labels))
            ]
        else:
            cs = [
                colorst[clustering_labels[i]]
                for i in range(len(clustering_labels))
            ]

        #print(tsne_data[:,0])
        #print(tsne_data[:,1])

        plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs)
        #for i,elem in enumerate(tsne_data):
        #    print(cs[0])
        #    print(ground_truth_utt_2_spk[0])
        #    plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i])
        plt.legend()

        #      for i in range(tsne_data.shape[0]):
        #          plt.text(tsne_data[i,0], tsne_data[i,1], uttids[i], fontsize=8, color=cs[i])

        print('Now showing tsne plot:')
        plt.show()

    if output_utt_2_spk is not None and output_utt_2_spk.lower(
    ) != 'none' and output_utt_2_spk.strip() != '':
        if len(min_cluster_sizes) > 1 or len(min_samples) > 1:
            print(
                'Not saving clustering result, since we searched a full range. Rerun with a single min_cluster_size and min_samples parameter.'
            )
        else:
            output_utt_2_spk = output_utt_2_spk.replace(
                '%minclustersize', str(min_cluster_size))
            output_utt_2_spk = output_utt_2_spk.replace(
                '%minsample', str(min_sample))
            output_utt_2_spk = output_utt_2_spk.replace('%set', fileset)
            featstr = ark_file.split('/')[-3]
            featstr = featstr.replace(
                'featinput_unnormalized.feats.ark_dot_combine_tied_embs',
                'std_end_conf').replace('feats_', '')
            print('featstr:', featstr)
            output_utt_2_spk = output_utt_2_spk.replace('%feat', featstr)
            output_utt_2_spk += ('_l2norm' if normalize else '')
            #output_utt_2_spk += postfix
            print('Saving result to:', output_utt_2_spk)
            with open(output_utt_2_spk, 'w') as output_utt_2_spk_out:
                for utt, label in zip(uttids, clustering_labels2):
                    output_utt_2_spk_out.write(utt +
                                               (' spk%07d' %
                                                label).replace('-', 'o') +
                                               '\n')
Ejemplo n.º 10
0
def corr_phn(ark_file,
             alignment_dir,
             fileset='train',
             limit=5000,
             cmudict_sort=True):

    print('len cmudict arpabet:',
          len(cmudict_silence + cmudict_vowels + cmudict_consonants))

    feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset),
                                     limit=limit)

    phones_txt = alignment_dir + 'phones.txt'
    num2phone_orig = load_phones_txt(phones_txt)
    #phone2num_orig =

    print(num2phone_orig)

    phoneset = list(set(num2phone_orig.values()))

    not_vowels = []
    for elem in phoneset:
        if elem not in cmudict_vowels:
            not_vowels += [elem]

    for elem in cmudict_vowels:
        if elem not in phoneset:
            print('vowel NA:', elem)

    print(not_vowels)

    print('Loaded num2phone_orig for: ', len(set(num2phone_orig.values())),
          'phones')

    if cmudict_sort:
        num2phone = dict(
            enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants))
        phone2num = dict([
            (y, x) for x, y in enumerate(cmudict_silence + cmudict_vowels +
                                         cmudict_consonants)
        ])
    else:
        num2phone = dict(enumerate(sorted(list(set(num2phone_orig.values())))))
        phone2num = dict([
            (y, x)
            for x, y in enumerate(sorted(list(set(num2phone_orig.values()))))
        ])

    num2phoneid = dict([(i, phone2num[x]) for i, x in num2phone_orig.items()])
    print(num2phoneid)

    ctm_file = alignment_dir + 'all.ctm'

    all_ctm_db = load_all_ctm_pos(ctm_file, num2phoneid, limit=limit)

    phone_len = len(num2phone.keys())

    feat_dim = len(feats[0][0])

    print(feat_dim)

    phone_mutual_info_classifs = []

    do_2d = True
    parallel = True

    if do_2d:
        if not parallel:
            for phone in sorted(num2phone.keys()):

                #for feat_num in range(feat_dim):
                phone_mutual_info_classif = get_phone_mutual_info(
                    phone, feats, uttids, all_ctm_db)
                phone_mutual_info_classifs += [phone_mutual_info_classif]

            phone_mutual_info_classifs = np.vstack(phone_mutual_info_classifs)
        else:
            pool = multiprocessing.Pool(28)
            get_phone_mutual_info_partial = partial(get_phone_mutual_info,
                                                    feats=feats,
                                                    uttids=uttids,
                                                    all_ctm_db=all_ctm_db)
            phone_mutual_info_classifs = pool.map(
                get_phone_mutual_info_partial, sorted(num2phone.keys()))

        #for feat_num in range(feat_dim):
    else:
        feat_vars = []
        phone_vars = []
        for feat, uttid in zip(feats, uttids):
            ctm = all_ctm_db[uttid]

            #print('lens (ctm|feat):',len(ctm), len(feat))
            if len(ctm) > len(feat):
                print(uttid,
                      'warning, ctm is slightly longer than feature len:',
                      len(ctm), 'vs.', len(feat))
                ctm = ctm[:len(feat)]
            elif len(ctm) < len(feat):
                print(uttid,
                      'warning, ctm is slightly shorter than feature len:',
                      len(ctm), 'vs.', len(feat))

            assert (len(feat) == len(ctm))

            feat_vars += [feat]

            #phone_var = #(np.array(ctm) == phone) * 1.0
            phone_vars += [ctm]

        feat_vars = np.vstack(feat_vars)
        phone_vars = np.hstack(phone_vars)

        print(feat_vars.shape)
        print(phone_vars.shape)

        phone_mutual_info_classifs = sklearn.feature_selection.mutual_info_classif(
            feat_vars, phone_vars)

        print(phone_mutual_info_classifs)

    print(plt.rcParams["figure.figsize"])
    fig = plt.figure(figsize=(11, 7))
    ax = fig.add_subplot(111)

    phone_mutual_info_classifs[0] *= 0.7

    ax.matshow(phone_mutual_info_classifs, aspect='auto')

    plt.yticks(sorted(num2phone.keys()),
               [num2phone[x] for x in sorted(num2phone.keys())])
    plt.show()
Ejemplo n.º 11
0
def tsne(ark_file,
         alignment_dir,
         fileset='train',
         limit=4000,
         subsample=1000,
         only_common_bigrams=True,
         cmudict_sort=True,
         normalize=True):

    feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset),
                                     limit=limit)

    if normalize:
        print('Normalize to unit length.')
        feats = [feat / np.linalg.norm(feat) for feat in feats]

    phones_txt = alignment_dir + 'phones.txt'
    num2phone_orig = load_phones_txt(phones_txt)
    #phone2num_orig =

    print(num2phone_orig)

    phoneset = list(set(num2phone_orig.values()))

    if cmudict_sort:
        num2phone = dict(
            enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants))
        phone2num = dict([
            (y, x) for x, y in enumerate(cmudict_silence + cmudict_vowels +
                                         cmudict_consonants)
        ])
    else:
        num2phone = dict(enumerate(sorted(list(set(num2phone_orig.values())))))
        phone2num = dict([
            (y, x)
            for x, y in enumerate(sorted(list(set(num2phone_orig.values()))))
        ])

    num2phoneid = dict([(i, phone2num[x]) for i, x in num2phone_orig.items()])
    print(num2phoneid)

    ctm_file = alignment_dir + 'all.ctm'

    all_ctm_db = load_all_ctm_pos(ctm_file, num2phoneid, limit=limit)

    phn_ngram_counts = Counter()

    ngram_func = fivegrams

    feat_vars = []
    phone_vars = []
    for feat, uttid in zip(feats, uttids):
        pos_array, phone_array = all_ctm_db[uttid]

        assert (len(pos_array) == len(phone_array))

        feat_vars += [feat[list(pos_array)]]
        phone_vars += [phone_array]

        phn_ngram_counts.update(ngram_func(phone_array))

    feat_vars = np.vstack(feat_vars)
    phone_vars = np.hstack(phone_vars)

    #print(phn_counts)

    most_common_ngrams = []

    print('Most common phone ngrams:')
    for phone_ngram, count in phn_ngram_counts.most_common(10):
        print(phone_ngram, [num2phone[elem] for elem in phone_ngram], count)
        most_common_ngrams.append(phone_ngram)

    #most_common_ngrams = [(30, 12, 30, 4, 26), (32, 4, 27, 35, 11), (4, 37, 9, 26, 19)]
    #print('Manually selected:', most_common_ngrams)

    most_common_bigrams_to_num = dict([
        (b, a) for a, b in enumerate(most_common_ngrams)
    ])

    print('Loaded', len(phone_vars), 'phoneme examples')

    if only_common_bigrams:
        select_pos = [
            i for i, elem in enumerate(ngram_func(phone_vars))
            if elem in most_common_ngrams
        ]

        feat_vars = feat_vars[select_pos]
        phone_vars = np.asarray([
            most_common_bigrams_to_num[elem]
            for i, elem in enumerate(ngram_func(phone_vars))
            if elem in most_common_ngrams
        ])  #phone_vars[select_pos]

        print(phone_vars)
        print(feat_vars)

        assert (len(feat_vars) == len(phone_vars))

    if subsample != -1:
        feat_phone_vars_sampled_idx = np.random.choice(np.arange(
            len(feat_vars)),
                                                       subsample,
                                                       replace=False)
        feat_vars = feat_vars[feat_phone_vars_sampled_idx]
        phone_vars = phone_vars[feat_phone_vars_sampled_idx]

        print('Subsampled to:', len(phone_vars), 'phoneme examples')

    model = TSNE(n_components=2, random_state=0, metric='cosine')
    tsne_data = model.fit_transform(feat_vars)

    num_classes = len(phoneset)

    if only_common_bigrams:
        num_classes = len(list(set(phone_vars)))

    colormap = plt.cm.gist_ncar  #nipy_spectral, Set1,Paired
    colorst = colormap(np.linspace(
        0, 1.0, num_classes +
        1))  #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)]

    cs = [colorst[phone_vars[i]] for i in range(len(phone_vars))]

    #print(tsne_data[:,0])
    #print(tsne_data[:,1])

    for i, color in enumerate(colorst):
        plt.text(30 + float(i) * 3, 30, str(i), color=color)

    plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs)
    #for i,elem in enumerate(tsne_data):
    #    print(cs[0])
    #    print(ground_truth_utt_2_spk[0])
    #    plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i])
    #plt.legend(most_common_ngrams)

    plt.show()
Ejemplo n.º 12
0
def same_different_experiment(ark_file,
                              utt_2_spk,
                              half_index=-1,
                              normalize=False,
                              fileset='',
                              use_metric='cosine',
                              max_spks=-1,
                              random_seed=42):

    results_file = 'samedifferent_results.csv'

    from pyannote.metrics.plot.binary_classification import plot_det_curve, plot_distributions

    print('Loading feats now:')

    feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset))

    print('loaded: ' + str(len(feats)) + ' feats')
    print('feat[0] shape: ', feats[0].shape)

    #feats = np.vstack([pairwise_normalize(feat[0]) for feat in feats])

    print('Generating mean vector.')

    feats = np.vstack([feat.mean(0) for utt, feat in zip(uttids, feats)])

    if half_index != -1:
        print('Cutting vectors at ', half_index,
              'and normalize to unit length' if normalize else '')
        feats = np.vstack([
            feat[:half_index] /
            (np.linalg.norm(feat[:half_index]) if normalize else 1.0)
            for feat in feats
        ])
    else:
        if normalize:
            print('Normalize to unit length.')
            feats = np.vstack([feat / np.linalg.norm(feat) for feat in feats])

    #print(type(feats))
    #print(feats)

    if utt_2_spk is not None and utt_2_spk.lower(
    ) != 'none' and utt_2_spk.strip() != '':
        utt_2_spk = utils.loadUtt2Spk(utt_2_spk.replace('%set', fileset))

        if max_spks != -1:

            # sample subset of speakers
            spk_set = list(set(utt_2_spk.values()))

            # make speaker selection reproducable
            spk_set.sort()
            #print('Selecting from these speakers:', spk_set)
            random.seed(random_seed)
            #np.random.seed(42)

            selected_spks = random.sample(spk_set, min(len(spk_set), max_spks))

            print('Selecting random subset of speakers with random seed',
                  random_seed, ':', len(selected_spks), 'speakers')
            print(selected_spks)

            # make new utt2spk dictionary on subset
            utt_2_spk_new = dict([(key, utt_2_spk[key]) for key in utt_2_spk
                                  if utt_2_spk[key] in selected_spks])

            #filter feats and uttids
            feats = [
                feat for feat, uttid in zip(feats, uttids)
                if uttid in utt_2_spk_new
            ]
            uttids = [uttid for uttid in uttids if uttid in utt_2_spk_new]

            print('Reduced feats to: ' + str(len(feats)) + ' feats')
            print('Reduced uttids to: ' + str(len(feats)) + ' uttids')

            utt_2_spk = utt_2_spk_new
        else:
            print('Using all speakers:', len(set(utt_2_spk.values())))

        ground_truth_utt_2_spk = [utt_2_spk[utt_id] for utt_id in uttids]

        le = preprocessing.LabelEncoder()
        le.fit(ground_truth_utt_2_spk)

        ground_truth_utt_2_spk_int = le.transform(ground_truth_utt_2_spk)

        print("Ground truth speaker classes available:")

        print(ground_truth_utt_2_spk_int)

    print('Calculating', use_metric, 'distance matrix...')
    #print('feats shape:', feats.shape)
    distances = pdist(feats, metric=use_metric)

    print('Calculating ground thruth distance matrix...')
    y_true = pdist(np.asarray(ground_truth_utt_2_spk_int)[:, np.newaxis],
                   metric='chebyshev') < 1

    result_key = ark_file.split('/')[-3] + ('.' + fileset if fileset != '' else
                                            '') + '.' + use_metric
    prefix = 'plots/plot.' + ark_file.split('/')[-3] + '.' + use_metric + (
        '.' + fileset if fileset != '' else '') + '.seed_' + str(random_seed)

    plot_distributions(y_true,
                       distances,
                       prefix,
                       xlim=(0, 2),
                       ymax=3,
                       nbins=100)

    eer = plot_det_curve(y_true, -distances, prefix)

    print('EER = {eer:.2f}%'.format(eer=100 * eer))

    with open(results_file, 'a') as outfile:
        outfile.write(result_key + ' ' + '{eer:.2f}%'.format(eer=100 * eer) +
                      '\n')