Ejemplo n.º 1
0
  def test_fit_concatenated_and_predict_single_label(self):
    """Train and test model while training data has single label.

    Training data have already been concatenated.
    """
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50
    inference_args.test_iteration = 1

    # generate fake training data, assume already concatenated
    train_sequence = np.random.rand(1000, model_args.observation_dim)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequence, train_cluster_id, training_args)

    # testing, where data has less variation than training
    test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0
    predicted_label = model.predict(test_sequence, inference_args)
    self.assertListEqual([0] * 10, predicted_label)
Ejemplo n.º 2
0
  def test_fit_list_and_predict_single_label(self):
    """Train and test model while training data has single label.

    Training data are not concatenated.
    """
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.enable_cuda = False
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50
    inference_args.test_iteration = 1

    # generate fake training data, as a list
    train_sequences = [
        np.random.rand(100, model_args.observation_dim),
        np.random.rand(200, model_args.observation_dim),
        np.random.rand(300, model_args.observation_dim)]
    train_cluster_ids = [
        np.array(['A'] * 100),
        np.array(['A'] * 200),
        np.array(['A'] * 300),]

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequences, train_cluster_ids, training_args)

    # testing, where data has less variation than training
    test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0
    predicted_label = model.predict(test_sequence, inference_args)
    self.assertListEqual([0] * 10, predicted_label)
Ejemplo n.º 3
0
def main():
    """The main function."""
    # Retrieve arguments
    model_args, training_args, \
        inference_args, data_args = uisrnn.parse_arguments()

    # Run experiment
    diarization_experiment(model_args, training_args, inference_args,
                           data_args)
Ejemplo n.º 4
0
 def test_save_and_load(self):
   """Save model and load it."""
   model_args, _, _ = uisrnn.parse_arguments()
   model_args.observation_dim = 16
   model_args.transition_bias = 0.5
   model_args.sigma2 = 0.05
   model = uisrnn.UISRNN(model_args)
   temp_file_path = tempfile.mktemp()
   model.save(temp_file_path)
   model.load(temp_file_path)
   self.assertEqual(0.5, model.transition_bias)
Ejemplo n.º 5
0
def main():
    """The main function."""
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 512
    training_args.enforce_cluster_id_uniqueness = True
    training_args.batch_size = 30
    training_args.learning_rate = 1e-4
    training_args.train_iteration = 10
    training_args.num_permutations = 20
    # training_args.grad_max_norm = 5.0
    training_args.learning_rate_half_life = 1000
    diarization_experiment(model_args, training_args, inference_args)
Ejemplo n.º 6
0
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1):
    logger.debug("[Speaker diarization] Initializing models")
    # Initialize ghostvlad
    toolkits.initialize_GPU(Expando({"gpu": ""}))
    ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1),
                                                   num_class=5994,
                                                   mode="eval",
                                                   args=Expando({"net": "resnet34s",
                                                                 "loss": "softmax",
                                                                 "vlad_cluster": 8,
                                                                 "ghost_cluster": 2,
                                                                 "bottleneck_dim": 512,
                                                                 "aggregation_mode": "gvlad"}))
    ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True)

    # Initialize uisrnn
    sys.argv = sys.argv[:1]
    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnn_model = uisrnn.UISRNN(model_args)
    uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark")

    logger.debug("[Speaker diarization] Calculating utterance features")
    utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate)
    feats = []
    for spec in utterances_spec:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = ghostvlad_model.predict(spec)
        feats += [v]
    feats = np.array(feats)[:, 0, :].astype(float)

    logger.debug("[Speaker diarization] Clustering utterance features")
    labels = uisrnn_model.predict(feats, inference_args)

    logger.debug("[Speaker diarization] Tagging segments speakers")
    embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate)
    labels_count = len(labels)
    current = 0
    for segment in segments:
        begin_index = math.floor(current/embedding_duration)
        current += segment.end-segment.begin
        end_index = math.ceil(current/embedding_duration)
        segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))]
        if len(segment_labels) > 0:
            segment.speaker = max(segment_labels, key=segment_labels.count)
        else:
            segment.speaker = 999
    return segments
Ejemplo n.º 7
0
  def test_fit_with_wrong_dim(self):
    """Training data has wrong dimension."""
    model_args, training_args, _ = uisrnn.parse_arguments()
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 5

    # generate fake data
    train_sequence = np.random.rand(1000, 18)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    with self.assertRaises(ValueError):
      model.fit(train_sequence, train_cluster_id, training_args)
Ejemplo n.º 8
0
  def test_predict_with_wrong_dim(self):
    """Testing data has wrong dimension."""
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.enable_cuda = False
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50

    # generate fake data
    train_sequence = np.random.rand(1000, model_args.observation_dim)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequence, train_cluster_id, training_args)

    # testing
    test_sequence = np.random.rand(10, 18)
    with self.assertRaises(ValueError):
      model.predict(test_sequence, inference_args)
Ejemplo n.º 9
0
# from matplotlib import cm
# from time import sleep, perf_counter as timer
# from umap import UMAP
# import matplotlib.pyplot as plt

sys.path.append("Resemblyzer")
from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate  # noqa

# %%
# Load file
wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3")

# %%
# Audio features
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5)

# %%
# Load UIS-RNN model
sys.argv = ['dummy']
model_args, training_args, inference_args = uisrnn.parse_arguments()
model = uisrnn.UISRNN(model_args)
model.load('uis-rnn/saved_model.uisrnn')

# %%
# Testing
test_sequence = cont_embeds.astype(float)
predictions = model.predict(test_sequence, inference_args)

# %%
Ejemplo n.º 10
0
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)
    while (True):
        print("Start speaking")
        myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
        print(type(myrecording))
        sd.wait()  # Wait until recording is finished0000
        print("Finished recording")
        write('wavs/output12.wav', fs, myrecording)
        initial_time = datetime.now()
        main(r'wavs/output12.wav', embedding_per_second=1.2, overlap_rate=0.4)
        print("time taken for execution is : " +
              str(datetime.now() - initial_time))
        num = int(input("Enter 1 to continue and 0 to exit"))
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)

    #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4)
    #mapTable1,keys1 =genMap(interval1)
    mapTable, keys = genMap(intervals)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]


# =============================================================================
#     for spec1 in specs1:
#         spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
#         v = network_eval.predict(spec1)
#         feats += [v]
# =============================================================================
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    #print(len(feats),'00000000')
    #predicted_label = uisrnnModel.predict(feats, inference_args)

    #silhoutte score
    # =============================================================================
    #     sli=[]
    #     fromsel=[]
    #     li=[]
    #     knum=[]
    #     for i in range(10):
    #         li=[]
    #         range_n_clusters = list (range(2,5))
    #         for n_clusters in range_n_clusters:
    #             clusterer = KMeans(n_clusters=n_clusters)
    #             preds = clusterer.fit_predict(feats)
    #             centers = clusterer.cluster_centers_
    #
    #             score = silhouette_score (feats, preds, metric='euclidean')
    #             print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
    #             li.append([n_clusters,score,clusterer,centers])
    #     # =============================================================================
    #     #     print([float(str(i[1])[:4]) for i in li])
    #     #     kvalue=(max([float(str(i[1])[:4]) for i in li]))
    #     #     for i in range(len(li)):
    #     #         if kvalue==float(str(li[i][1])[:4]):
    #     #             true_k=li[i][0]
    #     #             break
    #     # =============================================================================
    #         maxi=li[0][1]
    #         for i in range(1,len(li)):
    #             if li[i][1]-maxi>=0.005:
    #                 maxi=li[i][1]
    #         for i in li:
    #             if i[1]==maxi:
    #                 true_k=i[0]
    #     # =============================================================================
    #     #     maxi=max([i[1] for i in li])
    #     #     for i in li:
    #     #         if i[1]==maxi:
    #     #             true_k=i[0]
    #     # =============================================================================
    #         fromsel.append(li[true_k-2])
    #         print(true_k)
    #         knum.append(true_k)
    #     kval=(max(set(knum), key=knum.count))
    #     print(kval)
    # =============================================================================

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.95,
                                  gaussian_blur_sigma=1)
    predicted_label = clusterer.predict(feats)

    # =============================================================================
    #     clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0)
    #     clusters.fit(feats)
    #     tsne = TSNEVisualizer()
    #     tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_])
    #     tsne.poof()
    # =============================================================================

    global no_speakers
    no_speakers = len(set(predicted_label))
    #print(predicted_label,'**************************')
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e
    file1 = open("myfile.txt", "w+")
    for spk, timeDicts in speakerSlice.items():
        file1.write("person" + "\n")
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
            filestart = s + " -- "
            fileend = e + "\n"
            file1.write(filestart)
            file1.write(fileend)
    file1.close()
Ejemplo n.º 13
0
    def test_four_clusters(self):
        """Four clusters on vertices of a square."""
        label_to_center = {
            'A': np.array([0.0, 0.0]),
            'B': np.array([0.0, 1.0]),
            'C': np.array([1.0, 0.0]),
            'D': np.array([1.0, 1.0]),
        }

        # generate training data
        train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D'
                                                                      ] * 100
        random.shuffle(train_cluster_id)
        train_sequence = _generate_random_sequence(train_cluster_id,
                                                   label_to_center,
                                                   sigma=0.01)
        train_sequences = [
            train_sequence[:100, :], train_sequence[100:300, :],
            train_sequence[300:600, :], train_sequence[600:, :]
        ]
        train_cluster_ids = [
            train_cluster_id[:100], train_cluster_id[100:300],
            train_cluster_id[300:600], train_cluster_id[600:]
        ]

        # generate testing data
        test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40
        random.shuffle(test_cluster_id)
        test_sequence = _generate_random_sequence(test_cluster_id,
                                                  label_to_center,
                                                  sigma=0.01)

        # construct model
        model_args, training_args, inference_args = uisrnn.parse_arguments()
        model_args.enable_cuda = True  #for prince
        model_args.rnn_depth = 2
        model_args.rnn_hidden_size = 8
        model_args.observation_dim = 2
        model_args.verbosity = 3
        training_args.learning_rate = 0.01
        training_args.train_iteration = 200
        training_args.enforce_cluster_id_uniqueness = False
        inference_args.test_iteration = 2

        model = uisrnn.UISRNN(model_args)
        verbose = True
        if verbose:
            print("Training prints")
            print('TYPES(seq, id):', type(train_sequences),
                  type(train_cluster_ids))
            print('emb shape:', np.shape(train_sequences))
            print('label shape:', np.shape(train_sequences[0]))
            print('flat label:', np.shape(train_cluster_ids[0]))
            print('*' * 10, '\n\n')
        # run training, and save the model
        model.fit(train_sequences, train_cluster_ids, training_args)
        temp_file_path = tempfile.mktemp()
        model.save(temp_file_path)

        # run testing
        predicted_label = model.predict(test_sequence, inference_args)

        if verbose:
            print("Prediction prints")
            print(type(predicted_label))
            #print(len(predicted_label))
            print('*' * 10, '\n\n')
        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # load new model
        loaded_model = uisrnn.UISRNN(model_args)
        loaded_model.load(temp_file_path)

        # run testing with loaded model
        predicted_label = loaded_model.predict(test_sequence, inference_args)

        # run evaluation with loaded model
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # keep training from loaded model on a subset of training data
        transition_bias_1 = model.transition_bias
        training_args.learning_rate = 0.001
        training_args.train_iteration = 50
        model.fit(train_sequence[:100, :], train_cluster_id[:100],
                  training_args)
        transition_bias_2 = model.transition_bias
        self.assertNotAlmostEqual(transition_bias_1, transition_bias_2)
        model.logger.print(
            3, 'Asserting transition_bias changed from {} to {}'.format(
                transition_bias_1, transition_bias_2))

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)
Ejemplo n.º 14
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):
    start_time = time.clock()
    # gpu configuration
    session = toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to original wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if s != 0 and e != 0:
                    break
                if s == 0 and key > timeDict['start']:
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if e == 0 and key > timeDict['stop']:
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    end_time = time.clock()
    print("Total Time: %.2f s." % (end_time - start_time))

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()
Ejemplo n.º 15
0
    def test_four_clusters(self):
        """Four clusters on vertices of a square."""
        label_to_center = {
            'A': np.array([0.0, 0.0]),
            'B': np.array([0.0, 1.0]),
            'C': np.array([1.0, 0.0]),
            'D': np.array([1.0, 1.0]),
        }

        # generate training data
        train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D'
                                                                      ] * 100
        random.shuffle(train_cluster_id)
        train_sequence = _generate_random_sequence(train_cluster_id,
                                                   label_to_center,
                                                   sigma=0.01)

        # generate testing data
        test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40
        random.shuffle(test_cluster_id)
        test_sequence = _generate_random_sequence(test_cluster_id,
                                                  label_to_center,
                                                  sigma=0.01)

        # construct model
        model_args, training_args, inference_args = uisrnn.parse_arguments()
        model_args.rnn_depth = 2
        model_args.rnn_hidden_size = 8
        model_args.observation_dim = 2
        model_args.verbosity = 3
        training_args.learning_rate = 0.01
        training_args.learning_rate_half_life = 50
        training_args.train_iteration = 200
        inference_args.test_iteration = 2

        model = uisrnn.UISRNN(model_args)

        # run training, and save the model
        model.fit(train_sequence, train_cluster_id, training_args)
        temp_file_path = tempfile.mktemp()
        model.save(temp_file_path)

        # run testing
        predicted_label = model.predict(test_sequence, inference_args)

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # load new model
        loaded_model = uisrnn.UISRNN(model_args)
        loaded_model.load(temp_file_path)

        # run testing with loaded model
        predicted_label = loaded_model.predict(test_sequence, inference_args)

        # run evaluation with loaded model
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # keep training from loaded model on a subset of training data
        transition_bias_1 = model.transition_bias
        training_args.learning_rate = 0.001
        training_args.train_iteration = 50
        model.fit(train_sequence[:100, :], train_cluster_id[:100],
                  training_args)
        transition_bias_2 = model.transition_bias
        self.assertNotAlmostEqual(transition_bias_1, transition_bias_2)
        model.logger.print(
            3, 'Asserting transition_bias changed from {} to {}'.format(
                transition_bias_1, transition_bias_2))

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)
Ejemplo n.º 16
0
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)
    if check != '':
        specs1, interval1 = load_data(check,
                                      embedding_per_second=1.2,
                                      overlap_rate=0.4)
        mapTable1, keys1 = genMap(interval1)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]
    featss = np.array(feats)[:, 0, :].astype(float)
    predicted_label = uisrnnModel.predict(featss, inference_args)
    total_speaker = len(set(predicted_label))
    global no_speakers
    print("predicted_label: %s" % predicted_label)
    no_speakers = len(set(predicted_label))
    print('total no of speakers', no_speakers)
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    if check != '':
        for spec1 in specs1:
            spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
            v = network_eval.predict(spec1)
            feats += [v]
        featss = np.array(feats)[:,
                                 0, :].astype(float)  # [splits, embedding dim]
        print("=====================")
        print(feats)
        print(featss)
        print("=====================")
        predicted_label2 = uisrnnModel.predict(featss, inference_args)
        check_speaker = len(set(predicted_label2))
        print("predicted_label2: %s" % predicted_label2)
        print('same Speaker' if total_speaker ==
              check_speaker else 'not the same speaker')
        print('speaker detected as ' +
              str(predicted_label2[-1]) if total_speaker ==
              check_speaker else '')
        speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate)
        print("=============speakerSlice2===============")
        for spk, timeDicts in speakerSlice2.items(
        ):  # time map to orgin wav(contains mute)
            for tid, timeDict in enumerate(timeDicts):
                s = 0
                e = 0
                for i, key in enumerate(keys):
                    if (s != 0 and e != 0):
                        break
                    if (s == 0 and key > timeDict['start']):
                        offset = timeDict['start'] - keys[i - 1]
                        s = mapTable[keys[i - 1]] + offset
                    if (e == 0 and key > timeDict['stop']):
                        offset = timeDict['stop'] - keys[i - 1]
                        e = mapTable[keys[i - 1]] + offset

                speakerSlice2[spk][tid]['start'] = s
                speakerSlice2[spk][tid]['stop'] = e

        for spk, timeDicts in speakerSlice2.items():
            print('========= ' + str(spk) + ' =========')
            for timeDict in timeDicts:
                s = timeDict['start']
                e = timeDict['stop']
                s = fmtTime(s)  # change point moves to the center of the slice
                e = fmtTime(e)
                print(s + ' ==> ' + e)
        print("=============speakerSlice2===============")
        #print(predicted_label,'**************************')
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
Ejemplo n.º 17
0
def main():
    """The main function."""
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    diarization_experiment(model_args, training_args, inference_args)
Ejemplo n.º 18
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    # for spk,timeDicts in speakerSlice.items():
    #     print('========= ' + str(spk) + ' =========')
    #     for timeDict in timeDicts:
    #         s = timeDict['start']
    #         e = timeDict['stop']
    #         s = fmtTime(s)  # change point moves to the center of the slice
    #         e = fmtTime(e)
    #         print(s+' ==> '+e)
    # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    # p.draw()
    # p.plot.show()
    speech_r = speech_reg.Recognizer()
    sound = AudioSegment.from_wav(wav_path)
    for spk in speakerSlice.keys():
        print('========= ' + str(spk) + ' =========')
        for item_dict in speakerSlice[spk]:
            audio_seg = sound[item_dict['start']:item_dict['stop']]
            s = item_dict['start']
            e = item_dict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
            item_dict.update({'content': audio_seg})
            filename = 'speaker' + str(spk) + '-' + str(
                item_dict['start'] / 1000) + '-' + str(
                    item_dict['stop'] / 1000) + '.wav'
            audio_seg.export(filename, format="wav")
            audio = speech_reg.AudioFile(filename)
            # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2)
            with audio as source:
                words = speech_r.record(source)
                try:
                    res = speech_r.recognize_google(words)
                except speech_reg.UnknownValueError:
                    try:
                        res = speech_r.recognize_sphinx(words)
                    except speech_reg.UnknownValueError:
                        res = ''
                item_dict.update({'content': res})
            print(res)

    return speakerSlice
Ejemplo n.º 19
0
def main(wav_path,
         embedding_per_second=1.0,
         n_classes=5994,
         overlap_rate=0.5,
         plot_results=True):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)
    #

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    print('intervals', intervals, len(intervals))
    print('mapTable', mapTable, len(mapTable))
    print('keys', keys, len(keys))
    # print('mapTable, keys', mapTable, keys)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        # print('v',v.shape)
        #print('feats', feats.shape)

        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)
    print(feats.shape)
    print(inference_args)
    print('predicted_label', predicted_label)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    print('time_spec_rate', time_spec_rate)
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    print('speakerSlice', speakerSlice)
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        print(spk, timeDicts)
        for tid, timeDict in enumerate(timeDicts):
            print(tid, timeDict)
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    print('offset', offset)
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

                print('i,s,e')
                print(i, s, e, tid, spk)
            print('>>>>>', i, s, e, tid, spk)
            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    speaker_assingments = []
    for spk, timeDicts in speakerSlice.items():
        speaker = str(spk)
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            start = timeDict['start']
            end = timeDict['stop']
            start = fmtTime(
                start)  # change point moves to the center of the slice
            end = fmtTime(end)
            print(start + ' ==> ' + end)
            speaker_assingments.append((start, end, speaker, wav_path))

    if plot_results:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
Ejemplo n.º 20
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         exportFile=None,
         expectedSpeakers=2):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e
    n_speakers = len(speakerSlice)
    print('N-SPeakers:', n_speakers)
    global speaker_final
    speaker_final = [pdb.empty()] * n_speakers
    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            diarization_try(wav_path, s / 1000, e / 1000, spk)
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    # Find the Top n Speakers
    speaker_final.sort(key=lambda speaker: speaker.duration_seconds,
                       reverse=True)
    speaker_final = speaker_final[0:expectedSpeakers]

    # Export the Files
    iso_wav_path = wav_path.split(".")[0]
    itr = 0
    while itr < len(speaker_final):
        write_path = exportFile + "_speaker" + str(itr) + ".wav"
        speaker_final[itr].export(write_path, format="wav")
        itr += 1

    del speaker_final
Ejemplo n.º 21
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         retain_audio_clip=False):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            get_transcript(str(spk), s, e)

    result = print_transcipt()
    try:
        for item in result:
            start = fmtTime(item[1])
            end = fmtTime(item[2])
            file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a')
            transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}"
            print(transcription)
            file.write(transcription)
    except Exception as exp:
        print(f"Failed in main() while writing to file with exception {exp}")
    finally:
        file.close()

    if not retain_audio_clip:
        shutil.rmtree(dir_name)
    else:
        print(
            f'Audio files of transcriptions can be found in {dir_name} folder')

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()

    return result