Beispiel #1
0
 def train_and_test(self, dataset, batch_size):
     self.batch_size = batch_size
     x_train, y_train = dataset.get_training_data()
     num_examples = dataset.get_num_training_examples()
     dropped = 0
     for idx in range(num_examples):
         if idx % 100 == 0:
             print('Training on {} out of {} examples'.format(
                 idx, num_examples))
         mfcc_vec_seq = x_train[idx]
         phoneme_idx = y_train[idx]
         # drop sequences that are too short
         if len(mfcc_vec_seq) < self.n_states:
             print('dropped')
             dropped += 1
         else:
             # find an appropriate hmm and fit using EM
             hmm = self.hmms[phoneme_idx]
             # add phonem frequency
             if not phoneme_idx in self.phonem_freq:
                 self.phonem_freq[phoneme_idx] = 0
             self.phonem_freq[phoneme_idx] += 1
             hmm.fit(mfcc_vec_seq)
     # calculate phonem frequences
     phonem_num = sum(self.phonem_freq.values())
     for key in self.phonem_freq:
         self.phonem_freq[key] /= float(phonem_num)
         print(self.phonem_freq[key])
         print(key)
     print('Done training, dropped {} out of {}'.format(
         dropped, num_examples))
     self.test_on_random_training_batch(dataset, self.batch_size)
     self.test(dataset)
Beispiel #2
0
def train(dataset):
    # Get all vectors in the datasets
    all_vectors = np.concatenate(
        [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
    print("vectors", all_vectors.shape)
    # Run K-Means algorithm to get clusters
    kmeans = clustering(all_vectors)
    print("centers", kmeans.cluster_centers_.shape)

    models = {}
    for cname in CLASS_NAMES:
        #     print(cname[:4])
        # class_vectors = dataset[cname]
        # convert all vectors to the cluster index
        # dataset['one'] = [O^1, ... O^R]
        # O^r = (c1, c2, ... ct, ... cT)
        # O^r size T x 1
        dataset[cname] = list(
            [kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]])

        #define model
        hmm = hmm_model()
        if 'test' not in cname:
            X = np.concatenate(dataset[cname])
            lengths = list([len(x) for x in dataset[cname]])
            print("training class", cname)
            print(X.shape, lengths, len(lengths))
            hmm.fit(X, lengths=lengths)
            models[cname] = hmm

    print("Training done")
    return models
Beispiel #3
0
def train_hmm_and_keep_track_of_log_likelihood(hmm, obs, n_iter=1, **kwargs):
    hmm.n_iter = 1
    hmm.fit(obs)
    loglikelihoods = []
    for n in range(n_iter):
        hmm.n_iter = 1
        hmm.init_params = ''
        hmm.fit(obs)
        loglikelihoods.append(sum(hmm.score(x) for x in obs))
    return loglikelihoods
Beispiel #4
0
def train_hmm_and_keep_track_of_log_likelihood(hmm, obs, n_iter=1, **kwargs):
    hmm.n_iter = 1
    hmm.fit(obs)
    loglikelihoods = []
    for n in range(n_iter):
        hmm.n_iter = 1
        hmm.init_params = ''
        hmm.fit(obs)
        loglikelihoods.append(sum(hmm.score(x) for x in obs))
    return loglikelihoods
Beispiel #5
0
def test_ala2():
    # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes
    # sure the code runs without erroring out
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology

    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])

    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3)
    hmm.fit(sequences)

    assert len(hmm.timescales_ == 3)
    assert np.any(hmm.timescales_ > 50)
Beispiel #6
0
def test_ala2():
    # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes
    # sure the code runs without erroring out
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology

    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])

    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs)
    hmm.fit(sequences)

    assert len(hmm.timescales_ == 3)
    assert np.any(hmm.timescales_ > 50)
Beispiel #7
0
def test_pickle():
    """Test pickling an HMM"""
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology
    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs)
    hmm.fit(sequences)
    logprob, hidden = hmm.predict(sequences)

    with tempfile.TemporaryFile() as savefile:
        pickle.dump(hmm, savefile)
        savefile.seek(0, 0)
        hmm2 = pickle.load(savefile)

    logprob2, hidden2 = hmm2.predict(sequences)
    assert (logprob == logprob2)
Beispiel #8
0
def test_pickle():
    """Test pickling an HMM"""
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology
    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs)
    hmm.fit(sequences)
    logprob, hidden = hmm.predict(sequences)

    with tempfile.TemporaryFile() as savefile:
        pickle.dump(hmm, savefile)
        savefile.seek(0, 0)
        hmm2 = pickle.load(savefile)

    logprob2, hidden2 = hmm2.predict(sequences)
    assert(logprob == logprob2)
Beispiel #9
0
def train(dataset):
    # Get all vectors in the datasets
    all_vectors = np.concatenate(
        [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
    print("vectors", all_vectors.shape)
    # Run K-Means algorithm to get clusters
    kmeans = clustering(all_vectors)
    print("centers", kmeans.cluster_centers_.shape)

    models = {}
    for cname in CLASS_NAMES:
        #     print(cname[:4])
        # class_vectors = dataset[cname]
        # convert all vectors to the cluster index
        # dataset['one'] = [O^1, ... O^R]
        # O^r = (c1, c2, ... ct, ... cT)
        # O^r size T x 1
        dataset[cname] = list(
            [kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]])

        if cname == "benh_nhan":
            hmm = hmm_model(N_COMPONENT_BN, START_PROB_BN, TRANSMAT_PRIOR_BN)
        elif cname == "cua":
            hmm = hmm_model(N_COMPONENT_CUA, START_PROB_CUA,
                            TRANSMAT_PRIOR_CUA)
        elif cname == "khong":
            hmm = hmm_model(N_COMPONENT_KHONG, START_PROB_KHONG,
                            TRANSMAT_PRIOR_KHONG)
        elif cname == "nguoi":
            hmm = hmm_model(N_COMPONENT_NGUOI, START_PROB_NGUOI,
                            TRANSMAT_PRIOR_NGUOI)
        #define model
        # hmm = hmm_model()
        if 'test' not in cname:
            X = np.concatenate(dataset[cname])
            lengths = list([len(x) for x in dataset[cname]])
            print("training class", cname)
            print(X.shape, lengths, len(lengths))
            hmm.fit(X, lengths=lengths)
            models[cname] = hmm

    print("Training done")
    return models
Beispiel #10
0
def train(data_train, state_num):
    models = {}
    dataset = data_train.copy()
    for cname in dataset.keys():
        n = state_num[cname]
        startprob = np.zeros(n)
        startprob[0] = 1
        transmat = np.diag(np.full(n, 1))

        hmm = hmmlearn.hmm.MultinomialHMM(
            n_components=n,
            random_state=0,
            n_iter=1000,
            verbose=False,
            startprob_prior=startprob,
            transmat_prior=transmat,
        )

        X = np.concatenate(dataset[cname])
        lengths = list([len(x) for x in dataset[cname]])
        hmm.fit(X, lengths=lengths)
        models[cname] = hmm
    return models
Beispiel #11
0
                              params='mctw',
                              init_params='mst')
    hmm.startprob_ = np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    hmm.transmat_ = np.array([
        [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
        [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])

    X = np.concatenate(dataset_train[cname])
    lengths = list([len(x) for x in dataset_train[cname]])
    hmm.fit(X)
    models[cname] = hmm
print("Training done")

with open("gmm_hmm.pkl", "wb") as file:
    pickle.dump(models, file)
print("Saved!")

print("Testing")

#class_names = ['khong', 'toi', 'trong', 'amtinh',"test_toi","test_trong","test_khong","test_amtinh"]
for true_cname in class_names:
    #if true_cname[:4] == 'test':
    true_predict = 0
    #     for O in dataset[true_cname]:
    for O in dataset_test[true_cname]:
                    0.1,
                    0.1,
                    0.5,
                ],
                [
                    0.1,
                    0.1,
                    0.1,
                    0.1,
                    0.1,
                    0.5,
                ],
            ]),
        )
        if cname[:4] != 'test':
            X = np.concatenate(dataset[cname])
            lengths = list([len(x) for x in dataset[cname]])
            print("training class", cname)
            print(X.shape, lengths, len(lengths))
            hmm.fit(X, lengths=lengths)
            models[cname] = hmm
    print("Training done")

    print("Testing")
    for true_cname in class_names:
        for O in dataset[true_cname]:
            score = {
                cname: model.score(O, [len(O)])
                for cname, model in models.items() if cname[:4] != 'test'
            }
            print(true_cname, score)
def train(evaluate=False):
    ''' if evaluate is True, test data will be choosen randomly from train data set
        however, you need to backup your self-record test data before use this evaluate
    '''
    global trained
    trained = True

    class1 = ["toi", "test_toi"]
    class2 = ["mot", "test_mot"]
    class3 = ["trong", "test_trong"]
    class4 = ["thoigian", "test_thoigian"]
    class5 = ["chungta", "test_chungta"]

    class_names = []
    class_names.extend(class1)
    class_names.extend(class2)
    class_names.extend(class3)
    class_names.extend(class4)
    class_names.extend(class5)

    dataset = {}

    if evaluate is True:
        for cname in class_names:
            if cname[:4] == "test":
                cname_ = cname.split("_")[1]
                data_dir_src = os.path.join("data", cname_)
                data_dir_dst = os.path.join("data", cname)
                samples = random.sample([x for x in os.listdir(data_dir_src) if os.path.isfile(os.path.join(data_dir_src, x))], 10)
                [copyfile(os.path.join(data_dir_src, x), os.path.join(data_dir_dst, x)) for x in samples]
    for cname in class_names:
        print(f"Load {cname} dataset")
        dataset[cname] = get_class_data(os.path.join("data", cname))
    all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
    kmeans = clustering(all_vectors)
    kmeans_model_filename = 'kmeans.joblib'
    with open (kmeans_model_filename, 'wb') as f_kmeans:
        joblib.dump(kmeans, f_kmeans)
    print("centers", kmeans.cluster_centers_.shape)


    config = {
        'toi': {'n_components':5},
        'mot': {'n_components': 5},
        'trong': {'n_components': 5},
        'thoigian': {'n_components': 8}, # 10
        'chungta': {'n_components': 8},
        'test_toi': {'n_components': 5},
        'test_mot': {'n_components': 5},
        'test_trong': {'n_components': 5},
        'test_thoigian': {'n_components': 8}, # 10
        'test_chungta': {'n_components': 8},
        'demo': {'n_components': 3},
    }
    for cname in class_names:
        dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])
        n_components = config[cname]['n_components']
        start_prob = np.zeros(n_components)
        start_prob[0] = 1.0
        transmat = np.ndarray(shape=(n_components, n_components), dtype=float)
        for i in range((n_components - 1)):
            transmat[i][i] = 0.7
            transmat[i][i + 1] = 0.3
        transmat[n_components - 1][n_components - 1] = 1.0

        hmm = hmmlearn.hmm.MultinomialHMM(
            n_components=n_components, random_state=0, n_iter=1000, verbose=True,
            transmat_prior=transmat,
            startprob_prior=start_prob,
            init_params='ste',
            params='ste'
        )
        # hmm.startprob_ = start_prob
        # hmm.transmat_ = transmat

        if cname[:4] != 'test':
            X = np.concatenate(dataset[cname])
            lengths = list([len(x) for x in dataset[cname]])
            print("training class", cname)
            hmm.fit(X, lengths=lengths)
            models[cname] = hmm
    print("Training done")
    model_filename = 'finalized_model.joblib'
    with open (model_filename, 'wb') as f_hmm:
        joblib.dump(models, f_hmm)

    if evaluate is True:
        print("Testing")
        for true_cname in class_names:           
            if true_cname != 'demo':
                count = 0
                for O in dataset[true_cname]:
                    score = {cname : model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' and cname != 'demo' }
                    predict = max(score, key = score.get)
                    if predict == true_cname or true_cname[:4] == 'test' and predict == true_cname.split('_')[1]:
                        count+=1
            #         print(true_cname, score, predict)
                print(f"true: {count}/{len(dataset[true_cname])}")

        for cname in class_names:
            if cname[:4] == "test":
                data_dir_dst = os.path.join("data", cname)
                samples = os.listdir(data_dir_dst)
                [os.remove(os.path.join(data_dir_dst, x)) for x in samples]
    return models
def main():
    n_states = {
        'khong': 11,
        'vietnam': 24,
        'nguoi': 11,
        'benhvien': 24,
        'trong': 11
    }
    #Load dataset
    class_names = [
        f for f in os.listdir('data') if os.path.isdir(os.path.join('data', f))
    ]
    dataset = {}
    for cname in class_names:
        print(f"-->Load {cname} dataset")
        dataset[cname] = get_class_data(cname)

    # Get all vectors in the datasets
    all_vectors = np.concatenate(
        [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
    print("vectors", all_vectors.shape)
    # Run K-Means algorithm to get clusters
    kmeans = clustering(all_vectors, n_clusters=35)

    #Train model
    models = {}
    class_vectors = dataset.copy()
    for cname in class_names:
        # convert all vectors to the cluster index
        # dataset['one'] = [O^1, ... O^R]
        # O^r = (c1, c2, ... ct, ... cT)
        # O^r size T x 1
        class_vectors[cname] = list(
            [kmeans.predict(v).reshape(-1, 1) for v in class_vectors[cname]])
        if cname[:4] != 'test':
            hmm = hmmlearn.hmm.MultinomialHMM(
                n_components=n_states[cname],
                random_state=2020,
                n_iter=1000,
                verbose=False,
                init_params='e',
                params='te',
            )
            hmm.startprob_ = np.array(make_pi(n_states[cname]))
            hmm.transmat_ = np.array(make_A(n_states[cname]))
            X = np.concatenate(class_vectors[cname])
            lengths = list([len(x) for x in class_vectors[cname]])
            print("training class", cname)
            print(X.shape, lengths, len(lengths))
            hmm.fit(X, lengths=lengths)
            models[cname] = hmm
    print("<--Training done-->\n")
    print("-----Testing-----")
    print("Test in Datatrain")
    for test_cname in class_names:
        cnt = 0
        if test_cname[:4] != "test":
            for O in class_vectors[test_cname]:
                score = {
                    cname: model.score(O, [len(O)])
                    for cname, model in models.items() if cname[:4] != 'test'
                }
                max_value = max(v for k, v in score.items())
                #print("Max: ", max_value)
                for k, v in score.items():
                    if v == max_value:
                        if k == test_cname:
                            cnt += 1
                #print(test_cname, score)
            print(f"{test_cname} -- Score: ",
                  cnt / len(class_vectors[test_cname]))
    print()
    print("Test in Datatest")
    for test_cname in class_names:
        cnt = 0
        if test_cname[:4] == "test":
            for O in class_vectors[test_cname]:
                score = {
                    cname: model.score(O, [len(O)])
                    for cname, model in models.items() if cname[:4] != 'test'
                }
                max_value = max(v for k, v in score.items())
                #print("Max: ", max_value)
                for k, v in score.items():
                    if v == max_value:
                        predict = k
                        if predict.strip() == test_cname[5:].strip():
                            cnt += 1
                #print(test_cname, score)
            print(f"{test_cname} -- Score: ",
                  cnt / len(class_vectors[test_cname]))

    #Extract models parameters
    with open("Models_parameters.txt", "w") as f:
        for cname, model in models.items():
            f.write(f"Model_name : {cname}\n")
            f.write("Startprob matrix:\n")
            f.write(" ".join(map(str, model.startprob_)))
            f.write("\nTransition Matrix\n")
            f.write(" ".join(map(str, model.transmat_)))
            f.write("\nEmissionProb Matrix\n")
            f.write(" ".join(map(str, model.emissionprob_)))
            f.write("\n\n")
    print("Extracted models to Models_parameters.txt successfully")
    #Save models
    if "models" not in os.listdir():
        os.mkdir("models")
    #Kmeans
    with open(os.path.join("models", "kmeans.pkl"), "wb") as f:
        pickle.dump(kmeans, f)
    print("Saved Kmeans model to 'models/kmeans.pkl' successfully")
    #HMM
    with open(os.path.join("models", "models.pkl"), "wb") as f:
        pickle.dump(models, f)
    print(f"Saved HMMs model to 'models/models.pkl' successfully")