Example #1
0
def main():
    """
    First ARG: list of training files
    Second ARG: save name for model
    """
    file1 = sys.argv[1]
    outname = sys.argv[2]
    file_list = [f[0:-1] for f in open(file1, 'r')]
    models, transitions, priors = calc_transmat(file_list)
    hmm = GaussianHMM(
        transitions.shape[0],
        "full",
        #startprob=priors,
        n_iter=500,
        transmat=transitions,
        init_params='mcs',
        params='mcs',
    )
    feats, _ = load_feats_labels(file_list)
    feat, lab = load_feats_labels(file_list)
    #hmm.means_ = np.transpose(models['mean'])
    #hmm.covars_ = models['sigma']
    print 'Fitting'

    start = timeit.default_timer()
    hmm.fit([np.transpose(feat)])
    stop = timeit.default_timer()
    print 'Training Time: ' + str(stop - start)

    features, labels = load_feats_labels(['audio.arff'])
    _, seq = hmm.decode(np.transpose(features))
    #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq)))
    print len(filter(lambda (x, y): x == y, zip(labels, map(int2label, seq))))
    pickle.dump(hmm, open(outname, "wb"))
    plt.imshow(transitions, interpolation='nearest')
    plt.show()
Example #2
0
def main():
    """
    First ARG: list of training files
    Second ARG: save name for model
    """
    file1 = sys.argv[1]
    outname = sys.argv[2]
    file_list = [f[0:-1] for f in open(file1,'r')]
    models, transitions, priors = calc_transmat(file_list)
    hmm = GaussianHMM(
        transitions.shape[0],
        "full",
        #startprob=priors,
        n_iter=500,
        transmat=transitions,
        init_params='mcs',
        params='mcs',
    )
    feats, _ = load_feats_labels(file_list)
    feat, lab = load_feats_labels(file_list)
    #hmm.means_ = np.transpose(models['mean'])
    #hmm.covars_ = models['sigma']
    print 'Fitting'

    start = timeit.default_timer()
    hmm.fit([np.transpose(feat)])
    stop = timeit.default_timer()
    print 'Training Time: ' + str(stop - start)

    features, labels = load_feats_labels(['audio.arff'])
    _, seq = hmm.decode(np.transpose(features))
    #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq)))
    print len(filter(lambda(x,y): x==y, zip(labels, map(int2label, seq))))
    pickle.dump(hmm, open(outname, "wb"))
    plt.imshow(transitions, interpolation='nearest')
    plt.show()
     for i in range(n_states):
         print 'checking if initial covs are pos-definite'
         np.linalg.cholesky(covs[i])
         print np.linalg.eigvals(covs[i])
 tmat, smat = get_tmat_and_smat(pre_states, end=False, start=False)
 print tmat, smat
 model = GaussianHMM(n_components=n_states, n_iter=n_iter, covariance_type=cov_type, startprob=smat, transmat=tmat, init_params='mc')
 model.means_ = means
 model.covars_ = covs
 sum_inital_ll = 0.0
 sum_initial_score = 0.0
 sum_initial_map = 0.0
 remove_idx = []
 for idx, feat_from_list in enumerate(feats_as_list):
     if np.shape(feat_from_list)[0] > n_states:
         initial_ll, initial_best_seq = model.decode(feat_from_list)
         initial_map, initial_best_sep_map = model.decode(feat_from_list, algorithm='map')
         sum_initial_score += model.score(feat_from_list)
         sum_inital_ll += initial_ll
         sum_initial_map += initial_map
     else:
         remove_idx.append(idx)
         print 'too few samples in file', list_of_patient_file_paths[idx], np.shape(feat_from_list)
 print 'initial viterbi log-likelihood,', sum_inital_ll
 print 'initial score log-likelihood,', sum_initial_score
 print 'initial map log-likelihood', sum_initial_map
 remove_idx.sort()
 remove_idx.reverse()
 print 'removing...', remove_idx
 for r in remove_idx:
     del feats_as_list[r]
            t, last_index = overlapped_samples(file_path, incident_reported_time=int(incident_time), overlap=5, window=10, with_end=2)
            if t is None:
                print file_path, 'is bad'
            else:
                model.means_ = means
                model.covars_ = covs
                print 'shape intial', np.shape(covs)
                '''
                best_seq = model.decode(t)
                print 'intial,', best_seq
                print 'final means', model.means_
                print 'initial trans', tmat
                print 'initial startprobs', smat, sum(smat)
                '''
                model.fit([t])
                best_seq = model.decode(t)
                print 'file', file_path
                print 'final,', best_seq
                #print 'final means', model.means_
                #print 'final trans', model.transmat_
                #print 'final startprob', model.startprob_

                if np.isnan(model.means_).any() == False and np.isnan(model.covars_).any() == False:
                    means = model.means_
                    covs = np.array([np.diag(model.covars_[0])])
                    for i in range(1, model.n_components):
                        covs = np.vstack((covs, [np.diag(model.covars_[i])]))
                    print 'shape after', np.shape(covs)
                    tmat = model.transmat_

        covars[covars==0] = 1e-5
        
        model = GaussianHMM(numState, covariance_type="tied", n_iter=1000, init_params='abdefghijklnopqrstuvwxyzABDEFGHIJKLNOPQRSTUVWXYZ')
        model.means_ = means
        model.covars_ = covars
        
        print("Fitting model...")
        sys.stdout.flush()
        model.fit(data)

        print("Decoding states...")
        sys.stdout.flush()
        # do a loop over everything and record in one long array
        states = np.array([])
        score = 0
        for i in range(0, len(data)):
            hidden_states = model.decode(data[i])
            states = np.append(states, hidden_states[1])
            score = score + model.score(data[i])

        print("Saving data...")
        sys.stdout.flush()

        # save the states and LLH
        np.savetxt("data/substates/%s%d/%d/rep_%d_states.txt" % (basepath,stateNum,numState,repInx), states, fmt="%d")
        with open("data/substates/%s%d/%d/rep_%d_LLH.txt" % (basepath,stateNum,numState,repInx), 'w') as f:
        	f.write(str(score))
	
        saveobject(model, "data/substates/%s%d/%d/rep_%d.pk" % (basepath,stateNum,numState,repInx))    
        
Example #6
0
means = np.array([[0.0, 0.0], [np.log1p(args.coverage), 0.0],
                  [0.0, np.log1p(args.coverage)],
                  [np.log1p(args.coverage / 2),
                   np.log1p(args.coverage / 2)],
                  [np.log1p(args.coverage),
                   np.log1p(args.coverage)]])
cv = 1.0
covars = np.array([[0.01, 0.01], [cv, 0.01], [0.01, cv], [cv / 2, cv / 2],
                   [cv, cv]])
hidden = ["private"] + ref_samples + ["heterozygous", "pseudohet"]

hmm = GaussianHMM(n_components=len(means), random_state=rs)
hmm._set_means(means)
hmm._set_covars(covars)

## filter sites; compute observation sequence as log(1+count)
keep = np.logical_and((counts.max(1) < args.X_max * args.coverage),
                      (counts.sum(1) > -1.0))
counts = counts[keep, :]
obs = np.log1p(counts)
starts = np.array([start for start, end in ivls]).reshape((len(ivls), 1))
starts = starts[keep, :]

## run hmm
states = hmm.decode(obs)

## print result to stdout
for i in range(0, counts.shape[0]):
    print starts[i, 0], obs[i, 0], obs[i, 1], hidden[states[1][i]]
    ax.set_xticks(range(start,end+1),minor=True)
    ax.legend()
    ax.grid(True,which='both')
    plt.show()
##############################################################################
# Run HMM
X_hmm = np.column_stack((y_train,X_train[['hour_of_day','weather','day_of_week']]))
#X_hmm = np.column_stack((y_train,X_train[['hour_of_day','weather']]))
#X_hmm = y_train
from sklearn.hmm import GaussianHMM
n_clusters = 9
#n_clusters = 17
model = GaussianHMM(n_clusters,covariance_type='diag',n_iter=1000)
model.fit([X_hmm])
hidden_states = model.predict(X_hmm)
viterbi_states = model.decode(X_hmm)
x_ax = np.asarray(range(len(X_hmm)))
x_ax = X_train['hour_of_day'] + X_train['day_of_week']*24
#x_ax = X_train['hour_of_day']
x_ax = np.asarray([item.to_datetime() for item in X_train.index])
def plot_HMM(n_clusters,hidden_states,x_ax,y_ax):
    #PLOT HIDDEN STATES
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for i in xrange(n_clusters):
        print i
        idx = (hidden_states==i)
        if i<7:
            ax.plot(x_ax[idx],y_ax[idx],'o',label='%dth state'%i)
        elif i<14:
            ax.plot(x_ax[idx],y_ax[idx],'x',label='%dth state'%i)
means = np.array([	[  0.0, 0.0 ],
					[ np.log1p(args.coverage), 0.0 ],
					[ 0.0, np.log1p(args.coverage) ],
					[ np.log1p(args.coverage/2), np.log1p(args.coverage/2) ],
					[ np.log1p(args.coverage), np.log1p(args.coverage) ] ])
cv = 1.0
covars = np.array([ [ 0.01, 0.01 ],
					[ cv, 0.01 ],
					[ 0.01, cv ],
					[ cv/2, cv/2 ],
					[ cv, cv ] ])
hidden = [ "private" ] + ref_samples + [ "heterozygous","pseudohet" ]

hmm = GaussianHMM(n_components = len(means), random_state = rs)
hmm._set_means(means)
hmm._set_covars(covars)

## filter sites; compute observation sequence as log(1+count)
keep = np.logical_and((counts.max(1) < args.X_max*args.coverage), (counts.sum(1) > -1.0))
counts = counts[ keep,: ]
obs = np.log1p(counts)
starts = np.array([ start for start,end in ivls ]).reshape( (len(ivls), 1) )
starts = starts[ keep,: ]

## run hmm
states =  hmm.decode(obs)

## print result to stdout
for i in range(0, counts.shape[0]):
	print starts[i,0], obs[i,0], obs[i,1], hidden[ states[1][i] ]