def run_cluster_nn(dataset,
                   cluster_method,
                   trgX,
                   trgY,
                   tstX,
                   tstY,
                   replace=False):
    n_cluster = best_cluster_count(dataset, cluster_method)
    if dataset is "chess":
        clf = MLPClassifier(hidden_layer_sizes=(50, 10), activation='relu')
    elif dataset is "fmnist":
        clf = MLPClassifier(hidden_layer_sizes=(50, ),
                            activation='relu',
                            tol=0.002)
    else:
        print("Error: dataset is ", dataset,
              " but must be either chess or fmnist")
        return
    if cluster_method is "km":
        cm = KMeans(n_clusters=n_cluster, random_state=0)
        cm.fit(trgX)
        trg_clusters = cm.transform(trgX)
        tst_clusters = cm.transform(tstX)
    elif cluster_method is "em":
        cm = GaussianMixture(n_components=n_cluster, random_state=0)
        cm.fit(trgX)
        trg_clusters = cm.predict_proba(trgX)
        tst_clusters = cm.predict_proba(tstX)
    else:
        print("Error: cluster_method is ", cluster_method,
              " but must be km or em")
        return

    # Calculate distances from cluster centers

    if replace:
        # Replace features
        trgX = trg_clusters
        tstX = tst_clusters
    else:
        # Append features
        trgX = np.concatenate((trgX, trg_clusters), axis=1)
        tstX = np.concatenate((tstX, tst_clusters), axis=1)

    start = time.time()
    clf.fit(trgX, trgY)
    end = time.time()
    elapsed = end - start
    train_score = clf.score(trgX, trgY)
    test_score = clf.score(tstX, tstY)
    print("For dataset: ", dataset, " got train_score: ", train_score,
          " and test_score: ", test_score, " in time", elapsed)
    if replace:
        tag = 'NN_' + cluster_method + '_replace'
    else:
        tag = 'NN_' + cluster_method
    write_best_results(tag, dataset, train_score, test_score, elapsed, clf)
    return
Beispiel #2
0
test = pickle.load(open(TEST, 'rb'))
test_data = test['data']
X_train = np.array(x_label)
y_train = np.array(y_label)
#X_candidation=np.array(x_unlabel)
X_test = np.array(test_data)

clf = KMeans(n_clusters=10, random_state=0).fit(encoded_imgs_xlabel)
pred_class_kmeans = clf.predict(encoded_imgs_xunlabel)

clf = RandomForestClassifier(n_estimators=10).fit(encoded_imgs_xlabel, y_train)
pred_class_rf = clf.predict(encoded_imgs_xunlabel)

clf = KNeighborsClassifier(n_neighbors=3).fit(encoded_imgs_xlabel, y_train)
pred_class_knn = clf.predict(encoded_imgs_xunlabel)
knn_prob = clf.predict_proba(encoded_imgs_xunlabel)

for i in xrange(45000):
    if pred_class_kmeans[i] == pred_class_rf[i] == pred_class_knn[i]:
        X_train = np.append(X_train, np.array([x_unlabel[i]]), axis=0)
        y_train = np.append(y_train, np.array([pred_class_kmeans[i]]), axis=0)

batch_size = 200
nb_classes = 10
nb_epoch = 1000
data_augmentation = True

if len(X_train) > 40000:
    batch_size = 300
elif len(X_train) > 30000:
    batch_size = 250
Beispiel #3
0
            def dLdceta(S,Sinv,dlambdadceta,mu,mi,T,Prec,sigmafun,Noinst,Nonode,method_clus, diagonal): # Provereno *2
                
                def find_nearest(array, value):
                    array = np.asarray(array)
                    idx = (np.abs(array - value)).argmin()
                    return idx,array[idx]
                
                def sigmoid(ceta): # Provereno
                    Sigma = 1/(1 + np.exp(-ceta))
                    Sigma[Sigma>0.99999999] = 0.99999999
                    Sigma[Sigma<1e-10] = 1e-10
                    return Sigma
                
                def evaluate():
                    for ind in range(broj_klastera):
                        indeks, najblizi = find_nearest(mux,centri[ind])
                        indeks = np.unravel_index(indeks.astype(int),(Noinst,NodeNo))
                        i = indeks[0]
                        j = indeks[1]
                        dlambdadceta = Dlambdadceta(i, j, NodeNo, diagonal)
                        DLdceta[i,j] = -Trace(S[i,:,:],dlambdadceta)\
                            - 2*((T[i,:].T + mu[i,:].T.dot(Prec[i,:,:])).dot(S[i,:,:]).dot(dlambdadceta).dot(S[i,:,:])).dot(Sinv[i,:,:]).dot(mi[i,:]) +\
                            mi[i,:].T.dot(dlambdadceta).dot(mi[i,:]) + sigmafun[i,j]
                        DLdceta[predikcije==ind] = DLdceta[i,j]
                    return DLdceta
                
                def evaluate2(predikcije):
                    DLdcetax =np.zeros(broj_klastera)
                    for ind in range(broj_klastera):
                        indeks, najblizi = find_nearest(mux,centri[ind])
                        indeks = np.unravel_index(indeks.astype(int),(Noinst,NodeNo))
                        i = indeks[0]
                        j = indeks[1]
                        dlambdadceta = Dlambdadceta(i, j, NodeNo, diagonal)
                        DLdcetax[ind] = -Trace(S[i,:,:],dlambdadceta)\
                            - 2*((T[i,:].T + mu[i,:].T.dot(Prec[i,:,:])).dot(S[i,:,:]).dot(dlambdadceta).dot(S[i,:,:])).dot(Sinv[i,:,:]).dot(mi[i,:]) +\
                            mi[i,:].T.dot(dlambdadceta).dot(mi[i,:]) + sigmafun[i,j]
                    DLdceta = np.sum(DLdcetax*predikcije,1).reshape([Noinst,NodeNo])
                    return DLdceta                
                
                
                DLdceta = np.zeros([Noinst,NodeNo])
                mu[np.isnan(mu)] = np.random.rand(mu[np.isnan(mu)].shape[0])

                if method_clus == 'KMeans':
                    mux = mu.reshape([mu.size,1])
                    mux[mux==np.inf] = 1e10
                    mux[mux==-np.inf] = -1e10
                    mux = sigmoid(mux)
                    broj_klastera = clus_no
                    claster = KMeans(n_clusters = broj_klastera, random_state=0, n_init=1,tol = 1e-3)
                    claster.fit(mux)
                    centri = claster.cluster_centers_
                    predikcije = claster.predict(mux)
                    predikcije = predikcije.reshape([Noinst,NodeNo])
                    DLdceta = evaluate()
                    
                if method_clus == 'MiniBatchKMeans':
                    mux = mu.reshape([mu.size,1])
                    mux[mux==np.inf] = 1e10
                    mux[mux==-np.inf] = -1e10
                    mux = sigmoid(mux)
                    broj_klastera = clus_no
                    claster = MiniBatchKMeans(n_clusters = broj_klastera, random_state=0, batch_size = 100, n_init=1, tol = 1e-3)
                    claster.fit(mux)
                    centri = claster.cluster_centers_
                    predikcije = claster.predict(mux)
                    predikcije = predikcije.reshape([Noinst,NodeNo])
                    DLdceta = evaluate()
                
                if method_clus == 'MeanShift':
                    mux = mu.reshape([mu.size,1])
                    mux[mux==np.inf] = 1e10
                    mux[mux==-np.inf] = -1e10
                    mux = sigmoid(mux)
                    claster = MeanShift()
                    claster.fit(mux)
                    centri = claster.cluster_centers_
                    predikcije = claster.predict(mux)
                    predikcije = predikcije.reshape([Noinst,NodeNo])
                    DLdceta = evaluate()
                    
                if method_clus == 'GaussianMixture':
                    mux = mu.reshape([mu.size,1])
                    mux[mux==np.inf] = 1e10
                    mux[mux==-np.inf] = -1e10
                    mux = sigmoid(mux)
                    broj_klastera = clus_no
                    claster = GaussianMixture(n_components=broj_klastera, warm_start= True, random_state=0, init_params='random')
                    claster.fit(mux)
                    centri = claster.means_
                    predikcije = claster.predict(mux)
                    predikcije = predikcije.reshape([Noinst,NodeNo])
                    DLdceta = evaluate()
                
                if method_clus == 'GaussianMixtureProb':
                    mux = mu.reshape([mu.size,1])
                    mux[mux==np.inf] = 1e10
                    mux[mux==-np.inf] = -1e10
                    mux = sigmoid(mux)
                    broj_klastera = clus_no
                    claster = GaussianMixture(n_components=broj_klastera, warm_start= True, random_state=0)
                    claster.fit(mux)
                    centri = claster.means_
                    predikcije = claster.predict_proba(mux)
#                    predikcije = predikcije.reshape([Noinst,NodeNo])
                    DLdceta = evaluate2(predikcije)
                return -1*DLdceta
    'Y': 25,
    'Z': 26
}).astype(int)

X_test = test_df.drop("is_click", axis=1)
Y_test = test_df["is_click"]

# Training and prediction
logreg = KMeans()
logreg.fit(X_train, Y_train)
Y_test = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_train)
preds = probs[:, 1]
fpr, tpr, threshold = metrics.roc_curve(Y_train, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
def DeepInteract():
    X, labels, labels2 = prepare_data(seperate=True)
    '''
    neg_tmp = [index for index,value in enumerate(labels) if value == 0]
    np.random.shuffle(neg_tmp)
    pos_tmp = [index for index,value in enumerate(labels) if value == 1]
    pos_X = X[pos_tmp]
    neg_X = X[neg_tmp[:len(pos_tmp)]]
    pos_labels = [labels[item] for item in pos_tmp]
    neg_labels = [labels[item] for item in neg_tmp[:len(pos_tmp)]]
    X_new = np.vstack((pos_X,neg_X))
    labels_new = pos_labels+neg_labels
    '''
    import pdb
    X_data1, X_data2 = transfer_array_format(X)  # X X_new
    print X_data1.shape, X_data2.shape
    y, encoder = preprocess_labels(labels)  # labels labels_new
    y2 = np.array(labels2)  # labels labels_new

    num = np.arange(len(y))
    np.random.shuffle(num)
    '''
    X_data1 = X_data1[num]
    X_data2 = X_data2[num]
    y = y[num]
    y2 = y2[num]
    '''
    num_cross_val = 5
    all_performance = []
    all_performance_rf = []
    all_performance_bef = []
    all_performance_DNN = []
    all_performance_SDADNN = []
    all_performance_blend = []
    all_labels = []
    all_prob = {}
    num_classifier = 3
    all_prob[0] = []
    all_prob[1] = []
    all_prob[2] = []
    all_prob[3] = []
    all_averrage = []
    for fold in range(num_cross_val):
        train1 = np.array(
            [x for i, x in enumerate(X_data1) if i % num_cross_val != fold])
        test1 = np.array(
            [x for i, x in enumerate(X_data1) if i % num_cross_val == fold])
        train2 = np.array(
            [x for i, x in enumerate(X_data2) if i % num_cross_val != fold])
        test2 = np.array(
            [x for i, x in enumerate(X_data2) if i % num_cross_val == fold])
        train_label = np.array(
            [x for i, x in enumerate(y) if i % num_cross_val != fold])
        test_label = np.array(
            [x for i, x in enumerate(y) if i % num_cross_val == fold])
        #pdb.set_trace()
        #train_label2 = np.array([x for i, x in enumerate(y2) if i % num_cross_val != fold])
        train_label2 = np.array(
            [x for i, x in enumerate(y2) if i % num_cross_val != fold])
        test_label2 = np.array(
            [x for i, x in enumerate(y2) if i % num_cross_val == fold])

        real_labels = []
        for val in test_label:
            if val[0] == 1:
                real_labels.append(0)
            else:
                real_labels.append(1)
        '''
	real_labels2 = []
        for val in test_label2:
            if val[0] == 1:
                real_labels2.append(0)
            else:
                real_labels2.append(1)
	'''
        train_label_new = []
        for val in train_label:
            if val[0] == 1:
                train_label_new.append(0)
            else:
                train_label_new.append(1)

        blend_train = np.zeros((
            train1.shape[0],
            num_classifier))  # Number of training data x Number of classifiers
        blend_test = np.zeros(
            (test1.shape[0],
             num_classifier))  # Number of testing data x Number of classifiers
        skf = list(StratifiedKFold(train_label_new, num_classifier))
        class_index = 0
        #prefilter_train, prefilter_test, prefilter_train_bef, prefilter_test_bef = autoencoder_two_subnetwork_fine_tuning(train1, train2, train_label, test1, test2, test_label)
        #prefilter_train_bef, prefilter_test_bef = autoencoder_two_subnetwork_fine_tuning(train1, train2, train_label, test1, test2, test_label)
        prefilter_train_bef, prefilter_test_bef = autoencoder_two_subnetwork_fine_tuning(
            X_data1, X_data2, train_label, test1, test2, test_label)
        #X_train1_tmp, X_test1_tmp, X_train2_tmp, X_test2_tmp, model = autoencoder_two_subnetwork_fine_tuning(train1, train2, train_label, test1, test2, test_label)
        #model = autoencoder_two_subnetwork_fine_tuning(train1, train2, train_label, test1, test2, test_label)
        #model = merge_seperate_network(train1, train2, train_label)
        #proba = model.predict_proba([test1, test2])[:1]

        real_labels = []
        for val in test_label:
            if val[0] == 1:
                real_labels.append(0)
            else:
                real_labels.append(1)

        all_labels = all_labels + real_labels
        all_data_labels = real_labels + train_label_new

        all_prefilter_data = np.vstack(
            (prefilter_test_bef, prefilter_train_bef))
        all_label2_data = np.vstack(
            (test_label2.reshape(test_label2.shape[0], 1),
             train_label2.reshape(train_label2.shape[0], 1)))
        #prefilter_train, new_scaler = preprocess_data(prefilter_train, stand =False)
        #prefilter_test, new_scaler = preprocess_data(prefilter_test, scaler = new_scaler, stand = False)
        true_data = np.hstack((train1[46529, :], train2[46529, :]))  # 61713
        #true_data = np.vstack((prefilter_train_bef[46529,:],prefilter_train_bef[64833,:])) # 61713
        #false_data = np.vstack((prefilter_train_bef[46528,:],prefilter_train_bef[64834,:]))
        false_data = np.hstack((train1[46528, :], train2[46529, :]))
        #pdb.set_trace()
        '''
        prefilter_train1 = xgb.DMatrix( prefilter_train, label=train_label_new)
        evallist  = [(prefilter_train1, 'train')]
        num_round = 10
        clf = xgb.train( plst, prefilter_train1, num_round, evallist )
        prefilter_test1 = xgb.DMatrix( prefilter_test)
        ae_y_pred_prob = clf.predict(prefilter_test1)
        '''
        '''
        tmp_aver = [0] * len(real_labels)
        print 'deep autoencoder'
        clf = RandomForestClassifier(n_estimators=50)
        clf.fit(prefilter_train_bef, train_label_new)
        ae_y_pred_prob = clf.predict_proba(prefilter_test_bef)[:,1]
        all_prob[class_index] = all_prob[class_index] + [val for val in ae_y_pred_prob]
        tmp_aver = [val1 + val2/3 for val1, val2 in zip(ae_y_pred_prob, tmp_aver)]
        proba = transfer_label_from_prob(ae_y_pred_prob)
        #pdb.set_trace()            
        acc, precision, sensitivity, specificity, MCC = calculate_performace(len(real_labels), proba,  real_labels)
	fpr, tpr, auc_thresholds = roc_curve(real_labels, ae_y_pred_prob)
	auc_score = auc(fpr, tpr)
	#scipy.io.savemat('deep',{'fpr':fpr,'tpr':tpr,'auc_score':auc_score})
	## AUPR score add 
        precision1, recall, pr_threshods = precision_recall_curve(real_labels, ae_y_pred_prob)
        aupr_score = auc(recall, precision1)
	#scipy.io.savemat('deep_aupr',{'recall':recall,'precision':precision1,'aupr_score':aupr_score})
        print acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score
	all_performance.append([acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score])
	'''
        print 'deep autoencoder without fine tunning'
        class_index = class_index + 1
        #clf = RandomForestClassifier(n_estimators=50)
        #pdb.set_trace()
        #clf = KMeans(n_clusters=2, random_state=0).fit(prefilter_train_bef)
        #clf = MiniBatchKMeans(n_clusters=2, init=np.vstack((false_data,true_data)),max_iter=1).fit(np.vstack((false_data,true_data)))
        #clf = KMeans(n_clusters=2, init=np.vstack((false_data,true_data)),max_iter=1).fit(np.vstack((false_data,true_data)))
        #clf.fit(prefilter_train_bef, train_label_new)
        #ae_y_pred_prob = clf.predict(prefilter_test_bef)#[:,1]
        pdb.set_trace()
        #prefilter_train_bef2 = np.hstack((all_prefilter_data,all_label2_data))
        prefilter_train_bef2 = np.hstack(
            (prefilter_train_bef, y2.reshape(y2.shape[0], 1)))
        prefilter_test_bef2 = np.hstack(
            (prefilter_test_bef, test_label2.reshape(
                (test_label2.shape[0], 1))))
        #ae_y_pred_prob = last_layer_autoencoder(prefilter_train_bef2,all_data_labels, activation = 'sigmoid', batch_size = 100, nb_epoch = 100, last_dim = 2)
        ae_y_pred_prob = last_layer_autoencoder(prefilter_train_bef2,
                                                all_data_labels,
                                                activation='sigmoid',
                                                batch_size=100,
                                                nb_epoch=100,
                                                last_dim=2)
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('My')
        i_tmp = 0
        for line_i in range(ae_y_pred_prob.shape[0]):
            if round(ae_y_pred_prob[line_i, 1], 4) > 0.5:
                worksheet.write(i_tmp, 0, line_i)
                worksheet.write(i_tmp, 1, line_i / 104)
                worksheet.write(i_tmp, 2, round(ae_y_pred_prob[line_i, 1], 4))
                worksheet.write(i_tmp, 3, line_i % 104 + 1000)
                worksheet.write(i_tmp, 4, "Undirected")
                i_tmp = i_tmp + 1
        workbook.save('cluster_Workbook1.xls')

        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('My')
        i_tmp = 0
        for line_i in range(ae_y_pred_prob.shape[0]):
            if round(ae_y_pred_prob[line_i, 0], 4) > 0.5:
                worksheet.write(i_tmp, 0, line_i)
                worksheet.write(i_tmp, 1, line_i / 104)
                worksheet.write(i_tmp, 2, round(ae_y_pred_prob[line_i, 0], 4))
                worksheet.write(i_tmp, 3, line_i % 104 + 1000)
                worksheet.write(i_tmp, 4, "Undirected")
                i_tmp = i_tmp + 1
        workbook.save('cluster_Workbook2.xls')
        pdb.set_trace()
        clf = KMeans(n_clusters=2, random_state=0).fit(prefilter_train_bef2)
        #clf = KMeans(n_clusters=2, random_state=0).fit(all_prefilter_data)
        #ae_y_pred_prob = clf.predict(prefilter_train_bef2)#(prefilter_train_bef2)
        ae_y_pred_prob = clf.predict(prefilter_train_bef2)
        '''
	if ae_y_pred_prob[0][0] > ae_y_pred_prob[0][1]:
	    aha = 1
	else:
	    aha = 0
        '''
        #pdb.set_trace()
        proba = transfer_label_from_prob(ae_y_pred_prob)
        #pdb.set_trace()
        acc, precision, sensitivity, specificity, MCC = calculate_performace(
            len(all_data_labels), proba, all_data_labels)
        fpr, tpr, auc_thresholds = roc_curve(all_data_labels, ae_y_pred_prob)
        auc_score = auc(fpr, tpr)
        #scipy.io.savemat('deep_without',{'fpr':fpr,'tpr':tpr,'auc_score':auc_score})
        ## AUPR score add
        precision1, recall, pr_threshods = precision_recall_curve(
            all_data_labels, ae_y_pred_prob)
        aupr_score = auc(recall, precision1)
        #scipy.io.savemat('deep_without_aupr',{'recall':recall,'precision':precision1,'aupr_score':aupr_score})
        print acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score
        all_performance_bef.append([
            acc, precision, sensitivity, specificity, MCC, auc_score,
            aupr_score
        ])

        print 'random forest using raw feature'
        class_index = class_index + 1
        prefilter_train = np.concatenate((train1, train2), axis=1)
        prefilter_test = np.concatenate((test1, test2), axis=1)

        #clf = RandomForestClassifier(n_estimators=50)
        clf = AdaBoostClassifier(n_estimators=50)
        #clf = DecisionTreeClassifier()
        clf.fit(prefilter_train_bef, train_label_new)
        ae_y_pred_prob = clf.predict_proba(prefilter_test_bef)[:, 1]
        all_prob[class_index] = all_prob[class_index] + [
            val for val in ae_y_pred_prob
        ]
        tmp_aver = [
            val1 + val2 / 3 for val1, val2 in zip(ae_y_pred_prob, tmp_aver)
        ]
        proba = transfer_label_from_prob(ae_y_pred_prob)

        acc, precision, sensitivity, specificity, MCC = calculate_performace(
            len(real_labels), proba, real_labels)
        fpr, tpr, auc_thresholds = roc_curve(real_labels, ae_y_pred_prob)
        auc_score = auc(fpr, tpr)
        scipy.io.savemat('raw', {
            'fpr': fpr,
            'tpr': tpr,
            'auc_score': auc_score
        })
        ## AUPR score add
        precision1, recall, pr_threshods = precision_recall_curve(
            real_labels, ae_y_pred_prob)
        aupr_score = auc(recall, precision1)
        #scipy.io.savemat('raw_aupr',{'recall':recall,'precision':precision1,'aupr_score':aupr_score})
        print acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score
        all_performance_rf.append([
            acc, precision, sensitivity, specificity, MCC, auc_score,
            aupr_score
        ])
        ### Only RF
        clf = RandomForestClassifier(n_estimators=50)
        #clf = AdaBoostClassifier(n_estimators=50)
        #clf = DecisionTreeClassifier()
        clf.fit(prefilter_train_bef, train_label_new)
        ae_y_pred_prob = clf.predict_proba(prefilter_test_bef)[:, 1]
        #all_prob[class_index] = all_prob[class_index] + [val for val in ae_y_pred_prob]
        #tmp_aver = [val1 + val2/3 for val1, val2 in zip(ae_y_pred_prob, tmp_aver)]
        proba = transfer_label_from_prob(ae_y_pred_prob)

        acc, precision, sensitivity, specificity, MCC = calculate_performace(
            len(real_labels), proba, real_labels)
        fpr, tpr, auc_thresholds = roc_curve(real_labels, ae_y_pred_prob)
        auc_score = auc(fpr, tpr)
        #scipy.io.savemat('raw',{'fpr':fpr,'tpr':tpr,'auc_score':auc_score})
        ## AUPR score add
        precision1, recall, pr_threshods = precision_recall_curve(
            real_labels, ae_y_pred_prob)
        aupr_score = auc(recall, precision1)
        #scipy.io.savemat('raw_aupr',{'recall':recall,'precision':precision1,'aupr_score':aupr_score})
        print "RF :", acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score

        ## DNN
        class_index = class_index + 1
        prefilter_train = np.concatenate((train1, train2), axis=1)
        prefilter_test = np.concatenate((test1, test2), axis=1)
        model_DNN = DNN()
        train_label_new_forDNN = np.array([[0, 1] if i == 1 else [1, 0]
                                           for i in train_label_new])
        model_DNN.fit(prefilter_train,
                      train_label_new_forDNN,
                      batch_size=200,
                      nb_epoch=20,
                      shuffle=True,
                      validation_split=0)
        proba = model_DNN.predict_classes(prefilter_test,
                                          batch_size=200,
                                          verbose=True)
        ae_y_pred_prob = model_DNN.predict_proba(prefilter_test,
                                                 batch_size=200,
                                                 verbose=True)
        acc, precision, sensitivity, specificity, MCC = calculate_performace(
            len(real_labels), proba, real_labels)
        fpr, tpr, auc_thresholds = roc_curve(real_labels, ae_y_pred_prob[:, 1])
        auc_score = auc(fpr, tpr)
        scipy.io.savemat('raw_DNN', {
            'fpr': fpr,
            'tpr': tpr,
            'auc_score': auc_score
        })
        ## AUPR score add
        precision1, recall, pr_threshods = precision_recall_curve(
            real_labels, ae_y_pred_prob[:, 1])
        aupr_score = auc(recall, precision1)
        print "RAW DNN:", acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score
        all_performance_DNN.append([
            acc, precision, sensitivity, specificity, MCC, auc_score,
            aupr_score
        ])

        ## SDA + DNN
        class_index = class_index + 1
        model_DNN = DNN2()
        train_label_new_forDNN = np.array([[0, 1] if i == 1 else [1, 0]
                                           for i in train_label_new])
        model_DNN.fit(prefilter_train_bef,
                      train_label_new_forDNN,
                      batch_size=200,
                      nb_epoch=20,
                      shuffle=True,
                      validation_split=0)
        proba = model_DNN.predict_classes(prefilter_test_bef,
                                          batch_size=200,
                                          verbose=True)
        ae_y_pred_prob = model_DNN.predict_proba(prefilter_test_bef,
                                                 batch_size=200,
                                                 verbose=True)
        acc, precision, sensitivity, specificity, MCC = calculate_performace(
            len(real_labels), proba, real_labels)
        fpr, tpr, auc_thresholds = roc_curve(real_labels, ae_y_pred_prob[:, 1])
        auc_score = auc(fpr, tpr)
        scipy.io.savemat('SDA_DNN', {
            'fpr': fpr,
            'tpr': tpr,
            'auc_score': auc_score
        })
        ## AUPR score add
        precision1, recall, pr_threshods = precision_recall_curve(
            real_labels, ae_y_pred_prob[:, 1])
        aupr_score = auc(recall, precision1)
        print "SDADNN :", acc, precision, sensitivity, specificity, MCC, auc_score, aupr_score
        all_performance_SDADNN.append([
            acc, precision, sensitivity, specificity, MCC, auc_score,
            aupr_score
        ])
        pdb.set_trace()
    print 'mean performance of deep autoencoder'
    print np.mean(np.array(all_performance), axis=0)
    print '---' * 50
    print 'mean performance of deep autoencoder without fine tunning'
    print np.mean(np.array(all_performance_bef), axis=0)
    print '---' * 50
    print 'mean performance of ADA using raw feature'
    print np.mean(np.array(all_performance_rf), axis=0)
    print '---' * 50
    print 'mean performance of DNN using raw feature'
    print np.mean(np.array(all_performance_DNN), axis=0)
    print '---' * 50
    print 'mean performance of SDA DNN'
    print np.mean(np.array(all_performance_SDADNN), axis=0)
    #print 'mean performance of stacked ensembling'
    #print np.mean(np.array(all_performance_blend), axis=0)
    #print '---' * 50

    fileObject = open('resultListAUC_aupr_ADA5_inter2.txt', 'w')
    for i in all_performance:
        k = ' '.join([str(j) for j in i])
        fileObject.write(k + "\n")
    fileObject.write('\n')
    for i in all_performance_bef:
        k = ' '.join([str(j) for j in i])
        fileObject.write(k + "\n")
    fileObject.write('\n')
    for i in all_performance_rf:
        k = ' '.join([str(j) for j in i])
        fileObject.write(k + "\n")
    fileObject.write('\n')
    for i in all_performance_DNN:
        k = ' '.join([str(j) for j in i])
        fileObject.write(k + "\n")
    fileObject.write('\n')
    for i in all_performance_SDADNN:
        k = ' '.join([str(j) for j in i])
        fileObject.write(k + "\n")
    #for i in all_performance_blend:
    #    k=' '.join([str(j) for j in i])
    #    fileObject.write(k+"\n")

    fileObject.close()
def applyClustering(clusteringOptions, classifier, outputFolder):

  pca = 0
  kme = 0
  if classifier:
    print("reloading classifier")
    pca = classifier[0]
    model  = classifier[1]

  analyzeAllWellsAtTheSameTime   = clusteringOptions['analyzeAllWellsAtTheSameTime']
  pathToVideos                   = clusteringOptions['pathToVideos']
  nbCluster                      = clusteringOptions['nbCluster']
  if 'nbPcaComponents' in clusteringOptions:
    nbPcaComponents              = clusteringOptions['nbPcaComponents']
  else:
    nbPcaComponents              = 0
  nbFramesTakenIntoAccount       = clusteringOptions['nbFramesTakenIntoAccount']
  scaleGraphs                    = clusteringOptions['scaleGraphs']
  showFigures                    = clusteringOptions['showFigures']
  useFreqAmpAsym                 = clusteringOptions['useFreqAmpAsym']
  useAngles                      = clusteringOptions['useAngles']
  useAnglesSpeedHeadingDisp      = clusteringOptions['useAnglesSpeedHeadingDisp']
  useAnglesSpeedHeading          = clusteringOptions['useAnglesSpeedHeading']
  useAnglesSpeed                 = clusteringOptions['useAnglesSpeed']
  useAnglesHeading               = clusteringOptions['useAnglesHeading']
  useAnglesHeadingDisp           = clusteringOptions['useAnglesHeadingDisp']
  useFreqAmpAsymSpeedHeadingDisp = clusteringOptions['useFreqAmpAsymSpeedHeadingDisp']
  videoSaveFirstTenBouts         = clusteringOptions['videoSaveFirstTenBouts']
  nbVideosToSave                 = clusteringOptions['nbVideosToSave']
  resFolder                      = clusteringOptions['resFolder']
  nameOfFile                     = clusteringOptions['nameOfFile']
  globalParametersCalculations   = clusteringOptions['globalParametersCalculations']
  
  if 'modelUsedForClustering' in clusteringOptions:
    modelUsedForClustering = clusteringOptions['modelUsedForClustering']
  else:
    modelUsedForClustering = 'KMeans'

  instaTBF   = ['instaTBF'+str(i)  for i in range(1,nbFramesTakenIntoAccount+1)]
  instaAmp   = ['instaAmp'+str(i)  for i in range(1,nbFramesTakenIntoAccount+1)]
  instaAsym  = ['instaAsym'+str(i) for i in range(1,nbFramesTakenIntoAccount+1)]

  tailAngles = ['tailAngles'+str(i) for i in range(1,nbFramesTakenIntoAccount+1)]

  instaSpeed       = ['instaSpeed'       + str(i) for i in range(1,nbFramesTakenIntoAccount+1)]
  instaHeadingDiff = ['instaHeadingDiff' + str(i) for i in range(1,nbFramesTakenIntoAccount+1)]
  instaHorizDispl  = ['instaHorizDispl'  + str(i) for i in range(1,nbFramesTakenIntoAccount+1)]
  
  allInstas  = instaTBF + instaAmp + instaAsym

  allInstas2 = tailAngles + instaSpeed + instaHeadingDiff + instaHorizDispl

  xaxis    = [0, 30]
  freqAxis = [0, 0.5]
  ampAxis  = [0, 1.6]
  asymAxis = [0, 0.8]
  angAxis  = [0, 1.5]

  possibleColors = ['b', 'r', 'g', 'k']
  possibleColorsNames = ['blue', 'red', 'green', 'black']
  
  outputFolderResult = os.path.join(outputFolder, nameOfFile)
  
  if os.path.exists(outputFolderResult):
    shutil.rmtree(outputFolderResult)
  os.mkdir(outputFolderResult)

  infile = open(os.path.join(resFolder, nameOfFile),'rb')
  dfParam = pickle.load(infile)
  infile.close()

  nbConditions = len(np.unique(dfParam['Condition'].values))

  # Applying PCA
  if classifier == 0:
    print("creating pca object")
    if nbPcaComponents:
      pca = PCA(n_components = nbPcaComponents)
    else:
      pca = PCA()

  if useFreqAmpAsym:
    allInstaValues = dfParam[allInstas].values

  if useAngles:
    allInstaValues = dfParam[tailAngles].values

  if useAnglesSpeedHeadingDisp:
    allInstaValues = dfParam[tailAngles + instaSpeed + instaHeadingDiff + instaHorizDispl].values
    
  if useAnglesSpeedHeading:
    allInstaValues = dfParam[tailAngles + instaSpeed + instaHeadingDiff].values
    
  if useAnglesSpeed:
    allInstaValues = dfParam[tailAngles + instaSpeed].values

  if useAnglesHeading:
    allInstaValues = dfParam[tailAngles + instaHeadingDiff].values
    
  if useAnglesHeadingDisp:
    allInstaValues = dfParam[tailAngles + instaHeadingDiff + instaHorizDispl].values
    
  if useFreqAmpAsymSpeedHeadingDisp:
    allInstaValues = dfParam[allInstas + instaSpeed + instaHeadingDiff + instaHorizDispl].values

  if modelUsedForClustering == 'KMeans':
    scaler = StandardScaler()
    allInstaValues = scaler.fit_transform(allInstaValues)

  allInstaValuesLenBef = len(allInstaValues)
  dfParam = dfParam.drop([idx for idx, val in enumerate(~np.isnan(allInstaValues).any(axis=1)) if not(val)])
  allInstaValues = allInstaValues[~np.isnan(allInstaValues).any(axis=1)]
  allInstaValuesLenAft = len(allInstaValues)
  if allInstaValuesLenBef - allInstaValuesLenAft > 0:
    print(allInstaValuesLenBef - allInstaValuesLenAft, " bouts (out of ", allInstaValuesLenBef, " ) were deleted because they contained NaN values")
  else:
    print("all bouts were kept (no nan values)")

  if classifier == 0:
    print("creating pca transform and applying it on the data")
    pca_result = pca.fit_transform(allInstaValues)
  else:
    print("applying pca (reloaded)")
    pca_result = pca.transform(allInstaValues)
  
  ind = []
  for i in range(0,nbConditions):
    ind.append(dfParam.loc[(dfParam['Condition'] == i)].index.values)
    
  # KMean clustering
  if classifier == 0:
    if modelUsedForClustering == 'KMeans':
      model = KMeans(n_clusters = nbCluster)
    elif modelUsedForClustering == 'GaussianMixture':
      model = GaussianMixture(n_components = nbCluster)
    else:
      model = KMeans(n_clusters = nbCluster)
    model.fit(pca_result)
    
  labels = model.predict(pca_result)
  if modelUsedForClustering == 'GaussianMixture':
    predictedProbas = model.predict_proba(pca_result)

  # Sorting labels
  nbLabels       = clusteringOptions['nbCluster']
  labels2        = np.zeros(len(labels))
  nbElemPerClass = np.zeros(nbLabels) 
  for i in range(0, nbLabels):
    nbElemPerClass[i] = labels.tolist().count(i)
  sortedIndices = (-nbElemPerClass).argsort()
  for i in range(0, len(labels)):
    labels2[i] = np.where(sortedIndices==labels[i])[0][0]
  dfParam['classification'] = labels2
  
  if modelUsedForClustering == 'GaussianMixture':
    for j in range(0, nbLabels):
      probasClassJ = predictedProbas[:, sortedIndices[j]]
      dfParam['classProba' + str(j)] = probasClassJ

  # Calculating proportions of each conditions in each class

  df2 = dfParam[['Condition','classification']]
  proportions = np.zeros((nbConditions, nbCluster))
  for idxCond, cond in enumerate(np.unique(dfParam['Condition'].values)):
    for classed in range(0, len(proportions[0])):
      proportions[idxCond, classed] = len(df2.loc[(df2['Condition'] == cond) & (df2['classification'] == classed)])

  for i in range(0, nbConditions):
    proportions[i, :] = proportions[i, :] / sum(proportions[i, :])
    
  outF = open(os.path.join(outputFolderResult, 'proportions.txt'), "w")
  labelX = ""
  for i in range(0, nbCluster):
    labelX = labelX + "Cluster " + str(i+1) + " : \n"
    for j, cond in enumerate(np.unique(dfParam['Condition'].values)):
      labelX = labelX + cond + ": " + str(round(proportions[j,i]*100*100)/100) + "%, "
      labelX = labelX + "\n"
    labelX = labelX + "\n"
  outF.write(labelX)
  outF.write("\n")
  outF.close()

  # Plotting each cluster one by one

  mostRepresentativeBout = np.zeros((nbConditions,nbCluster))

  # fig2 = matplotlib.pyplot.figure(figsize=(8.0, 5.0))

  fig, tabAx = plt.subplots(4, len(proportions[0]), figsize=(22.9, 8.8))

  for idxCond, cond in enumerate(np.unique(dfParam['Condition'].values)):
    for classed in range(0, len(proportions[0])):
      dfTemp = dfParam.loc[(dfParam['Condition'] == cond) & (dfParam['classification'] == classed)]
      instaTBFtab   = dfTemp[instaTBF]
      instaAmptab   = dfTemp[instaAmp]
      instaAsymtab  = dfTemp[instaAsym]
      tailAnglestab = dfTemp[tailAngles]
      color = possibleColors[idxCond]
      tabAx[0, classed].plot(instaTBFtab.median().values,  color, label=cond)
      tabAx[1, classed].plot(instaAmptab.median().values,  color)
      tabAx[2, classed].plot(instaAsymtab.median().values, color)
      tabAx[3, classed].plot(tailAnglestab.median().values, color)
      
      instaTBFmedian  = instaTBFtab.median().values
      instaAmpmedian  = instaAmptab.median().values
      instaAsymmedian = instaAsymtab.median().values
      
      dist = abs(instaTBFtab-instaTBFmedian).sum(axis=1)/abs(instaTBFmedian).sum() + abs(instaAmptab-instaAmpmedian).sum(axis=1)/abs(instaAmpmedian).sum() + abs(instaAsymtab-instaAsymmedian).sum(axis=1)/abs(instaAsymmedian).sum()
      if len(dist):
        idMinDist = dist.idxmin()
      else:
        idMinDist = -1
      mostRepresentativeBout[idxCond, classed] = idMinDist
  
  if scaleGraphs:
    for classed in range(0, len(proportions[0])):
      tabAx[0, classed].scatter(xaxis, freqAxis, None, 'w')
      tabAx[1, classed].scatter(xaxis, ampAxis, None, 'w')
      tabAx[2, classed].scatter(xaxis, asymAxis, None, 'w')
      tabAx[3, classed].scatter(xaxis, angAxis, None, 'w')
  tabAx[0, 0].legend()
  tabAx[0, 0].set_ylabel('Avg Insta Frequency')
  tabAx[1, 0].set_ylabel('Avg Insta Amplitude')
  tabAx[2, 0].set_ylabel('Avg Insta Asymetry')
  tabAx[3, 0].set_ylabel('Avg Angle')
  for i in range(0, nbCluster):
    labelX = "Cluster " + str(i+1) + "\n"
    for j, condName in enumerate(np.unique(dfParam['Condition'].values)):
      labelX = labelX + "for " + condName + " :  " + str(round(proportions[j,i]*100*100)/100) + "%\n"
    tabAx[3, i].set_xlabel(labelX)
  plt.savefig(os.path.join(outputFolderResult, 'medianValuesUsedForClusteringForEachClusterAndCondition.png'))
  if showFigures:
    plt.show()

  # Plot most representative bout for each cluster
  fig, tabAx2 = plt.subplots(4, len(proportions[0]), figsize=(22.9, 8.8))
  for cond in range(0, len(proportions)):
    for classed in range(0, len(proportions[0])):
      idMinDist = mostRepresentativeBout[cond, classed]
      if idMinDist != -1 and not(np.isnan(idMinDist)):
        instaTBFtab   = dfParam.loc[idMinDist, instaTBF]
        instaAmptab   = dfParam.loc[idMinDist, instaAmp]
        instaAsymtab  = dfParam.loc[idMinDist, instaAsym]
        tailAnglestab = dfParam.loc[idMinDist, tailAngles]
        color = possibleColors[cond]
        tabAx2[0, classed].plot(instaTBFtab.values,  color)
        tabAx2[1, classed].plot(instaAmptab.values,  color)
        tabAx2[2, classed].plot(instaAsymtab.values, color)
        tabAx2[3, classed].plot(tailAnglestab.values, color)
  if scaleGraphs:
    for classed in range(0, len(proportions[0])):
      tabAx2[0, classed].scatter(xaxis, freqAxis, None, 'w')
      tabAx2[1, classed].scatter(xaxis, ampAxis, None, 'w')
      tabAx2[2, classed].scatter(xaxis, asymAxis, None, 'w')
      tabAx2[3, classed].scatter(xaxis, angAxis, None, 'w')
  tabAx2[0, 0].set_ylabel('Avg Insta Frequency')
  tabAx2[1, 0].set_ylabel('Avg Insta Amplitude')
  tabAx2[2, 0].set_ylabel('Avg Insta Asymetry')
  tabAx2[3, 0].set_ylabel('Avg Angle')
  for i in range(0, nbCluster):
    labelX = "Most representative bout of cluster "+ str(i+1) + ":\n"
    for j, condName in enumerate(np.unique(dfParam['Condition'].values)):
      labelX = labelX + "for " + condName + " (in " + possibleColorsNames[j] + ")\n"
    tabAx2[3, i].set_xlabel(labelX)
  plt.savefig(os.path.join(outputFolderResult, 'mostRepresentativeBoutForEachClusterAndCondition.png'))
  if showFigures:
    plt.show()

  # Getting most representative sorted bouts
  sortedRepresentativeBouts = []
  for classed in range(0, len(proportions[0])):
    dfTemp = dfParam.loc[(dfParam['classification'] == classed)]
    instaTBFtab   = dfTemp[instaTBF]
    instaAmptab   = dfTemp[instaAmp]
    instaAsymtab  = dfTemp[instaAsym]
    tailAnglestab = dfTemp[tailAngles]
    
    instaTBFmedian  = instaTBFtab.median().values
    instaAmpmedian  = instaAmptab.median().values
    instaAsymmedian = instaAsymtab.median().values

    dist = abs(instaTBFtab-instaTBFmedian).sum(axis=1)/abs(instaTBFmedian).sum() + abs(instaAmptab-instaAmpmedian).sum(axis=1)/abs(instaAmpmedian).sum() + abs(instaAsymtab-instaAsymmedian).sum(axis=1)/abs(instaAsymmedian).sum()
    
    sortedRepresentativeBouts.append(dfParam.loc[dist.index.values[dist.values.argsort()], tailAngles])

  # Plot most representative bouts
  nbOfMostRepresentativeBoutsToPlot = 10000000000000
  for classed in range(0, len(proportions[0])):
    nb = len(sortedRepresentativeBouts[classed].index)
    if nb < nbOfMostRepresentativeBoutsToPlot:
      nbOfMostRepresentativeBoutsToPlot = nb
  if nbOfMostRepresentativeBoutsToPlot > 100:
    nbOfMostRepresentativeBoutsToPlot = 100
  fig, tabAx3 = plt.subplots(len(proportions[0]),1, figsize=(22.9, 8.8))
  for classed in range(0, len(proportions[0])):
    indices = sortedRepresentativeBouts[classed].index
    for j in range(0, nbOfMostRepresentativeBoutsToPlot):
      tailAnglestab = sortedRepresentativeBouts[classed].loc[indices[j]].values
      color = 'b'
      tabAx3[classed].plot(tailAnglestab, color)
  for i in range(0,len(proportions[0])):
    tabAx3[i].set_ylabel('Cluster '+str(i+1))
  tabAx3[len(proportions[0])-1].set_xlabel("Tail angle over time for the\n"+str(nbOfMostRepresentativeBoutsToPlot)+' most representative bouts for each cluster')
  plt.savefig(os.path.join(outputFolderResult, str(nbOfMostRepresentativeBoutsToPlot) + 'mostRepresentativeBoutsForEachCluster.png'))
  if showFigures:
    plt.show()

  # Plot most representative bouts - second plot
  fig, tabAx3 = plt.subplots(len(proportions[0]),1, figsize=(22.9, 8.8))
  for classed in range(0, len(proportions[0])):
    nbOfMostRepresentativeBoutsToPlot = 10000000000000
    nbOfMostRepresentativeBoutsToPlot = len(sortedRepresentativeBouts[classed].index)
    if nbOfMostRepresentativeBoutsToPlot > 100:
      nbOfMostRepresentativeBoutsToPlot = 100
    indices = sortedRepresentativeBouts[classed].index
    for j in range(0, nbOfMostRepresentativeBoutsToPlot):
      tailAnglestab = sortedRepresentativeBouts[classed].loc[indices[j]].values
      color = 'b'
      tabAx3[classed].plot(tailAnglestab, color)
      
  for i in range(0,len(proportions[0])):
    tabAx3[i].set_ylabel('Cluster '+str(i+1))
  tabAx3[len(proportions[0])-1].set_xlabel("Tail angle over time for the most representative bouts for each cluster")
  plt.savefig(os.path.join(outputFolderResult, 'mostRepresentativeBoutsForEachCluster.png'))
  if showFigures:
    plt.show()

  # Creating validation videos: Beginning, middle, and end. (10 movements each)
  if False:
    length = 150
    for boutCategory in range(0, nbCluster):
      print("boutCategory:",boutCategory)
      out = cv2.VideoWriter(os.path.join(outputFolderResult, 'cluster' + str(boutCategory) + '.avi'),cv2.VideoWriter_fourcc('M','J','P','G'), 10, (length,length))
      indices = sortedRepresentativeBouts[boutCategory].index
      print("total:",len(indices))
      r = [i for i in range(0, 10)] + [i for i in range(int(len(indices)/2)-10, int(len(indices)/2))] + [i for i in range(len(indices)-10, len(indices))]
      for num in r:
        print("num:",num)
        BoutStart = int(dfParam.loc[indices[num],'BoutStart'])
        BoutEnd   = int(dfParam.loc[indices[num],'BoutEnd'])
        Well_ID   = int(dfParam.loc[indices[num],'Well_ID']) - 1
        Trial_ID  = dfParam.loc[indices[num],'Trial_ID']
        out = outputValidationVideo(pathToVideos, Trial_ID, '.txt', Well_ID, 1, BoutStart, BoutEnd, out, length, analyzeAllWellsAtTheSameTime)
      out.release()
      
  # Creating validation videos: Beginning (10 movements each)
  if videoSaveFirstTenBouts:
    length = 150
    for boutCategory in range(0, nbCluster):
      print("boutCategory:",boutCategory+1)
      out = cv2.VideoWriter(os.path.join(outputFolderResult, 'cluster' + str(boutCategory+1) + '.avi'),cv2.VideoWriter_fourcc('M','J','P','G'), 10, (length,length))
      indices = sortedRepresentativeBouts[boutCategory].index
      nbTemp = len(indices)
      if nbTemp < nbVideosToSave:
        nbVideosToSave = nbTemp
      
      r = [i for i in range(0, nbVideosToSave)]
      for num in r:
        print("num:",num)
        BoutStart = int(dfParam.loc[indices[num],'BoutStart'])
        BoutEnd   = int(dfParam.loc[indices[num],'BoutEnd'])
        Well_ID   = int(dfParam.loc[indices[num],'Well_ID'])
        Trial_ID  = dfParam.loc[indices[num],'Trial_ID']
        out = outputValidationVideo(pathToVideos, Trial_ID, '.txt', Well_ID, 1, BoutStart, BoutEnd, out, length, analyzeAllWellsAtTheSameTime)
      out.release()
  
  # Looking into global parameters
  if globalParametersCalculations:
    globParam = ['BoutDuration','TotalDistance','Speed','NumberOfOscillations', 'meanTBF', 'maxAmplitude']
    fig, tabAx = plt.subplots(2, 3, figsize=(22.9, 8.8))
    for idx, parameter in enumerate(globParam):
      concatenatedValues = []
      for boutCategory in range(0, nbCluster):
        indices = sortedRepresentativeBouts[boutCategory].index
        values  = dfParam.loc[indices[:],parameter].values
        concatenatedValues.append(values)
      tabAx[int(idx/3), idx%3].set_title(parameter)
      tabAx[int(idx/3), idx%3].boxplot(concatenatedValues)
    plt.savefig(os.path.join(outputFolderResult, 'globalParametersforEachCluster.png'))
    if showFigures:
      plt.plot()
      plt.show()
  
  # Saves classifications
  dfParam[['Trial_ID','Well_ID','NumBout','classification']].to_csv(os.path.join(os.path.join(outputFolder, clusteringOptions['nameOfFile']), 'classifications.txt'))
  
  return [dfParam, [pca, model]]
Beispiel #7
0
class ModalPolicy(object):
    """Clusters the input space and returns local policies.
    """

    def __init__(self, optimizer=None, reward_model=None, mode_classifier=KNeighborsClassifier,
                 mode_args=None):
        if reward_model is None:
            self.reward_model = GPRewardModel()
        else:
            self.reward_model = reward_model
        self.reward_model_fitted = False

        self.mode_classifier = mode_classifier
        if mode_args is None:
            self.mode_args = {'weights': 'distance'}

        self.states = []
        self.actions = []
        self.rewards = []

        self.clusters = None
        self.clusters_init = False
        self.cluster_actions = []
        self.cluster_rewards = []
        self.active_clusters = []
        self.n_modes = 0

        self.sa_kde = KernelDensity()  # TODO

        if optimizer is None:
            self.optimizer = BFGSOptimizer(mode='max', num_restarts=3)
            self.optimizer.lower_bounds = -1
            self.optimizer.upper_bounds = 1  # TODO
        else:
            self.optimizer = optimizer

    def report_sample(self, s, a, r):
        x = np.hstack((s, a))
        # try:
        self.reward_model.report_sample(x, r)
        self.reward_model_fitted = True
        # except AttributeError:
            # self.reward_model_fitted = False

        self.states.append(s)
        self.actions.append(a)
        self.rewards.append(r)

    def get_action(self, s):
        s = np.asarray(s)
        if len(s.shape) < 2:
            s = s.reshape(1, -1)

        # TODO Support multiple queries?
        probs = self.clusters.predict_proba(s)
        ind = np.random.choice(self.active_clusters,
                               size=1,
                               p=np.atleast_1d(np.squeeze(probs)))
        a = [self.cluster_actions[i] for i in ind]

        return np.squeeze(a)

    def expected_reward(self, normalize=False):
        self.fit_reward_model()

        X = np.hstack((self.states, self.actions))
        r_pred, r_std = self.reward_model.predict(X, return_std=True)

        if normalize:
            logq = self.sa_kde.score_samples(X)
            logp = np.mean(logq)
            return importance_sample(x=r_pred, p_gen=logq, p_tar=logp,
                                     normalize=True, log_weight_lim=3)
        else:
            return np.mean(r_pred)

    def initialize_modes(self, n_modes, init_clusterer=None):
        if init_clusterer is None:
            self.clusters = KMeans(n_clusters=n_modes)
        else:
            self.clusters = init_clusterer

        self.n_modes = n_modes
        self.clusters.fit(X=self.states)
        self.cluster_actions = [None] * n_modes
        self.cluster_rewards = [None] * n_modes
        self.active_clusters = range(self.n_modes)
        self.optimize_mode_actions()

    def fit_reward_model(self):
        if self.reward_model_fitted:
            return

        # X = np.hstack((self.states, self.actions))
        # r = np.asarray(self.rewards).reshape(-1, 1)
        # self.reward_model.fit(X, r)
        # self.reward_model_fitted = True

    def optimize(self, n_iters, beta=1.0):
        for i in range(n_iters):
            self.optimize_mode_assignments()
            self.optimize_mode_actions(beta=beta)

    def optimize_mode_actions(self, beta=1.0):
        """Pick the best actions for the current mode assignments.
        """
        self.fit_reward_model()
        assignments = self.clusters.predict(X=self.states)
        present_modes = np.unique(assignments)
        probabilities = np.zeros((len(self.states), self.n_modes))

        try:
            probs = self.clusters.predict_proba(X=self.states)
            for i, k in zip(range(len(present_modes)), present_modes):
                probabilities[:, k] = probs[:, i]
        except AttributeError:
            for k in range(self.n_modes):
                probabilities[:, k] = assignments == k

        a_dim = len(self.actions[0])  # TODO HACK!
        states = np.asarray(self.states)

        for k in range(self.n_modes):

            print 'Optimizing action for mode %d...' % k

            # Start with the current mode action if exists, else init to zeros
            init_a = self.cluster_actions[k]
            if init_a is None:
                init_a = np.zeros(a_dim)

            potential_in = probabilities[:, k] > 0
            members = states[potential_in]
            member_probs = np.squeeze(probabilities[potential_in, k])

            if len(members) == 0:
                print 'Cluster %d empty!' % k
                self.cluster_actions[k] = np.random.uniform(-1, 1, a_dim)
                continue

            def obj(a):
                # TODO Settable criterion
                a = a.reshape(1, -1)
                A = np.tile(a, reps=(len(members), 1))
                X = np.hstack((members, A))
                r_pred, r_std = self.reward_model.predict(X, return_std=True)
                ucb = r_pred + beta * r_std
                return np.average(ucb, weights=member_probs)

            best_a, r_pred = self.optimizer.optimize(x_init=init_a, func=obj)
            self.cluster_actions[k] = best_a
            self.cluster_rewards[k] = r_pred

    def optimize_mode_assignments(self):
        """Pick the best assignments for states given the current actions
        for each mode.
        """

        # Predict rewards for all combinations of state, mode action
        N = len(self.states)
        s_dim = len(self.states[0])
        a_dim = len(self.actions[0])
        state_actions = np.empty((N * self.n_modes, s_dim + a_dim))
        state_actions[:, :s_dim] = np.repeat(self.states, self.n_modes, axis=0)
        state_actions[:, s_dim:] = np.tile(self.cluster_actions, reps=(N, 1))

        r_pred, r_std = self.reward_model.predict(state_actions,
                                                  return_std=True)
        # self.fit_reward_model()
        #r_pred = self.reward_model.predict(state_actions)

        r_pred = np.reshape(r_pred, (N, self.n_modes))

        new_assignments = np.argmax(r_pred, axis=1)

        self.active_clusters = np.unique(new_assignments)

        # NOTE Do we actually want to do this? 1 mode seems fine if state is irrelevant!
        # if len(self.active_clusters) == 1:
        # print 'Mode collapse! Re-initializing...'
        # self.initialize_modes(self.n_modes)
        # return

        # to_assign = []
        # for i in range(self.n_modes):
        #     if not np.any(new_assignments == i):
        #         print 'Randomly reassigning mode %d' % i
        #         to_assign.append(i)

        # ind = np.random.randint(0, len(new_assignments), size=len(to_assign))
        # for i, j in zip(ind, to_assign):
        #     new_assignments[i] = j

        # TODO Something with these new assignments?
        #self.clusters = SVC(decision_function_shape='ovr')
        #self.clusters.fit(X=self.states, y=new_assignments)

        if not self.clusters_init:
            self.clusters = self.mode_classifier(**self.mode_args)
            self.clusters_init = True
        self.clusters.fit(X=self.states,
                          y=new_assignments)
Beispiel #8
0
def get_cluster_masks(vectors, num_sources, binary_mask=True, algo=None):
    """
    Cluster the vectors using k-means with k=num_sources.  Use the cluster IDs
    to create num_sources T-F masks.

    Inputs:
        vectors: Numpy array of shape (Batch, Time, Frequency, Embedding).
                 Only the masks for the first batch are computed.
        num_sources: Integer number of sources to compute masks for
        binary_mask: If true, computes binary masks.  Otherwise computes the
                     soft masks.
        algo: sklearn-compatable clustering algorithm

    Returns:
         masks: Numpy array of shape (Time, Frequency, num_sources) containing
                the estimated binary mask for each of the num_sources sources.
    """

    if algo is None:
        algo = KMeans(n_clusters=num_sources, random_state=0)

    # Get the shape of the input
    shape = np.shape(vectors)

    # Preallocate mask array
    masks = np.zeros((shape[1] * shape[2], num_sources))

    if algo.__class__.__name__ == 'BayesianGaussianMixture' or algo.__class__.__name__ == 'GaussianMixture':
        vectors = PCA(n_components=max(1, shape[3] // 10),
                      random_state=0).fit_transform(vectors[0].reshape(
                          (shape[1] * shape[2], shape[3])))

        algo.fit(vectors)

        # all_probs = algo.predict_proba(vectors[0].reshape((shape[1]*shape[2], shape[3])))
        all_probs = algo.predict_proba(vectors)

        if binary_mask:
            for i in range(all_probs.shape[0]):
                probs = all_probs[i]
                label = np.argmax(probs)
                masks[i, label] = 1
        else:
            for i in range(all_probs.shape[0]):
                probs = all_probs[i]
                masks[i, :] = probs / probs.sum()

        masks = masks.reshape((shape[1], shape[2], num_sources))

    else:
        # Do clustering
        algo.fit(vectors[0].reshape((shape[1] * shape[2], shape[3])))

        if binary_mask:
            # Use cluster IDs to construct masks
            labels = algo.labels_
            for i in range(labels.shape[0]):
                label = labels[i]
                masks[i, label] = 1

            masks = masks.reshape((shape[1], shape[2], num_sources))

        else:
            if algo.__class__.__name__ == 'KMeans':
                all_dists = algo.transform(vectors[0].reshape(
                    (shape[1] * shape[2], shape[3])))
                for i in range(all_dists.shape[0]):
                    dists = all_dists[i]
                    masks[i, :] = dists / dists.sum()

                masks = masks.reshape((shape[1], shape[2], num_sources))
            # # Get cluster centers
            # centers = algo.cluster_centers_
            # centers = centers.T
            # centers = np.expand_dims(centers, axis=0)
            # centers = np.expand_dims(centers, axis=0)

            # # Compute the masks using the cluster centers
            # masks = centers * np.expand_dims(vectors[0], axis=3)
            # # masks = np.sum(masks*1.5, axis=2)
            # masks = np.sum(masks, axis=2)
            # masks = softmax(masks)
            # # masks = 1/(1 + np.exp(-masks))

    return masks