def function(nhmmstates, nestimators, nhmmiter):

    nhmmstates = nhmmstates[0]
    nestimators = nestimators[0] * 100
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = "full"

    print nhmmstates, nestimators, nhmmiter

    # Load the dataset
    static_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/train_data.npy")
    dynamic_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy")
    static_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy")
    dynamic_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy")
    labels_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy")
    labels_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy")

    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # prepare where to store the ratios
    ratios_all_hmm = np.empty(len(labels_all))

    # split indices into folds
    enrich_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, enrich_idx in enumerate(enrich_idx_list):
        train_idx = list(set(range(nsamples)) - set(enrich_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(
            nhmmstates, nhmmiter, hmmcovtype, dynamic_all[train_idx], labels_all[train_idx]
        )
        ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])

    # dataset for hybrid learning
    enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T), axis=1)

    # (2.) k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # Hybrid on features enriched by HMM (3)
        rf = RandomForestClassifier(n_estimators=nestimators)
        rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
        scores.append(rf.score(enriched_by_hmm[val_idx], labels_all[val_idx]))

    print "Result: %.4f" % np.mean(scores)
    return -np.mean(scores)
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract predictions using RF on static
    print "    Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "    Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])

    # extract predictions using LSTM on dynamic
    print "    Extracting predictions on dynamic data with LSTM..."
    lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs)
    model_pos, model_neg = lstmcl.train(dynamic_all[train_idx], labels_all[train_idx])
    mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, dynamic_all[predict_idx]) 
    predictions_all_lstm[predict_idx] = np.vstack((mse_pos, mse_neg)).T
    ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])


#
# Prepare combined datasets for the future experiments
#

# datasets for ensemble learning
# split the training data into two halves
#   fh stands for first half
#   sh stands for second half
print "Splitting data in two halves..."
fh_data, fh_labels, sh_data, sh_labels = DataHandler.split(0.5, train_data, train_labels)

# train HMM on first 50% of the training set
print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(NSTATES, NITERS, fh_data, fh_labels)

# feed second 50% of the training set into the HMM to obtain
# pos/neg ratio for every sequence in the second half of the training set
print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
test_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, test_data)

# apply fourier transform on the second 50% of the training set
print "Fouriering the second half of the dataset..."
fourier_sh_data = Fourier.data_to_fourier(sh_data)
fourier_test_data = Fourier.data_to_fourier(test_data)

# augment fourier results of the second 50% train with the ratios thus producing an enriched dataset
print "Merging Fourier features and HMM-based ratios..."
enriched_sh_data = np.hstack((fourier_sh_data, sh_ratios.reshape(len(sh_ratios), 1)))
enriched_test_data = np.hstack((fourier_test_data, test_ratios.reshape(len(test_ratios), 1)))

# train RF on the enriched dataset
print "Training RF on the merged dataset..."
rf = RandomForestClassifier(n_estimators=500)
    ratios_all_lstm = np.empty(len(labels_all))

    # split indices into folds
    predict_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, predict_idx in enumerate(predict_idx_list):
        print "Enrichment fold %d / %d" % (fid + 1, nfolds)
        train_idx = list(set(range(nsamples)) - set(predict_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_all[train_idx],
                                           labels_all[train_idx])
        ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[predict_idx])

        # extract predictions using LSTM on dynamic
        lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs,
                                lstmbatchsize)
        model_pos, model_neg = lstmcl.train(dynamic_all[train_idx],
                                            labels_all[train_idx])
        mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg,
                                              dynamic_all[predict_idx])
        ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[predict_idx])

    # datasets for hybrid learning
    enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T),
                                     axis=1)
    enriched_by_lstm = np.concatenate(
Example #5
0
                          replace=False)
sh_idx = list(set(range(0, dynamic_train.shape[0])) - set(fh_idx))
fh_data = dynamic_train[fh_idx, :, :]
fh_labels = labels_train[fh_idx]
sh_data = dynamic_train[sh_idx, :, :]
sh_labels = labels_train[sh_idx]

# RF+HMM
print "Evaluating RF+HMM model:"

print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, fh_data, fh_labels)

print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
val_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_val)

print "Merging static features and HMM-based ratios..."
enriched_sh_data = np.hstack(
    (static_train[sh_idx, :], sh_ratios.reshape(len(sh_ratios), 1)))
enriched_val_data = np.hstack(
    (static_val, val_ratios.reshape(len(val_ratios), 1)))

print "Training RF on the merged dataset..."
rf = RandomForestClassifier(n_estimators=rf_estimators, n_jobs=-1)
rf.fit(enriched_sh_data, sh_labels)
print "RF+HMM with enriched features on validation set: %.4f" % rf.score(
    enriched_val_data, labels_val)

# RF+LSTM
# split the training data into two halves
#   fh stands for first half
#   sh stands for second half
print "Splitting data in two halves..."
fh_data, fh_labels, sh_data, sh_labels = DataHandler.split(
    0.5, train_data, train_labels)

# train HMM on first 50% of the training set
print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(NSTATES, NITERS, fh_data, fh_labels)

# feed second 50% of the training set into the HMM to obtain
# pos/neg ratio for every sequence in the second half of the training set
print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
test_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, test_data)

# apply fourier transform on the second 50% of the training set
print "Fouriering the second half of the dataset..."
fourier_sh_data = Fourier.data_to_fourier(sh_data)
fourier_test_data = Fourier.data_to_fourier(test_data)

# augment fourier results of the second 50% train with the ratios thus producing an enriched dataset
print "Merging Fourier features and HMM-based ratios..."
enriched_sh_data = np.hstack(
    (fourier_sh_data, sh_ratios.reshape(len(sh_ratios), 1)))
enriched_test_data = np.hstack(
    (fourier_test_data, test_ratios.reshape(len(test_ratios), 1)))

# train RF on the enriched dataset
predictions_all_hmm = np.empty((len(labels_all), 2))
predictions_all = np.empty((len(labels_all), ))

# split indices into folds
enrich_idx_list = np.array_split(range(nsamples), nfolds)

# run CV for enrichment
for fid, enrich_idx in enumerate(enrich_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(enrich_idx))

    # extract predictions using HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all_hmm[enrich_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all[enrich_idx] = hmmcl.predict(hmmcl.tensor_to_list(dynamic_all[enrich_idx]), model_pos, model_neg)
 
# dataset for hybrid learning
dynamic_as_static = dynamic_all.reshape((dynamic_all.shape[0], dynamic_all.shape[1] * dynamic_all.shape[2]))
enriched_by_hmm = np.concatenate((dynamic_as_static, predictions_all_hmm), axis=1)


# k-fold cross validation to obtain accuracy
print '===> HMM on dynamic: %.4f' % hmmcl.accuracy(predictions_all, labels_all)

val_idx_list = np.array_split(range(nsamples), nfolds)
scores = []
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
Example #8
0
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(trainA_static, trainA_labels)
predictions_trainB_rf = rf.predict_log_proba(trainB_static)
predictions_trainB_rf[predictions_trainB_rf == -inf] = np.min(
    predictions_trainB_rf[predictions_trainB_rf != -inf])
predictions_test_rf = rf.predict_log_proba(test_static)
predictions_test_rf[predictions_test_rf == -inf] = np.min(
    predictions_test_rf[predictions_test_rf != -inf])

# extract predictions using HMM on dynamic
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                   trainA_dynamic, trainA_labels)
predictions_trainB_hmm = hmmcl.predict_log_proba(model_pos, model_neg,
                                                 trainB_dynamic)
ratios_trainB_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
predictions_test_hmm = hmmcl.predict_log_proba(model_pos, model_neg,
                                               test_dynamic)
ratios_test_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)

# extract predictions using LSTM on dynamic
lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs,
                        lstmbatchsize)
model_pos, model_neg = lstmcl.train(trainA_dynamic, trainA_labels)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, trainB_dynamic)
predictions_trainB_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_trainB_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg,
                                           trainB_dynamic)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, test_dynamic)
predictions_test_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_test_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)
print ""
print "Evaluating joint model:"
print "Splitting data in two halves..."
fh_idx = np.random.choice(range(0, dynamic_train_data.shape[0]), size=np.round(dynamic_train_data.shape[0] * 0.5, 0), replace=False)
sh_idx = list(set(range(0, dynamic_train_data.shape[0])) - set(fh_idx))
fh_data = dynamic_train_data[fh_idx, :, :]
fh_labels = dynamic_train_labels[fh_idx]
sh_data = dynamic_train_data[sh_idx, :, :]
sh_labels = dynamic_train_labels[sh_idx]

print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, fh_data, fh_labels)

print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
val_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_val_data)

print "Merging static features and HMM-based ratios..."
enriched_sh_data = np.hstack((static_train_data[sh_idx, :], sh_ratios.reshape(len(sh_ratios), 1)))
enriched_val_data = np.hstack((static_val_data, val_ratios.reshape(len(val_ratios), 1)))

print "Training RF on the merged dataset..."
rf = RandomForestClassifier(n_estimators=100)
rf.fit(enriched_sh_data, sh_labels)

print np.sum(static_val_labels == dynamic_val_labels), len(static_val_labels)

print "RF+HMM with enriched features on validation set: %.4f" % rf.score(enriched_val_data, static_val_labels)

def bpic(nhmmstates, nestimators, nhmmiter):

    nhmmstates = nhmmstates[0]
    nestimators = nestimators[0] * 100
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nestimators, nhmmiter

    # Load the dataset
    static_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_static.npy'
    )
    dynamic_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_dynamic.npy'
    )
    static_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_static.npy'
    )
    dynamic_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_dynamic.npy'
    )
    labels_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_labels.npy'
    )
    labels_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_labels.npy'
    )

    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # prepare where to store the ratios
    ratios_all_hmm = np.empty(len(labels_all))

    # split indices into folds
    enrich_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, enrich_idx in enumerate(enrich_idx_list):
        train_idx = list(set(range(nsamples)) - set(enrich_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_all[train_idx],
                                           labels_all[train_idx])
        ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[enrich_idx])

    # dataset for hybrid learning
    enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T),
                                     axis=1)

    # (2.) k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # Hybrid on features enriched by HMM (3)
        rf = RandomForestClassifier(n_estimators=nestimators)
        rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
        scores.append(rf.score(enriched_by_hmm[val_idx], labels_all[val_idx]))

    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
#
print 'Training enrichment models...'

# extract predictions using RF on static
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(trainA_static, trainA_labels)
predictions_trainB_rf = rf.predict_log_proba(trainB_static)
predictions_trainB_rf[predictions_trainB_rf == -inf] = np.min(predictions_trainB_rf[predictions_trainB_rf != -inf])
predictions_test_rf = rf.predict_log_proba(test_static)
predictions_test_rf[predictions_test_rf == -inf] = np.min(predictions_test_rf[predictions_test_rf != -inf])

# extract predictions using HMM on dynamic
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, trainA_dynamic, trainA_labels)
predictions_trainB_hmm = hmmcl.predict_log_proba(model_pos, model_neg, trainB_dynamic)
ratios_trainB_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
predictions_test_hmm = hmmcl.predict_log_proba(model_pos, model_neg, test_dynamic)
ratios_test_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)

# extract predictions using LSTM on dynamic
lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs, lstmbatchsize)
model_pos, model_neg = lstmcl.train(trainA_dynamic, trainA_labels)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, trainB_dynamic)
predictions_trainB_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_trainB_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, test_dynamic)
predictions_test_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_test_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)


#
ratios_all_hmm = np.empty(len(labels_all))
predictions_all_hmm = np.empty((len(labels_all), 2))
predictions_all = np.empty((len(labels_all),))

# split indices into folds
enrich_idx_list = np.array_split(range(nsamples), nfolds)

# run CV for enrichment
for fid, enrich_idx in enumerate(enrich_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(enrich_idx))

    # extract predictions using HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, dynamic_all[train_idx], labels_all[train_idx])
    ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all_hmm[enrich_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all[enrich_idx] = hmmcl.predict(hmmcl.tensor_to_list(dynamic_all[enrich_idx]), model_pos, model_neg)

# dataset for hybrid learning
dynamic_as_static = dynamic_all.reshape((dynamic_all.shape[0], dynamic_all.shape[1] * dynamic_all.shape[2]))
enriched_by_hmm = np.concatenate((dynamic_as_static, predictions_all_hmm), axis=1)


# k-fold cross validation to obtain accuracy
print "===> HMM on dynamic: %.4f" % hmmcl.accuracy(predictions_all, labels_all)

val_idx_list = np.array_split(range(nsamples), nfolds)
scores = []
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)