def do_kfolds(x, y):
    random.seed(seed)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    accuracies = []
    fscores = []
    precisions = []
    recalls = []
    mccs = []
    x = np.array(x)
    y = np.array(y)
    print("Preparing training and label data...OK")
    print("")

    for train, test in kfold.split(x, y):

        x_train = x[train]
        y_train = y[train]
        x_test = x[test]
        y_test = y[test]

        mean = np.mean(x_train, axis = 0)
        std = np.std(x_train, axis = 0)
        x_train -= mean
        eps = 10**-5
        std = std + eps
        x_train /= std

        x_test -= mean
        x_test /= std

        print("Training model on data...")
        s_training = time.time()
        M = trainer.build_sequential_model(rate = 0.3, shape = x_train.shape[1])
        trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000)

        print("Classifying data...")
        s_classify = time.time()
        #scores = trained_M.(x_test)
        classes = trained_M.predict_classes(x_test)
        classes = np.array(classes)
        classes = classes.ravel()

        e_classify = time.time()
        print("Classifying data...OK, took: " + str((e_classify - s_classify)))

        mcc, accuracy, fscore, precision, recall = get_performance_vals(y_test, classes)

        mccs.append(mcc)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

    print("MCC: %.2f (+/- %.2f)" % (np.mean(mccs), np.std(mccs)))
    print("Accuracy: %.2f%% (+/- %.2f%%)" % (100*np.mean(accuracies), 100*np.std(accuracies)))
    print("F1 score: %.2f (+/- %.2f)" % (np.mean(fscores), np.std(fscores)))
    print("Precision: %.2f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
    print("Recall: %.2f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))

    return trained_M, mean, std
def main(model=None):

    if choice == 6:
        if already_extracted == 0:
            extract_descriptors_from_file_to_pickle(predict_pos_input_name,
                                                    predict_pos_name)
            extract_descriptors_from_file_to_pickle(predict_neg_input_name,
                                                    predict_neg_name)
        pos_dvec = IO.deserialize_descriptor_vector(predict_pos_name)
        neg_dvec = IO.deserialize_descriptor_vector(predict_neg_name)

    if choice != 6:
        if already_extracted == 0:
            pos_samples = extract_descriptors_from_file_to_pickle(
                pos_input_name, pos_name)
            if use_random_small_sequence_negative == 0:
                extract_descriptors_from_file_to_pickle(
                    neg_input_name, neg_name, pos_samples)
            if choice == 1 or choice == 4 or choice == 5:
                extract_descriptors_from_file_to_pickle("Insert_name", postest)
                extract_descriptors_from_file_to_pickle("insert_name", negtest)

        print("Deserializing descriptor vectors...")
        pos_dvec = IO.deserialize_descriptor_vector(pos_name)

        if use_random_small_sequence_negative != 0:
            neg_dvec = IO.deserialize_descriptor_vector(
                "neg_pipeline_complete_anticancer")  #same as neg_cytotoxic
            if len(neg_dvec) >= len(pos_dvec):
                neg_dvec = neg_dvec[:len(pos_dvec)]
            else:
                print(
                    "Set use_random_small_sequence_negative to zero, because that pickle file does not contain enough samples to maintain alanced classes! Use CTRL-C to quit!"
                )
                input()
        else:
            neg_dvec = IO.deserialize_descriptor_vector(neg_name)
            if len(neg_dvec) != len(pos_dvec):
                print(
                    "Warning! Class balance is no achieved! Increase negative dataset sampling! Use CTRL-C to quit!"
                )
                print("Negative dataset length: %d" % (len(neg_dvec)))
                print("Positive dataset length: %d" % (len(pos_dvec)))
                input()

        if choice == 1 or choice == 4 or choice == 5:
            pos_dvec_test = IO.deserialize_descriptor_vector(postest)
            neg_dvec_test = IO.deserialize_descriptor_vector(negtest)

        print("Deserializing descriptor vectors...OK")
        print("")

    print("Extracting numerical vectors...")
    # maybe save these too separately

    #'''Choosing to train only with certain features'''

    #mask = {'T', 'P', 'G', 'D', 'Q', 'C', 'E', 'M', 'K'}
    #pos_dvec = [{key: dvec[key] for key in dvec.keys() & mask} for dvec in pos_dvec]
    #neg_dvec = [{key: dvec[key] for key in dvec.keys() & mask} for dvec in neg_dvec]

    pos_nmat = []
    for dvec in pos_dvec:
        if dvec is None:
            continue
        pos_nvec = FX.num_vector_from_descriptor_vector(dvec)
        pos_nmat.append(pos_nvec)

    neg_nmat = []
    for dvec in neg_dvec:
        if dvec is None:
            continue
        neg_nvec = FX.num_vector_from_descriptor_vector(dvec)
        neg_nmat.append(neg_nvec)

    if choice == 1 or choice == 4:
        pos_nmat_test = []
        for dvec in pos_dvec_test:
            if dvec is None:
                continue
            pos_nvec_test = FX.num_vector_from_descriptor_vector(dvec)
            pos_nmat_test.append(pos_nvec_test)

        neg_nmat_test = []
        for dvec in neg_dvec_test:
            if dvec is None:
                continue
            neg_nvec_test = FX.num_vector_from_descriptor_vector(dvec)
            neg_nmat_test.append(neg_nvec_test)

    print("Extracting numerical vectors...OK")
    print("")

    print("Preparing training and label data...")

    # Prepare labels
    pos_y_batch = [1 for _ in pos_nmat]
    neg_y_batch = [0 for _ in neg_nmat]

    if choice == 1 or choice == 4 or choice == 5:
        pos_y_batch_test = [1 for _ in pos_nmat_test]
        neg_y_batch_test = [0 for _ in neg_nmat_test]

    # Append training data and labels, shuffle is done is kfolds
    neg_nmat.extend(pos_nmat)
    x = neg_nmat
    neg_y_batch.extend(pos_y_batch)
    y = neg_y_batch

    if choice == 1 or choice == 4 or choice == 5:
        neg_nmat_test.extend(pos_nmat_test)
        x_test = neg_nmat_test
        neg_y_batch_test.extend(pos_y_batch_test)
        y_test = neg_y_batch_test

        if choice == 4 or choice == 5:
            x.extend(x_test)
            y.extend(y_test)
            if choice == 4:
                trained_M, mean, std = do_kfolds(x, y)
            if choice == 5:
                trained_M, mean, std = do_crossval(x, y)

    print("Preparing training and label data...OK")
    print("")

    if choice == 6:

        #json_file = open('./models/' + model_name + '.json', 'r')
        json_file = open('./' + model_name + '.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        # load weights into new model
        #loaded_model.load_weights("./models/" + model_name + ".h5")
        loaded_model.load_weights("./" + model_name + ".h5")
        print("Loaded model from disk")

        optim = Adam(lr=0.01, beta_1=0.95)

        loaded_model.compile(loss='binary_crossentropy',
                             optimizer=optim,
                             metrics=['accuracy'])

        #path = path = "C:/Users/Peter/Desktop/MIT/code/models/" + std_name + ".pickle"
        path = path = "./" + std_name + ".pickle"
        array_file = open(path, 'rb')
        std = pickle.load(array_file)

        #path = path = "C:/Users/Peter/Desktop/MIT/code/models/" + mean_name + ".pickle"
        path = path = "./" + mean_name + ".pickle"
        array_file = open(path, 'rb')
        mean = pickle.load(array_file)

        x -= mean
        x /= std

        result = loaded_model.predict(x)
        print("Probabilities:")
        print(result)
        classes = loaded_model.predict_classes(x)
        classes = np.array(classes)
        classes = classes.ravel()
        print("Calsses:")
        print(classes)

        if known_classes == 1:

            get_performance_vals(y, classes)

    # 10-folds cross-validation
    if choice == 0:

        if old_dataset == 1:

            no_features = 114  #MODIFY THIS ACCORDINGLY

            x = np.array([np.array(xi).T for xi in x])
            remain = x.shape[0]
            num = []
            for i in range(remain):
                if x[i].shape[0] != no_features:
                    num.extend([i])
            print(len(num))
            x = np.delete(x, num, 0)
            y = np.delete(y, num, 0)
            x = np.array([np.array(xi).T for xi in x
                          ])  #needed to be done again for some reason
            print(x.size)
            remain2 = x.shape[0]
            x.reshape(remain2, no_features)

        trained_M, mean, std = do_kfolds(x, y)

    if choice == 1:

        x = np.array(x)
        y = np.array(y)
        x_train = x
        y_train = y
        mean = np.mean(x_train, axis=0)
        std = np.std(x_train, axis=0)
        x_train -= mean
        eps = 10**-5
        std = std + eps
        x_train /= std

        x_test -= mean
        x_test /= std

        print("Training model on data...")
        s_training = time.time()
        M = trainer.build_sequential_model(rate=0.3, shape=x_train.shape[1])
        trained_M = trainer.fit_model_batch(M,
                                            x_train,
                                            y_train,
                                            num_epoch=2000)
        e_training = time.time()
        print("Training model on data...OK, took: " +
              str((e_training - s_training)))

        print("Classifying data...")
        s_classify = time.time()
        #scores = trained_M.predict_with_model(x_test)
        classes = trained_M.predict_classes(x_test)
        classes = np.array(classes)
        classes = classes.ravel()
        e_classify = time.time()
        print("Classifying data...OK, took: " + str((e_classify - s_classify)))

        mcc, accuracy, fscore, precision, recall = get_performance_vals(
            y_test, classes)

    if choice == 2:

        trained_M, mean, std = do_crossval(x, y)

    if choice == 3:

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            train_size=0.8,
                                                            random_state=seed,
                                                            stratify=y)
        mean = np.mean(x_train, axis=0)
        std = np.std(x_train, axis=0)
        x_train -= mean
        eps = 10**-5
        std = std + eps
        x_train /= std

        x_test -= mean
        x_test /= std

        print("Training model on data...")
        s_training = time.time()
        M = trainer.build_sequential_model(rate=0.3, shape=x_train.shape[1])
        trained_M = trainer.fit_model_batch(M,
                                            x_train,
                                            y_train,
                                            num_epoch=2000)

        print("Classifying data...")
        s_classify = time.time()
        #scores = trained_M.(x_test)
        classes = trained_M.predict_classes(x_test)
        classes = np.array(classes)
        classes = classes.ravel()

        e_classify = time.time()
        print("Classifying data...OK, took: " + str((e_classify - s_classify)))

        mcc, accuracy, fscore, precision, recall = get_performance_vals(
            y_test, classes)

    if choice != 6:

        #path = "C:/Users/Peter/Desktop/MIT/code/models/" + std_name + ".pickle"
        path = "./" + std_name + ".pickle"
        output = open(path, 'w+b')
        pickle.dump(std, output)
        output.close()

        #path = "C:/Users/Peter/Desktop/MIT/code/models/" + mean_name + ".pickle"
        path = "./" + mean_name + ".pickle"
        output = open(path, 'w+b')
        pickle.dump(mean, output)
        output.close()

        # serialize model to JSON
        model_json = trained_M.to_json()
        #with open("./models/" + model_name + ".json", "w") as json_file:
        with open("./" + model_name + ".json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        #trained_M.save_weights("./models/" + model_name + ".h5")
        trained_M.save_weights("./" + model_name + ".h5")
        print("Saved model to disk")
def do_crossval(x, y):
    mccs = []
    rate_arr = [0.2, 0.3, 0.4, 0.5, 0.6]  #dropout rate

    x, x_test, y, y_test = train_test_split(x,
                                            y,
                                            test_size=0.2,
                                            train_size=0.8,
                                            random_state=seed,
                                            stratify=y)
    x_train, x_validate, y_train, y_validate = train_test_split(
        x, y, test_size=0.25, train_size=0.75, random_state=seed, stratify=y)

    mean = np.mean(x_train, axis=0)
    std = np.std(x_train, axis=0)
    x_train -= mean
    eps = 10**-5
    std = std + eps
    x_train /= std

    x_validate -= mean
    x_validate /= std

    x_test -= mean
    x_test /= std

    for i in rate_arr:
        print("Training model on data...")
        s_training = time.time()
        M = trainer.build_sequential_model(rate=i, shape=x_train.shape[1])
        trained_M = trainer.fit_model_batch(M,
                                            x_train,
                                            y_train,
                                            num_epoch=2000)  #set high to 500
        e_training = time.time()
        print("Training model on data...OK, took: " +
              str((e_training - s_training)))

        print("Classifying data...")
        s_classify = time.time()
        #scores = trained_M.predict(x_validate)
        classes = trained_M.predict_classes(x_validate)
        classes = np.array(classes)
        classes = classes.ravel()

        e_classify = time.time()
        print("Classifying data...OK, took: " + str((e_classify - s_classify)))

        mcc, accuracy, fscore, precision, recall = get_performance_vals(
            y_validate, classes)

        mccs.append(mcc)

    idx = np.argmax(mccs)
    best_rate = rate_arr[idx]
    print("Best dropout rate is %f" % (best_rate))

    print("Training model on data...")
    s_training = time.time()
    M = trainer.build_sequential_model(rate=best_rate, shape=x_train.shape[1])
    trained_M = trainer.fit_model_batch(M, x_train, y_train, num_epoch=2000)
    e_training = time.time()
    print("Training model on data...OK, took: " +
          str((e_training - s_training)))

    print("Classifying data...")
    s_classify = time.time()
    #scores = trained_M.predict(x_test)
    classes = trained_M.predict_classes(x_test)
    classes = np.array(classes)
    classes = classes.ravel()

    e_classify = time.time()
    print("Classifying data...OK, took: " + str((e_classify - s_classify)))

    print("Best dropout rate is %f" % (best_rate))
    mcc, accuracy, fscore, precision, recall = get_performance_vals(
        y_test, classes)

    return trained_M, mean, std