def testLogisticRegression():
     print("\n\n Testing Logistic Regression:")
     df = pd.read_csv("testData/ex2data1.txt", delimiter=',', header=None)
     df_copy = df.copy()
     df.columns = [0, 1, 2]
     print(df)
     df.describe()
     df_y = df[2]
     df, mu, sigma = featureScaling(df)
     X = df.values
     y = df_y.values
     y_vector = np.zeros((y.shape[0], 1))
     y_vector[:, 0] = y[:]
     y = y_vector
     theta, cost, X = lg.logisticRegression(400, 0.1, X[:, 0:2], y[:, 0:1])
     plotLogisticRegression(df_copy.values, y, theta, cost)
     print("\nTest successful")
     print("<-------------------->")
     print("Theta:")
     print(theta)
     print("<-------------------->")
     print("Testing for:")
     print(
         "Student with a score of 45 in Exam 1 and a score of 85 in Exam 2")
     test = np.array([1, (45 - mu[0]) / sigma[0], (85 - mu[1]) / sigma[1]])
     predict_prob = lg.sigmoid(np.matmul(test, theta))
     #print(str(test) + " * " + str(theta))
     print("Probability that the Student passes: " + str(predict_prob) +
           " Expected Value: 0.775 +/- 0.002")
     print("<-------------------->")
     predict_prob_one = lg.sigmoid(np.matmul(X[12:13, :], theta))
     predict_prob_two = lg.sigmoid(np.matmul(X[43:44, :], theta))
     predict_prob_three = lg.sigmoid(np.matmul(X[21:22, :], theta))
     print(
         str(float(predict_prob_one)) + " <-- Example 14 --> " +
         str(y[12:13, 0]))
     print(
         str(float(predict_prob_two)) + " <-- Example 45 --> " +
         str(y[43:44, 0]))
     print(
         str(float(predict_prob_three)) + " <-- Example 23 --> " +
         str(y[21:22, 0]))
Exemple #2
0
    df['Standard Deviation'] = ((stdvA + stdvE))
    df['Range'] = ((rangeA + rangeE))
    df['Inner Quartile Range'] = ((iqrA + iqrE))
    #df['Absolute Value Mean']= ((absSumA+absSumE))
    df['Output'] = outputSet
    df = df.sample(frac=1).reset_index(drop=True)
    print(df)
    return df

    #print(training_df)


df = create_prob_functions()
#linnearRegression=linnearRegression.LinnearRegression()
#linnearRegression.runModel(df)
#runing logistic regression model
logisticRegression = logisticRegression.logisticRegression()
logisticRegression.runModel(df)
"""
kMeans=kMeans.kMeans()
category=kMeans.runModel(df)

"""

#training_df=df.tail(20000)
#testing_df=df.tail(20000)
"""
mlpnn=mlpnn.MLPNN()
model=mlpnn.runModel(training_df, df)
"""
Exemple #3
0
convertToNum(wine_data)
convertToNum(cancer_data)
 
#wine_data = Stats.removeOutliers(wine_data)
#Stats.normalityOfFeatures(wine_data)
#ratioOnes = Stats.ratioOfOnes(wine_data.iloc[:,-1])
print("-------logistic regression--------------")
"""wineLR = lR.logisticRegression(wine_data.iloc[:,:-1], wine_data.iloc[:,-1], 0.1, 100, 0,5) 
start = time.time()
acc=wineLR.start()
end = time.time()
print("wine")
print(acc)
print(end-start)"""

cancerLR = lR.logisticRegression(cancer_data.iloc[:,:-1], cancer_data.iloc[:,-1], 0.1, 100, 0,5) 
start = time.time()
acc = cancerLR.start()
end = time.time()
print("cancer")
print(acc)
print(end-start)
#print("-----------------LDA------------------------------") 
print("---------linear discriminant analysis------------")
var = lDA.linearDiscriminantAnalysis(cancer_data.iloc[:,:-1])
var = lDA.linearDiscriminantAnalysis(wine_data.iloc[:,:-1])
wd=wine_data.iloc[:,0:11]
x0=wine_data.iloc[3:4,0:11]
#(ans,inc,cor)=var.predict_A(wd,wine_data.iloc[:,-1])
start = time.time()
myl=var.predict_k_Log_odds(wd,wine_data.iloc[:,-1],5)
Exemple #4
0
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from logisticRegression import logisticRegression

pathname = os.path.dirname(sys.argv[0])
os.chdir(pathname)

data = pd.read_csv("../ex2data1.txt", delimiter=",", header=None)
X = data.values[:, :2]
y = data.values[:, 2]
del (data)

clf = logisticRegression(lambd=0, tol=1e-6, verbose=True)
clf.fit(X, y)

y_pred = clf.predict(X)
print("train accuracy =", clf.accuracy(y_pred, y))

# add decision boundary
xx = np.linspace(X[:, 0].min(), X[:, 0].max())
yy = -clf.theta[1] / clf.theta[2] * xx - clf.theta[0] / clf.theta[2]

plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, alpha=0.8, cmap=plt.cm.Paired)

plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, alpha=0.8, cmap=plt.cm.Paired)
plt.plot(xx, yy, 'k-')
Exemple #5
0


    #####################################################################

    # Select the top num_feat features
    X = X[:,index[:num_feat]]
    
X = np.transpose(X)


# Start measuring execution time
start = timeit.default_timer()

# Train the classifier
w = logisticRegression(X,Y)

# Print logistic regression learning execution time
stop = timeit.default_timer()
print ('Running Time: ' + str(stop-start))


# Load test data and split data from class labels
data = np.loadtxt('test.csv', delimiter=',')
rY = data[:, 0] # The real class labels
test = np.transpose(data[:, 1:data.shape[1]])

# Keep the features indicated by the feature selection task
if featureSelection == True: 
    test = test[index[:num_feat],:]
Exemple #6
0
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from logisticRegression import logisticRegression

pathname = os.path.dirname(sys.argv[0])
os.chdir(pathname)

data = pd.read_csv("../ex2data1.txt", delimiter=",", header=None)
X = data.values[:,:2]
y = data.values[:,2]
del(data)

clf = logisticRegression(lambd=0,tol=1e-6,verbose=True)
clf.fit(X,y)

y_pred = clf.predict(X)
print("train accuracy =", clf.accuracy(y_pred, y))

# add decision boundary
xx = np.linspace(X[:,0].min(), X[:,0].max())
yy = -clf.theta[1]/clf.theta[2]*xx - clf.theta[0]/clf.theta[2]

plt.figure()
plt.scatter(X[:,0],X[:,1],c=y,s=40,alpha=0.8,cmap=plt.cm.Paired)

plt.figure()
plt.scatter(X[:,0],X[:,1],c=y,s=40,alpha=0.8,cmap=plt.cm.Paired)
plt.plot(xx, yy, 'k-')
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from analyseData import analyseData
from plotDecisionBoundry import plotDecisionBoundry
from evaluateRegression import evaluateRegression
from kmeans import kmeans
from dbscan import dbscan
from agglomerative import agglomerative
from evaluateClustering import evaluateClustering
from vizualizeData import vizualizeData

#loading pre-processed data
X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData(
)

yClass_lr = logisticRegression(X_train, X_test, yClass_train)
lrAc = accuracy_score(yClass_test, yClass_lr)
X_lr = X_test

#reloading data pre-processed with different parameters
X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData(
    normalization='mms')

yClass_rf = randomForest(X_train, X_test, yClass_train)
rfAc = accuracy_score(yClass_test, yClass_rf)
X_rf = X_test

yClass_knn = kNN(X_train, X_test, yClass_train)
knnAc = accuracy_score(yClass_test, yClass_knn)
X_knn = X_test
Exemple #8
0
def main():

    # -------------------------- Fist 1D function

    max_iter = 100
    x_histo, f_histo, grad_histo = optimize(fonc=_test_1, dfonc=_dtest_1,
                                            xinit=[2], eps=0.1, max_iter=100)
    x_histo2, f_histo2, grad_histo2 = optimize(
        fonc=_test_1, dfonc=_dtest_1, xinit=[2], eps=0.8, max_iter=100)

    _plot_2D_val_grad_f("xcos(x)\nen fonction du nombre d'itération,\n"
                        "les valeurs de f et du gradient de f",
                        x_histo, f_histo, grad_histo)

    _plot_2D_compare(title="xcos(x)\nla fonction f et la trajectoire de\n"
                     "l'optimisation(les valeurs successives de f(x))",
                     fonc=_test_1, dfonc=_dtest_1, xinit1=[2], xinit2=[2],
                     eps1=0.1, eps2=0.8)

    _plot_courbe(
        "xcos(x), courbe(t, log||xt - x*||)",
        x_histo,
        x_histo2,
        max_iter)

    # -------------------------- Second 1D function

    x_histo, f_histo, grad_histo = optimize(fonc=_test_2, dfonc=_dtest_2,
                                            xinit=[2], eps=0.1)
    x_histo2, f_histo2, grad_histo2 = optimize(fonc=_test_2, dfonc=_dtest_2,
                                               xinit=[2], eps=0.8)

    _plot_2D_val_grad_f("-log(x)+x^2\nen fonction du nombre d'itération,\n"
                        "les valeurs de f et du gradient de f",
                        x_histo, f_histo, grad_histo)

    _plot_2D_compare(title="-log(x)+x^2\nla fonction f et la trajectoire de\n"
                     "l'optimisation(les valeurs successives de f(x))",
                     fonc=_test_2, dfonc=_dtest_2, xinit1=[2], xinit2=[2],
                     eps1=0.1, eps2=0.8)

    _plot_courbe(
        "-log(x)+x^2, courbe(t, log||xt - x*||)",
        x_histo,
        x_histo2,
        max_iter)
    # -------------------------- 2d function Rosenbrock (or banana)

    x_histo, f_histo, grad_histo = optimize(fonc=_test_3, dfonc=_dtest_3,
                                            xinit=[0, 1], eps=0.1)

    _plot_2D_val_grad_f("Rosenbrock\nen fonction du nombre d'itération,\n"
                        "les valeurs de f et du gradient de f",
                        x_histo, f_histo, grad_histo)

    _plot_3D(x_histo, f_histo, grad_histo, _test_3)

    # --------------------------- Logistic Regression

    trainx, trainy = load_usps("USPS_train.txt")
    testx, testy = load_usps("USPS_test.txt")

    logisticReg = logisticRegression(loss_g=cost_f_g, max_iter=100,
                                     epsilon=0.1)
    # logisticReg.fit(trainx, trainy)

    # Matrice de poids 6 vs 9 and 1 vs 8
    fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, sharey=True)
    plt.suptitle("Matrice de poids")
    weight_matrix(6, 9, fig, logisticReg, ax1)
    weight_matrix(1, 8, fig, logisticReg, ax2)
    # plt.savefig("weight_matrix_qqlexs_LR")

    # Matrice de poids 6 vs All
    matrix_one_vs_all(6, logisticReg)

    # Courbes d'erreurs 6 vs All
    error_curves(6)

    # --------------------------- Naïve Bayes

    clf_gaussian = GaussianNB()

    test_clf_on_usps(clf_gaussian, 6)
if __name__ == "__main__":

    # download data from the internet if it doesn't exists
    if not os.path.exists("ex2data1.txt"):
        req = requests.get("https://raw.github.com/SaveTheRbtz/ml-class/master/ex2/ex2data1.txt")
        with open("ex2data1.txt", "w") as f:
            f.write(req.content)

            # load text file and separate into data and label
    textfile = np.loadtxt("ex2data1.txt", delimiter=",")
    data = textfile[:, 0:2]
    label = textfile[:, 2]

    # fit logistic regression
    model = logisticRegression.logisticRegression()
    model.fit(data, label)

    # print information
    model.summary()

    # visualize information
    plt.clf()

    pos = label.astype(bool)
    neg = np.abs(label - 1.0).astype(bool)
    plt.plot(data[pos, 0], data[pos, 1], "ro")
    plt.plot(data[neg, 0], data[neg, 1], "bo")

    X, Y = np.meshgrid(np.arange(25, 105), np.arange(25, 105))
    result = np.zeros((0, 80))
Exemple #10
0
from logisticRegression import logisticRegression
from naiveBayesGaussian import naiveBayesGaussian
from Utils import datasets, plotCompareErrors

num_splits = 10
train_percent = [10, 25, 50, 75, 100]

LR_errors = logisticRegression(num_splits, train_percent)
NB_errors = naiveBayesGaussian(num_splits, train_percent)

names = ["LogisticReg", "NaiveBayes"]

for dataname in datasets.keys():
    error_list = []
    error_list.append(LR_errors[dataname])
    error_list.append(NB_errors[dataname])
    plotCompareErrors(names, dataname, error_list, train_percent)
Exemple #11
0
    # Compute Kendall tau correlation
    ## TODO: Compute the Kendall tau correlation of the features' lists produced
    # by the two feature selection measures

    #####################################################################

    # Select the top num_feat features
    X = X[:, index[:num_feat]]

X = np.transpose(X)

# Start measuring execution time
start = timeit.default_timer()

# Train the classifier
w = logisticRegression(X, Y)

# Print logistic regression learning execution time
stop = timeit.default_timer()
print('Running Time: ' + str(stop - start))

# Load test data and split data from class labels
data = np.loadtxt('test.csv', delimiter=',')
rY = data[:, 0]  # The real class labels
test = np.transpose(data[:, 1:data.shape[1]])

# Keep the features indicated by the feature selection task
if featureSelection == True:
    test = test[index[:num_feat], :]

# Perform predictions of the test data
Exemple #12
0
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
import random
import logisticRegression

def generate_rand_data(origin_size, rand_size):
    rand_array = np.zeros(origin_size, dtype=bool)
    ran_num = random.randint(0, origin_size - 1)

    for i in range(rand_size):
        while rand_array[ran_num] == 1:
            ran_num = random.randint(0, origin_size - 1)
        rand_array[ran_num] = 1
    return rand_array

# Load Mnist Data set
(x_train, y_train), (x_test, y_test) = load_mnist(flatten=True, normalize=True)#, one_hot_label=True)
num = np.unique(y_train, axis=0)  # num = y array 중 unique한 값들로만 이루어진 array
num = num.shape[0]  # num = y array 중 unique한 값들의 개수
y_train = np.eye(num)[y_train]  # np.eye = 단위행렬을 만드는 함수. 즉, y의 unique한 개수만큼의 row를 가지는 단위행렬을 만들고 y에 해당하는 row를 추출한다.

LRmodel = logisticRegression.logisticRegression(x_train, y_train, 'multi')

# train Data set
LRmodel.learn(learning_rate=0.001, epoch=10)
accuracy = LRmodel.predict(x_test, y_test)
print("accuracy:", accuracy, "  score:", int(accuracy*y_test.shape[0]), "/", y_test.shape[0])
def _runLogRegression(designM, labels, alpha, regParam, iterations):
    theta = logisticRegression(designM, labels, alpha, regParam, iterations)
    a = accuracy(logHypo(theta, designM), labels)
    return theta, a
Exemple #14
0
# 評価データ(全体の10%)
Xte = myData.X[dtrNum:]
Yte = myData.Y[dtrNum:]
#-------------------

#-------------------
# 3. 入力データの標準化
xMean = np.mean(Xtr, axis=0)
xStd = np.std(Xtr, axis=0)
Xtr = (Xtr - xMean) / xStd
Xte = (Xte - xMean) / xStd
#-------------------

#-------------------
# 4. ロジスティックモデルの学習と評価
myModel = lr.logisticRegression(Xtr, Ytr)

trLoss = []
teLoss = []

for ite in range(1001):
    trLoss.append(myModel.CE(Xtr, Ytr))
    teLoss.append(myModel.CE(Xte, Yte))

    if ite % 100 == 0:
        print(f"反復:{ite}")
        print(f"モデルパラメータ:\nw={myModel.w},\nb={myModel.b}")
        print(f"平均交差エントロピー損失={myModel.CE(Xte,Yte):.2f}")
        print(f"正解率={myModel.accuracy(Xte,Yte):.2f}")
        print("----------------")
Exemple #15
0
score_split = train_data.groupby(['score']).count()
score_split.reset_index(inplace=True)
score_split = pd.DataFrame(score_split, columns=['score', 'V2'])
score_split.rename(columns={'V2': 'count'}, inplace=True)

#data preprocessing
preprocess = data_preprocessing.preprocessing(train_data)
categorical_var = preprocess.split_numerical_categorical()[1]
numerical_var = preprocess.split_numerical_categorical()[0]
low_corr_var = preprocess.numerical_feature_selection(numerical_var)

################Numerical data classifier####################
#logistic regression classifier
score_LR = {}
for C in [0.1, 1, 10, 100, 1000]:
    regr = logisticRegression.logisticRegression(train_data, low_corr_var, C)
    score_LR[C] = regr.classifier()

score_LR_df = pd.DataFrame(list(score_LR.values()),
                           index=list(score_LR.keys()),
                           columns=['precision', 'recall', 'accuracy'])
score_LR_df[['precision0',
             'precision1']] = score_LR_df['precision'].apply(pd.Series)
score_LR_df[['recall0', 'recall1']] = score_LR_df['recall'].apply(pd.Series)
score_LR_df = pd.DataFrame(
    score_LR_df,
    columns=['precision0', 'precision1', 'recall0', 'recall1', 'accuracy'])

fig, ax = plt.subplots(figsize=(8, 5))
ax = sns.heatmap(score_LR_df)
Exemple #16
0
    class_num = np.unique(input,
                          axis=0)  # num = y array 중 unique한 값들로만 이루어진 array
    class_num = class_num.shape[0]  # num = y array 중 unique한 값들의 개수
    return np.eye(
        class_num
    )[input], class_num  # np.eye = 단위행렬을 만드는 함수. 즉, y의 unique한 개수만큼의 row를 가지는 단위행렬을 만들고 y에 해당하는 row를 추출한다.


# Load Iris Data set
iris = load_iris()

# Parsing the data sets
X = iris.data  # iris data input
y = iris.target  # iris target = label : 0, 1, 2
y_name = iris.target_names  # iris target name : Setosa, Versicolor, Virginica

# Divide data sets into train, test sets
X_train, X_test, y_train, y_test\
    = train_test_split(X, y, test_size=1/15, shuffle=True, random_state=int(time.time()))  # sklearn의 데이터분할 내장함수 사용.
# test_size : 전체 데이터의 몇 %를 test data로 사용할지 지정
# shuffle : 셔플 여부 설정, random_state : 셔플을 위한 시드 값 지정
y_train, class_num = one_hot_encoding(y_train)

LRmodel = logisticRegression.logisticRegression(X_train, y_train)

# train Data set
LRmodel.learn(learning_rate=0.001, epoch=10)
accuracy = LRmodel.predict(X_test, y_test)
print("accuracy:", accuracy, "  score:", int(accuracy * y_test.shape[0]), "/",
      y_test.shape[0])
Exemple #17
0
import logisticRegression

# Load Mnist Data set
(x_train, y_train), (x_test, y_test) = load_mnist(flatten=True,
                                                  normalize=True,
                                                  one_hot_label=True)
class_num = np.unique(y_train,
                      axis=0)  # num = y array 중 unique한 값들로만 이루어진 array
class_num = class_num.shape[0]  # num = y array 중 unique한 값들의 개수

LRmodel_arr, cost_arr = [], []
i = 0
while i < class_num:
    print("\n***", i, "th Logistic Regression model ***")
    LRmodel_arr.append(
        logisticRegression.logisticRegression(x_train, y_train[:, i]))
    cost_arr.append(LRmodel_arr[i].learn(learning_rate=0.1, epoch=1))
    i += 1

for graph in cost_arr:
    plt.plot(graph[0], graph[1])
plt.title('Binary-Class Model\'s Loss Graph')
plt.xlabel('number of iteration')
plt.ylabel('cost')
plt.legend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
           loc='upper right')
plt.tight_layout()
plt.show()

# train Data set
i = 0
Exemple #18
0
from logisticRegression import logisticRegression
from sklearn.preprocessing import PolynomialFeatures

pathname = os.path.dirname(sys.argv[0])
os.chdir(pathname)

data = pd.read_csv("../ex2data2.txt", delimiter=",", header=None)
X = data.values[:,:2]
y = data.values[:,2]
del(data)

poly = PolynomialFeatures(6,include_bias=False)
X_new = poly.fit_transform(X)
print("After polynomial feature transformation, now shape=", X_new.shape)

clf = logisticRegression(lambd=1, verbose=False, tol=1e-10)
clf.fit(X_new,y)
y_pred = clf.predict(X_new)
print("train accuracy = ", clf.accuracy(y_pred, y))

h = .02  # step size in the mesh

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - .2, X[:, 0].max() + .2
y_min, y_max = X[:, 1].min() - .2, X[:, 1].max() + .2
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = clf.predict(poly.fit_transform(np.c_[xx.ravel(), yy.ravel()]))

# Put the result into a color plot
Exemple #19
0
from logisticRegression import logisticRegression
from sklearn.preprocessing import PolynomialFeatures

pathname = os.path.dirname(sys.argv[0])
os.chdir(pathname)

data = pd.read_csv("../ex2data2.txt", delimiter=",", header=None)
X = data.values[:, :2]
y = data.values[:, 2]
del (data)

poly = PolynomialFeatures(6, include_bias=False)
X_new = poly.fit_transform(X)
print("After polynomial feature transformation, now shape=", X_new.shape)

clf = logisticRegression(lambd=1, verbose=False, tol=1e-10)
clf.fit(X_new, y)
y_pred = clf.predict(X_new)
print("train accuracy = ", clf.accuracy(y_pred, y))

h = .02  # step size in the mesh

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - .2, X[:, 0].max() + .2
y_min, y_max = X[:, 1].min() - .2, X[:, 1].max() + .2
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = clf.predict(poly.fit_transform(np.c_[xx.ravel(), yy.ravel()]))

# Put the result into a color plot
Z = Z.reshape(xx.shape)
Exemple #20
0
from sklearn import cross_validation
from logisticRegression import logisticRegression
from sklearn import linear_model

data = pd.read_csv('../data/train.csv')
data = data.replace(np.nan, -1)
for col in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']:
    col_max = data[col].max()
    col_min = data[col].min()
    data[col] = (data[col] - col_min) / (col_max - col_min)
data = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Survived']]
data['Sex_male'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data['Sex_female'] = data['Sex'].apply(lambda x: 1 if x == 'female' else 0)
train, test = cross_validation.train_test_split(data,
                                                test_size=0.3,
                                                random_state=2018)
train_y = train[['Survived']]
train_x = train[[
    'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Sex_female'
]]
test_y = test[['Survived']]
test_x = test[[
    'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Sex_female'
]]

clf = logisticRegression(learning_rate=0.01,
                         epoch=100,
                         bacth_size=10,
                         feature_size=7)

clf.fit(train_x, train_y, test_x, test_y)
    # download data from the internet if it doesn't exists
    if not os.path.exists("ex2data1.txt"):
        req = requests.get(
            'https://raw.github.com/SaveTheRbtz/ml-class/master/ex2/ex2data1.txt'
        )
        with open("ex2data1.txt", 'w') as f:
            f.write(req.content)

    # load text file and separate into data and label
    textfile = np.loadtxt("ex2data1.txt", delimiter=",")
    data = textfile[:, 0:2]
    label = textfile[:, 2]

    # fit logistic regression
    model = logisticRegression.logisticRegression()
    model.fit(data, label)

    # print information
    model.summary()

    # visualize information
    plt.clf()

    pos = label.astype(bool)
    neg = np.abs(label - 1.0).astype(bool)
    plt.plot(data[pos, 0], data[pos, 1], 'ro')
    plt.plot(data[neg, 0], data[neg, 1], 'bo')

    X, Y = np.meshgrid(np.arange(25, 105), np.arange(25, 105))
    result = np.zeros((0, 80))
Exemple #22
0
def crossValidateLogistic(X, Y, regType, kfold, numSamples):
    #%%
    import numpy as np
    from logisticRegression import logisticRegression
    import matplotlib.pyplot as plt
    from numpy import random as rng
    from compiler.ast import flatten

    scale = np.sqrt((X**2).mean())
    mskOutNans = (np.sum(np.isnan(X), axis=1) + np.squeeze(np.isnan(Y))) < 1
    X = X[mskOutNans, :]
    Y = Y[mskOutNans]
    numObservations, numFeatures = X.shape
    lvect = flatten([0, list(10**(np.arange(-5, 2, 0.5)))])

    l = np.zeros((len(lvect), 2))
    if regType == 'l2':
        l[:, 0] = lvect
        # l2-regularization
    elif regType == 'l1':
        l[:, 1] = lvect
        # l1-regularization
    else:
        lvect = [0., 0.]  # no regularization
        l = [0., 0.]

    l = l * scale

    perClassErrorTest = np.nan + np.ones((numSamples, l.shape[0]))
    perClassErrorTrain = np.nan + np.ones((numSamples, l.shape[0]))

    for s in range(numSamples):
        ## %%%%%% shuffle trials to break any dependencies on the sequence of trails
        shfl = rng.permutation(np.arange(0, numObservations))
        Ys = Y[shfl]
        Xs = X[shfl, :]

        ## %%%%% divide data to training and testin sets
        YTrain = Ys[np.arange(0, int((kfold - 1.) / kfold * numObservations))]
        YTest = Ys[np.arange(int((kfold - 1.) / kfold * numObservations),
                             numObservations)]

        XTrain = Xs[np.arange(0, int(
            (kfold - 1.) / kfold * numObservations)), :]
        XTest = Xs[np.arange(int(
            (kfold - 1.) / kfold * numObservations), numObservations), :]
        ## %%%%% loop over the possible regularization values
        for i in range(l.shape[0]):
            w, b, lps, perClassEr, cost, optParams = logisticRegression(
                np.reshape(XTrain, (np.prod(XTrain.shape)), order='F'), YTrain,
                l[i, :])
            perClassErrorTest[s, i] = optParams.perClassErFn(XTest, YTest)
            perClassErrorTrain[s, i] = optParams.perClassErFn(XTrain, YTrain)
        print 'cross-validating: %.2f %% completed' % (
            (s + 1.) / (numSamples + 0.) * 100.)

    meanPerClassErrorTrain = np.mean(perClassErrorTrain, axis=0)
    semPerClassErrorTrain = np.std(perClassErrorTrain,
                                   axis=0) / np.sqrt(numSamples)

    meanPerClassErrorTest = np.mean(perClassErrorTest, axis=0)
    semPerClassErrorTest = np.std(perClassErrorTest,
                                  axis=0) / np.sqrt(numSamples)
    ix = np.argmin(meanPerClassErrorTest)
    l = l[meanPerClassErrorTest <=
          (meanPerClassErrorTest[ix] + semPerClassErrorTest[ix]), :]
    lbest = l[-1, :]
    # best regularization term based on minError+SE criteria
    ix = np.sum(l == lbest, 1) == 2
    ##%%%%%% plot coss-validation results
    plt.figure('cross validation')

    plt.fill_between(lvect,
                     meanPerClassErrorTrain - semPerClassErrorTrain,
                     meanPerClassErrorTrain + semPerClassErrorTrain,
                     alpha=0.5,
                     edgecolor='k',
                     facecolor='k')
    plt.fill_between(lvect,
                     meanPerClassErrorTest - semPerClassErrorTest,
                     meanPerClassErrorTest + semPerClassErrorTest,
                     alpha=0.5,
                     edgecolor='r',
                     facecolor='r')
    plt.plot(lvect, meanPerClassErrorTrain, 'k', label='training')
    plt.plot(lvect, meanPerClassErrorTest, 'r', label='validation')
    plt.plot(np.array(lvect)[ix], meanPerClassErrorTest[ix], 'bo')
    plt.xlim([lvect[1], lvect[-1]])
    plt.xscale('log')
    plt.xlabel('regularization parameter')
    plt.ylabel('classification error (%)')
    plt.legend()
    return lbest