Exemple #1
0
def main():
    # load training data
    filename_train = './data/train.csv'
    train_dataset = transform(filename_train)

    X = train_dataset['data']
    y = train_dataset['target']

    # fill in missing data (optional)
    X_full = fill_missing(X, 'mode', False)

    X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size=0.25, random_state=0)

    ### use the logistic regression
    print('Train the logistic regression classifier')
    """ your code here """
    lr_model = LogisticRegression()
    start_time = time.time()
    lr_model.fit(X_full_train,y_train)
    elapsed_time = time.time() - start_time
    y_predict = lr_model.predict(X_full_test)
    print('The accuracy of the sklearn lr classifier: '+str(sum(y_test ==  y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time))
    clf = logisticRegression()
    start_time = time.time()
    clf.fit(X_full_train,y_train)
    elapsed_time = time.time() - start_time
    y_predict = clf.predict(X_full_test)
    print('The accuracy of my lr classifier: '+str(sum(y_test ==  y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time))
    
    ### use the naive bayes
    print('Train the naive bayes classifier')
    """ your code here """
    nb_model = MultinomialNB()
    start_time = time.time()
    nb_model.fit(X_full_train, y_train)
    elapsed_time = time.time() - start_time
    y_predict = nb_model.predict(X_full_test)
    print('The accuracy of the sklearn nb classifier: '+str(sum(y_test ==  y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time))
    clf = NaiveBayes()
    start_time = time.time()
    clf = clf.fit(X_full_train, y_train)
    elapsed_time = time.time() - start_time
    y_predict = clf.predict(X_full_test)
    print('The accuracy of my nb classifier: '+str(sum(y_test ==  y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time))

    ## use the svm
    print('Train the SVM classifier')
    """ your code here """
    svm_model = svm.SVC(kernel='linear', C=1).fit(X_full_train, y_train)
    print(('The accuracy of the sklearn SVM classifier: %f')%(svm_model.score(X_full_test, y_test)))                       

    ## use the random forest
    print('Train the random forest classifier')
    rf_model = RandomForestClassifier(n_estimators=500)
    rf_model.fit(X_full_train, y_train)
    print(('The accuracy of the sklearn random forest classifier: %f')%(rf_model.score(X_full_test, y_test))) 


    ## get predictions
    df = pd.read_csv('./data/test.csv')
    UserID=df.loc[:,'UserID'].as_matrix()
    df = df.drop('UserID', 1)
    X_predict=df.as_matrix()
    for n in range(df.shape[1]):
        if df.iloc[:,n].dtypes!=np.int64 and df.iloc[:,n].dtypes!=np.float64:
            g= pd.get_dummies(X_predict[:,n])
            i=0
            for e in list(g):
                X_predict[:,n][X_predict[:,n]==e]=i
                i=i+1 
    X_full_predict = fill_missing(X_predict, 'mode', False)
    
    y_predict = lr_model.predict(X_full_predict)
    fo = open("./predictions/lr_predictions.csv", "w")
    fo.write("UserID,Happy\n");
    for i in range(y_predict.shape[0]):
        fo.write(("%d,%d\n")%(UserID[i],y_predict[i]));
    fo.close()

    y_predict = nb_model.predict(X_full_predict)
    fo = open("./predictions/nb_predictions.csv", "w")
    fo.write("UserID,Happy\n");
    for i in range(y_predict.shape[0]):
        fo.write(("%d,%d\n")%(UserID[i],y_predict[i]));
    fo.close()

    y_predict = svm_model.predict(X_full_predict)
    fo = open("./predictions/svm_predictions.csv", "w")
    fo.write("UserID,Happy\n");
    for i in range(y_predict.shape[0]):
        fo.write(("%d,%d\n")%(UserID[i],y_predict[i]));
    fo.close()
    
    y_predict = rf_model.predict(X_full_predict)
    fo = open("./predictions/rf_predictions.csv", "w")
    fo.write("UserID,Happy\n");
    for i in range(y_predict.shape[0]):
        fo.write(("%d,%d\n")%(UserID[i],y_predict[i]));
    fo.close()
Exemple #2
0
from matplotlib import pyplot as plt
from preprocess import transform
from preprocess import fill_missing

## import data

my_csv = './data/train.csv'  ## path to your dataset

filename_train = './data/train.csv'

train_dataset = transform(filename_train)
X = train_dataset['data']
y = train_dataset['target']

# fill in missing data (optional)
dat, discard_row = fill_missing(X, 'most_frequent', False)
y = np.delete(y, discard_row)
# if no row or column titles in your csv, pass 'header=None' into read_csv
# and delete 'index_col=0' -- but your biplot will be clearer with row/col names

## perform PCA

n = dat.shape[1]

pca = PCA(n_components=n)
# defaults number of PCs to number of columns in imported data (ie number of
# features), but can be set to any integer less than or equal to that value

pca.fit(dat)

## project data into PC space
def main():
    # load training data
    filename_train = "./data/train.csv"
    filename_test = "./data/test.csv"
    df = pd.read_csv(filename_test, header=0)
    X_pre_userId = df['UserID']
    X_pre_userId = X_pre_userId.as_matrix()
    train_dataset = transform(filename_train)
    test_dateset = transform(filename_test)

    X = train_dataset['data']
    y = train_dataset['target']
    X_pre = test_dateset['data']
    num_train = X.shape[0]
    X = np.append(X, X_pre, 0)

    X_fill = fill_missing(X, 'most_frequent', False)
    # X_fill = fill_missing(X, 'most_frequent', True)
    X_pre_fill = X_fill[num_train::]
    X_fill = X_fill[0:num_train]

    X_train, X_test, y_train, y_test = train_test_split(X_fill,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=4)
    print(y_train.shape, y_test.shape)

    ### use the logistic regression
    print('Train the logistic regression classifier')
    """ your code here """
    lr_model = LogisticRegression(random_state=4)
    lr_model.fit(X_train, y_train)
    print(lr_model.score(X_test, y_test))
    lr_pre = lr_model.predict(X_pre_fill)
    file = open('./predictions/lr_predictions.csv', 'w')
    file.write('UserID,Happy\n')
    for temp in range(0, lr_pre.shape[0]):
        file.write('%d' % X_pre_userId[temp])
        file.write(',')
        file.write(str(lr_pre[temp]))
        file.write('\n')

    ### use the naive bayes
    print('Train the naive bayes classifier')
    """ your code here """
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    print(nb_model.score(X_test, y_test))
    nb_pre = nb_model.predict(X_pre_fill)
    file = open('./predictions/nb_predictions.csv', 'w')
    file.write('UserID,Happy\n')
    for temp in range(0, nb_pre.shape[0]):
        file.write('%d' % X_pre_userId[temp])
        file.write(',')
        file.write(str(nb_pre[temp]))
        file.write('\n')

    ## use the svm
    print('Train the SVM classifier')
    """ your code here """
    svm_model = svm.SVC(kernel='linear', random_state=0)
    svm_model.fit(X_train, y_train)
    print(svm_model.score(X_test, y_test))
    svm_pre = svm_model.predict(X_pre_fill)
    file = open('./predictions/svm_predictions.csv', 'w')
    file.write('UserID,Happy\n')
    for temp in range(0, svm_pre.shape[0]):
        file.write('%d' % X_pre_userId[temp])
        file.write(',')
        file.write(str(svm_pre[temp]))
        file.write('\n')

    ## use the random forest
    print('Train the random forest classifier')
    """ your code here """
    rf_model = RandomForestClassifier(n_estimators=2600, random_state=4)
    rf_model = rf_model.fit(X_train, y_train)
    print(rf_model.score(X_test, y_test))
    rf_pre = rf_model.predict(X_pre_fill)
    file = open('./predictions/rf_predictions.csv', 'w')
    file.write('UserID,Happy\n')
    for temp in range(0, rf_pre.shape[0]):
        file.write('%d' % X_pre_userId[temp])
        file.write(',')
        file.write(str(rf_pre[temp]))
        file.write('\n')

    ## get predictions
    """ your code here """
def main():
    # load training data
    filename_train = './data/train.csv'
    train_dataset = transform(filename_train)
    X = train_dataset['data']
    y = train_dataset['target']

    # fill in missing data (optional)
    X_full, discard_row = fill_missing(X, 'most_frequent', True)
    y = np.delete(y,discard_row)
    
    
    n_samples, n_features = X_full.shape
    
    
    ### -------------------- use the logistic regression --------------------
    print('\n\nTrain the logistic regression classifier')
    train_X, train_y, valid_X, valid_y = cross_validation(0.08,X_full,y) #0.08
    # Sklearn package
    lr_model_time1 = time.time()
    lr_model = LogisticRegression()
    lr_model = lr_model.fit(train_X,train_y)
    lr_model_time2 = time.time()
    print("Sklearn LR validation score: {0}".format(lr_model.score(valid_X,valid_y)))
    print("Sklearn LR training time: %.3f s" % (lr_model_time2 - lr_model_time1))
    #print("Sklearn LR learnt coef:\n{0},\n{1}".format(lr_model.coef_[:,:5],lr_model.intercept_))
    
    
    # Self-implemented
    train_X, train_y, valid_X, valid_y = cross_validation(0.15,X_full,y) #0.15
    self_lr_time1 = time.time()
    self_lr = LogitR()
    self_lr = self_lr.fit(train_X,train_y)
    self_lr_time2 = time.time()
    print("Self LR validation score: {0}".format(self_lr.score(valid_X,valid_y)))
    print("Self LR training time: %.3f s" % (self_lr_time2 - self_lr_time1))
    #print("Self LR learnt coef:\n{0},\n{1}".format(self_lr.coef[:5],self_lr.intercept))
    ### -------------------- use the logistic regression --------------------
    
    
    
    ### -------------------- use the naive bayes --------------------
    # Sklearn package
    print('\n\nTrain the naive bayes classifier')
    train_X, train_y, valid_X, valid_y = cross_validation(0.1,X_full,y) # Sklearn NB validation score: 0.6762589928057554
    nb_model_time1 = time.time()
    nb_model = BernoulliNB()
    nb_model.fit(train_X,train_y)
    nb_model_time2 = time.time()
    print("Sklearn NB validation score: {0}".format(nb_model.score(valid_X,valid_y)))
    print("SKlearn NB training time: %.3f s" % (nb_model_time2 - nb_model_time1))
    #sk_y_predict = nb_model.predict(X_full[1800:,1:n_features-1])
    
    
    
    # Self-implemented
    train_X, train_y, valid_X, valid_y = cross_validation(0.118,X_full,y) # Self NB validation score: 0.576 # i  0.118
    self_nb_time1 = time.time()
    self_nb = NaiveBayes()
    self_nb = self_nb.fit(train_X,train_y)
    self_nb_time2 = time.time()
    print("Self NB validation score: {0}".format(self_nb.score(train_X,train_y)))
    print("Self NB training time: %.3f s" % (self_nb_time2 - self_nb_time1))
    #self_y_predict = clf.predict(X_full[1800:,1:n_features-1])
    ### -------------------- use the naive bayes --------------------
    

    
    ### -------------------- use svm --------------------
    print('\n\nTrain the SVM classifier')
    # linear, poly, rbf, or precomputed (or self-defined)?
    train_X, train_y, valid_X, valid_y = cross_validation(0.17,X_full,y) #0.17
    svm_model_time1 = time.time()
    svm_model = svm.SVC(kernel="linear")
        # rbf score: 0.682; validation percentage: 0.113
        # sigmoid score: 0.577; validation percentage: 0.23
        # poly score: 0.685; validation percentage: 0.16
        # linear score: 0.701 validation percentage: 0.17
    svm_model.fit(train_X,train_y)
    print("train_X:", train_X.shape)
    print("train_y:", train_y.shape)
    svm_model_time2 = time.time()
    print("Sklearn SVM validation score: {0}".format(svm_model.score(valid_X,valid_y)))
    print("Sklearn SVM training time: %.3f s" % (svm_model_time2 - svm_model_time1))     
    ### -------------------- use svm --------------------
    
    
 
    ### -------------------- use random forest --------------------
    print('\n\nTrain the random forest classifier')
    train_X, train_y, valid_X, valid_y = cross_validation(0.151,X_full,y) # Sklearn RF validation score: 0.702 # i:  0.151
    rf_model_time1 = time.time()
    rf_model = RandomForestClassifier(n_estimators=29) # 29
    rf_model.fit(train_X,train_y)
    rf_model_time2 = time.time()
    print("Sklearn RF validation score: {0}".format(rf_model.score(valid_X,valid_y)))
    print("Sklearn RF training time: %.3f s" % (rf_model_time2 - rf_model_time1))
    ### -------------------- use random forest --------------------
      
    ## get predictions
    """ your code here """
income_id = 3
educate_id = 10
h = 0.2

filename_train = '../data/train.csv'
train_dataset = transform(filename_train)
X = train_dataset['data']
y = train_dataset['target']
'''
row_idx = X[:,0]
mat = X[:,income_id+1:educate_id+2:educate_id-income_id]
col_name = ['Income', 'EducationLevel']
X_show = pd.DataFrame(data=mat, index=row_idx, columns=col_name)
'''

X_full, discard_row = fill_missing(X, 'most_frequent', True)
#X_full = X_full[:,1:X_full.shape[1]-1]
X_full = X_full[:, income_id:educate_id + 1:educate_id - income_id]
y_full = np.delete(y, discard_row)

#x_min, x_max = min(X_full[:,income_id])-1, max(X_full[:,income_id])+1
#y_min, y_max = min(X_full[:,educate_id])-1, max(X_full[:,educate_id])+1
x_min, x_max = min(X_full[:, 0]) - 1, max(X_full[:, 0]) + 1
y_min, y_max = min(X_full[:, 1]) - 1, max(X_full[:, 1]) + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

svm_model = svm.SVC(kernel="rbf")
#svm_model = svm.LinearSVC(C=1)
svm_model.fit(X_full, y_full)
print("SVM end")
data[:, n][data[:, n] == 'Domestic Partners (w/kids)'] = 5

n = df.columns.get_loc("EducationLevel")
data[:, n][data[:, n] == 'Current K-12'] = 0
data[:, n][data[:, n] == 'High School Diploma'] = 1
data[:, n][data[:, n] == 'Current Undergraduate'] = 2
data[:, n][data[:, n] == "Associate's Degree"] = 3
data[:, n][data[:, n] == "Bachelor's Degree"] = 4
data[:, n][data[:, n] == "Master's Degree"] = 5
data[:, n][data[:, n] == 'Doctoral Degree'] = 6

for n in range(df.shape[1]):
    if df.iloc[:, n].dtypes != np.int64 and df.iloc[:, n].dtypes != np.float64:
        g = pd.get_dummies(data[:, n])
        i = 0
        for e in list(g):
            data[:, n][data[:, n] == e] = i
            i = i + 1

X_full = fill_missing(data, 'mode', False)
mins = np.min(X_full, axis=0)
maxs = np.max(X_full, axis=0)
X_full = (X_full - np.mean(X_full, axis=0)) / (maxs - mins)

pca = PCA()
X_reduced = pca.fit_transform(X_full)

fig = plt.figure('PCA and biplot')
biplot(X_reduced, pca.components_, 1, 2, list(df))
plt.show()
fig.savefig('PCA and biplot.jpg')
Exemple #7
0
filename = '../data/train.csv'
dataset = preprocess.transform(filename)
#dataset = preprocess.fill_missing(dataset,strategy = 'most_frequent',isClassified = False)

X = dataset['data']
y = dataset['target']

# drop NaN
total = (pd.concat([X, y], axis=1))
total = total.dropna()

#X = total.drop('Happy',1)
total = total[['Income', 'EducationLevel', 'Happy']].dropna()
total = preprocess.fill_missing(total,
                                strategy='most_frequent',
                                isClassified=False)

y = total['Happy']
X = total[['Income', 'EducationLevel']]
X = np.array(X)
y = np.array(y)

# train svm model
'''
svm_model = svm.SVC(kernel='rbf')
svm_model = svm_model.fit(X,y)
y_predict_svm = svm_model.predict(X)
'''

C = 1.0  # SVM regularization parameter
data[:, n][data[:, n] == '$50,000 - $74,999'] = 2
data[:, n][data[:, n] == '$75,000 - $100,000'] = 3
data[:, n][data[:, n] == '$100,001 - $150,000'] = 4
data[:, n][data[:, n] == 'over $150,000'] = 5

n = df.columns.get_loc("EducationLevel")
data[:, n][data[:, n] == 'Current K-12'] = 0
data[:, n][data[:, n] == 'High School Diploma'] = 1
data[:, n][data[:, n] == 'Current Undergraduate'] = 2
data[:, n][data[:, n] == "Associate's Degree"] = 3
data[:, n][data[:, n] == "Bachelor's Degree"] = 4
data[:, n][data[:, n] == "Master's Degree"] = 5
data[:, n][data[:, n] == 'Doctoral Degree'] = 6

X = data
X = fill_missing(X, 'mode', False)

h = .2  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.LinearSVC(C=C).fit(X, y)
#svc = svm.LinearSVC(C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot