def main(): # load training data filename_train = './data/train.csv' train_dataset = transform(filename_train) X = train_dataset['data'] y = train_dataset['target'] # fill in missing data (optional) X_full = fill_missing(X, 'mode', False) X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size=0.25, random_state=0) ### use the logistic regression print('Train the logistic regression classifier') """ your code here """ lr_model = LogisticRegression() start_time = time.time() lr_model.fit(X_full_train,y_train) elapsed_time = time.time() - start_time y_predict = lr_model.predict(X_full_test) print('The accuracy of the sklearn lr classifier: '+str(sum(y_test == y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time)) clf = logisticRegression() start_time = time.time() clf.fit(X_full_train,y_train) elapsed_time = time.time() - start_time y_predict = clf.predict(X_full_test) print('The accuracy of my lr classifier: '+str(sum(y_test == y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time)) ### use the naive bayes print('Train the naive bayes classifier') """ your code here """ nb_model = MultinomialNB() start_time = time.time() nb_model.fit(X_full_train, y_train) elapsed_time = time.time() - start_time y_predict = nb_model.predict(X_full_test) print('The accuracy of the sklearn nb classifier: '+str(sum(y_test == y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time)) clf = NaiveBayes() start_time = time.time() clf = clf.fit(X_full_train, y_train) elapsed_time = time.time() - start_time y_predict = clf.predict(X_full_test) print('The accuracy of my nb classifier: '+str(sum(y_test == y_predict)/y_test.shape[0])+' elapsed time: '+str(elapsed_time)) ## use the svm print('Train the SVM classifier') """ your code here """ svm_model = svm.SVC(kernel='linear', C=1).fit(X_full_train, y_train) print(('The accuracy of the sklearn SVM classifier: %f')%(svm_model.score(X_full_test, y_test))) ## use the random forest print('Train the random forest classifier') rf_model = RandomForestClassifier(n_estimators=500) rf_model.fit(X_full_train, y_train) print(('The accuracy of the sklearn random forest classifier: %f')%(rf_model.score(X_full_test, y_test))) ## get predictions df = pd.read_csv('./data/test.csv') UserID=df.loc[:,'UserID'].as_matrix() df = df.drop('UserID', 1) X_predict=df.as_matrix() for n in range(df.shape[1]): if df.iloc[:,n].dtypes!=np.int64 and df.iloc[:,n].dtypes!=np.float64: g= pd.get_dummies(X_predict[:,n]) i=0 for e in list(g): X_predict[:,n][X_predict[:,n]==e]=i i=i+1 X_full_predict = fill_missing(X_predict, 'mode', False) y_predict = lr_model.predict(X_full_predict) fo = open("./predictions/lr_predictions.csv", "w") fo.write("UserID,Happy\n"); for i in range(y_predict.shape[0]): fo.write(("%d,%d\n")%(UserID[i],y_predict[i])); fo.close() y_predict = nb_model.predict(X_full_predict) fo = open("./predictions/nb_predictions.csv", "w") fo.write("UserID,Happy\n"); for i in range(y_predict.shape[0]): fo.write(("%d,%d\n")%(UserID[i],y_predict[i])); fo.close() y_predict = svm_model.predict(X_full_predict) fo = open("./predictions/svm_predictions.csv", "w") fo.write("UserID,Happy\n"); for i in range(y_predict.shape[0]): fo.write(("%d,%d\n")%(UserID[i],y_predict[i])); fo.close() y_predict = rf_model.predict(X_full_predict) fo = open("./predictions/rf_predictions.csv", "w") fo.write("UserID,Happy\n"); for i in range(y_predict.shape[0]): fo.write(("%d,%d\n")%(UserID[i],y_predict[i])); fo.close()
from matplotlib import pyplot as plt from preprocess import transform from preprocess import fill_missing ## import data my_csv = './data/train.csv' ## path to your dataset filename_train = './data/train.csv' train_dataset = transform(filename_train) X = train_dataset['data'] y = train_dataset['target'] # fill in missing data (optional) dat, discard_row = fill_missing(X, 'most_frequent', False) y = np.delete(y, discard_row) # if no row or column titles in your csv, pass 'header=None' into read_csv # and delete 'index_col=0' -- but your biplot will be clearer with row/col names ## perform PCA n = dat.shape[1] pca = PCA(n_components=n) # defaults number of PCs to number of columns in imported data (ie number of # features), but can be set to any integer less than or equal to that value pca.fit(dat) ## project data into PC space
def main(): # load training data filename_train = "./data/train.csv" filename_test = "./data/test.csv" df = pd.read_csv(filename_test, header=0) X_pre_userId = df['UserID'] X_pre_userId = X_pre_userId.as_matrix() train_dataset = transform(filename_train) test_dateset = transform(filename_test) X = train_dataset['data'] y = train_dataset['target'] X_pre = test_dateset['data'] num_train = X.shape[0] X = np.append(X, X_pre, 0) X_fill = fill_missing(X, 'most_frequent', False) # X_fill = fill_missing(X, 'most_frequent', True) X_pre_fill = X_fill[num_train::] X_fill = X_fill[0:num_train] X_train, X_test, y_train, y_test = train_test_split(X_fill, y, test_size=0.2, random_state=4) print(y_train.shape, y_test.shape) ### use the logistic regression print('Train the logistic regression classifier') """ your code here """ lr_model = LogisticRegression(random_state=4) lr_model.fit(X_train, y_train) print(lr_model.score(X_test, y_test)) lr_pre = lr_model.predict(X_pre_fill) file = open('./predictions/lr_predictions.csv', 'w') file.write('UserID,Happy\n') for temp in range(0, lr_pre.shape[0]): file.write('%d' % X_pre_userId[temp]) file.write(',') file.write(str(lr_pre[temp])) file.write('\n') ### use the naive bayes print('Train the naive bayes classifier') """ your code here """ nb_model = GaussianNB() nb_model.fit(X_train, y_train) print(nb_model.score(X_test, y_test)) nb_pre = nb_model.predict(X_pre_fill) file = open('./predictions/nb_predictions.csv', 'w') file.write('UserID,Happy\n') for temp in range(0, nb_pre.shape[0]): file.write('%d' % X_pre_userId[temp]) file.write(',') file.write(str(nb_pre[temp])) file.write('\n') ## use the svm print('Train the SVM classifier') """ your code here """ svm_model = svm.SVC(kernel='linear', random_state=0) svm_model.fit(X_train, y_train) print(svm_model.score(X_test, y_test)) svm_pre = svm_model.predict(X_pre_fill) file = open('./predictions/svm_predictions.csv', 'w') file.write('UserID,Happy\n') for temp in range(0, svm_pre.shape[0]): file.write('%d' % X_pre_userId[temp]) file.write(',') file.write(str(svm_pre[temp])) file.write('\n') ## use the random forest print('Train the random forest classifier') """ your code here """ rf_model = RandomForestClassifier(n_estimators=2600, random_state=4) rf_model = rf_model.fit(X_train, y_train) print(rf_model.score(X_test, y_test)) rf_pre = rf_model.predict(X_pre_fill) file = open('./predictions/rf_predictions.csv', 'w') file.write('UserID,Happy\n') for temp in range(0, rf_pre.shape[0]): file.write('%d' % X_pre_userId[temp]) file.write(',') file.write(str(rf_pre[temp])) file.write('\n') ## get predictions """ your code here """
def main(): # load training data filename_train = './data/train.csv' train_dataset = transform(filename_train) X = train_dataset['data'] y = train_dataset['target'] # fill in missing data (optional) X_full, discard_row = fill_missing(X, 'most_frequent', True) y = np.delete(y,discard_row) n_samples, n_features = X_full.shape ### -------------------- use the logistic regression -------------------- print('\n\nTrain the logistic regression classifier') train_X, train_y, valid_X, valid_y = cross_validation(0.08,X_full,y) #0.08 # Sklearn package lr_model_time1 = time.time() lr_model = LogisticRegression() lr_model = lr_model.fit(train_X,train_y) lr_model_time2 = time.time() print("Sklearn LR validation score: {0}".format(lr_model.score(valid_X,valid_y))) print("Sklearn LR training time: %.3f s" % (lr_model_time2 - lr_model_time1)) #print("Sklearn LR learnt coef:\n{0},\n{1}".format(lr_model.coef_[:,:5],lr_model.intercept_)) # Self-implemented train_X, train_y, valid_X, valid_y = cross_validation(0.15,X_full,y) #0.15 self_lr_time1 = time.time() self_lr = LogitR() self_lr = self_lr.fit(train_X,train_y) self_lr_time2 = time.time() print("Self LR validation score: {0}".format(self_lr.score(valid_X,valid_y))) print("Self LR training time: %.3f s" % (self_lr_time2 - self_lr_time1)) #print("Self LR learnt coef:\n{0},\n{1}".format(self_lr.coef[:5],self_lr.intercept)) ### -------------------- use the logistic regression -------------------- ### -------------------- use the naive bayes -------------------- # Sklearn package print('\n\nTrain the naive bayes classifier') train_X, train_y, valid_X, valid_y = cross_validation(0.1,X_full,y) # Sklearn NB validation score: 0.6762589928057554 nb_model_time1 = time.time() nb_model = BernoulliNB() nb_model.fit(train_X,train_y) nb_model_time2 = time.time() print("Sklearn NB validation score: {0}".format(nb_model.score(valid_X,valid_y))) print("SKlearn NB training time: %.3f s" % (nb_model_time2 - nb_model_time1)) #sk_y_predict = nb_model.predict(X_full[1800:,1:n_features-1]) # Self-implemented train_X, train_y, valid_X, valid_y = cross_validation(0.118,X_full,y) # Self NB validation score: 0.576 # i 0.118 self_nb_time1 = time.time() self_nb = NaiveBayes() self_nb = self_nb.fit(train_X,train_y) self_nb_time2 = time.time() print("Self NB validation score: {0}".format(self_nb.score(train_X,train_y))) print("Self NB training time: %.3f s" % (self_nb_time2 - self_nb_time1)) #self_y_predict = clf.predict(X_full[1800:,1:n_features-1]) ### -------------------- use the naive bayes -------------------- ### -------------------- use svm -------------------- print('\n\nTrain the SVM classifier') # linear, poly, rbf, or precomputed (or self-defined)? train_X, train_y, valid_X, valid_y = cross_validation(0.17,X_full,y) #0.17 svm_model_time1 = time.time() svm_model = svm.SVC(kernel="linear") # rbf score: 0.682; validation percentage: 0.113 # sigmoid score: 0.577; validation percentage: 0.23 # poly score: 0.685; validation percentage: 0.16 # linear score: 0.701 validation percentage: 0.17 svm_model.fit(train_X,train_y) print("train_X:", train_X.shape) print("train_y:", train_y.shape) svm_model_time2 = time.time() print("Sklearn SVM validation score: {0}".format(svm_model.score(valid_X,valid_y))) print("Sklearn SVM training time: %.3f s" % (svm_model_time2 - svm_model_time1)) ### -------------------- use svm -------------------- ### -------------------- use random forest -------------------- print('\n\nTrain the random forest classifier') train_X, train_y, valid_X, valid_y = cross_validation(0.151,X_full,y) # Sklearn RF validation score: 0.702 # i: 0.151 rf_model_time1 = time.time() rf_model = RandomForestClassifier(n_estimators=29) # 29 rf_model.fit(train_X,train_y) rf_model_time2 = time.time() print("Sklearn RF validation score: {0}".format(rf_model.score(valid_X,valid_y))) print("Sklearn RF training time: %.3f s" % (rf_model_time2 - rf_model_time1)) ### -------------------- use random forest -------------------- ## get predictions """ your code here """
income_id = 3 educate_id = 10 h = 0.2 filename_train = '../data/train.csv' train_dataset = transform(filename_train) X = train_dataset['data'] y = train_dataset['target'] ''' row_idx = X[:,0] mat = X[:,income_id+1:educate_id+2:educate_id-income_id] col_name = ['Income', 'EducationLevel'] X_show = pd.DataFrame(data=mat, index=row_idx, columns=col_name) ''' X_full, discard_row = fill_missing(X, 'most_frequent', True) #X_full = X_full[:,1:X_full.shape[1]-1] X_full = X_full[:, income_id:educate_id + 1:educate_id - income_id] y_full = np.delete(y, discard_row) #x_min, x_max = min(X_full[:,income_id])-1, max(X_full[:,income_id])+1 #y_min, y_max = min(X_full[:,educate_id])-1, max(X_full[:,educate_id])+1 x_min, x_max = min(X_full[:, 0]) - 1, max(X_full[:, 0]) + 1 y_min, y_max = min(X_full[:, 1]) - 1, max(X_full[:, 1]) + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) svm_model = svm.SVC(kernel="rbf") #svm_model = svm.LinearSVC(C=1) svm_model.fit(X_full, y_full) print("SVM end")
data[:, n][data[:, n] == 'Domestic Partners (w/kids)'] = 5 n = df.columns.get_loc("EducationLevel") data[:, n][data[:, n] == 'Current K-12'] = 0 data[:, n][data[:, n] == 'High School Diploma'] = 1 data[:, n][data[:, n] == 'Current Undergraduate'] = 2 data[:, n][data[:, n] == "Associate's Degree"] = 3 data[:, n][data[:, n] == "Bachelor's Degree"] = 4 data[:, n][data[:, n] == "Master's Degree"] = 5 data[:, n][data[:, n] == 'Doctoral Degree'] = 6 for n in range(df.shape[1]): if df.iloc[:, n].dtypes != np.int64 and df.iloc[:, n].dtypes != np.float64: g = pd.get_dummies(data[:, n]) i = 0 for e in list(g): data[:, n][data[:, n] == e] = i i = i + 1 X_full = fill_missing(data, 'mode', False) mins = np.min(X_full, axis=0) maxs = np.max(X_full, axis=0) X_full = (X_full - np.mean(X_full, axis=0)) / (maxs - mins) pca = PCA() X_reduced = pca.fit_transform(X_full) fig = plt.figure('PCA and biplot') biplot(X_reduced, pca.components_, 1, 2, list(df)) plt.show() fig.savefig('PCA and biplot.jpg')
filename = '../data/train.csv' dataset = preprocess.transform(filename) #dataset = preprocess.fill_missing(dataset,strategy = 'most_frequent',isClassified = False) X = dataset['data'] y = dataset['target'] # drop NaN total = (pd.concat([X, y], axis=1)) total = total.dropna() #X = total.drop('Happy',1) total = total[['Income', 'EducationLevel', 'Happy']].dropna() total = preprocess.fill_missing(total, strategy='most_frequent', isClassified=False) y = total['Happy'] X = total[['Income', 'EducationLevel']] X = np.array(X) y = np.array(y) # train svm model ''' svm_model = svm.SVC(kernel='rbf') svm_model = svm_model.fit(X,y) y_predict_svm = svm_model.predict(X) ''' C = 1.0 # SVM regularization parameter
data[:, n][data[:, n] == '$50,000 - $74,999'] = 2 data[:, n][data[:, n] == '$75,000 - $100,000'] = 3 data[:, n][data[:, n] == '$100,001 - $150,000'] = 4 data[:, n][data[:, n] == 'over $150,000'] = 5 n = df.columns.get_loc("EducationLevel") data[:, n][data[:, n] == 'Current K-12'] = 0 data[:, n][data[:, n] == 'High School Diploma'] = 1 data[:, n][data[:, n] == 'Current Undergraduate'] = 2 data[:, n][data[:, n] == "Associate's Degree"] = 3 data[:, n][data[:, n] == "Bachelor's Degree"] = 4 data[:, n][data[:, n] == "Master's Degree"] = 5 data[:, n][data[:, n] == 'Doctoral Degree'] = 6 X = data X = fill_missing(X, 'mode', False) h = .2 # step size in the mesh # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors C = 1.0 # SVM regularization parameter svc = svm.LinearSVC(C=C).fit(X, y) #svc = svm.LinearSVC(C=C).fit(X, y) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = svc.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot