def runBestRegressionModelKFoldwFS(dataSets=[], regModels=[], names=[]): myResults = {} for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') myTrain = skb(f_regression, k=5).fit_transform(myTrain, myVal) splits = kf(n_splits=10, shuffle=True, random_state=42) infinity = float("inf") index = -1 count = -1 for reg in regModels: count = count + 1 reg.fit(myTrain, myVal) cvsScores = cvs(reg, myTrain, myVal, cv=splits, scoring='neg_mean_squared_error') meanSquareRootError = np.sqrt(-1 * cvsScores.mean()) print(regsNames[names[count]], meanSquareRootError) if (meanSquareRootError < infinity): infinity = meanSquareRootError index = count L1, L2, L3, L4, L5, L6 = regsNames[ names[index]], reg.intercept_, reg.coef_, np.exp( reg.coef_), cvsScores, infinity print(filesReg[ds], regsNames[names[index]], infinity) myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5, 6: L6} print('\n') return myResults
def runBestRegsCompKFold(dataSets=[], regModels=[], names=[]): myResults = {} for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') #myTrain = skb(f_regression, k=3).fit_transform(myTrain,myVal) for name in myTrain.columns: if (not (myTrain[name].dtype == 'O')): myTrain[name] = pre.minmax_scale(myTrain[name].astype('float')) splits = kf(n_splits=10, shuffle=True, random_state=42) infinity = float("inf") index = -1 count = -1 for reg in regModels: count = count + 1 reg.fit(myTrain, myVal) cvsScores = cvs(reg, myTrain, myVal, cv=splits, scoring='neg_mean_squared_error') meanSquareRootError = np.sqrt(-1 * cvsScores.mean()) print(RegsCompNames[names[count]], meanSquareRootError) if (meanSquareRootError < infinity): infinity = meanSquareRootError index = count L1, L2, L3 = RegsCompNames[names[index]], cvsScores, infinity print(filesReg[ds], RegsCompNames[names[index]], infinity) myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3} print('\n') return myResults
def runBestRegressionModelKFoldPrintFolderErrors(dataSets=[], regModels=[], names=[]): myResults = {} for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') splits = kf(n_splits=2, shuffle=True, random_state=42) infinity = float("inf") index = -1 count = -1 for reg in regModels: count = count + 1 xval_err = 0 for train, test in splits.split(myTrain): reg.fit(myTrain.ix[train], myVal.ix[train]) p = reg.predict(myTrain.ix[test]) e = p - myVal.ix[test] print(e) xval_err += np.dot(e, e) rmse_10cv = np.sqrt(xval_err / len(myTrain)) print(rmse_10cv) input("Press any key") if (rmse_10cv < infinity): infinity = rmse_10cv index = count L1, L2, L3, L4, L5 = regsNames[ names[index]], reg.intercept_, reg.coef_, np.exp( reg.coef_), infinity print(filesReg[ds], regsNames[names[index]], infinity) myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5} print('\n') return myResults
def main(): #train, validation = train_test_split(train_full, test_size = 0.2, random_state = None) train_full = pd.read_csv('D:/Sem1/INF552/Project/Data/Final1/train.csv', converters={ 'Date Occurred': date_time_converter, 'Time Occurred': format_time, 'Crime Code': format_crime_code }) print("Completed reading csv.....") train_full = train_full[np.isfinite(train_full['Date Occurred'])] #train_full = train_full[~np.isnan(train_full['Time Occurred'])] train_full = train_full[~np.isnan(train_full['Crime Code'])] print(len(train_full)) X_full = train_full.iloc[:, [3, 4, 8]].values y_full = train_full.iloc[:, 5].values #divide training data into 10 pairs of train and dev kfn = kf(n_splits=10, random_state=seed, shuffle=False) kfn.get_n_splits(X_full) i = 0 for train_index, dev_index in kfn.split(X_full): #print("train:", train_index, "dev:", dev_index) i += 1 print("In iteration: ") print(i) X_train, X_dev = X_full[train_index], X_full[dev_index] y_train, y_dev = y_full[train_index], y_full[dev_index] apply_models(X_train, y_train, X_dev, y_dev)
def runRidgeRegressiontoEstAlpha(dataSets=[]): from sklearn.linear_model import Ridge print('Ridge Regression') print('alpha\t RMSE_train\t RMSE_10cv\n') alpha = np.linspace(.01, 20, 50) for ds in dataSets: myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg') #for name in myTrain.columns: #if (not(myTrain[name].dtype=='O')): #myTrain[name]=pre.minmax_scale(myTrain[name].astype('float')) t_rmse = np.array([]) cv_rmse = np.array([]) for a in alpha: ridge = Ridge(fit_intercept=True, alpha=a) ridge.fit(myTrain, myVal) p = ridge.predict(myTrain) err = p - myVal total_error = np.dot(err, err) rmse_train = np.sqrt(total_error / len(p)) splits = kf(n_splits=10, shuffle=True, random_state=42) xval_err = 0 for train, test in splits.split(myTrain): ridge.fit(myTrain.ix[train], myVal.ix[train]) p = ridge.predict(myTrain.ix[test]) e = p - myVal.ix[test] xval_err += np.dot(e, e) rmse_10cv = np.sqrt(xval_err / len(myTrain)) t_rmse = np.append(t_rmse, [rmse_train]) cv_rmse = np.append(cv_rmse, [rmse_10cv]) print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a, rmse_train, rmse_10cv)) input("Press Any Key") plt.plot(alpha, t_rmse, label='RMSE-Train') plt.plot(alpha, cv_rmse, label='RMSE_CV') plt.legend(('RMSE-Train', 'RMSE_XCV')) plt.ylabel('RMSE') plt.xlabel('Alpha') plt.show()
def cross_validation_svm(self): """ Effectue la validation croisée pour le classifieurs SVM """ print("Recherche des meilleurs paramètres du SVM Classifier...") C_value = [0.5, 1, 1.5] kernel = ['linear', 'poly', 'rbf', 'sigmoid'] param_grid = dict(C=C_value, kernel=kernel) model = SVC() kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=param_grid, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.C_svm = grid_result.best_params_['C'] self.kernel = grid_result.best_params_['kernel'] print("...Done")
def cross_validation_ada_boost(self): """ Effectue la validation croisée pour le classifieurs AdaBoost """ print("Recherche des meilleurs paramètres du AdaBoost Classifier...") n_estimators = [25, 50, 75] learning_rate_ada = [0.5, 1, 1.5] grid_param = dict(n_estimators=n_estimators, learning_rate=learning_rate_ada) model = AdaBoostClassifier() kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=grid_param, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.n_estimators = grid_result.best_params_['n_estimators'] self.learning_rate_ada = grid_result.best_params_['learning_rate'] print("...Done")
def cross_validation_knn(self): """ Effectue la validation croisée pour le classifieurs K Nearest Neighbour afin de trouver les meilleurs paramètres """ print( "Recherche des meilleurs paramètres du K Nearest Neighbour Classifier..." ) neighbors = [1, 3, 5, 7, 9] grid_param = dict(n_neighbors=neighbors) model = KNeighborsClassifier() kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=grid_param, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.n_neighbors = grid_result.best_params_['n_neighbors'] print("...Done")
def cross_validation_random_forest(self): """ Effectue la validation croisée pour le classifieurs Random Forest """ print( "Recherche des meilleurs paramètres du Random Forest Classifier..." ) max_depth = [10, 50, 100] bootstrap = [True, False] grid_param = dict(max_depth=max_depth, bootstrap=bootstrap) model = RandomForestClassifier() kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=grid_param, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.max_depths = grid_result.best_params_['max_depth'] self.bootstrap = grid_result.best_params_['bootstrap'] print("...Done")
def cross_validation_nn(self): """ Effectue la validation croisée pour le classifieurs Neural Networks """ print( "Recherche des meilleurs paramètres du Neural Network Classifier..." ) solver = ['lbfgs', 'sgd', 'adam'] learning_rate = ['constant', 'invscaling', 'adaptive'] param_grid = dict(solver=solver, learning_rate=learning_rate) model = MLPClassifier() kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=param_grid, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.solver = grid_result.best_params_['solver'] self.learning_rate = grid_result.best_params_['learning_rate'] print("...Done")
def cross_validation_logistic_regression(self): """ Effectue la validation croisée pour le classifieurs Logistic Regression afin de trouver les meilleurs paramètres """ print( "Recherche des meilleurs paramètres du Logistic Regression Classifier..." ) C = [1, 10, 50] tol = [0.005, 0.003, 0.001] grid_param = dict(C=C, tol=tol) model = LogisticRegression(solver='newton-cg', multi_class='multinomial') kfold = kf(n_splits=self.num_folds, random_state=self.seed) grid = gridsearchcv(estimator=model, param_grid=grid_param, scoring=self.scoring, cv=kfold) grid_result = grid.fit(self.x_data_scale, self.data_y_train) self.C = grid_result.best_params_['C'] self.tol = grid_result.best_params_['tol'] print("...Done")
def __init__(self, Cs=500, cv=10, sampler='skf', solver='liblinear', **kwargs): super(self.__class__, self).__init__() self.penalty = 'l1' self.solver = solver self.Cs = Cs self.sampler = sampler self.cv_folds = cv if self.sampler == 'skf': self.cv = skf(n_splits=self.cv_folds) elif self.sampler == 'sss': self.cv = sss(n_splits=self.cv_folds) elif self.sampler == 'kf': self.cv = kf(n_splits=self.cv_folds) elif self.sampler == 'ss': self.cv = ss(n_splits=self.cv_folds) else: raise (Exception( 'Selected sampler is not a valid. Please choose ' '"skf" for stratified K-fold or "sss" for ' 'stratified shuffle split. Also "sk" and "ss" for ' 'the respective non-stratified methods.')) for k, v in kwargs.items(): setattr(self, k, v) self.x = None self.y = None
def kFoldValidatoin(self, XMap, Y, k=10, classNum=2): randomIndex = random.sample(range(len(Y)), len(Y)) for clfName in XMap: XMap[clfName] = XMap[clfName][randomIndex] y = Y[randomIndex] cmTotal = np.zeros((classNum, classNum)) index = kf(n_splits=k, random_state=666).split(list(range(len(Y)))) for line in index: trainIndex = line[0] testIndex = line[1] trainX, testX = {}, {} trainy = y[trainIndex] testy = y[testIndex] for clfName in XMap: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis featureProcessor = LinearDiscriminantAnalysis(n_components=100) trainX[clfName] = XMap[clfName][trainIndex] testX[clfName] = XMap[clfName][testIndex] featureProcessor.fit(trainX[clfName], trainy) trainX[clfName] = featureProcessor.transform(trainX[clfName]) testX[clfName] = featureProcessor.transform(testX[clfName]) import copy self.baseModels = copy.deepcopy(self.oriBaseModels) self.metaModel = copy.deepcopy(self.oriMetaModel) self.fit(trainX, trainy) print("训练集表现:") y_train = self.predict(trainX) cm = confusion_matrix(trainy, y_train) print(cm) y_pred = self.predict(testX) cm = confusion_matrix(testy, y_pred) cmTotal += np.array(cm) print("测试集表现:") print(cm) print(cmTotal) return cmTotal
def Gaussian_Naive_Bayes(test_set): errors = 0 for i in range(0, test_set['class_label'].count()): class_value = predict_class_gnb(test_set.iloc[i]) if class_value != test_set.iloc[i]['class_label']: errors += 1 return (test_set['class_label'].count() - errors) * 100 / test_set['class_label'].count() data = pd.read_csv('bank_data', header=None, names=['f1', 'f2', 'f3', 'f4', 'class_label']) fraction_list = [.01, .02, .05, .1, .625, 1] kfold = kf(3, True, 1) gnb_sum_kfold = {} for i in fraction_list: gnb_sum_kfold[i] = 0.0 for tr_ind, te_ind in kfold.split(data): for fraction in fraction_list: gnb_sum_acc = 0 for ii in range(0, 5): train_set = data.iloc[tr_ind].sample(frac=fraction) test_set = data.iloc[te_ind] frequency = train_set['class_label'].value_counts() distribution_mean = train_set.groupby('class_label').mean() distribution_variance = train_set.groupby('class_label').var()
def testClassifier(featureCollectionName="", scale=True): def scala(x): res = [] for i in range(len(x)): temp = [] for n in x[i]: v = 1 if n > 0 else 0 temp.append(v) res.append(temp) return np.array(res) print("读取数据") collection = db[featureCollectionName] data = collection.find({}) #从mongo中查询这个特征以及对应的性别标签 data = list(data) #[:500] df = pd.DataFrame(data) dfClean = df.drop(columns=['gender', '_id', 'uid']) X, Y = dfClean.values, df['gender'] if scale: X = scala(X) print("开始交叉验证") index = kf(n_splits=10, random_state=666).split(list(range(len(Y)))) cmTotal = np.zeros((2, 2)) count = 0 for line in index: # clf = LogisticRegression(max_iter=50, solver='newton-cg', C=0.1)#基于词频0.75 clf = LogisticRegression(max_iter=50, solver='newton-cg', C=0.1, class_weight={ 0: 0.5, 1: 0.7 }) #基于词频0.75 # clf = DecisionTreeClassifier() # clf = RandomForestClassifier() # clf = MLPClassifier(hidden_layer_sizes=(200,50)) # clf = SVC(C=0.8) trainIndex = line[0] testIndex = line[1] trainX = X[trainIndex] testX = X[testIndex] trainy = Y[trainIndex] testy = Y[testIndex] # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # featureProcessor = LinearDiscriminantAnalysis(n_components=500) # featureProcessor.fit(trainX, trainy) # trainX = featureProcessor.transform(trainX) # testX = featureProcessor.transform(testX) clf.fit(trainX, trainy) count += 1 print(count, "训练集表现:") y_train = clf.predict(trainX) cm = confusion_matrix(trainy, y_train) print(cm) y_pred = clf.predict(testX) cm = confusion_matrix(testy, y_pred) cmTotal += np.array(cm) print(count, "测试集表现:") print(cm) pickle.dump(clf, open('genderClassfier.pkl', 'wb')) # dataList = pickle.load(open('genderClassfier.pkl','rb')) print(cmTotal) print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal)))) print("召回率是", cmTotal[1, 1] / (cmTotal[1, 0] + cmTotal[1, 1]), "精度是", cmTotal[1, 1] / (cmTotal[0, 1] + cmTotal[1, 1]))
def testClassifierComplexFeatures(): featureNames = ['lastLoginTime', 'registerTime', 'homeTeams', 'fans', 'theOrg', 'follow', 'location', 'uid', 'onlineTime', 'userNumCame', 'communityRPScore','HPLevel', 'gender'] featureNames = set(featureNames) featureNamesMap = {} for name in featureNames: featureNamesMap[name] = 1 featureNamesMap['_id'] = 0 print("读取数据") # , db['bigram'],db['postagBigramFreq']] uids = db[userOriFeatureCollection].find({}, featureNamesMap) dataList = [] count = 0 progressCount = 0 for line in uids: progressCount += 1 print("正在读取第", progressCount, "个用户。", ) # print(set(uid.keys())&featureNames) if len(set(line.keys())&featureNames)<7 or 'gender' not in line: continue # if line['gender']=='m' and len(dataList)>0 and dataList[-1]['gender']=='f': # pass if line['gender']=='m' and random.uniform(0, 1) > 0.05: continue count += 1 if 'communityRPScore' in line and line['communityRPScore'] !=None and line['communityRPScore']<0: line['communityRPBad'] = 1 if 'userName' in line: line = len(line['userName']) if 'location' in line: line['location'] = 1 if 'follow' in line: line['follow'] = len(line['follow']) if 'fans' in line: line['fans'] = len(line['fans']) if 'theOrg' in line: if line['theOrg']=='小黑屋住户': line['theOrgBlock'] = 1 line['theOrg'] = 1 if 'homeTeams' in line: line['homeTeams'] = len(line['homeTeams']) if 'registerTime' in line: line['registerTime'] = time2timestamp(line['registerTime'])-\ time2timestamp('2003-1-1') line['registerTime'] = int(line['registerTime']/3600) if 'lastLoginTime' in line: line['lastLoginTime'] = int(time.time()) - time2timestamp(line['lastLoginTime']) line['lastLoginTime'] = int(line['lastLoginTime']/3600) # if count == 200000: # break dataList.append(line) df = pd.DataFrame(dataList) dfClean = df[list(featureNames)].drop(columns=['gender', 'uid']).fillna(0) X, Y = dfClean.values, df['gender'] print(Y) print("开始交叉验证") index = kf(n_splits=10, random_state=666).split(list(range(len(Y)))) cmTotal = np.zeros((2, 2)) count = 0 for line in index: # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)#基于词频0.75 # clf = DecisionTreeClassifier() # clf = RandomForestClassifier() clf = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=666, n_jobs=6) #clf = MLPClassifier(hidden_layer_sizes=(50,10), max_iter=500, learning_rate_init=0.01) # clf = SVC(C=0.9) trainIndex = line[0] testIndex = line[1] trainX = X[trainIndex] testX = X[testIndex] trainy = Y[trainIndex] testy = Y[testIndex] # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # featureProcessor = LinearDiscriminantAnalysis(n_components=150) # featureProcessor.fit(trainX, trainy) # trainX = featureProcessor.transform(trainX) # testX = featureProcessor.transform(testX) clf.fit(trainX, trainy) count += 1 print(count, "训练集表现:") y_train = clf.predict(trainX) cm = confusion_matrix(trainy, y_train) print(cm) y_pred = clf.predict(testX) cm = confusion_matrix(testy, y_pred) cmTotal += np.array(cm) print(count, "测试集表现:") print(cm) print(cmTotal) print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal))))
classifier = LogisticRegression(random_state = seed, solver='newton-cg', multi_class='multinomial') classifier.fit(X_train, y_train) y_pred = classifier.predict(X_dev) cm = confusion_matrix(y_dev, y_pred) ac = accuracy_score(y_dev, y_pred) print("********************************************************************************") print(classification_report(y_dev, y_pred)) print(cm) print(ac) print("********************************************************************************") #train, validation = train_test_split(train_full, test_size = 0.2, random_state = None) train_full = pd.read_csv('D:/Sem1/INF552/Project/Data/Final1/train.csv') train_full = train_full[np.isfinite(train_full['Victim Age'])] X_full = train_full.iloc[:,[11]].values y_full = train_full.iloc[:, 8].values #divide training data into 10 pairs of train and dev kfn = kf(n_splits=10, random_state=seed, shuffle=False) kfn.get_n_splits(X_full) for train_index, dev_index in kfn.split(X_full): #print("train:", train_index, "dev:", dev_index) X_train, X_dev = X_full[train_index], X_full[dev_index] y_train, y_dev = y_full[train_index], y_full[dev_index] apply_models(X_train, y_train, X_dev, y_dev)
from sklearn.naive_bayes import GaussianNB as gnb, BernoulliNB as bnb with open('train.json') as data: train = json.load(data) cuisine = [] ingredients = [] for i in train: cuisine.append(i["cuisine"]) ingredients.extend(i["ingredients"]) singredients = list(set(ingredients)) traind = [] d = {singredients[i]: i for i in range(len(singredients))} for i in train: row = [0] * len(singredients) for j in i["ingredients"]: row[d[j]] = 1 traind.append(row) k_fold = kf(n_splits=3) ga = cvs(gnb(), traind, cuisine, cv=k_fold, n_jobs=-1) ba = cvs(bnb(), traind, cuisine, cv=k_fold, n_jobs=-1) f = open('2d', 'wb') s = "Gaussian accuracy is: " + str(np.mean(ga)) print s f.write(s) s = "Bernoulli accuracy is: " + str(np.mean(ba)) print s f.write(s)
def testClassifierLSTM(featureCollectionName="", scale=True): def scala(x): res = [] for i in range(len(x)): temp = [] for n in x[i]: v = 1 if n > 0 else 0 temp.append(v) res.append(temp) return np.array(res) print("读取数据") collection = db[featureCollectionName] data = collection.find({}) #从mongo中查询这个特征以及对应的性别标签 maleSpecialWords = ['武器库', 'UFC', '硬邦邦的', '龟头', '前臂', '尼玛比'] maleSpecialWords = set(maleSpecialWords) femaleSpecialWords = ['小女子', '小宝贝', "美少年", '萌图', '治愈系', '防晒霜', '萌系'] femaleSpecialWords = set(femaleSpecialWords) data = list(data) #[] maleData, femaleData, otherData = [], [], [] for line in data: gender = decideGenderBySpecialWords(line, maleSpecialWords, femaleSpecialWords) if gender == 'male': maleData.append(line) elif gender == 'female': femaleData.append(line) else: otherData.append(line) print("基于特殊词语判断性别,得到男性和女性个数分别是", len(maleData), len(femaleData)) print("还剩下的用户数是", len(otherData)) df = pd.DataFrame(otherData) dropFeatureNames = [] import re for line in df.columns: if len(re.findall('[0-9]', line)) > 0: dropFeatureNames.append(line) # dfClean = df.drop(columns=['gender','_id', 'uid']) dfClean = df.drop(columns=dropFeatureNames + ['gender', '_id', 'uid']) X, Y = dfClean.values, df['gender'] if scale: X = scala(X) Y = np.array(Y).reshape(-1, 1) X = X[:, :] # Y = list(map(lambda x: [x], Y)) # Y = np.array(Y).reshape(-1,1) oneHotEncoder4Y = OneHotEncoder().fit(Y) print("开始交叉验证") index = kf(n_splits=10, random_state=666).split(list(range(len(Y)))) cmTotal = np.zeros((2, 2)) count = 0 for line in index: num_feature = len(X[0]) clf = deepLearning.LSTMClassifier(2, num_feature, learning_rate=1e-3, layer_num=1, hidden_size=100, timestep_size=100) clf.initGraph(ifDecrLR=True) trainIndex = line[0] testIndex = line[1] trainX = X[trainIndex] testX = X[testIndex] trainy = Y[trainIndex] testy = Y[testIndex] clf.initOneHotEncoder4Y(trainy) batch_ys = oneHotEncoder4Y.transform(trainy).todense().astype( np.float32) batch_xs = np.array(trainX).astype(np.float32) for i in range(50): print("这是第", count, "折,第", i, "轮训练。", len(trainy)) stepsize = 100 for j in range(0, len(trainy), stepsize): batch_ys = clf.oneHotEncode(trainy[j:j + stepsize, :]) batch_xs = np.array(trainX[j:j + stepsize, :]).astype( np.float32) clf.fit(batch_xs, batch_ys) print("学习率是", clf.learning_rate, clf.global_step) batch_ys = clf.oneHotEncode(trainy) batch_xs = np.array(trainX).astype(np.float32) pred_train = clf.test(batch_xs, batch_ys) pred_train = list( map(lambda x: 1 if x[0] < x[1] else 0, pred_train)) pred_train = np.array(pred_train).reshape(-1, 1) # print(len(trainy), len(pred_train)) cm = confusion_matrix(trainy, pred_train) print("训练集混淆矩阵", cm) count += 1 batch_ys = clf.oneHotEncode(testy) batch_xs = np.array(testX).astype(np.float32) print(count, "测试集表现:") y_pred = clf.test(batch_xs, batch_ys) y_pred = list(map(lambda x: 1 if x[0] < x[1] else 0, y_pred)) y_pred = np.array(y_pred).reshape(-1, 1) cm = confusion_matrix(testy, y_pred) print("测试集混淆矩阵", cm) print("召回率是", cm[1, 1] / (cm[1, 0] + cm[1, 1]), "精度是", cm[1, 1] / (cm[0, 1] + cm[1, 1])) break
def testClassifierComplexFeatures(featureNames=[], scale=True): def scala(x): res = [] for i in range(len(x)): temp = [] for n in x[i]: v = 1 if n > 0 else 0 temp.append(v) res.append(temp) return np.array(res) print("读取数据") dataCursorList = [db[name] for name in featureNames ] #, db['bigram'],db['postagBigramFreq']] uids = dataCursorList[0].find({}, {'uid': 1}) dataList = [] count = 0 for uid in uids: uid = uid['uid'] count += 1 # if count == 200: # break print("正在读取第", count, "个用户。") dataTemp = {} flag = 0 for i in range(len(dataCursorList)): cursor = dataCursorList[i] line = cursor.find_one({'uid': uid}) if line == None: flag = 1 break dataTemp.update(line) if flag == 1: continue dataList.append(dataTemp) df = pd.DataFrame(dataList) dfClean = df.drop(columns=['gender', '_id', 'uid']) X, Y = dfClean.values, df['gender'] X = scala(X) print("开始交叉验证") index = kf(n_splits=10, random_state=666).split(list(range(len(Y)))) cmTotal = np.zeros((2, 2)) count = 0 for line in index: # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)#基于词频0.75 # clf = DecisionTreeClassifier() # clf = RandomForestClassifier() # clf = RandomForestClassifier(n_estimators=30, max_depth=6, random_state=666) # clf = MLPClassifier(hidden_layer_sizes=(200,10)) clf = SVC(C=0.9) trainIndex = line[0] testIndex = line[1] trainX = X[trainIndex] testX = X[testIndex] trainy = Y[trainIndex] testy = Y[testIndex] # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # featureProcessor = LinearDiscriminantAnalysis(n_components=150) # featureProcessor.fit(trainX, trainy) # trainX = featureProcessor.transform(trainX) # testX = featureProcessor.transform(testX) clf.fit(trainX, trainy) count += 1 print(count, "训练集表现:") y_train = clf.predict(trainX) cm = confusion_matrix(trainy, y_train) print(cm) y_pred = clf.predict(testX) cm = confusion_matrix(testy, y_pred) cmTotal += np.array(cm) print(count, "测试集表现:") print(cm) print(cmTotal) print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal))))
def indexToyExample(): dataDict = { 'Age': list(np.random.uniform(low=30.0, high=79.0, size=1000)), 'Sex': list(np.random.randint(2, size=1000)), 'SBP': list(np.random.uniform(low=90.0, high=180.0, size=1000)), 'Smoker': list(np.random.randint(2, size=1000)), 'CHF': list(np.random.randint(2, size=1000)) } myPD = pd.DataFrame(dataDict) predictors = ['Age', 'Sex', 'SBP'] target = ['CHF'] myTrain = myPD[predictors] newMyTrain = myPD[predictors] myVal = myPD[target] splits = kf(n_splits=10, shuffle=True, random_state=42) LR = Classifiers[0].fit(myTrain, myVal) cvsScores = cvs(LR, myTrain, myVal, cv=splits, scoring='neg_mean_squared_error') LR.predict(myTrain) meanSquareRootError = np.sqrt(-1 * cvsScores.mean()) L1 = { 1: LR.intercept_, 2: LR.coef_, 3: np.exp(LR.coef_), 4: cvsScores, 5: meanSquareRootError } LS = [] LY = [] for index, row in myTrain.iterrows(): if (row['Age'] < 39): LS.append(1) elif (row['Age'] <= 49): LS.append(2) elif (row['Age'] <= 59): LS.append(3) elif (row['Age'] <= 69): LS.append(4) elif (row['Age'] <= 80): LS.append(5) if (row['SBP'] < 120): LY.append(1) elif (row['SBP'] < 129): LY.append(2) elif (row['SBP'] < 139): LY.append(3) elif (row['SBP'] < 159): LY.append(4) else: LY.append(5) newMyTrain['Age'] = LS newMyTrain['SBP'] = LY newMyPD = pd.DataFrame() newMyPD['Age'] = newMyTrain['Age'] newMyPD['Sex'] = newMyTrain['Sex'] newMyPD['SBP'] = newMyTrain['SBP'] newMyPD['CHF'] = myVal Freq = { 'Age': newMyPD.groupby(['Age', 'CHF']).size(), 'Sex': newMyPD.groupby(['Sex', 'CHF']).size(), 'SBP': newMyPD.groupby(['SBP', 'CHF']).size() } return myPD, myTrain, myVal, newMyPD, L1, Freq