def main(): np.random.seed(0) data = f.readData() train, validation, test = f.splitData(data.shape[0]) C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] # C = 0.061 F-1 = 0.525 trainRegularizationStrengthForl2(data, train, validation, C) # C = 0.175 F-1 = 0.526170798898 trainRegularizationStrengthForl1(data, train, validation, C)
def main(): np.random.seed(0) data = f.readData() train, validation, test = f.splitData(data.shape[0]) # trainNeuralNetworks(data, train, validation) alphas = np.arange( 0.0001, 0.0015, 0.0001) #[0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01] N = [75, 100] F_1 = np.zeros([len(alphas), len(N)]) for i in range(len(alphas)): for j in range(len(N)): F_1[i, j] = trainNeuralNetworks(data, train, validation, N=N[j], alpha=alphas[i])
def main(): np.random.seed(0) data = f.readData() train, validation, test = f.splitData(data.shape[0]) # C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] # n_estimators = [5000, 10000, 50000, 100000, 500000] # n_estimators = np.arange(10, 200, 10) # n_estimators = np.repeat([100],100) # number_of_trees = 100, average F-1 on 100 forests = 0.377228139802 # trainRandomForest(data, train, validation, n_estimators, max_features = None) n_estimators = [100, 200, 500, 1000] # number_of_boosting_stages = 100, average F-1 on 100 boostings = 0.377228139802 trainGradientBoosting(data, train, validation, n_estimators, max_features='auto')
def main(): np.random.seed(0) data = f.readData() train, validation, test = f.splitData(data.shape[0]) # C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] C = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005] # C = [10, 50, 100, 500, 1000] # C = np.arange(1, 10, 1) # C = 137, F-1 = 0.541310541311 trainSVMWithGaussianKernel(data, train, validation, C) trainSVMWithLinearKernel(data, train, validation, C) # SVM with Linear Kernel # l1, squared hinge, C = 50, F-1 = 0.525447042641 # l2, hinge, C = 0.001 , F-1 = 0.512968299712 # l2, squared hinge, C = 1, F-1 = 0.524725274725 trainSVMWithLinearKernel2(data, train, validation, C)
if fName == 'abalone.data': abalonePath = os.path.join(path, fName) abaloneNames = [ 'sex', 'length', 'diameter', 'height', 'wholeHt', 'shuckWt', 'visceraWt', 'shellWt', 'rings' ] data, features, classNum = prepData(abalonePath, abaloneNames, slice(-1), 'rings') classes = classNum.astype(str) classes[classNum <= 8] = '1-8' classes[classNum >= 11] = '11+' else: sys.exit("No such data set.") (xvData, xvLabel), xvFolds, pruningSet = splitData(data, classes) fullErr, prnErr = crossValidate(xvData, xvLabel, pruningSet, xvFolds, printTree) print("Full Tree - Mean Error: %f" % fullErr.mean()) print("Full Tree - St Dev Error: %f" % fullErr.std()) print("Pruned Tree - Mean Error: %f" % prnErr.mean()) print("Pruned Tree - St Dev Error: %f" % prnErr.std()) tr = TrainDTree(xvData, xvLabel) # train DTree using full cross-val sample tr.combineChildNodes() # combine subtrees with homogeneous classes if printTree: # print full tree print("\n===full tree===") print(tr) PruneDTree(tr, pruningSet[0], pruningSet[1]) # prune with pruning set
def main(): np.random.seed(0) data = f.readData() train, validation, test = f.splitData(data.shape[0]) X_train, y_train = v.makeMatrix(data, train) X_test, y_test = v.makeMatrix(data, test) print("Logistic Regression") clf = LogisticRegression(C=0.061, class_weight='balanced', max_iter=10000, solver='sag', n_jobs=-1) f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print("F-1 measure for Logistic Regression with l2 and C = %s is %s" % (0.061, f1)) clf = LogisticRegression(penalty='l1', C=0.175, class_weight='balanced', max_iter=5000, n_jobs=-1) f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print("F-1 measure for Logistic Regression with l1 and C = %s is %s" % (0.175, f1)) print("SVM") clf = SVC(C=137, class_weight='balanced') f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print("F-1 measure for SVM with RBF and C = %s is %s" % (137, f1)) # l1, squared hinge, C = 50, F-1 = 0.525447042641 # l2, hinge, C = 0.001 , F-1 = 0.512968299712 # l2, squared hinge, C = 1, F-1 = 0.524725274725 C = 50 loss = "squared_hinge" penalty = 'l1' clf = LinearSVC(C=C, loss=loss, penalty=penalty, class_weight='balanced', dual=False) f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print( "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s" % (loss, penalty, C, f1)) C = 0.001 loss = "hinge" penalty = 'l2' clf = LinearSVC(C=C, loss=loss, penalty=penalty, class_weight='balanced', dual=True) f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print( "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s" % (loss, penalty, C, f1)) C = 1 loss = "squared_hinge" penalty = 'l2' clf = LinearSVC(C=C, loss=loss, penalty=penalty, class_weight='balanced', dual=False) f1 = v.validate(data, X_train, y_train, X_test, y_test, clf) print( "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s" % (loss, penalty, C, f1))
TODO: Kann ein feature welches immer 0 ist hohe gewichte im lin fit kriegen? -> Ja! sehr einfach sogar Ändert die Ridge regression das? -> Ja! w[i] ist dann 0 Dazu entweder einfach ein feature = 0 setzen und lernen oder ein gelerntes modell nehmen und feature X runter und gewicht zu X sehr hoch setzen - sollte die performance wenig belasten wenn das feature X ca. 0 ist. """ ### data data = F.loadPreparedData() ### norm each feature to 1 (makes the weights comparable - not necessary with shap, but nice to have) data = data / data.max(axis=0) ## append sqrt(fare) to better the distribution data["sqrt(fare)"] = np.sqrt(data["fare"]) ## append 0 data colum data["testZero"] = rnd.random(len(data)) * 0.00001 ## test train split x_train, y_train, x_test, y_test = F.splitData(data) ### correlation corr = data.corr()["survived"] F.prettyPrint("Correlation", corr.sort_values(key=lambda x: abs(x), ascending=False)) DO_LIN_FIT = 0 DO_NN_FIT = 1 DO_NNLIN_FIT = 0 if DO_LIN_FIT: print("-----------------------------------------") print(" ######## LIN MODEL ########") print("-----------------------------------------") ### Linear Model linModel, w = F.linRegression(x_train, y_train, x_test, y_test)