def scr2(X, y): print("**************************************") print("* Classification Model *") print("* 1-GaussianNB *") print("* 2- KMeans *") print("* 3- hierachie Clustering *") print("* 4- Support Vector Machine *") print("* 5- Multiclassification *") print("* 6- Tree Classication *") print("* 7- Return Main *") print("**************************************") var2 = int(input('Enter the model No\t')) if var2 == 1: gau(X, y) else: if var2 == 2: kmeans(X, y) else: if var2 == 3: hiclus(X, y) else: if var2 == 4: svm(X, y) else: if var2 == 5: multcla(X, y) else: if var2 == 6: trea(X, y) else: if var2 == 7: scr3(X, y) print("ctrl -D to exit") scr2(X, y)
def main(): df_preprocessed_data = process_data() neural_net(df_preprocessed_data) #Done Setup decision_tree(df_preprocessed_data) #Done Setup adaboost(df_preprocessed_data) #Done Setup knn(df_preprocessed_data) #Done Setup svm(df_preprocessed_data) #Done Setup
def main(): df_submission,df_train,df_test=process_data() neural_net(df_submission,df_train,df_test) #Done Setup decision_tree(df_submission,df_train,df_test) #Done Setup adaboost(df_submission,df_train,df_test) #Done Setup knn(df_submission,df_train,df_test) #Done Setup svm(df_submission,df_train,df_test) #Done Setup
def main(): datapath = '../data/' data = read_data(datapath) [X,y] = create_features(data) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=7641) nn(X_train, X_test, y_train, y_test) svm(X_train, X_test, y_train, y_test) dt(X_train, X_test, y_train, y_test) boost(X_train, X_test, y_train, y_test) knn(X_train, X_test, y_train, y_test)
def imageProcess(): try: img = cv2.imread('cropped.jpg', 0) img = cv2.medianBlur(img, 3) cv2.imwrite('medianimg.jpg', img) os.system('sudo cp medianimg.jpg /var/www/html/medianimg.jpg') cimg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) cv2.imwrite('grayimg.jpg', cimg) os.system('sudo cp grayimg.jpg /var/www/html/grayimg.jpg') eimg = cv2.Canny(cimg, 20, 50) cv2.imwrite('edgedimg.jpg', eimg) os.system('sudo cp edgedimg.jpg /var/www/html/edgedimg.jpg') output = img.copy() circles = cv2.HoughCircles(img, cv2.cv.CV_HOUGH_GRADIENT, 1.25, 45, param1=40, param2=26, minRadius=0, maxRadius=90) x1 = 0 if circles is not None: #convert the (x, y) coordinates and radius of the circles to integers circles = np.round(circles[0, :]).astype("int") #loop over the (x, y) coordinates and radius of the circles for (x, y, r) in circles: x1 = x1 + 1 diameter = (r * 2) / float(7.5) print(r) cv2.circle(output, (x, y), r, (0, 255, 0), 4) cv2.putText(output, str(diameter)[:4], (x - 10, y + 5), 1, 1, (255, 255, 255), 2) print "severity: ", x1, " detected" diam = (max(circles[:, 2]) * 2) / float(7.5) print "float: ", diam dArea = float(diam * diam * 3.1416 / 6) sArea = str(round(dArea, 2)) maximum = str(round(diam, 2)) area.set("Area: " + sArea + "mm^2") maxDiameter.set("Max Diameter: " + maximum + "mm") cv2.imwrite('output.jpg', output) os.system('sudo cp output.jpg /var/www/html/output.jpg') print circles[:, 2] svm(severity=x1, diameter=diam) except: result.set("SVM Result: ERROR: No Circle Detected")
def start(): data = pd.read_csv("projetData/red_wines.csv") data = clean(data) regressionLogistique(data) analyseDiscriminanteLineaire(data) analyseDiscriminanteQuadratique(data) svm(data) voisins(data) arbre(data) test_perceptron(data) return data
def runNTimes(times): for i in range(0, times): voting(evaluation_set) adaboost(evaluation_set) bagging(evaluation_set) stacking(evaluation_set) svm(evaluation_set) knn(evaluation_set) decisionTree(evaluation_set) calculateMeanStatistics(times)
def main(): # input = "../Data/glass.data" input = "glass.data" headers = [ "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "type" ] data = pd.read_csv(input, names=headers) data.drop(["Id"], axis=1, inplace=True) # if we'd like to plot all the results, set plot to true plot = False onePlot = True # list for each type of kernel kernels = ['linear', 'rbf', 'sigmoid', 'poly'] if plot: onePlot = False timeVacc = pdf.PdfPages("Time_VS_Accuracy.pdf") # run each type of kernel with both 1v1 and 1vAll for k in kernels: ovoTime, ovoAccuracy = svm(data, kernel=k, classification='ovo', plot=plot, onePlot=onePlot) ovrTime, ovrAccuracy = svm(data, kernel=k, classification='ovr', plot=plot) onePlot = False print("*" * 300) if plot: # plot the time vs the accuracy and save to pdf fig = plotTimeVAccuracy(k, ovoTime, ovrTime, ovoAccuracy, ovrAccuracy) timeVacc.savefig(fig, bbox_inches='tight') plt.close(fig) if plot: timeVacc.close() # run each type of kernel with 1v1 where the classes are reweighted for k in kernels: ovoTime, ovoAccuracy = svm(data, kernel=k, classification='ovo', weighted=True, plot=plot)
def manualSVM(): try: diameter = entDiam.get() amount = entAmount.get() diam = float(diameter) amount = int(amount) dArea = float(diam * diam * 3.1416 / 6) sArea = str(round(dArea, 2)) maximum = str(round(diam, 2)) area.set("Area: " + sArea + "mm^2") maxDiameter.set("Max Diameter: " + maximum + "mm") svm(severity=amount, diameter=diam) except: result.set("SVM Result: Input Error")
def evaluation(chromosome): global best_val global valores global a global b global c global d global e global f code_comp = chromosome.getCompiledCode() features = eval(code_comp) cfeatures = len(features) matrix_final = reducir(matrix,features) evaluated_data = svm(matrix_final) total = len(matrix[0])-1 alfa = 0.5 beta = 0.4 gama = 0.1 valor = ((alfa*(1-evaluated_data[0]))+beta*(1-evaluated_data[1])+gama*(cfeatures/total))/3.0 if(valor < best_val): best_val = valor valores = [] valores.append(evaluated_data[0]) valores.append(evaluated_data[1]) valores.append(cfeatures) fitness.append(valor) print 'AUC: ',evaluated_data[0],'ACC: ',evaluated_data[1] return valor
def run(file): data = pd.read_csv(file, encoding='latin-1') # filter stop words, punctuation and ignore capital letter f = feature_extraction.text.CountVectorizer(stop_words='english') X = f.fit_transform(data["message"]) # split the data to test set and train set X_train, X_test, y_train, y_test = model_selection.train_test_split( X, data['label'], test_size=0.33) # sends the sets to the machine learning algorithms adaboost(X_train, X_test, y_train, y_test) svm(X_train, X_test, y_train, y_test) knn(X_train, X_test, y_train, y_test) decisionTree(X_train, X_test, y_train, y_test)
def SVM_train_cross(train_x, train_y, validation, test, test_data): print("training data...") clf_pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)), RandomUnderSampler(), SVC(kernel='rbf', C=1)) scores = cross_val_score(clf_pipe, train_x, train_y, cv=5) print("Model is fitted!") if validation: print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) y_pred = cross_val_predict(clf_pipe, train_x, train_y, cv=5) # Evaluation # classification report print("classification reports:", classification_report(train_y, y_pred)) print("Finished!") # confusion matrix conf_mat = confusion_matrix(train_y, y_pred) print(conf_mat) plot_conf(conf_mat) if test: svm(test_data)
def sup_vec(X_train, y_train): pipe = Pipeline([('tfidf', TfidfVectorizer()), ('svm', svm(random_state=42)) ]) parameter = {"tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)], "svm__kernel": ["linear", "poly", "rbf", "sigmoid"], "svm__C": np.logspace(-3, 3, 7) } model = GridSearchCV(pipe, param_grid=parameter, cv=5, verbose=1) model.fit(X_train, y_train) print(f"Best o score : {model.best_score_} with {model.best_params_}") return model.best_score_, model.best_params_
def train(train_data_features, train_data_labels, classifier, hyperparameter): if classifier == 'lr': model = LogisticRegression(solver=str(hyperparameter), class_weight='balanced') if classifier == 'rf': model = RandomForestClassifier(n_estimators=int(hyperparameter), class_weight='balanced') if classifier == 'knn': model = KNeighborsClassifier(n_neighbors=int(hyperparameter)) if classifier == 'svm': model = svm(kernel=str(hyperparameter), class_weight='balanced') fit = model.fit(train_data_features, train_data_labels) return fit
def main(scores): output = DataFrame(data = []) for i in scores.keys(): #print(i) #print(scores.get(i)) value = scores.get(i) if value >= 4 : prediction = "Positive" elif 2 < value < 4 : prediction = str(svm(i, data)) #print(logit(i, data)) elif 0 < value <= 2 : prediction = str(logit(i, data, 0.8)) elif value <= 0 : prediction = "Negative" else: prediction = "Not calculated" print(str(i) + ' ' + prediction) row = Series(data = [i, value, prediction]) output = concat([output, row], axis = 1) output = output.T output.to_csv("consensus_predictions.txt", sep = '\t')
def _init(self, X, Y): if self.bias: self.X = np.append(X, np.ones((np.shape(X)[0], 1)), axis=1) else: self.X = X self.Y = np.ravel(Y) self.l_index = np.ravel(self.Y != -1) self.u_index = ~self.l_index if self.estimator is None: self.estimator = [ RandomForestClassifier(), LogisticRegression(), svm() ] # Initialize the classifiers for est in self.estimator: indices = random.sample( list(compress(range(len(X)), self.l_index)), int(np.sum(self.l_index) * 1)) est.fit(self.X[indices], self.Y[indices])
def main(): X_train, X_test, y_train, y_test = split() # y_predicted = knn_clasify(X_train, X_test, y_train) # knn_efficiency(y_predicted, y_test) linear_regression(X_train, X_test, y_train, y_test) y_predicted = logistic_regression(X_train, X_test, y_train) logistic_regression_efficiency(y_predicted, y_test) y_predicted = svm(X_train, X_test, y_train) svm_efficiency(y_predicted, y_test) k = 1 while k < 10: y_predicted = svm_kernal(X_train, X_test, y_train, k) svm_kernal_efficiency(y_predicted, y_test, k) k = k + 1 y_predicted = svm_rbf(X_train, X_test, y_train) svm_rbf_efficiency(y_predicted, y_test)
def main(): # keyword_id clicks conv cost date group hour imps match_type month monthday pos weekday data = np.genfromtxt("data.csv", delimiter=",", dtype=None)[1:] target = data[:, 2].astype(np.float) # conv data = scipy.delete(data, 2, 1) skf = cross_validation.StratifiedKFold(target, n_folds=10) for train_index, test_index in skf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = target[train_index], target[test_index] X_train, X_test = preprocess(X_train, X_test) train_data = np.column_stack([X_train, y_train]) train_data = np.array([np.array(x) for x in set(tuple(x) for x in train_data)]) X_train = train_data[:, :-1] y_train = train_data[:, -1] # predicted_test = random_forest_regressor(X_train,y_train,X_test) predicted_test = svm(X_train, y_train, X_test) _, _, non_zero_clicks = get_non_zero_clicks(X_test, y_test) predicted_test[non_zero_clicks == False] = 0 model_eval(y_test, predicted_test)
def main(argv): if FLAGS.dataset == 'toy': train_X, train_y, test_X, test_y, num_classes = get_toy_dataset() elif FLAGS.dataset == 'mnist': train_X, train_y, test_X, test_y, num_classes = get_mnist() train_pred = None if FLAGS.method == 'knn': pred = knn(train_X, train_y, test_X) elif FLAGS.method == 'svm': train_pred, pred = svm(train_X, train_y, test_X) elif FLAGS.method == 'tree': pred = tree(train_X, train_y, test_X) elif FLAGS.method == 'boosting': pred = boosting(train_X, train_y, test_X) elif FLAGS.method == 'nn': train_pred, pred = nn(train_X, train_y, test_X, num_classes) if train_pred is not None: print('Train Accuracy: %f' % compute_accuracy(train_pred, train_y)) print('Accuracy: %f' % compute_accuracy(pred, test_y))
def my_ensemble(X_train, X_test, y_train, y_test): y_pred = [] votes = [] X_train_pca, X_test_pca = preprocess_PCA(X_train, X_test, True, 100) votes.append(svm(X_train_pca, X_test_pca, y_train)) votes.append(mog(X_train_pca, X_test_pca, y_train, 10)) votes.append(ensemble(X_train, X_test, y_train)) print "*** SVM ***" show_metrics(votes[0], y_test) print "*** mog ***" show_metrics(votes[1], y_test) print "*** ensemble ***" show_metrics(votes[2], y_test) print "*** my_ensemble ***" for i in range(len(X_test)): if votes[1][i] == votes[2][i]: y_pred.append(votes[1][i]) else: y_pred.append(votes[0][i]) return y_pred
#%% # ______________ Naive_Bayes _______________- sk = SklearnClassifier(MultinomialNB()) sk.train(train_feats) acc_Naive_Bayes = accuracy(sk, test_feats) print("Naive Bayes Accuracy: ", acc_Naive_Bayes) # ______________ K-Neighbors _______________- sk_knn = SklearnClassifier(KNeighborsClassifier()) sk_knn.train(train_feats) acc_knn = accuracy(sk_knn, test_feats) print("K-NN Accuracy: ", acc_knn) # ______________ Regression _______________- #%% sk_reg = SklearnClassifier(LogisticRegression()) sk_reg.train(train_feats) acc_reg = accuracy(sk_reg, test_feats) print("Regression Accuracy: ", acc_reg) # ______________ SVM _______________- #%% sk_svm = SklearnClassifier(svm()) sk_svm.train(train_feats) acc_svm = accuracy(sk_svm, test_feats) print("SVM Accuracy: ", acc_svm) #%%
def train_model(classifier, train_path, test_path, type_classification, train=True, validation=True, test=True, cross_validation=False): # collect train data print("reading train set...") if type_classification == "T": # read titles and their label train_x, train_y = collect_titles(train_path) elif type_classification == "TB": # read whole document train_x, train_y = collect_documents(train_path) elif type_classification == "TBW": # weighted title and body train_x, train_y = collect_weighted_doc(train_path) else: print("wrong argument") # if test: print("loading test data...") test_data, reference = collect_test_documents(test_path) # split data if not cross_validation: print("spliting the train set...") train_data, validate_data, train_target, validate_target = train_test_split(train_x, train_y, test_size=0.4, random_state=0) # Naive bayes classifier if classifier == "NB": # train data set if train: print("training data...") naive_bayes_train(train_data, train_target) # validate validation set if validation: print("evaluating data...") naive_bayes_evaluate(validate_data, validate_target) # test data if test: print("testing data...") naive_bayes(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # SVM classifier if classifier == "SVM": # train data set if train: print("training data...") svm_train(train_data, train_target) # validate validation set if validation: print("evaluating data...") svm_evaluate(validate_data, validate_target) # test data if test: print("testing data...") svm(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # Logistic regression if classifier == "LR": # train data set if train: print("training data...") train_logistic_regression(train_data, train_target) # validate validation set if validation: print("evaluating data...") validate_logistic_regression(validate_data, validate_target) # test data if test: print("testing data...") logistic_regression(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # using cross validation else: if classifier == "NB": naive_bayse_cross(train_x, train_y, validation, test, test_data) if classifier == "SVM": SVM_train_cross(train_x, train_y, validation, test, test_data)
from sklearn import datasets def svm(): # iris = datasets.load_iris() # digits = datasets.load_digits() from sklearn import svm X = [[0, 0], [1, 1]] y = [0, 1] clf = svm.SVC() clf.fit(X, y) print clf print clf.predict([[2., 2.]]) print clf book = open_workbook('Adomain_Substrate.xls') worksheet = book.sheet_by_name('Adomain_Substrate') print worksheet #SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, #gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, #shrinking=True, tol=0.001, verbose=False) if __name__ == "__main__": svm() #sys.exit(main())
def pca_svm(X_train, X_test, y_train, use_unlabelled=True, pca_components=150, fraction=2): X_train, X_test = preprocess_PCA(X_train, X_test, use_unlabelled, pca_components, fraction) return svm(X_train, X_test, y_train)
max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) scaleX(xVal) scaleX(xTest) #xVal, xTest = pca2(xVal, xTest) #print xVal[0] #print xVal[0] #print yVal clf.fit(xVal, yVal) yTest2 = [] ans = 0 yTest = (clf.predict(xVal)) print 'Classification on Traning Data' print yTest yTest2 = (clf.predict(xTest)) print 'Classification on Test Data' print yTest2 return yTest2 trainData = loadDataSet('training.csv') testData = loadDataSet('test.csv') ytrainData = getY() ans = svm(trainData, ytrainData, testData)
def repeatedCrossValidation(normal_set, anom_set, k, repetitions): k_fold = KFold(n_splits=k) # Repete o processo K = 5 vezes original_set = normal_set + anom_set voting_statistics = [] adaboost_statistics = [] bagging_statistics = [] stacking_statistics = [] svm_statistics = [] knn_statistics = [] dt_statistics = [] voting_time = [] adaboost_time = [] bagging_time = [] stacking_time = [] svm_time = [] knn_time = [] dt_time = [] mlp_time = [] technique_name = [ '- Voting -', '- AdaBoost -', '- Bagging -', '- Stacking -', '- SVM -', '- KNN -', '- DT -', '- MLP -' ] j = 0 for i in range(repetitions): shuffle(original_set) for train_indices, test_indices in k_fold.split(original_set): train_set = [] train_set_classification = [] test_set = [] normal_flows_in_evaluation_set = 0 anom_flows_in_evaluation_set = 0 # Separa o conjuntos de treinamento e a correspondente classificação for index in train_indices: train_set.append(original_set[index]) if original_set[index] in anom_set: train_set_classification.append(1) else: train_set_classification.append(0) # Separa o conjunto de avaliação for index in test_indices: test_set.append(original_set[index]) # Contabiliza número de fluxos anômalos e normais no conjunto # de avaliação (utilizado para estatísticas) if original_set[index] in anom_set: anom_flows_in_evaluation_set = anom_flows_in_evaluation_set + 1 else: normal_flows_in_evaluation_set = normal_flows_in_evaluation_set + 1 # Treinamento classifier_knn.fit(train_set, train_set_classification) classifier_svm.fit(train_set, train_set_classification) classifier_dt.fit(train_set, train_set_classification) classifier_mlp.fit(train_set, train_set_classification) # # Avaliação start_time = time.time() predictions = voting(train_set, train_set_classification, test_set[0:20]) time_spent = time.time() - start_time voting_time.append(time_spent) start_time = time.time() predictions = adaboost(train_set, train_set_classification, test_set[0:20]) time_spent = time.time() - start_time adaboost_time.append(time_spent) start_time = time.time() predictions = bagging(train_set, train_set_classification, test_set[0:20]) time_spent = time.time() - start_time bagging_time.append(time_spent) start_time = time.time() predictions = stacking(train_set, train_set_classification, test_set[0:20]) time_spent = time.time() - start_time stacking_time.append(time_spent) start_time = time.time() predictions = svm(test_set[0:20]) time_spent = time.time() - start_time svm_time.append(time_spent) start_time = time.time() predictions = knn(test_set[0:20]) time_spent = time.time() - start_time knn_time.append(time_spent) start_time = time.time() predictions = decisionTree(test_set[0:20]) time_spent = time.time() - start_time dt_time.append(time_spent) start_time = time.time() predictions = neuralNetwork(test_set[0:1]) time_spent = time.time() - start_time mlp_time.append(time_spent) getTimeMeasurementsPerInstance(voting_time, adaboost_time, bagging_time, stacking_time, svm_time, knn_time, dt_time, mlp_time)
def main(): # Read data df = pd.read_table('../titanic/train.csv', sep=",") # print(df.head()) # ------------------------------------------------------------------------------------------ # Store 'in sample' and 'out of sample' errors: arrays for result df E_in = [] E_out = [] Model_name = [] Model_id = [] iFeatures = [] Features = [] # ------------------------------------------------------------------------------------------ # Preprocessing data # Set all features df['Age'].fillna(0, inplace=True) df['Pclass'].fillna(0, inplace=True) df['Fare'].fillna(0., inplace=True) df['SibSp'].fillna(0., inplace=True) df['Parch'].fillna(0., inplace=True) df['Sex'].fillna('no', inplace=True) df['sex_'] = df['Sex'].map( {'female': 0, 'male': 1, 'no': 2} ).astype(int) df['Embarked'].fillna('N', inplace=True) df['embarked_'] = df['Embarked'].map( {'N': 0., 'C': 1., 'S': 2., 'Q': 3.} ).astype(float) # ------------------------------------------------- # Slightly more advanced feature extraction df['Cabin'].fillna('no', inplace=True) def prep_cabin(row): res = 0 if row.lower().find('a')>=0: return 1. elif row.lower().find('b')>=0: return 2. elif row.lower().find('c')>=0: return 3. elif row.lower().find('d')>=0: return 4. elif row.lower().find('e')>=0: return 5. elif row.lower().find('f')>=0: return 6. elif row.lower().find('g')>=0: return 7. elif row.lower().find('h')>=0: return 8. return res df['cabin_'] = df['Cabin'].apply(lambda r: prep_cabin(r)) #print(df[['Cabin','cabin_']].head(20)) #exit() df['Name'].fillna('no', inplace=True) def prep_name(row): res = 0 if row.lower().find('miss.')>=0: return 1. elif row.lower().find('mrs.')>=0: return 2. elif row.lower().find('mr.')>=0: return 3. elif row.lower().find('master')>=0: return 4. return res df['name_'] = df['Name'].apply(lambda r: prep_name(r)) #print(df[['Name','name_']].head(20)) #exit() # ----- # --- # Metrics for knn: from sklearn import preprocessing #scaler = preprocessing.StandardScaler().fit_transform(df['Fare'].values) scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['Fare'].astype(float).values) df['fare_'] = pd.Series(scale) #print(df[['Fare','fare_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['Age'].astype(float).values) df['age_'] = pd.Series(scale) #print(df[['Age','age_']].head(20)) #exit() # --- #scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['Pclass'].astype(int).values) scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['Pclass'].astype(float).values) df['pclass_'] = pd.Series(scale) #print(df[['Pclass','pclass_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['Parch'].astype(float).values) df['parch_'] = pd.Series(scale) #print(df[['Parch','parch_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['SibSp'].astype(float).values) df['sibsp_'] = pd.Series(scale) #print(df[['SibSp','sibsp_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['embarked_'].astype(float).values) df['embarked_'] = pd.Series(scale) #print(df[['Embarked','embarked_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['cabin_'].astype(float).values) df['cabin_'] = pd.Series(scale) print(df[['Cabin','cabin_']].head(20)) # --- scale = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(df['name_'].astype(float).values) df['name_'] = pd.Series(scale) #print(df[['Name','name_']].head(20)) #exit() # --- #feature_names = np.array(['sex_', 'Fare']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'Fare', 'Age', 'Parch', 'SibSp']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'Age', 'Parch', 'SibSp']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'age_', 'Pclass', 'Parch', 'SibSp']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'Parch', 'SibSp']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'parch_', 'SibSp']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'parch_', 'SibSp','embarked_']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'Fare', 'Age', 'Pclass', 'Parch', 'SibSp','embarked_']) #['Pclass', 'Fare', ]) #feature_names = np.array(['sex_', 'fare_', 'pclass_', 'parch_', 'sibsp_']) #feature_names = np.array(['sex_', 'fare_', 'pclass_']) #feature_names = np.array(['sex_', 'fare_']) #feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'parch_', 'sibsp_','embarked_']) ###feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'parch_', 'sibsp_']) #feature_names = np.array(['sex_', 'fare_', 'pclass_','name_','cabin_']) feature_names = np.array(['sex_', 'fare_', 'age_', 'pclass_', 'name_', 'cabin_', 'parch_', 'sibsp_']) label_name = ['Survived'] # Generate feature combinations print(feature_names) fc = [] flen = len(feature_names) for fl in np.arange(flen)+1: c = itertools.combinations(feature_names, fl) # check: for s in c: #print(s) fc.append(np.array(s)) Feature_list = fc lFl = len(fc) #for f in Feature_list: # print(f) #exit() feat_c = 0 # Run over all feature combinations on different classifiers for fn in Feature_list: # KNN model: Run over some NN for nn in (1,5,10,15): e_in,e_out = knn(df,label_name,fn,lFl,feat_c,n_neighbors=nn) Model_name.append('KNN k'+str(nn)) E_in.append(e_in) E_out.append(e_out) iFeatures.append(feat_c) Features.append(fn) # DTree e_in,e_out = dtree(df,label_name,fn,lFl,feat_c) Model_name.append('DTree') E_in.append(e_in) E_out.append(e_out) iFeatures.append(feat_c) Features.append(fn) # SVM #e_in,e_out = svm(df,label_name,fn,lFl,feat_c,C=10.) e_in,e_out = svm(df,label_name,fn,lFl,feat_c,C=1000.) #e_in,e_out = svm(df,label_name,fn,lFl,feat_c,C=100000.) Model_name.append('SVM') E_in.append(e_in) E_out.append(e_out) iFeatures.append(feat_c) Features.append(fn) # RF e_in,e_out = rf(df,label_name,fn,lFl,feat_c,n_estimators=100) #e_in,e_out = rf(df,label_name,fn,lFl,feat_c,n_estimators=1000) Model_name.append('RF') E_in.append(e_in) E_out.append(e_out) iFeatures.append(feat_c) Features.append(fn) # http://scikit-learn.org/stable/modules/ensemble.html#adaboost # 200 estimators looked better # adaboost e_in,e_out = adaboost(df,label_name,fn,lFl,feat_c,n_estimators=200) Model_name.append('adaboost') E_in.append(e_in) E_out.append(e_out) iFeatures.append(feat_c) Features.append(fn) feat_c += 1 pass # ------------------------------------------------------------------------------------------ # Fill results dataframe Model_id = np.arange(len(E_in)) #print(Feature_list) #print(Model_id) #print(Model_name) #print(E_in) #print(E_out) #modeldf = pd.DataFrame({'Name': Model_name, 'E_in': E_in, 'E_out': E_out, 'iFeatures': iFeatures},index=Model_id) modeldf = pd.DataFrame({'Name': Model_name, 'E_in': E_in, 'E_out': E_out, 'Features': Features},index=Model_id) # Sort by best performing models modeldf.sort(columns=['E_out','E_in'],ascending=[1,1],inplace=True) #print(modeldf.head()) modeldf.to_csv('result_brute_test.csv') # ------------------------------------------------------------------------------------------ # Print out best performing models #nbest = 10 ##best_model = modeldf.ix[modeldf['E_out'].argmin()] #best_model = modeldf.ix[modeldf['E_out'].argsort().values[:nbest]] print(modeldf.head(20)) # print best models #print(best_model) #for i in best_model['iFeatures'].values: # print(i,Feature_list[i]) # ------------------------------------------------------------------------------------------ # Plot performance vs models plt.rc('text', usetex=True) line_ein = plt.plot(Model_id,np.array(E_in)*100.,label=r'$E_{in}$ #("in sample")') line_eout = plt.plot(Model_id,np.array(E_out)*100.,label=r'$E_{out}$ ("out of sample")') plt.title('') plt.xlabel('Model Id') plt.ylabel('Error Rate (\%)') plt.legend(handles=[line_ein, line_eout],labels=['',''])
def forecast(): global selected_algorithm, selected_optimizer, image_name selected_algorithm = request.form.get('alist') selected_optimizer = request.form.get('olist') image_name = "default" chart_list=glob.glob("static/images/*.png") for chart in chart_list: os.remove(chart) selected_file = request.form.get('flist') if selected_file == "default": #selected_algorithm = " " filelist = os.listdir( "D:\\uploads" ) return render_template('upload2.html', filenames = filelist, image_name= image_name, selected_algorithm=selected_algorithm, selected_optimizer=selected_optimizer, error = True) print(selected_file) file_path = "data\\" + selected_file print("file path - {}".format(file_path)) energyData=pd.read_csv(file_path,header=0) def lstm(energyData): energyData.head() energyData=energyData.drop(['REGION'],axis=1) energyData=energyData.drop(['PERIODTYPE'],axis=1) energyData=energyData.drop(['RRP( Regional reference price)'],axis=1) #energyData=energyData.drop(['TOTALDEMAND'],axis=1) energyData=energyData.rename(columns={"RRP( Regional reference price)" : "RRP"}) energyData.tail() def univariate_data(dataset, start_index, end_index, history_size, target_size): data = [] labels = [] start_index = start_index + history_size if end_index is None: end_index = len(dataset) - target_size for i in range(start_index, end_index): indices = range(i-history_size, i) # Reshape data from (history_size,) to (history_size, 1) data.append(np.reshape(dataset[indices], (history_size, 1))) labels.append(dataset[i+target_size]) return np.array(data), np.array(labels) TRAIN_SPLIT = 1339 tf.random.set_random_seed(13) uni_data = energyData['TOTALDEMAND'] uni_data.index = energyData['SETTLEMENTDATE'] uni_data.head() uni_data = uni_data.values uni_train_mean = uni_data[:TRAIN_SPLIT].mean() uni_train_std = uni_data[:TRAIN_SPLIT].std() uni_data = (uni_data-uni_train_mean)/uni_train_std univariate_past_history = 20 univariate_future_target = 0 x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, univariate_past_history, univariate_future_target) x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, univariate_past_history, univariate_future_target) print(x_train_uni.shape[-2:]) print ('Single window of past history') print (x_train_uni[0]) print ('\n Target Demand to predict') print (y_train_uni[0]) from keras.layers import LSTM simple_lstm_model = Sequential() simple_lstm_model.add(LSTM(50, input_shape=x_train_uni.shape[-2:])) simple_lstm_model.add(Dense(1)) simple_lstm_model.compile(optimizer='adam', loss='mae') print(simple_lstm_model.predict(x_val_uni).shape) EVALUATION_INTERVAL = 200 EPOCHS = 10 simple_lstm_model.fit(x_train_uni,y_train_uni, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL, validation_data=(x_val_uni,y_val_uni), validation_steps=50) y_pred=simple_lstm_model.predict(x_val_uni) plt.plot(y_val_uni, color = 'red', label = 'Actual Energy Demand') plt.plot(y_pred, color = 'blue', label = 'Predicted Energy Demand') plt.title('Energy demand- Actual vs Predicted ') plt.ylabel('Demand') plt.legend() val = random.randrange(1,500) global image_name image_name = "chart"+str(val)+'.png' plt.savefig(os.path.join(os.getcwd(), 'static/images',image_name), format='png') from sklearn.metrics import mean_squared_error from math import sqrt rms = sqrt(mean_squared_error(y_val_uni, y_pred)) global RMSE RMSE=rms print(rms) def lstm2(energyData): energyData.head() energyData=energyData.drop(['REGION'],axis=1) energyData=energyData.drop(['PERIODTYPE'],axis=1) energyData=energyData.drop(['RRP( Regional reference price)'],axis=1) #energyData=energyData.drop(['TOTALDEMAND'],axis=1) energyData=energyData.rename(columns={"RRP( Regional reference price)" : "RRP"}) energyData.tail() def univariate_data(dataset, start_index, end_index, history_size, target_size): data = [] labels = [] start_index = start_index + history_size if end_index is None: end_index = len(dataset) - target_size for i in range(start_index, end_index): indices = range(i-history_size, i) # Reshape data from (history_size,) to (history_size, 1) data.append(np.reshape(dataset[indices], (history_size, 1))) labels.append(dataset[i+target_size]) return np.array(data), np.array(labels) TRAIN_SPLIT = 1339 tf.random.set_random_seed(13) uni_data = energyData['TOTALDEMAND'] uni_data.index = energyData['SETTLEMENTDATE'] uni_data.head() uni_data = uni_data.values uni_train_mean = uni_data[:TRAIN_SPLIT].mean() uni_train_std = uni_data[:TRAIN_SPLIT].std() uni_data = (uni_data-uni_train_mean)/uni_train_std univariate_past_history = 20 univariate_future_target = 0 x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, univariate_past_history, univariate_future_target) x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, univariate_past_history, univariate_future_target) print(x_train_uni.shape[-2:]) print ('Single window of past history') print (x_train_uni[0]) print ('\n Target Demand to predict') print (y_train_uni[0]) from keras.layers import LSTM simple_lstm_model = Sequential() simple_lstm_model.add(LSTM(50, input_shape=x_train_uni.shape[-2:])) simple_lstm_model.add(Dense(1)) simple_lstm_model.compile(optimizer=selected_optimizer, loss='mae') print(simple_lstm_model.predict(x_val_uni).shape) EVALUATION_INTERVAL = 200 EPOCHS = 10 simple_lstm_model.fit(x_train_uni,y_train_uni, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL, validation_data=(x_val_uni,y_val_uni), validation_steps=50) y_pred=simple_lstm_model.predict(x_val_uni) plt.plot(y_val_uni, color = 'red', label = 'Actual Energy Demand') plt.plot(y_pred, color = 'blue', label = 'Predicted Energy Demand') plt.title('Energy demand- Actual vs Predicted ') plt.ylabel('Demand') plt.legend() val = random.randrange(1,500) global image_name image_name = "chart"+str(val)+'.png' plt.savefig(os.path.join(os.getcwd(), 'static/images',image_name), format='png') from sklearn.metrics import mean_squared_error from math import sqrt rms = sqrt(mean_squared_error(y_val_uni, y_pred)) global RMSE RMSE=rms print(rms) def Gradient_booster(energyData): df = energyData[['SETTLEMENTDATE', 'TOTALDEMAND']] df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'], dayfirst=True) print(df.head) print (df.dtypes) from sklearn import preprocessing minmaxscaler=preprocessing.MinMaxScaler() array_y=np.array(df['TOTALDEMAND']) normalized_y=minmaxscaler.fit_transform(array_y.reshape(-1,1)) X = np.array(df['SETTLEMENTDATE']) X=X.astype(float) from sklearn.model_selection import train_test_split xTrain, xTest, yTrain, yTest = train_test_split(X, normalized_y, test_size = 0.2, random_state = 0) from sklearn.ensemble import GradientBoostingRegressor gbrt=GradientBoostingRegressor(n_estimators=150,learning_rate=0.02,subsample=.5,max_depth=8) gbrt.fit(xTrain.reshape(-1, 1), yTrain.reshape(-1, 1)) y_pred=gbrt.predict(xTest.reshape(-1, 1)) plt.figure(figsize=(9,6)) plt.plot(minmaxscaler.inverse_transform(yTest.reshape(-1,1)), color='red', label = 'Actual Energy Demand') plt.plot(minmaxscaler.inverse_transform(y_pred.reshape(-1,1)), color='blue', label = 'Predicted Energy Demand') plt.title('Energy demand- Actual vs Predicted ') plt.ylabel('Demand') plt.legend() val = random.randrange(1,500) global image_name image_name = "chart"+str(val)+'.png' plt.savefig(os.path.join(os.getcwd(), 'static/images',image_name), format='png') from sklearn.metrics import mean_squared_error from math import sqrt Grms = sqrt(mean_squared_error(yTest, y_pred)) print("Root Mean Square error : {}".format(Grms)) global RMSE RMSE=Grms from sklearn.metrics import mean_absolute_error mean_absolute_error(yTest, y_pred) def decision_tree(energyData): df = energyData[['SETTLEMENTDATE', 'TOTALDEMAND']] df['SETTLEMENTDATE'] = pd.to_datetime(df['SETTLEMENTDATE'], dayfirst=True) from sklearn import preprocessing minmaxscaler=preprocessing.MinMaxScaler() array_y=np.array(df['TOTALDEMAND']) normalized_y=minmaxscaler.fit_transform(array_y.reshape(-1,1)) X = np.array(df['SETTLEMENTDATE']) X=X.astype(float) from sklearn.model_selection import train_test_split xTrain, xTest, yTrain, yTest = train_test_split(X, normalized_y, test_size = 0.2, random_state = 0) from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state = 0, max_depth=75,min_samples_split=2, max_leaf_nodes=None) regressor.fit(xTrain.reshape(-1,1), yTrain.reshape(-1,1)) y_predict=regressor.predict(xTest.reshape(-1,1)) plt.figure(figsize=(9,6)) plt.plot(minmaxscaler.inverse_transform(yTest.reshape(-1,1)), color='red', label = 'Actual Energy Demand') plt.plot(minmaxscaler.inverse_transform(y_predict.reshape(-1,1)), color='blue', label = 'Predicted Energy Demand') plt.title('Energy demand- Actual vs Predicted ') plt.ylabel('Demand') plt.legend() val = random.randrange(1,500) global image_name image_name = "chart"+str(val)+'.png' plt.savefig(os.path.join(os.getcwd(), 'static/images',image_name), format='png') from math import sqrt Drms= sqrt(mean_squared_error(yTest, y_predict)) print("Root Mean Square Error : {}".format(Drms)) global RMSE RMSE=Drms from sklearn.metrics import mean_absolute_error mean_absolute_error(yTest, y_predict) def svm(file_path): fontsize = 18 data_file = pd.read_csv(file_path,parse_dates=True, index_col=1) data_file=data_file.drop(['REGION'],axis=1) data_file=data_file.drop(['PERIODTYPE'],axis=1) ## Set weekends and holidays to 1, otherwise 0 data_file['Atypical_Day'] = np.zeros(len(data_file['TOTALDEMAND'])) # Weekends data_file['Atypical_Day'][(data_file.index.dayofweek==5)|(data_file.index.dayofweek==6)] = 1 data_file.head(50) # Create new column for each hour of day, assign 1 if index.hour is corresponding hour of column, 0 otherwise for i in range(0,48): data_file[i] = np.zeros(len(data_file['TOTALDEMAND'])) data_file[i][data_file.index.hour==i] = 1 # Example 3am data_file[3][:6] # Add historic usage to each X vector # Set number of hours prediction is in advance n_hours_advance = 1 # Set number of historic hours used n_hours_window = 6 for k in range(n_hours_advance,n_hours_advance+n_hours_window): data_file['TOTALDEMAND_t-%i'% k] = np.zeros(len(data_file['TOTALDEMAND'])) for i in range(n_hours_advance+n_hours_window,len(data_file['TOTALDEMAND'])): for j in range(n_hours_advance,n_hours_advance+n_hours_window): data_file['TOTALDEMAND_t-%i'% j][i] = data_file['TOTALDEMAND'][i-j] # Define training and testing periods train_start = '1-march-2019' train_end = '23-march-2019' test_start = '24-march-2019' test_end = '1-april-2019' # Split up into training and testing sets (still in Pandas dataframes) X_train_df = data_file[train_start:train_end] y_train_df = data_file['TOTALDEMAND'][train_start:train_end] X_test_df = data_file[test_start:test_end] y_test_df = data_file['TOTALDEMAND'][test_start:test_end] N_train = len(X_train_df[0]) print ('Number of observations in the training set: ', N_train) # Numpy arrays for sklearn X_train = np.array(X_train_df) X_test = np.array(X_test_df) y_train = np.array(y_train_df) y_test = np.array(y_test_df) from sklearn import preprocessing as pre from sklearn import svm scaler = pre.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) SVR_model = svm.SVR(kernel='rbf',C=100,gamma=.001).fit(X_train_scaled,y_train) print ('Testing R^2 =', round(SVR_model.score(X_test_scaled,y_test),3)) # Use SVR model to calculate predicted next-hour usage predict_y_array = SVR_model.predict(X_test_scaled) # Put it in a Pandas dataframe for ease of use predict_y = pd.DataFrame(predict_y_array,columns=['TOTALDEMAND']) predict_y.index = X_test_df.index ### Plot daily total kWh over testing period y_test_barplot_df = pd.DataFrame(y_test_df,columns=['TOTALDEMAND']) y_test_barplot_df['Predicted'] = predict_y['TOTALDEMAND'] fig = plt.figure(figsize=[11,7]) ax = fig.add_subplot(111) y_test_barplot_df.plot(kind='line',ax=ax,color=['red','blue']) ax.grid(False) ax.set_ylabel('Electricity Demand(kWh)', fontsize=fontsize) ax.set_xlabel('') # Pandas/Matplotlib bar graphs convert xaxis to floats, so need a hack to get datetimes back ax.set_xticklabels([dt.strftime('%b %d') for dt in y_test_df.index.to_pydatetime()],rotation=0, fontsize=fontsize) plt.title('Energy demand- Actual vs Predicted ') plt.legend(fontsize=fontsize) val = random.randrange(1,500) global image_name image_name = "chart"+str(val)+'.png' plt.savefig(os.path.join(os.getcwd(), 'static/images',image_name), format='png') from sklearn.metrics import mean_squared_error from math import sqrt SvmRms = sqrt(mean_squared_error(y_test_df,predict_y)) global RMSE RMSE=SvmRms print("Root Mean Square Error : {}".format(SvmRms)) if selected_algorithm == "LSTM": lstm(energyData) elif selected_algorithm == "Gradient_Booster": Gradient_booster(energyData) elif selected_algorithm == "Decision_Tree": decision_tree(energyData) elif selected_algorithm == "SVM": svm(file_path) elif selected_algorithm == "LSTM2": lstm2(energyData) return redirect('/')
y_pred = clf.predict(x_test) # print y_pred f1.append(f1_score(y_test, y_pred, average='weighted')) # print sum(int(f1)) f_average.append((sum(f1) / (len(f1)))) print f_average C=[0.01,0.1,1,10,100] plt.title("SVM") plt.plot(C,f_average) plt.show() return f_average f_average_svm=svm() def gini(): classifiers_DT = [DecisionTreeClassifier(criterion="gini", max_leaf_nodes=2),DecisionTreeClassifier(criterion="gini", max_leaf_nodes=5), DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10), DecisionTreeClassifier(criterion="gini", max_leaf_nodes=20)] DT_name = "Decision Tree" f1 = [] f_average=[] for name, clf in zip(DT_name, classifiers_DT): for train_index ,test_index in kf.split(df): x_train,x_test=X[train_index],X[test_index] y_train,y_test=Y[train_index],Y[test_index] # Prepare the plot for this classifier
print("Accuracy Test Data:", metrics.accuracy_score(y_train, y_hat)) print("Accuracy Test Data:", metrics.accuracy_score(y_test, y_pred)) print("Precision:", metrics.precision_score(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) y_pred_rf = svclassifier.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_rf) return fpr_rf, tpr_rf, thresholds_rf fpr_keras, tpr_keras, thresholds_keras = neural_net() auc_neural = auc(fpr_keras, tpr_keras) fpr_rf, tpr_rf, thresholds_rf = svm() auc_SVM = auc(fpr_rf, tpr_rf) def roc_cruve(): plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_keras, tpr_keras, label='Neural Net (area = {:.3f})'.format(auc_neural)) plt.plot(fpr_rf, tpr_rf, label='SVM (area = {:.3f})'.format(auc_SVM)) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show()
fig.suptitle('Predicted versus actual labels', fontsize=14, fontweight='bold') # Show the plot plt.show() if __name__ == '__main__': # Preprocess data # # shift the distribution of each attribute to have a mean of zero and a standard deviation of one (unit variance) # from sklearn.preprocessing import scale # Apply `scale()` to the `digits` data data = scale(digits.data) # # Split data into training and test sets # from sklearn.cross_validation import train_test_split # Split the `digits` data into training and test sets X_train, X_test, y_train, y_test, images_train, images_test = train_test_split( data, digits.target, digits.images, test_size=0.25, random_state=42) # Use either k_means() or svm() svm()
def main(): print("Starting application..\n") traffic = normalize(filename_adj) print("1 - Display the entropy list") print("2 - Display the information gain list") print("3 - Display the chi-squared list") print("4 - Display the ReliefF list") selection = input("Enter your selection: ") #num_features = input("Enter the number of features to select: ") if (selection == 1): ent_list(traffic) elif (selection == 2): #sorted_gain_list = ig_list(traffic) sorted_gain_list = [ 'land', 'urgent', 'wrong_fragment', 'rerror_rate', 'srv_rerror_rate', 'count', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'duration', 'flag', 'serror_rate', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'diff_srv_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'srv_diff_host_rate', 'dst_host_diff_srv_rate', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_count', 'protocol_type', 'srv_count', 'dst_host_same_src_port_rate', 'dst_bytes', 'service', 'src_bytes' ] print("Features selected using Information Gain.") elif (selection == 3): #sorted_chi2_list = chi2_list() print("Features selected using Chi-squared.") sorted_chi2_list = [ 'dst_host_same_srv_rate', 'count', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'wrong_fragment', 'dst_host_rerror_rate', 'diff_srv_rate', 'dst_host_srv_rerror_rate', 'dst_host_diff_srv_rate', 'serror_rate', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'flag', 'dst_host_same_src_port_rate', 'protocol_type', 'srv_diff_host_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_count', 'service', 'land', 'urgent', 'dst_host_count', 'srv_count', 'duration', 'src_bytes', 'dst_bytes' ] elif (selection == 4): #X,Y = create_dataframe() #obj = reliefF() #sorted_reliefF_list = obj.fit_transform(X, Y) sorted_reliefF_list = [ 'srv_count', 'src_bytes', 'dst_bytes', 'dst_host_count', 'srv_diff_host_rate', 'dst_host_srv_diff_host_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_srv_rate', 'same_srv_rate', 'diff_srv_rate', 'duration', 'dst_host_srv_rerror_rate', 'dst_host_rerror_rate', 'dst_host_srv_serror_rate', 'dst_host_serror_rate', 'service', 'serror_rate', 'srv_serror_rate', 'srv_rerror_rate', 'rerror_rate', 'flag', 'count', 'protocol_type', 'land', 'urgent', 'wrong_fragment' ] print("Features selected using reliefF.") else: print("Invalid selection") print("\n") print("*****************") print("Classification") print("*****************") print("1 - Naive Bayes ") print("2 - SVM") print("3 - Decision Tree") print("4 - Random Forest") clx_selection = input("Enter your selection: ") book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("Sheet 1") sheet1.write(0, 2, "Accuracy Values") for i in range(1, 2): num_features = i print("Number of features selected - ", i) if (selection == 2 and clx_selection == 1): print("Classifying using Naive Bayes...") accuracy = naive_bayes(sorted_gain_list, num_features) sheet1.write(i, 2, accuracy) elif (selection == 3 and clx_selection == 1): print("Classifying using Naive Bayes...") accuracy = naive_bayes(sorted_chi2_list, num_features) sheet1.write(i, 5, accuracy) elif (selection == 4 and clx_selection == 1): print("Classifying using Naive Bayes...") accuracy = naive_bayes(sorted_reliefF_list, num_features) sheet1.write(i, 2, accuracy) elif (selection == 2 and clx_selection == 2): print("Classifying using SVM...") accuracy = svm(sorted_gain_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 3 and clx_selection == 2): print("Classifying using SVM...") accuracy = svm(sorted_chi2_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 4 and clx_selection == 2): print("Classifying using SVM...") accuracy = svm(sorted_reliefF_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 2 and clx_selection == 3): print("Classifying using Decision Trees...") accuracy = decision_tree(sorted_gain_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 3 and clx_selection == 3): print("Classifying using Decision Trees...") accuracy = decision_tree(sorted_chi2_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 4 and clx_selection == 3): print("Classifying using Decision Trees...") accuracy = decision_tree(sorted_reliefF_list, num_features) sheet1.write(i, 3, accuracy) elif (selection == 2 and clx_selection == 4): print("Classifying using Random Forest...") accuracy = randomForest(sorted_gain_list, num_features) sheet1.write(i, 6, accuracy) elif (selection == 3 and clx_selection == 4): print("Classifying using Random Forest...") accuracy = randomForest(sorted_chi2_list, num_features) sheet1.write(i, 6, accuracy) elif (selection == 4 and clx_selection == 4): print("Classifying using Random Forest...") accuracy = randomForest(sorted_reliefF_list, num_features) sheet1.write(i, 6, accuracy) else: print("Invalid Selection") print("End") book.save("results.xls")
return scores.mean() def svm(X, y): from sklearn import svm scores = cross_val_score(svm.SVC(C=1, kernel='linear'), X, y, cv=5, scoring='accuracy') # print(scores) return scores.mean() if __name__ == '__main__': # load data data = read_csv('/Users/Bian/Desktop/mockData/StatiscalAnalysis.csv', delimiter=",", skiprows=0) data = data.as_matrix() X = data[:, 2:] y = data[:, 0].astype(int) # dimension reduction via LDA lda = LDA(n_components=2) X = lda.fit_transform(X, y) # compute model accuracy print('Decision tree accuracy: {0}'.format(decisionTree(X, y))) print('KNN accuracy: {0}'.format(knn(X, y))) print('Logistic regression accuracy: {0}'.format(logistic(X, y))) print('SVM accuracy: {0}'.format(svm(X, y)))
#SVM from sklearn import svm #import the model library svm = svm.SVC() parameters = { 'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'degree': [1, 3, 5, 7], 'gamma': ('scale', 'auto') } clf = GridSearchCV(svm, parameters) clf.fit(x_train, y_train) print(clf.best_params_) model = svm(gamma='scale', degree=1, kernel='linear', C=1) # sitting model parameters print("test accuracy: {} ".format( svm.fit(x_train, y_train).score(x_test, y_test)) ) # printing the results of fitting the model over the testing set print("train accuracy: {} ".format( svm.fit(x_train, y_train).score(x_train, y_train)) ) # printing the results of fitting the model over the training set #0.9801 #XGBoost from xgboost import XGBClassifier model = XGBClassifier() parameters = { 'max_depth': [2, 4, 6, 8, 10],
def main(): st.title("ABC Corp..") st.title("Automated Machine Learning Web (POC)") file_name = '' + datetime.now().strftime("%d%b%Y_%H%M%S%f") + '.csv' data_file = './DataDump/file' + file_name file_bytes = st.file_uploader("Upload a file") data_load_state = st.text("Upload your data") try: if file_bytes is not None: with open(data_file, mode='w', newline='') as f: print(file_bytes.getvalue().strip('\r\n'), file=f) data_load_state.text("Upload....Done!") #dataDF = pd.read_csv(data_file) except FileNotFoundError: st.error('File not found.') st.header("Data Exploration") st.sidebar.header("Data Exploration") X = "" y = "" X_train = '' X_test = '' y_train = '' y_test = '' y_pred = '' @st.cache def load_data(): data = pd.read_csv(data_file) # st.write(data.head()) return data if st.sidebar.checkbox("Show Data HEAD or TAIL"): select_option = st.radio("Select option", ['HEAD', 'TAIL']) if select_option == 'HEAD': st.write(load_data().head()) elif select_option == "TAIL": st.write(load_data().tail()) if st.sidebar.checkbox("Show Full Data"): st.write(load_data()) data_load_state.text("Loading data....Done!") if st.sidebar.checkbox("Data Info"): st.text("Data Shape") st.write(load_data().shape) st.text("Data Columns") st.write(load_data().columns) st.text("Data Type") st.write(load_data().dtypes) st.text("Count of NaN values") st.write(load_data().isnull().any().sum()) st.markdown("Select Target Column") try: if file_bytes is not None: all_columns = load_data().columns dataDF = load_data() target = st.selectbox("Select", all_columns) if dataDF[target].dtype == "object": label_encoder = LabelEncoder() dataDF[target] = label_encoder.fit_transform(dataDF[target]) except: st.markdown('File not found.') st.markdown("Auto Discard Columns") try: if file_bytes is not None: for column in dataDF: if dataDF[column].nunique() == dataDF.shape[0]: dataDF.drop([column], axis=1, inplace=True) for column in dataDF: if 'name' in column.lower(): dataDF.drop([column], axis=1, inplace=True) st.text("Data Columns") st.write(dataDF.columns) st.text("Count of NaN values") st.write(dataDF.isnull().any().sum()) except: st.markdown('File not found.') st.markdown("Preprocess Object Type Columns") try: if file_bytes is not None: obj_df = dataDF.select_dtypes(include=['object']).copy() dataDF = dataDF.select_dtypes(exclude=['object']) try: one_hot = pd.get_dummies(obj_df) # ,drop_first=True) except Exception as e: print("There has been an exception: ", e) one_hot = pd.DataFrame() dataDF = pd.concat([one_hot, dataDF], axis=1) except: st.markdown('File not found.') sc = StandardScaler() st.header("Split DataSet into Train and Test") st.sidebar.header("Split DataSet into Train and Test") st.markdown("Split") try: if file_bytes is not None: #print(dataDF.dtypes) X = dataDF.drop([target], axis=1) # X = X.apply(normalize) y = dataDF[target] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=20) except: st.markdown('File not found.') st.markdown("Normalize Columns") try: if file_bytes is not None: from sklearn.preprocessing import MinMaxScaler norm = MinMaxScaler() X_train = norm.fit_transform(X_train) X_test = norm.transform(X_test) X = norm.transform(X) except: st.markdown('File not found.') if st.sidebar.checkbox("Show X_test,X_train,y_test,y_train"): st.write("X_train") st.write(X_train) st.write(X_train.shape) st.write("X_test") st.write(X_test) st.write(X_test.shape) st.write("y_train") st.write(y_train) st.write(y_train.shape) st.write("y_test") st.write(y_test) st.write(y_test.shape) def gradBoost(X, y): from sklearn.ensemble import GradientBoostingClassifier gradientBoosting = GradientBoostingClassifier() gradientBoosting.fit(X, y) return gradientBoosting def randForest(X, y): from sklearn.ensemble import RandomForestClassifier randomForest = RandomForestClassifier() randomForest.fit(X, y) return randomForest def svm(X, y): from sklearn import svm clf = svm.SVC() clf.fit(X, y) return clf def xgb(X, y): import xgboost as xgboost xg_reg = xgboost.XGBRegressor() xg_reg.fit(X, y) return xg_reg def linearReg(X, y): from sklearn.linear_model import LinearRegression lineReg = LinearRegression() lineReg.fit(X, y) return lineReg def lassoReg(X, y): from sklearn.linear_model import Lasso lasso = Lasso(alpha=0.01) lasso.fit(X, y) return lasso st.subheader("ML Algorithms") try: if file_bytes is not None: st.write("Available algorithms are:") st.write( "Binary Classification: GB Classifier, RF Classifier, SVM") st.write("Regression: OLS, XGB, Lasso Regression") if dataDF[target].nunique() == 2: st.header("Using Binary Classification Algorithms") GB = gradBoost(X_train, y_train) st.write( 'Accuracy of Gradient Boosting classifier on test set: {:.2f}' .format(GB.score(X_test, y_test))) RF = randForest(X_train, y_train) st.write( 'Accuracy of Random Forest classifier on test set: {:.2f}'. format(RF.score(X_test, y_test))) SVM = svm(X_train, y_train) st.write( 'Accuracy of SVM classifier on test set: {:.2f}'.format( SVM.score(X_test, y_test))) elif dataDF[target].nunique() / dataDF[target].count() < .1: st.header("Using Multi-Class Classification Algorithms") GB = gradBoost(X_train, y_train) st.write( 'Accuracy of Gradient Boosting classifier on test set: {:.2f}' .format(GB.score(X_test, y_test))) st.write(classification_report(y_test, GB.predict(X_test))) RF = randForest(X_train, y_train) st.write( 'Accuracy of Random Forest classifier on test set: {:.2f}'. format(RF.score(X_test, y_test))) st.write(classification_report(y_test, RF.predict(X_test))) else: st.header("Using Regression Algorithms") from sklearn.metrics import mean_squared_error, r2_score LReg = linearReg(X_train, y_train) st.write( 'R-squared value for Linear Regression predictor on test set: {:.2f}%' .format(r2_score(y_test, LReg.predict(X_test)))) XGB = xgb(X_train, y_train) st.write( 'R-squared value for eXtreme Gradient Boosting Regression predictor on test set: {:.2f}%' .format(r2_score(y_test, XGB.predict(X_test)))) LassReg = lassoReg(X_train, y_train) st.write( 'R-squared value for Lasso Regression predictor on test set: {:.2f}%' .format(r2_score(y_test, LassReg.predict(X_test)))) except: st.markdown('File not found.') st.header("Run Prediction on Test Set") st.subheader("Select Desired Algorithm") try: if file_bytes is not None: if dataDF[target].nunique() == 2: selectML = st.selectbox("Select", [ 'Gradient Boosting classifier', 'Random Forest classifier', 'SVM classifier' ]) if selectML == 'Gradient Boosting classifier': dML = GB elif selectML == 'Random Forest classifier': dML = RF elif selectML == 'SVM classifier': dML = SVM elif dataDF[target].nunique() / dataDF[target].count() < .1: selectML = st.selectbox("Select", [ 'Gradient Boosting classifier', 'Random Forest classifier' ]) if selectML == 'Gradient Boosting classifier': dML = GB elif selectML == 'Random Forest classifier': dML = RF else: selectML = st.selectbox("Select", [ 'Linear Regression predictor', 'eXtreme Gradient Boosting Regression predictor', 'Lasso Regression predictor' ]) if selectML == 'Linear Regression predictor': dML = LReg elif selectML == 'eXtreme Gradient Boosting Regression predictor': dML = XGB elif selectML == 'Lasso Regression predictor': dML = LReg data_test = './DataDump/file' + datetime.now().strftime( "%d%b%Y_%H%M%S%f") + '.csv' file_test = st.file_uploader("Upload test file") try: if file_bytes is not None: with open(data_test, mode='w', newline='') as f: print(file_test.getvalue().strip('\r\n'), file=f) data_load_state.text("Upload....Done!") dataDF1 = pd.read_csv(data_test) except FileNotFoundError: st.error('File not found.') except: st.markdown('File not found.') st.subheader("PREDICT") try: if file_bytes is not None: for column in dataDF1: if dataDF1[column].nunique() == dataDF1.shape[0]: dataDF1.drop([column], axis=1, inplace=True) for column in dataDF1: if 'name' in column.lower(): dataDF1.drop([column], axis=1, inplace=True) obj_df1 = dataDF1.select_dtypes(include=['object']).copy() dataDF1 = dataDF1.select_dtypes(exclude=['object']) try: one_hot1 = pd.get_dummies(obj_df1) # ,drop_first=True) except Exception as e: print("There has been an exception: ", e) one_hot1 = pd.DataFrame() dataDF1 = pd.concat([one_hot1, dataDF1], axis=1) X1 = dataDF1.drop([target], axis=1) y1 = dataDF1[target] X1 = norm.transform(X1) st.write('Accuracy of Selected Algorithm on test Dataset: {:.2f}'. format(dML.score(X1, y1))) except: st.markdown('File not found.')
def main(): data0 = pd.read_csv('data.csv') data1 = pd.read_csv('data1.csv') data2 = pd.read_csv('data2.csv') data3 = pd.read_csv('data3.csv') #frames = [data0, data1] #data = pd.concat(frames) # shuffle the data # print(list(data.columns)) #print('Shuffling Data') data0 = data0.sample(frac=1).reset_index(drop=True) data1 = data1.sample(frac=1).reset_index(drop=True) data2 = data2.sample(frac=1).reset_index(drop=True) data3 = data3.sample(frac=1).reset_index(drop=True) # Calculate length of 30% data for testing val_text = len(data0.index) * (30.0 / 100.0) val1_text = len(data1.index) * (30.0 / 100.0) val2_text = len(data2.index) * (30.0 / 100.0) val3_text = len(data3.index) * (30.0 / 100.0) # divide training and test data test_data0 = data0.tail(int(val_text)).reset_index(drop=True) training_data0 = data0.head(len(data0.index) - int(val_text)) test_data1 = data1.tail(int(val1_text)).reset_index(drop=True) training_data1 = data1.head(len(data1.index) - int(val1_text)) test_data2 = data2.tail(int(val2_text)).reset_index(drop=True) training_data2 = data2.head(len(data2.index) - int(val2_text)) test_data3 = data3.tail(int(val3_text)).reset_index(drop=True) training_data3 = data3.head(len(data3.index) - int(val3_text)) #test_data = data.tail(1).reset_index(drop=True) #training_data = data.head(1) # set labels # labels for svm # print(list(training_data.columns)) training_labels0 = training_data0['labels'].values test_labels0 = test_data0['labels'].values training_labels1 = training_data1['labels'].values test_labels1 = test_data1['labels'].values training_labels2 = training_data2['labels'].values test_labels2 = test_data2['labels'].values training_labels3 = training_data3['labels'].values test_labels3 = test_data3['labels'].values # labels for tf classifiers training_features_final0 = [] test_features_final0 = [] training_features_final1 = [] test_features_final1 = [] training_features_final2 = [] test_features_final2 = [] training_features_final3 = [] test_features_final3 = [] print("Reading Data") training_features_final0 = readData(training_data0) test_features_final0 = readData(test_data0) training_features_final1 = readData(training_data1) test_features_final1 = readData(test_data1) training_features_final2 = readData(training_data2) test_features_final2 = readData(test_data2) training_features_final3 = readData(training_data3) test_features_final3 = readData(test_data3) training_features_final = np.concatenate( (training_features_final0, training_features_final1, training_features_final3, training_features_final3), axis=0) test_features_final = np.concatenate( (test_features_final0, test_features_final1, test_features_final2, test_features_final3), axis=0) training_labels = np.concatenate( (training_labels0, training_labels1, training_labels2, training_labels3), axis=0) test_labels = np.concatenate( (test_labels0, test_labels1, test_labels2, test_labels3), axis=0) training_X = np.array(training_labels) test_X = np.array(test_labels) training_X = np.reshape(training_X, (len(training_X), 1)) test_X = np.reshape(test_X, (len(test_X), 1)) print("Reshaping Data") train_cnn = reshapeData(training_features_final) test_cnn = reshapeData(test_features_final) training_features_final = reshapeList(training_features_final) test_features_final = reshapeList(test_features_final) n_dim = training_features_final.shape[1] while(True): print('Please choose any one of the options:') print('Press 1 for Support Vector Machine') print('Press 2 for Logistics Regression') print('Press 3 for Naive Bayes Classifier') print('Press 4 for Neural Network') print('Press 5 for Convolutional Neural Network') print('Press 6 to Exit') print('Press 7 for all') choice = int(raw_input()) bool = False if choice == 6: break if choice == 1 or choice == 7: ''' ###SVM classifier using sklearn ''' bool = True svm(training_features_final, training_labels, test_features_final, test_labels) ''' ###Logistics Regression Using tensorflow ''' if choice == 2 or choice == 7: bool = True #print("Initiating Logistics Regression") learning_rate = 0.1 training_epochs = 28 X = tf.placeholder(tf.float32, [None, n_dim]) Y = tf.placeholder(tf.float32, [None, 1]) W = tf.Variable(tf.ones([n_dim, 2])) logReg(training_features_final, training_X, test_features_final, test_X, learning_rate, training_epochs, X, Y, W) if choice == 4 or choice == 7: bool = True #print("Initialising Neural Network") neuralNetKeras(training_features_final, training_X, test_features_final, test_labels, n_dim) if choice == 5 or choice == 7: bool = True #print("Initialising convolutional neural network ") cnnKeras(train_cnn, training_X, test_cnn, test_X, n_dim) if choice == 3 or choice == 7: bool = True #print("Initialising Naive Bayes") naiveBayes(training_features_final, training_labels, test_features_final, test_labels) if bool == False: print('Wrong Choice, Please Choose Again') if choice == 7: break sys.exit()