def start_at(iteration): # Root-Verzeichnis aus Parametern lesen und Pfade initialisieren count_of_iterations = iteration - 1 max_iterations = 1000 all_start = time.time() while not count_of_iterations == max_iterations: start = time.time() count_of_iterations = count_of_iterations + 1 print("Starting new iteration... Current:", str(count_of_iterations)) if count_of_iterations == iteration: feature_selection.select_features_at(iteration) else: feature_selection.select_features() # Modell trainieren model_training.train_and_save_model() # Evaluation ausführen evaluation.write_evaluation(count_of_iterations, False) print('Iteration ', str(count_of_iterations), 'finished. Duration: ', str(round((time.time() - start) / 60, 2)), 'min') print("all", str(max_iterations), "iterations finished.") print("Overall duration:", str(round((time.time() - all_start) / 60, 2)), 'min')
def main(argv): print "Phishing URL predictor - Naive Bayes approach" training_file = argv[1] option = "random" if len(argv) > 2: option = argv[2] if option == "random": #Random shuffle run split = 0.8 if len(argv) == 4: split = float(argv[3]) data = read_data(training_file) # Uncomment the following line to filter features. data = filter_features( data, fs.select_features(data, { "method": "info_gain", "num_features": 14 })) X_train, y_train, X_test, y_test = split_train_test(data, split) class_prob, class_feature_value_count = train(X_train, y_train) store(class_prob, class_feature_value_count) accuracy = predict(X_test, y_test, class_prob, class_feature_value_count) print "\n" print "Random test-train split. Training ratio = " + str(split) + "." print "\n" print "Accuracy = " + str(accuracy) + " %." print "\n" elif option == "cv": k = 5 if len(argv) == 3: k = int(argv[2]) kf = KFold(n_splits=k) data = read_data(training_file) data = filter_features( data, fs.select_features(data, { "method": "info_gain", "num_features": 25 })) np.random.shuffle(data) X = data[:, :-1] y = data[:, -1] accuracy = 0.0 for train_idx, test_idx in kf.split(data): class_prob, class_feature_value_count = train( X[train_idx], y[train_idx]) accuracy += predict(X[test_idx], y[test_idx], class_prob, class_feature_value_count) print "Cross-validated. Folds = " + str(k) + "." print "Average Accuracy over different folds = " + str(accuracy / k) else: print "Illegal options set. Use either 'random' or 'cv'"
def main(): """ Main function """ # Extract features if not feature_file_exists(): extract_features() # Select features if not select_feature_file_exists(): select_features() # Train model train()
'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi' ] data_dict = removeNaN(data_dict) finance = ['salary'] #change finance to get more outliers data_dict = removeOutlier(data_dict, finance) data_dict, new_feature = create_new_feature(data_dict) features_list.append(new_feature) k = 1 index = [] k_list = [] i = 0 while k < 20: index.append(k) new_features_list = list(select_features(data_dict, features_list, k)) new_features_list.insert(0, 'poi') my_dataset = data_dict data = featureFormat(data_dict, new_features_list) labels, features = targetFeatureSplit(data) # Craeting, training and validationg GaussianNB classifier from sklearn.naive_bayes import GaussianNB clf = GaussianNB() from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42)
def main(): # Read the data from the text files begin = time.time() vocab, train_raw, test_raw = read.read_tweets("../training_set_tweets.txt", "../test_set_tweets.txt") print "Num of Train users:", len(train_raw), "Num of Test users:", len(test_raw) print "Read data:", time.time() - begin # Preprocess the data begin = time.time() vocab, bigrams, train_word, test_word, train_char, test_char = preprocessing.preprocess(train_raw, test_raw) print "Preprocessed the data", time.time() - begin return # Assign ids to words vocab_list = list(vocab) vocab_list.sort() begin = time.time() vocab_dict = {} for i in range(len(vocab_list)): vocab_dict[vocab_list[i]] = i print "Assigned ids to words:", time.time() - begin # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train: num_train_tweets += len(train[author_id]) for author_id in test: num_test_tweets += len(test[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train_word: num_train_tweets += len(train_word[author_id]) for author_id in test_word: num_test_tweets += len(test_word[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) count = 0 for author_id in train_word: for tweet in train_word[author_id]: X_train[count, :] = features.get_full_feats(tweet, vocab_dict) y_train[count] = author_id count += 1 print count count = 0 for author_id in test_word: for tweet in test_word[author_id]: X_test[count, :] = features.get_full_feats(tweet, vocab_dict) y_test[count] = author_id count += 1 print count begin = time.time() feats = feature_selection.select_features(X_train, y_train, np.zeros(num_full_feats), 100, "dia") X_train = X_train[:, feats] X_test = X_test[:, feats] print "Features selected:", time.time() - begin begin = time.time() clf = model.train(X_train, y_train) acc, my_acc, preds, scores = model.test(clf, X_test, y_test) print 'time:', time.time()-begin, 'acc:', acc, 'my_acc:', my_acc print 'preds:', preds print 'scores:', scores print (preds == y_test)[:100] print np.count_nonzero(scores > 0) print np.count_nonzero(scores < 0)
folder_plots = 'plots/' os.makedirs(folder_plots, exist_ok=True) X = tensor_data y = annotations.reset_index(drop=True) X = extract_df_with_features(X, y, attributes, [target_class], data_folder) # X = extract_basic_features(X, y, attributes) y_target = y[target_class] X_ids = X['recordingID'] X = X.drop(['recordingID', target_class], axis=1) # select the features with feature selection selected_features = select_features( X, y_target, 0.01, attributes, data_folder) # doesn't work with 10% = (0.1) for f in selected_features: if not f in X.columns.values: selected_features = selected_features.drop(f) X = X[selected_features] print("Number of selected features:", len(selected_features)) # add duration as a feature #X['duration'] = y['duration'] # X = X[['duration']] # only duration as feature y.loc[:, 'score'] = (1 - y.loc[:, target_class]) / y.loc[:, 'duration'] score_values_nozeros = y[y.score > 0].score.values mu, std = norm.fit(score_values_nozeros) score_values = y.score.values
import pandas as pd from sklearn.model_selection import train_test_split from feature_selection import select_features #our libraries import preprocessing import random_forests import ann dataset = pd.read_csv('data/student-mat.csv', delimiter=";") dataset = preprocessing.preprocess(dataset) x, y = preprocessing.split_attributes(dataset, 3) x = select_features(x, 16) y=preprocessing.bucketize_y(y,2) cols = list(x.columns) cols.extend(y.columns) new_data = pd.DataFrame(data=x.join(y), columns=cols) #splitting train and test x_train,x_test,y_train,y_test = train_test_split(x,y.values[:,2],test_size=.2,random_state=0) cm_rf = random_forests.classify(x_train,x_test,y_train,y_test) #Neural Network history = ann.build_and_train_net(x_train,y_train,x_test,y_test) cm_ann = ann.test_classifier(x_test,y_test)
attributes = ['bvp', 'gsr', 'hrv', 'ibi', 'tmp'] folder_plots = 'plots/' os.makedirs(folder_plots, exist_ok=True) X = tensor_data y = annotations.reset_index(drop=True) X = extract_df_with_features(X, y, attributes, [target_class], data_folder) #X = extract_basic_features(X, y, attributes) y_target = y[target_class] X_ids = X['recordingID'] X = X.drop(['recordingID', target_class], axis=1) # select the features with feature selection selected_features = select_features(X, y_target, 0.025, attributes, data_folder) for f in selected_features: if not f in X.columns.values: selected_features = selected_features.drop(f) X = X[selected_features] # add duration as a feature # X.loc[:, 'duration'] = y.loc[:, 'duration'] #X = X[['duration']] y.loc[:, 'score'] = (1 - y.loc[:, target_class]) / y.loc[:, 'duration'] score_values_nozeros = y[y.score > 0].score.values mu, std = norm.fit(score_values_nozeros) score_values = y.score.values y.loc[:, 'score_normalized'] = gaussian(score_values, mu, std) y.loc[:, 'score_norm_binary'] = pd.cut(y.loc[:, 'score_normalized'],
imgpr.create_masks_and_nrrds(dataPath) # Feature extraction img2use = ["T2"] mask2use = ["M+"] paramsPath = "../Params.yaml" fextr.extract_features_from_all(dataPath, img2use, mask2use, paramsPath, selectionFeaturesPath, manualFeaturesPath) # Feature selection FSmethod = 'MRMR' FSparams = {'nFeatures': 15, 'internalFEMethod': 'MID', 'nBins': 4, 'discStrategy': 'kmeans'} selectedFeatures = fesel.select_features(FSmethod, FSparams, selectionFeaturesPath, manualFeaturesPath) print(f'Features selected by {FSmethod}:') print(selectedFeatures) # Prediction model MLmethod = 'RFreg' rfParams = {'n_estimators': [5, 10, 15, 25, 50, 75], 'max_depth': [None, 1, 3, 5, 10, 15], 'max_features': [0.33, 0.67, 1.0, 'sqrt']} scoring = 'r2' yTrueTest, yPredRegTest, yTrueVal, yPredRegVal, MLparams = mlpred.create_evaluate_model\ (MLmethod, rfParams, selectedFeatures, selectionFeaturesPath, manualFeaturesPath, paramSearchResultsPath, optimizeParams=True, scoringOptiMetric=scoring) # Write validation- and test- results to csv-file mlpred.write_results_to_csv(predResultsPath, selectionFeaturesPath, FSmethod, FSparams, selectedFeatures, MLmethod, MLparams, yTrueTest, yPredRegTest, yTrueVal, yPredRegVal)