def main(): num_cluster = int(input("Enter Number of CLusters of data: ")) num_attr = int(input("Enter Number of attributes of the data: ")) num_rows = [ int(x) for x in input( "Enter the number of rows of each Cluster with a 3space : "). split() ] mu = data.genRandomList(num_attr, 0, 10) sigma = data.genRandomList(num_attr, 0, 10) df = data.genrateDataVal(mu, sigma, num_attr, num_rows, num_cluster) data.splitData(df, 0.5) pre.removeOutliers() pre.pca() # Print the Variance Ratio of top 3 attributes of the PCA
def makeOneFold(nb_folds): # Returns one fold from the cross-validation training set # Note: has to create the whole cross-validation set (could be improved) data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds) rand_fold = random.randint(0, nb_folds-1) # Pick a random fold to test np.random.shuffle(data_trains[rand_fold]) # shuffles training examples x_train = data_trains[rand_fold][:, 1:] y_train = data_trains[rand_fold][:, 0] x_test = data_tests[rand_fold][:, 1:] y_test = data_tests[rand_fold][:, 0] return x_train, y_train, x_test, y_test
def main(): place_weather = PreprocessData.PreprocessData() feature_columns = [ 'ObsTime', 'StnPres', 'SeaPres', 'Temperature', 'Td dew point', 'RH', 'Precp', 'PrecpHour', 'SunShine', 'SunShineRate', 'VisbMean', 'EvapA', 'Cloud Amount' ] weather_X = place_weather[feature_columns] weather_Y = place_weather['NextDayPrecp'] train_X, test_X, train_Y, test_Y = train_test_split( weather_X, weather_Y.astype('int'), test_size=0.3) rainfall_classifier = CreateClassifier(train_X, train_Y) test_Y_predicted = rainfall_classifier.predict(test_X) accuracy = metrics.accuracy_score(test_Y, test_Y_predicted) VisualizingDecisionTree.Visualize(rainfall_classifier, feature_columns, weather_Y.to_string()) print(accuracy)
def sequentialValidate(net, start=0.5, step=1, iterations=1000, learning_rate=0.01, grad_decay=0.9, epsilon=0.000001, adadelta=False): # Cross-validation procedure for time series data # Trains on the first 'start' fraction of examples and predicts the next one # Adds 'step' examples to training set and tests on the next example, repeat until all the examples have been used data = pp.preprocessing_final(2012, 2014, export=False)[0] x_data = data[:, 1:] y_data = data[:, 0] min_errs = [] test_errs = [] train_errs = [] train_class_errs = [] min_class_errs = [] nb_examples = int(start * len(data)) nb_runs = 0 print(len(x_data[nb_examples])) while nb_examples < len(data): net.reset() temp = net.test(x_data[:nb_examples, :], y_data[:nb_examples], iterations, learning_rate, grad_decay, epsilon, adadelta, X_test=x_data[nb_examples:nb_examples+20, :], y_test=y_data[nb_examples:nb_examples+20]) min_errs.append(temp[0]) test_errs.append(temp[1]) train_errs.append(temp[2]) train_class_errs.append(temp[3]) min_class_errs.append(temp[4]) nb_examples += step nb_runs += 1 print("\n----------") print(net, "\tNb runs:", nb_runs) print("Avg min:", sum(min_errs)/nb_runs, "\t\t\t", min_errs) print("Avg final test:", sum(test_errs)/nb_runs, "\t\t\t", test_errs) print("Avg final train:", sum(train_errs)/nb_runs, "\t\t\t", train_errs) print("Avg final class ", sum(train_class_errs)/nb_runs, "\t\t\t", train_class_errs) print("Avg min class ", sum(min_class_errs)/nb_runs, "\t\t\t", min_class_errs)
# -*- coding: utf-8 -*- """ Created on Sun May 26 23:34:08 2019 @author: user """ import numpy as np import matplotlib.pyplot as plt import PreprocessData as pre import pandas as pd #PREPROCESS solarRaw, windRaw, demandRaw, allRaw2016 = pre.importData() solar, wind, agg, demand = pre.preprocessData(solarRaw, windRaw, demandRaw, allRaw2016) df0 = pd.DataFrame( data={ "solar": solar["0"], "wind": wind["0"], "agg": agg["0"], "demand": demand["0"], "solar25": solar["0"] * 0.25, "wind75": wind["0"] * 0.75 }) #%% PROCESS print("START monitoring design") #initialize titles = [ "(a) Solar PV 100MWp Down", "(b) Wind 100MWp Down",
def clear(key_params=[]): X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') X = PreprocessData.prepare_data(X, mode='save', key_features=key_params) return X, y
X, y = clear() svm_clf = SVC(kernel='linear') svm_clf.fit(X, y) # In[3]: # test feature transformation scores = [] X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append(check_score(X, y, svm_clf)) X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append(check_score(PreprocessData.prepare_features(X), y, svm_clf)) X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append( check_score(PreprocessData.prepare_data(X, mode='save'), y, svm_clf)) scores # In[6]: # test feature importances from sklearn.feature_selection import RFE scores = [] for i in xrange(18):
def lambda_handler(event,context): store_id = int(event['data']) item = DynamoDB.GetDataFromDB(store_id) data = PreprocessData.PreprocessReceivedData(item) return data
import PreprocessData as pp import TestRun from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier import matplotlib as plt # This file is used to test other machine learning algorithms if __name__ == '__main__': pass data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, 9) print("Tests") errs = [] for i in range(9): x_train = data_trains[i][:, 1:] y_train = data_trains[i][:, 0] x_test = data_tests[i][:, 1:] y_test = data_tests[i][:, 0] # logistic regression reg = LogisticRegression() reg.fit(x_train, y_train) print("Error:", reg.score(x_test, y_test)) # support vector machine
def crossValidate(net, nb_folds, iterations=1000, learning_rate=0.01, grad_decay=0.9, epsilon=0.000001, adadelta=False): # Splits the data into nb_folds batches using each batch as a testing set in turn and rest as the training set ######## Need to fix: how to train on multiple years at once? data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds) for i in range(nb_folds): np.random.shuffle(data_trains[i]) # shuffles training examples min_errs = [] test_errs = [] train_errs = [] nb_buckets = 5 # Could make this a parameter freq_probs_test = [0] * nb_buckets freq_wins_test = [0] * nb_buckets freq_probs_train = [0] * nb_buckets freq_wins_train = [0] * nb_buckets for i in range(nb_folds): print("--- Fold " + str(i+1) + " ---") start = time.clock() net.reset() # Make test and training sets x_train = data_trains[i][:, 1:] y_train = data_trains[i][:, 0] x_test = data_tests[i][:, 1:] y_test = data_tests[i][:, 0] temp = net.test(x_train, y_train, iterations, learning_rate, grad_decay, epsilon, adadelta, X_test=x_test, y_test=y_test) min_errs.append(temp[0]) test_errs.append(temp[1]) train_errs.append(temp[2]) freqs = net.testProbBuckets(x_train, y_train, nb_buckets=nb_buckets, X_test=x_test, y_test=y_test) # Aggregates the prob buckets from each fold together freq_probs_test = list(map(add, freq_probs_test, freqs[0])) freq_wins_test = list(map(add, freq_wins_test, freqs[1])) freq_probs_train = list(map(add, freq_probs_train, freqs[2])) freq_wins_train = list(map(add, freq_wins_train, freqs[3])) print("Time:", time.clock() - start) print("\n----------") print(net, "\tNb folds:", nb_folds) print("Avg min:", sum(min_errs)/nb_folds, "\t\t\t", min_errs) print("Avg final test:", sum(test_errs)/nb_folds, "\t\t\t", test_errs) print("Avg final train:", sum(train_errs)/nb_folds, "\t\t\t", train_errs) probs_test = [freq_wins_test[i]/ freq_probs_test[i] if freq_probs_test[i] != 0 else -1 for i in range(nb_buckets)] probs_train = [freq_wins_train[i]/ freq_probs_train[i] if freq_probs_train[i] != 0 else -1 for i in range(nb_buckets)] print("Total freq test:") print(freq_probs_test) print(freq_wins_test) print(["{0:.2f}".format(x) for x in probs_test]) print("Total freq train:") print(freq_probs_train) print(freq_wins_train) print(["{0:.2f}".format(x) for x in probs_train]) # Returns average min test error return sum(min_errs)/nb_folds
def classify(class_prec, clf): import PreprocessData return clf.predict(PreprocessData.prepare_data([class_prec.get_features() ]))[0]