def calcWaypoints(self): ''' Given the waypoints, find their xy-coordinates. ''' startLoc = dm.haversine(*self.waypoints['initial']) endLoc = dm.haversine(*self.waypoints['final']) return [endLoc[0] - startLoc[0], endLoc[1] - startLoc[1]]
def hyper_parameters_testing(): data = flr.read_from_file("train_x.txt") data = dm.add_bias(data) data = dm.convert_sex_to_number(data, 1, 1, 0) data = np.array(data) lables = flr.read_from_file("train_y.txt") lables = np.array(lables) lables = dm.convert_to_float(lables) # Normalaise the data min_range = np.ones(data.shape[1]) min_range = np.multiply(-1, min_range) max_range = np.ones(data.shape[1]) data = dm.min_max_normalization(data, min_range, max_range) # Set the properties we want to use ignore = np.ones(len(data[0])) # Pa algorithem alg_pa = pa.Pa(data, 0.1, 100, ignore, 3) alg_pa.train(data, lables, 0.01, 26, 5) alg_pa.print_details() eta_list = [] epocnum_list = [] accuracy_list = []
def bprintPlaylist(self): self.listWidget.clear() li = DataManipulation.test( DataManipulation.truncatePlusArtist(self.so), self.ar, DataManipulation.truncatePlusArtist(self.al)) for i in li: self.listWidget.addItem(i)
def create_scatter_with_stats(): """ Creates a scatter plot which tries to visualize correlation between capture rate, base egg steps, and base total of Pokémon. To use 3 variables in a 2-dimensional plot, the colour channel is utilized to signify base total. """ all_data = dm.load_data() data = dm.group_data_mean(all_data, "type") x_var = "capture_rate" y_var = "base_egg_steps" colour_by = "base_total" fig = go.Figure( data=go.Scatter(x=data[x_var].astype(int), y=data[y_var].astype(int), mode='markers', marker=dict(size=16, color=data[colour_by], colorscale="Viridis", colorbar=dict( title="{}".format(colour_by)), showscale=True))) fig.update_layout(title="Correlation between {}, {}, and {}".format( x_var, y_var, colour_by), xaxis_title="{}".format(x_var), yaxis_title="{}".format(y_var)) fig.update_xaxes(range=(-5, 260)) fig.update_yaxes(range=(-1000, 35000)) save_plot(fig, "CorrelationPlot")
def __init__(self): super(self.__class__, self).__init__() self.setupUi(self) self.msg = QMessageBox() self.btnSo = QtGui.QPushButton('Songs') self.btnAr = QtGui.QPushButton('Artists') self.btnAl = QtGui.QPushButton('Albums') self.msg.setText("Choose Field to get length of") self.msg.setWindowTitle("Get Length") self.msg.addButton(self.btnAl, QtGui.QMessageBox.YesRole) self.msg.addButton(self.btnSo, QtGui.QMessageBox.AcceptRole) self.msg.addButton(self.btnAr, QtGui.QMessageBox.NoRole) self.al = DataManipulation.getAlbums() self.ar = DataManipulation.getArtists() self.so = DataManipulation.getSongs() self.di = DataManipulation.getSongCountPerArtist(self.ar) self.btnPrint.clicked.connect(self.bprintPlaylist) self.btnGetSongs.clicked.connect(self.bgetSongs) self.btnGetArtists.clicked.connect(self.bgetArtists) self.btnGetAlbums.clicked.connect(self.bgetAlbums) self.btnFindDups.clicked.connect(self.bfindDups) self.btnNumSongs.clicked.connect(self.bnumSongs) self.btnTopA.clicked.connect(self.btopArtists) self.btnSingles.clicked.connect(self.bsingles) self.btnCreateSS.clicked.connect(self.bcreateSS) self.btnLen.clicked.connect(self.bgetLength)
def main(): trainData = pd.read_csv('CrimeClassification/Dataset/train-2.csv') classesMap = dm.mapClasses(trainData) print trainData.info() print(classesMap) cleanedTrainData,normalizationValues = dm.cleanTrainData(trainData,classesMap) print(cleanedTrainData.info()) data = cleanedTrainData.values np.random.shuffle(data.astype(np.float64)) Ytrain = binarizeLabels(data[0:,0]) Xtrain = data[0:,1:] model = trainModel(Xtrain,Ytrain) output = testProbaModel(model,Xtrain) testData = pd.read_csv('CrimeClassification/Dataset/test-2.csv') cleanedTestData = dm.cleanTestData(testData,normalizationValues) print(cleanedTestData.info()) output = testProbaModel(model,cleanedTestData.values[:,1:]) result = np.c_[cleanedTestData.values[:,0].astype(int),output] outputVec = sorted(classesMap, key=classesMap.__getitem__) outputVec.insert(0,'Id') dataFrameResults = pd.DataFrame(result,columns=outputVec) dataFrameResults['Id']=dataFrameResults['Id'].astype(int) dm.saveResults(dataFrameResults)
def bcreateSS(self): self.listWidget.clear() li = DataManipulation.prettyPrint( DataManipulation.truncatePlusArtist(self.so), DataManipulation.truncateArtists(self.ar, sortedSet=False), DataManipulation.truncatePlusArtist(self.al)) for i in li: self.listWidget.addItem(i)
def load_specific_kinase(kinase, start_center): fin_extract = read_kinase('Kinase_Substrate_Dataset'); #select specidfied kinase and organism fg_read = [x[2] for x in fin_extract if x[0] == kinase and x[1] == 'human']; data_A = DataManipulation.list_to_formA(fg_read); data_A = DataManipulation.subset_formA(data_A,0,start_center); return dict([('seq',fg_read), ('formA',data_A)]);
def main(): trainData = pd.read_csv("CrimeClassification/Dataset/train-2.csv") trainData.info() classesMap = dm.mapClasses(trainData) cleanedTrainData = dm.cleanData(trainData, classesMap) cleanedTrainData.describe() heatMapXY(cleanedTrainData, "Global heatMap of crimes") heatMapPerCategory(cleanedTrainData, classesMap) histogramOfCategories(cleanedTrainData, classesMap) textHistogram(cleanedTrainData, classesMap)
def main(): Data = dm.extractData() Vectors = dm.dataToVectors(Data[:2000]) # Read First 2000 lines TrainingSet = Vectors[:1500] # First 1500 Training Data TestingSet = Vectors[1500:] # Last 500 Testing Data #testRBF(TrainingSet, TestingSet) # Test RBF Network kNearestNeighbors(TrainingSet, TestingSet, 50) # Test kNearestNeighbors
def exportProjections(newFile, projectionsFile='projections.csv', model_dir='edited'): projections = pd.read_csv(projectionsFile) predictions = runBatchPredict(projectionsFile, model_dir) newDF = pd.DataFrame() for i in predictions: projections['Sal'] = i[1] newRow = projections.loc[projections['Player'] == i[0]] newDF = newDF.append(newRow) DM.export(newDF, newFile) print('Exported!')
def main(): trainData = pd.read_csv('CrimeClassification/Dataset/train-2.csv') classesMap = dm.mapClasses(trainData) print trainData.info() print(classesMap) cleanedTrainData,normalizationValues = dm.cleanTrainData(trainData,classesMap) print(cleanedTrainData.info()) [Xtrain, Ytrain, Xtest,Ytest]=splitData(cleanedTrainData.values) model = trainModel(Xtrain,Ytrain) Ypred = testModel(model,Xtest) confMatrix = da.confusionMatrix(Ypred,Ytest) titleCM = da.orderClassesMapKeys(classesMap) da.plotConfusionMatrix(confMatrix,titleCM) print (da.f1Score(Ypred,Ytest))
def main(): trainData = pd.read_csv("CrimeClassification/Dataset/train-2.csv") classesMap = dm.mapClasses(trainData) print trainData.info() print (classesMap) cleanedTrainData = dm.cleanData(trainData, classesMap) testData = pd.read_csv("CrimeClassification/Dataset/test-2.csv") cleanedTestData = dm.cleanTestData(testData) print (cleanedTrainData.info()) model = trainModel(cleanedTrainData.values) result = np.c_[cleanedTestData.values[:, 0].astype(int), output] outputVec = sorted(classesMap, key=classesMap.__getitem__) outputVec.insert(0, "Id") dataFrameResults = pd.DataFrame(result, columns=outputVec) dataFrameResults["Id"] = dataFrameResults["Id"].astype(int) dm.saveResults(dataFrameResults)
def inputGPS(self): ''' End user must manually specify starting and stopping GPS. ''' # textInputApp = tk.Tk() # textInput = tk.simpledialog.askfloat('Input GPS', \ # 'Enter Lat in Deg: ', parent = textInputApp, minvalue = -180, \ # maxvalue = 180) # self.halt(textInputApp) self.waypoints = {} # startLat = float(input('Enter Starting Latitutde in Deg> ')) # startLon = float(input('Enter Starting Longitude in Deg> ')) startLat = 0 startLon = 0 self.waypoints['initial'] = [startLat, startLon] # endLat = float(input('Enter Ending Latitutde in Deg> ')) # endLon = float(input('Enter Ending Longitude in Deg> ')) endLat = 0.0005 endLon = 0.0005 self.waypoints['final'] = [endLat, endLon] # Before we do anything, convert to radians for key in self.waypoints.keys(): #map(dm.degToRad, self.waypoints[key]) for i in range(len(self.waypoints[key])): self.waypoints[key][i] = dm.degToRad(self.waypoints[key][i]) return None
def updateCompass(self): ''' Get new bearing data and update plot. ''' # Clear the old compass point self.pole.cla() radii = [0, 1] # After many days of debugging, it was found that pyplot expects # theta coordiantes in radians even though the default theta # coordinate axes display degrees. Good to know! thisAngle = dm.degToRad(self.bearing) angles = [thisAngle for i in range(2)] # Plot data self.pole.plot(angles, radii, color='red') # Formatting #self.pole.set_title('Bearing') self.pole.set_yticklabels([]) self.pole.set_xticklabels(['E', 'NE', 'N', 'NW', 'W', 'SW', 'S', 'SE']) # Add data and flush self.polar.canvas.draw() self.polar.canvas.flush_events() return None
def updateTextBoxes(self): ''' Update any text information in the gui. ''' # Create text box for bearing and xy-coordinate msg = '' thisPos = dm.getLatestPositionData(self.updateCount) msg += 'X-pos: ' + str(round(thisPos[0], self.figs)) + '\n' msg += 'Y-pos: ' + str(round(thisPos[1], self.figs)) + '\n' msg += 'Bearing: ' + str(self.bearing) + '\n' # And embed this text box in the application self.bearingText = tk.Label(self.master, text=msg) self.bearingText.config(width=20) self.bearingText.config(font=('Consolas', 12)) textColumn = 2 textRow = 0 self.bearingText.grid(column=textColumn, row=textRow, sticky='NW') # gps = '' # gps += 'Initial' + '\n' # gps += 'Lat: ' + str(round(self.waypoints['start'][0], self.figs)) # gps += 'Lon: ' + str(round(self.waypoints['start'][1], self.figs)) # # self.gpsText = tk.Label(self.master, text = gps) return None
def loadTestingData(date): """ Loads the testing data from disk. Note, because of the potential for very large amounts of testing data, the testing data is returned as a numpy memmap. :param date: (string) Date in which the data was collected (YYYY_MMDD) :return: tuple containing the testing data (row ordered feature vectors), as well as the target labels (X, y). X has type np.memmap, and y has type np.array """ dataDirectory = DATA_DIRECTORIES[date + "_ML"] testingDataPath = os.path.join(dataDirectory, TESTING_DATA_PATH) sampleCounts = DataManipulation.loadSampleCounts(date) testingSamples = sampleCounts["testing"] # Open the file for appending testingData = np.memmap(testingDataPath, mode='r', dtype=np.float32, shape=(testingSamples, NUM_WAVELENGTHS + 1)) X = testingData[:, 1:] y = testingData[:, 0] return X, y
def regression_linear_regression(data_array, cont_dis, cls_rmv, sig, y_col, split_array): print('-----------------------------------') print('Using Linear Regression Imputation') print('-----------------------------------') imp = 'Linear Regression Imputation:' # use linear regression for imputation d_a, stat_a, x, y, x_n, y_n, xr, rt, y_ar = DataManipulation.linear_regression_imputation( list(data_array), cont_dis, cls_rmv, sig, y_col) cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t( list(x), list(y), list(x_n), list(y_n), split_array) print('Unnormalized Data:') show_results(imp, cod_r, lse_r, mse_r) print('Normalized Data:') show_results(imp, n_cod_r, n_lse_r, n_mse_r) figure(1) title('Weight vs. Horse Power') plot(xr, rt, 'r--', label='weight vs regression') plot(xr, y_ar, 'o', label='c label') xlabel('Car Weight') ylabel('Horse Power') legend(['regression data', 'raw data']) show() return
def loadTestingData(date): """ Loads the testing data from disk. Note, because of the potential for very large amounts of testing data, the testing data is returned as a numpy memmap. :param date: (string) Date in which the data was collected (YYYY_MMDD) :return: tuple containing the testing data (row ordered feature vectors), as well as the target labels (X, y). X has type np.memmap, and y has type np.array """ dataDirectory = DATA_DIRECTORIES[date+"_ML"] testingDataPath = os.path.join(dataDirectory, TESTING_DATA_PATH) sampleCounts = DataManipulation.loadSampleCounts(date) testingSamples = sampleCounts["testing"] # Open the file for appending testingData = np.memmap(testingDataPath, mode='r', dtype=np.float32, shape=(testingSamples, NUM_WAVELENGTHS+1)) X = testingData[:, 1:] y = testingData[:, 0] return X, y
def getLength(self, str): self.listWidget.clear() if str == "s": self.listWidget.addItem("%s" % len(self.so)) elif str == "ar": self.listWidget.addItem( "%s" % len(DataManipulation.truncateArtists(self.ar))) elif str == "al": self.listWidget.addItem("%s" % len(sortedset(self.al)))
def main(argv): (train_x, train_y), (test_x, test_y) = DM.offLoad() train(train_x, train_y, test_x, test_y) #losses = kfoldCrossValidate(8) print("-" * 30) #print("Average loss for kfold validation: ", sum(losses)/len(losses)) #Average loss for kfold validation: 3.50362616777 -- with 8 folds print("-" * 30)
def btopArtists(self): maxsize = 50 numS = QtGui.QInputDialog.getInt(self, "Choose Number of Artists", "Display this number of Artists:", maxsize) self.listWidget.clear() li = DataManipulation.topArtists(numS[0], self.di) for i in li: self.listWidget.addItem(i)
def main(): pd.set_option("display.precision",3) trainData = pd.read_csv('CrimeClassification/Dataset/train01-tsc.csv') classesMap = dm.mapClasses(trainData) print(classesMap) cleanedTrainData = dm.cleanData(trainData,classesMap) testData = pd.read_csv('CrimeClassification/Dataset/test-tsc.csv') cleanedTestData = dm.cleanTestData(testData) print(cleanedTrainData.info()) model = trainModel(cleanedTrainData.values) print(cleanedTestData.info()) output = testProbaModel(model,cleanedTestData.values) result = np.c_[cleanedTestData.values[:,0].astype(int),output] outputVec = sorted(classesMap, key=classesMap.__getitem__) outputVec.insert(0,'Id') dataFrameResults = pd.DataFrame(result,columns=outputVec) dataFrameResults['Id']=dataFrameResults['Id'].astype(int)
def bnumSongs(self): maxsize = 20 numS = QtGui.QInputDialog.getInt( self, "Choose Song Limit", "Artists with more than this number of songs will be displayed:", maxsize) li = DataManipulation.numSongs(numS[0], self.di) self.listWidget.clear() for i in li: self.listWidget.addItem(i)
def saveTrainingData(date, X, y): """ Saves a given matrix of training data as a np.memmap in the proper location for later use in training machine learning models. :param date: (string) Date in which the data was collected (YYYY_MMDD) :param X: (np.array) Array of training features (n_samples x n_features) :param y: (np.array) Array of labels for the training data (n_samples) :return: (None) """ sampleCounts = DataManipulation.loadSampleCounts(date) dataDirectory = DATA_DIRECTORIES[date + "_ML"] trainingDataPath = os.path.join(dataDirectory, TRAINING_DATA_PATH) if not os.path.exists(trainingDataPath): # Open the file for the first time to write samples, features = X.shape trainingData = np.memmap(trainingDataPath, mode='w+', dtype=np.float32, shape=(samples, features + 1)) trainingData[:, 0] = y trainingData[:, 1:] = X # Flush the data to disk and close the memmap del trainingData else: DataManipulation.updateTrainingData(date, X, y) # Update the sample counts file for index in y: labelString = INDEX_TO_LABEL[index] sampleCounts[labelString + "_training"] += 1 sampleCounts["training"] += len(y) DataManipulation.updateSampleCounts(date, sampleCounts)
def saveTrainingData(date, X, y): """ Saves a given matrix of training data as a np.memmap in the proper location for later use in training machine learning models. :param date: (string) Date in which the data was collected (YYYY_MMDD) :param X: (np.array) Array of training features (n_samples x n_features) :param y: (np.array) Array of labels for the training data (n_samples) :return: (None) """ sampleCounts = DataManipulation.loadSampleCounts(date) dataDirectory = DATA_DIRECTORIES[date+"_ML"] trainingDataPath = os.path.join(dataDirectory, TRAINING_DATA_PATH) if not os.path.exists(trainingDataPath): # Open the file for the first time to write samples, features = X.shape trainingData = np.memmap(trainingDataPath, mode='w+', dtype=np.float32, shape=(samples, features+1)) trainingData[:, 0] = y trainingData[:, 1:] = X # Flush the data to disk and close the memmap del trainingData else: DataManipulation.updateTrainingData(date, X, y) # Update the sample counts file for index in y: labelString = INDEX_TO_LABEL[index] sampleCounts[labelString+"_training"] += 1 sampleCounts["training"] += len(y) DataManipulation.updateSampleCounts(date, sampleCounts)
def load_fasta_background(filename, center='.'): fasta_list = FileIO.read_fasta(filename) result = fasta_to_chunks(fasta_list) if (center != '.'): result = filter_chunks(result, center) result = result[0:300000] #Concatenate because I don't have enough ram formA = DataManipulation.list_to_formA(result) return dict([('seq', result), ('formA', formA)])
def regression_linear_regression_fs(data_array, cont_dis, cls_rmv, sig, y_col, split_array): print('-----------------------------------') print( 'Using Linear Regression Imputation with Forward Selection Dimension Reduction' ) print('-----------------------------------') attribute_labels = [ 'mpg', # 0 'Cylinders', # 1 'Displacement', # 2 'Horse Power', # 3 'Weight', # 4 'Acceleration', # 5 'Model Year', # 6 'Origin', # 7 'Car Type' ] # 8 imp = 'Linear Regression Imputation with Forward Selection:' d_a, stat_a, x, y, x_n, y_n, xr, rt, y_ar = DataManipulation.linear_regression_imputation( list(data_array), cont_dis, cls_rmv, sig, y_col) F, min_mse, cols_f = forward_selector_test(list(x), list(y), split_array[0]) f_n, min_mse_n, cols_f_n = forward_selector_test(list(x_n), list(y_n), split_array[0]) print('F is using ' + str(len(F[0]) - 1) + ' attributes') for i in range(len(cols_f)): print(attribute_labels[cols_f[i]]) cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t( list(F), list(y), list(f_n), list(y_n), split_array) print('Unnormalized Data:') show_results(imp, cod_r, lse_r, mse_r) print('') print('Normalized Data:') show_results(imp, n_cod_r, n_lse_r, n_mse_r) figure(1) title('Weight vs. Horse Power') plot(xr, rt, 'r--', label='weight vs regression') plot(xr, y_ar, 'o', label='c label') xlabel('Car Weight') ylabel('Horse Power') legend(['regression data', 'raw data']) show() return
def load_fasta_background(filename, center='.'): fasta_list = FileIO.read_fasta(filename); result = fasta_to_chunks(fasta_list); if(center != '.'): result = filter_chunks(result, center); result = result[0:300000]; #Concatenate because I don't have enough ram formA = DataManipulation.list_to_formA(result); return dict([('seq',result), ('formA',formA)]);
def final_format(data_file, lables_file, test_file): # Getting the data and convert it to be useable by the algorthems data = flr.read_from_file(data_file) data = dm.add_bias(data) data = dm.convert_sex_to_number(data, 1, 1, 0) data = np.array(data) # Normalaize the data min_range = np.ones(data.shape[1]) min_range = np.multiply(-1, min_range) max_range = np.ones(data.shape[1]) data = dm.min_max_normalization(data, min_range, max_range) # Getting the lables and convert it to be useable by the algorthems lables = flr.read_from_file(lables_file) lables = np.array(lables) lables = dm.convert_to_float(lables) # Get the test data and convert it to be useable by the algorthems test = flr.read_from_file(test_file) test = dm.add_bias(test) test = dm.convert_sex_to_number(test, 1, 1, 0) test = np.array(test) # Normalaize the test data min_range = np.ones(data.shape[1]) min_range = np.multiply(-1, min_range) max_range = np.ones(data.shape[1]) test = dm.min_max_normalization(test, min_range, max_range) # Set the properties we want the algorithems to use to use ignore = np.ones(len(data[0])) # Perceptron algorithem alg_peceptron = prtn.Perceptron(data, 0.1, 100, ignore, 3) alg_peceptron.train(data, lables, 0.01, 20) # Svm algorithem alg_svm = svm.Svm(data, 0.01, 100, ignore, 0.001, 3) alg_svm.lamda = 0.1 alg_svm.train(data, lables, 0.1, 20) # Pa algorithem alg_pa = pa.Pa(data, 0.1, 100, ignore, 3) alg_pa.train(data, lables, 0.01, 25) # Compare the algoritems on the test for test_data in test: line_to_print = "perceptron: " + str( alg_peceptron.predict(test_data)) + ", " line_to_print += "svm: " + str(alg_svm.predict(test_data)) + ", " line_to_print += "pa: " + str(alg_pa.predict(test_data)) print(line_to_print)
def find_first(x_data, y_data, split): col_size = len(x_data[0]) min_mse = [10000] min_col = [10000] best_col = [0] for col in range(1, col_size): x_column = list(DataManipulation.column_getter(x_data, col)) m, b, x, y, yg, mse = reg_lin_regression_msr(x_column, y_data, split) if mse < min_mse[0]: min_col[0] = col min_mse[0] = mse best_col[0] = list(x_column) return list(min_col), list(min_mse), list(best_col)
def regression_discard(data_array, cont_dis, cols_rmv, sig, y_col, split_array): print('-----------------------------------') print('Using Discard Imputation') print('-----------------------------------') imp = 'Discard Imputation:' d_array, stat_a, x, y, x_n, y_n = DataManipulation.discard_imputation( list(data_array), cont_dis, cols_rmv, sig, y_col) cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t( list(x), list(y), list(x_n), list(y_n), split_array) print('Unnormalized Result: ') show_results(imp, cod_r, lse_r, mse_r) print('Normalized Result: ') show_results(imp, n_cod_r, n_lse_r, n_mse_r) return
def timeOut(self): passive = PassiveAccelerometer.arduino(self.xAvg, self.yAvg, self.zAvg) while (self.sameCount < 50): time.sleep(0.080); vals = passive.readValues(); self.data.append(vals) try: valAvg = ((vals[0]+vals[1]+vals[2])/3) except Exception: passive.readValues() if (valAvg <= (abs(self.valPrev + 300)) or valAvg >= (abs(self.valPrev - 300))): self.sameCount += 1 valPrev = valAvg manip = DataManipulation.simpleFunctions(self.data, self.weight, self.workout)
def regression_average_fs(data_array, cont_dis, cols_rmv, sig, y_col, split_array): print('-----------------------------------') print( 'Using Average Imputation with Forward Selection Dimension Reduction') print('-----------------------------------') attribute_labels = [ 'mpg', # 0 'Cylinders', # 1 'Displacement', # 2 'Horse Power', # 3 'Weight', # 4 'Acceleration', # 5 'Model Year', # 6 'Origin', # 7 'Car Type' ] # 8 imp = 'Average Imputation:' d_array, stat_a, x, y, x_n, y_n = DataManipulation.average_imputation( list(data_array), cont_dis, cols_rmv, sig, y_col) F, min_mse, cols_f = forward_selector_test(list(x), list(y), split_array[0]) f_n, min_mse_n, cols_f_n = forward_selector_test(list(x_n), list(y_n), split_array[0]) print('F is using ' + str(len(F[0]) - 1) + ' attributes') for i in range(len(cols_f)): print(attribute_labels[cols_f[i]]) cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t( list(F), list(y), list(f_n), list(y_n), split_array) # show_results(imp, error, cod_result, lse_result, mse_result) print('Unnormalized Data:') show_results(imp, cod_r, lse_r, mse_r) print('') print('Normalized Data:') show_results(imp, n_cod_r, n_lse_r, n_mse_r) return
def kfoldCrossValidate(k): ''' cross validate the model ''' df = shuffle(DM.clean(DM.offData)) loss = [] rows = len(df.index) kf = KFold(rows, n_folds=k) for train_index, test_index in kf: train_set = df.iloc[train_index] test_set = df.iloc[test_index] train_x = train_set.drop(['R$'], axis=1) train_y = train_set['R$'] test_x = test_set.drop(['R$'], axis=1) test_y = test_set['R$'] l = train(train_x, train_y, test_x, test_y) loss.append(l) return loss
def collect_parameters2(x_d, y_d, split_a): w_list = list() tr_l = list() y_tr_l = list() val_l = list() y_val_l = list() for x in range(len(split_a)): tr, val, y_tr, y_val, rand = DataManipulation.dos_data_splitter( x_d, y_d, split_a[x]) # get w from training data w_list.append(multi_linear_regressor(tr, y_tr)) tr_l.append(tr) y_tr_l.append(y_tr) val_l.append(val) y_val_l.append(y_val) return w_list, tr_l, y_tr_l, val_l, y_val_l
def createQueriesDictionary(Data): InvertedIndex, Queries = dm.readInvertedIndex(), {} N = 537933 # Total Number of Queries for data in Data: for i in range(1, 3): qid, query = data[i], [data[i+2]] if qid not in Queries: Queries.update({qid:query}) for qid in Queries: Words = TextBlob(Queries[qid][0]).lower().words # Dictionary word -> frequency Hashes, Weights = [], [] try: maxf = max(Words.count(w) for w in Words) # Max Frequency of a term in the query except: continue # Corrupted Data for w in Words: Hashes.append(hashFunction(w, 64)) f, n = Words.count(w)/maxf, len(InvertedIndex[w]) # f(t), n(t) idf = math.log(N/n)/math.log(N) # IDF(t) Weights.append(f*idf) queryHash = HashQuery(Hashes, Weights) Queries[qid].append(queryHash) with open('Queries.txt', 'wb') as file: pickle.dump(Queries, file)
def updatePlot(self): ''' Get the new position data from file and add that point to the scatter plot. ''' # Fetch and plot data newPos = dm.getLatestPositionData(self.updateCount) self.xData.append(newPos[0]) self.yData.append(newPos[1]) self.axes.plot(self.xData, self.yData, color='black') # Formatting self.axes.set_title('X and Y Position Relative to Starting Point') self.axes.set_xlabel('meters') self.axes.set_ylabel('meters') self.axes.legend(loc='lower right') # Add data to plot and flush self.fig.canvas.draw() self.fig.canvas.flush_events() return None
def modelPredict(predict_x, path='saved', expected=[0]): '''This function rebuilds a NN from a directory where it was saved in a training job It then runs a predction job based on the given inputs''' #build the feature columns ageCol = tf.feature_column.numeric_column(key='Age') atbatCol = tf.feature_column.numeric_column(key='AB') hitCol = tf.feature_column.numeric_column(key='H') runCol = tf.feature_column.numeric_column(key='R') rbiCol = tf.feature_column.numeric_column(key='RBI') hrCol = tf.feature_column.numeric_column(key='HR') sbCol = tf.feature_column.numeric_column(key='SB') #define the feature columns in a list feature_columns = [ #ageCol, atbatCol, hitCol, runCol, rbiCol, hrCol, sbCol, tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB'], hash_bucket_size=int(1e4))), # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['HR', 'RBI', 'R'], hash_bucket_size=int(1e4))), # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'SB'], hash_bucket_size=int(1e4))), # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'HR', 'RBI', 'R', 'SB'], hash_bucket_size=int(1e4))), # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'HR', 'RBI', 'R'], hash_bucket_size=int(1e4))), ] #configure checkpoints: my_checkpointing_config = tf.estimator.RunConfig( save_checkpoints_secs = 20, # Save checkpoints every 20 secs. keep_checkpoint_max = 10, # Retain the 10 most recent checkpoints. ) # Build the Estimator. #model = tf.estimator.LinearRegressor( model = tf.estimator.DNNRegressor( hidden_units=[31, 22, 15, 12], feature_columns=feature_columns, config=my_checkpointing_config, model_dir=path ) predictions = model.predict( input_fn=lambda:DM.eval_input_fn(predict_x, labels=None, batch_size=100)) template = ('\nPrediction is "{}" , expected "{}"') ret = [] for pred_dict,expec in zip(predictions, expected): #ret.append(pred_dict["predictions"]) print(template.format(pred_dict["predictions"][0], expec)) ret.append(pred_dict["predictions"][0]) return ret
import os import time import pdb ################################# ##### DataManipulation.py ################################# import DataManipulation reload(DataManipulation) checksum = [] function = 'remove_x' argument = np.array([[0., 1.], [1., 3.], [2., 1.]]) ideal_output = np.array([1., 3., 1.]) real_output = DataManipulation.remove_x(argument) a = (ideal_output == real_output).all() checksum.append(a) print function, a function = 'add_x' argument = np.array([1., 3., 1.]) ideal_output = np.array([[0, 1.], [1, 3.], [2, 1.]]) real_output = DataManipulation.add_x(argument) a = (ideal_output == real_output).all() checksum.append(a) print function, a function = 'normalize'