def __init__(self): super().__init__() self.init_labels() self.init_textboxes() self.init_buttons() self.init_dropdowns() self.init_sublayouts() self.start_up() self.main_widget = QWidget() self.main_layout = QtWidgets.QGridLayout(self.main_widget) self.main_layout.sizeConstraint = QtWidgets.QLayout.SetDefaultConstraint self.main_layout.addLayout(self.panel_sublayout, 0, 1) self.main_layout.setColumnStretch(0, 1) self.canvas = self.init_graphs() self.main_layout.addWidget(self.canvas, 0, 0) self.main_widget.setLayout(self.main_layout) self.setCentralWidget(self.main_widget) self.setGeometry(50, 50, 1200, 700) self.setWindowTitle("Acconeer Exploration GUI") self.show() self.radar = data_processing.DataProcessing()
def __init__(self): self.dp = data_processing.DataProcessing() # word, POS, label #self.train_table = np.empty([0, 3]) self.train_table = np.empty([0, 1]) self.test_table = np.empty([0, 1])
def get_bioms(self): """ To get the bioms that are in the mid 20% (to be or not to be) :return: """ #cwd = os.getcwd() #population_path = cwd + '/../Data/ASV_table.tsv' data = data_processing.DataProcessing() population_path = data.url_ASV pop_bioms = pd.read_csv(population_path, delimiter='\s+', encoding='utf-8') to_keep = [] for i in pop_bioms.columns: c = 0 for j in pop_bioms.get(i): if j > 2: c += 1 if 0.6 > c / 72 > 0.4: to_keep.append(i) to_drop = [x for x in pop_bioms.columns if x not in to_keep] pop_bioms = pop_bioms.drop(to_drop, axis=1) return pop_bioms
def execute_data_processing(count, app_version, launch_time_list): data_image = data_processing.DataProcessing(count, app_version, launch_time_list) data_image.get_data_visualization_image() data_report = data_image.get_data_report() recent_image_path = data_image.get_recent_image_path() recent_log_path = data_image.get_recent_log_path() return data_report, recent_image_path, recent_log_path
def test_graph_600k(self): """ Displays the graph for the 600k dataset's task 6 test parameters. """ f = fm.FileManager(self.path_600) df = f.parse_json_dataframe() p = prc.DataProcessing(df) p.run_task_6(self.doc_id_600, self.user_id_600)
def test_format_time(self): """Tests if the time is formatted correctly We test the seconds converted by this tool. test data: 1234567800 source tool: https://www.convert-me.com/en/convert/time/millisecond/millisecond-to-dhms.html?u=millisecond&v=1%2C234%2C567%2C800 """ f = fm.FileManager(file_path=self.path_100) df = f.parse_json_dataframe() p = prc.DataProcessing(df) actual = p._format_time( 1234567800) # 1234567800ms is 14 days 6 hours 56 minutes 7 seconds expected = "14d : 6h : 56m : 7s" self.assertEqual(actual, expected, "Should be %s" % expected)
def test_also_likes_readers_100k(self): """ Test to see if the readers is as expected for a document and visitor in the 100k dataset. """ f = fm.FileManager(file_path=self.path_100) df = f.parse_json_dataframe() p = prc.DataProcessing(df) # get relevant readers gets a set of readers for a document and user id set_readers = p.get_relevant_readers(self.doc_id_100, self.user_id_100) set_expected = { '4108dc09bfe11a0c' } # We expect only 1 reader based on the given test data self.assertEqual(set_readers, set_expected, "Should be %s" % set_expected)
def test_also_likes_readers_600k(self): """ Test to see if the readers is as expected for a document and visitor in the 600k dataset. """ f = fm.FileManager(self.path_600) df = f.parse_json_dataframe() p = prc.DataProcessing(df) # get relevant readers gets a set of readers for a document and user id set_readers = p.get_relevant_readers(self.doc_id_600, self.user_id_600) # We expect the following 4 readers based on the given test data. set_expected = { '383508ea93fd2fd1', '3f64bccfd160557e', '1f891eb0b573e42c', '7134a88f8b201d31' } self.assertEqual(set_readers, set_expected, "Should be %s" % set_expected)
def read_data(self, pop_bool=True, metadata_bool=True): data = data_processing.DataProcessing() # Reading file into data frame #cwd = os.getcwd() population_path = data.url_ASV #cwd + '/../Data/ASV_table.tsv' metadata_path = data.url_metadata #cwd + '/../Data/Metadata_table.tsv' """ df.columns (identifier) df.values (population size) population_size.shape -> (72, 14991) """ population_size = pd.read_csv(population_path, delimiter='\s+', encoding='utf-8') if pop_bool: # find the non-zero bioms population_to_drop = [ x for x in population_size.columns if population_size.get(x).min() == 0 ] population_size = population_size.drop(population_to_drop, axis=1) """ df.columns (properties) df.values (values) metadata.shape -> (71, 41) """ metadata = pd.read_csv(metadata_path, delimiter='\s+', encoding='utf-8') # l = ["Latitude", "Longitude", "Altitude", "Area", if metadata_bool: l = [ "Temperature", "Secchi", "O2", "CH4", "pH", "TIC", "SiO2", "KdPAR" ] toDrop = [x for x in metadata.columns if x not in l] metadata = metadata.drop(toDrop, axis=1) return population_size, metadata
def select_file(): """Lets the user select a file and checks if it is valid""" file_frame.filename = filedialog.askopenfilename(initialdir="/dataAnalysis/data", title="Select Dataset", filetypes=FILE_TYPES) if file_frame.filename: f = fm.FileManager(file_frame.filename) if f.check_file_format(): # Check if file selected is of JSON format global df # Try loading the dataframe, else return messagebox with relevant errors. df = f.parse_json_dataframe() if not df.empty: global dataset dataset = pr.DataProcessing(df) display_file_info(f) else: messagebox.showerror("Value Error", "The JSON file you are trying to load didn't contain valid dictionaries. Please try again") else: # Display message box in case file is incorrect format messagebox.showerror(title="Bad file format", message="Please load JSON file only.")
def test_also_likes_documents_100k(self): """ Test to see if the documents is as expected for a document and visitor in the 100k dataset. """ f = fm.FileManager(file_path=self.path_100) df = f.parse_json_dataframe() p = prc.DataProcessing(df) # get relevant readers gets a set of readers for a document and user id set_readers = p.get_relevant_readers(self.doc_id_100, self.user_id_100) # get the documents that these readers like set_docs = p.get_documents(set_readers) # We expect only 4 documents based on the given test data set_expected = { '4108dc09bfe11a0c': { '100405170355-00000000ee4bfd24d2ff703b9147dd59', '100806162735-00000000115598650cb8b514246272b5', '100806172045-0000000081705fbea3553bd0d745b92f', '101122221951-00000000a695c340822e61891c8f14cf' } } self.assertEqual(set_docs, set_expected, "Should be %s" % set_expected)
# Python library imports import tkinter as tk from tkinter import filedialog from tkinter import messagebox from tkinter import simpledialog # Local library imports import file_manager as fm import data_processing as pr df = None dataset = pr.DataProcessing(df) # Create a window on which all our widgets will be built root = tk.Tk() # Set the dimensions root.geometry("940x600") # Set the title of the window root.title("Python Data Analysis App | Coursework 2") # root.resizable(width=False, height=False) # Defining rows in the grid root.grid_rowconfigure(0, weight=2) root.grid_rowconfigure(1, weight=1) root.grid_rowconfigure(2, weight=16) root.grid_rowconfigure(3, weight=10) # Define columns in the grid root.grid_columnconfigure(0, weight=6) root.grid_columnconfigure(1, weight=4)
def __init__(self): self.a = Q1.DataProcessing() #the first column not need to add '1'! self.traingSet = self.a.standardForm()[0] self.testSet = self.a.standardForm()[1] self.validationSet = self.a.standardForm()[2]
label_map = { 1: "No entry", 2: "No parking / waiting", 3: "No turning", 4: "Max Speed", 5: "Other prohibition signs", 6: "Warning", 7: "Mandatory", } num_classes = 7 batch_size = args.batch_size fdataset = tf.data.TFRecordDataset(TFRECORDS_FILE) data_processor = data_processing.DataProcessing(400, 154) label_encoder = m.LabelEncoder() dataset = fdataset.map(data_processor.preprocess_data) dataset = dataset.shuffle(8 * batch_size) dataset = dataset.padded_batch( batch_size, padding_values=(0.0, 1e-8, tf.cast(-1, tf.int64)), drop_remainder=True, ) dataset = dataset.map( label_encoder.encode_batch, num_parallel_calls=autotune ) dataset = dataset.apply(tf.data.experimental.ignore_errors()) dataset = dataset.prefetch(autotune) val_size = 500
def Run(self, *args): # getting data for main problem dataproc = data_processing.DataProcessing() data = dataproc.GetMainData() # getting everything we need CDOM, CDOM_sorted, CDOM_diag_mesh, \ ASV, ASV_ranged, \ metadata, metadata_scaled, \ X_ASV, y_CDOM = data #XGboost with scikitlearn - data with spatial component (BCC Bray distance by CDOM) X_CDOM = CDOM.loc[:, [ "CDOM.x1", "CDOM.x2" ]] #Molten meshgrid CDOM values for real data BCC Bray distances X_CDOM_diag_mesh = CDOM_diag_mesh.loc[:, [ "CDOM.x1", "CDOM.x2" ]] #Molten meshgrid CDOM values for generating predicted BCC Bray distances y_CDOM = CDOM.loc[:, "ASV.dist"] if self.paramData['type'] == 'ffnn_keras': # retrieving network data NNdata = self.PreProcessing() # passing parameter file print(self.paramData) ''' Getting Network Architecture ''' network = neural.NeuralNetwork(NNdata) # passing network architecture and create the model model = network.BuildModel() # training model model, history = network.TrainModel( model, self.X_train, self.X_test, self.Y_train_onehot, self.Y_test_onehot) #self.X_norm, self.Y_onehot) test_loss, test_acc = model.evaluate(self.X_test, self.Y_test_onehot) print('Test accuracy:', test_acc) # Plotting results self.funcs.PlotResultsKeras(history, self.paramData['type'], self.paramData['OutputPath'], self.paramData['epochs'], self.paramData['Optimization'], self.paramData['BatchSize']) elif self.paramData['type'] == 'snn_keras': # retrieving network data NNdata = self.PreProcessing() ''' Getting Network Architecture ''' network = neural.NeuralNetwork(NNdata) # passing network architecture and create the model model = network.BuildModel() # training model model, history = network.TrainModel(model, self.pairs_train, self.Y_train_onehot, self.pairs_test, self.Y_test_onehot) # Plotting results self.funcs.PlotResultsKeras(history, self.paramData['type'], self.paramData['OutputPath'], self.paramData['epochs'], self.paramData['Optimization'], self.paramData['BatchSize']) elif self.paramData['type'] == 'tnn_keras': # retrieving network data NNdata = self.PreProcessing() ''' Getting Network Architecture ''' network = neural.NeuralNetwork(NNdata) # passing network architecture and create the model model = network.BuildModel() # training model model, history = network.TrainModel(model, self.triplets_train, self.Y_train_onehot, self.triplets_test, self.Y_test_onehot) # Plotting results self.funcs.PlotResultsKeras(history, self.paramData['type'], self.paramData['OutputPath'], self.paramData['epochs'], self.paramData['Optimization'], self.paramData['BatchSize']) elif self.paramData['type'] == 'ffnn_manual': # Neural network with multiple layers - regression - BCC Bray distances by CDOM - original data X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy() y_CDOM = CDOM.loc[:, "ASV.dist"].to_numpy( )[:, np.newaxis] #Original data ''' NN_reg_original = neural.NeuralNetworkML(X_CDOM, y_CDOM, trainingShare=0.80, n_hidden_layers=3, n_hidden_neurons=[2000, 1000, 500], n_categories=1, epochs=10, batch_size=10, eta=1e-8, lmbd=0, fixed_LR=False, method="regression", activation="sigmoid", seed = self.paramData['RandomSeed']) ''' n_hidden_neurons = [] for layer in range(self.paramData['NHiddenLayers']): n_hidden_neurons.append(self.paramData['NHiddenNeurons']) for layer in range(1, self.paramData['NHiddenLayers'], 1): n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2) #print(n_hidden_neurons) NN_reg_original = neural.NeuralNetworkML( X_CDOM, y_CDOM, trainingShare=1 - self.paramData['TestSize'], n_hidden_layers=self.paramData['NHiddenLayers'], n_hidden_neurons=n_hidden_neurons, n_categories=1, epochs=self.paramData['epochs'], batch_size=self.paramData['BatchSize'], eta=self.paramData['alpha'], lmbd=0, fixed_LR=False, method="regression", activation="sigmoid", seed=self.paramData['RandomSeed']) NN_reg_original.train() # Plotting results self.funcs.PlotResultsManualFFNN(NN_reg_original, CDOM, self.paramData['type'], self.paramData['OutputPath'], self.paramData['epochs'], self.paramData['BatchSize']) elif self.paramData['type'] == 'xgb': X_train, X_test, y_train, y_test = train_test_split( X_CDOM, y_CDOM, train_size=1 - self.paramData['TestSize'], test_size=self.paramData['TestSize'], random_state=self.paramData['RandomSeed']) # initialising xgboosting xgboosting = xgb.XGBoosting() model = xgboosting.RunModel(X_train, X_test, y_train, y_test, X_CDOM, X_CDOM_diag_mesh, CDOM, CDOM_sorted, self.paramData['OutputPath']) #Get best model by test MSE XGboost_best_model_index = model.best_iteration XGboost_best_iteration = model.get_booster().best_ntree_limit MSE_per_epoch = model.evals_result() # make predictions for test data y_pred = model.predict(X_test, ntree_limit=XGboost_best_iteration) y_pred_train = model.predict(X_train) #predictions = [round(value) for value in y_pred] best_prediction = model.predict(X_CDOM, ntree_limit=XGboost_best_iteration) CDOM_pred = best_prediction.copy( ) #CDOM_pred.shape: (2556,) CDOM_pred are the predicted BCC Bray distances for CDOM value pairs CDOM_pred_fine_mesh = model.predict( X_CDOM_diag_mesh, ntree_limit=XGboost_best_iteration) ''' y_pred,\ y_pred_train,\ MSE_per_epoch,\ CDOM_pred, \ CDOM_pred_fine_mesh, \ XGboost_best_model_index = xgboosting.RunModel(X_train, X_test, y_train, y_test, X_CDOM, X_CDOM_diag_mesh, CDOM, CDOM_sorted, self.paramData['OutputPath']) ''' # plotting 3d plots and mse for XGBoost self.funcs.PlotResultsXGBoost(CDOM, CDOM_sorted, X_CDOM_diag_mesh, CDOM_pred_fine_mesh, CDOM_pred, self.paramData['OutputPath'], y_pred, y_pred_train, MSE_per_epoch, y_train, y_test, XGboost_best_model_index) elif self.paramData['type'] == 'rf_main': rf = random_forest.RandomForest() # Laurent population_size, metadata = rf.read_data(False, False) predictions, test_y, ML_ = rf.prepare_data( population_size, metadata, self.paramData['TestSize'], self.paramData['RandomSeed']) all_predictions = rf.predict_all_metadata(population_size, metadata, ML_) # we will compare the outcome with xgboost def MergeTable(var_list, metadata_variables): table = pd.DataFrame(np.concatenate((var_list), axis=1)) table.columns = metadata_variables return table def PredictMetadata(ASV_table, metadata_variables, train_size, test_size, seed): X_ASV = ASV_table X_ASV.columns = [''] * len(X_ASV.columns) X_ASV = X_ASV.to_numpy() metadata_list = [] for i in metadata_variables: #y_CDOM = metadata.loc[:, i][:, np.newaxis] # split data into train and test sets y_meta = metadata.loc[:, i] #Requires 1d array X_train, X_test, y_train, y_test = train_test_split( X_ASV, y_meta, train_size=train_size, test_size=test_size, random_state=seed) # fit model no training data model = XGBRegressor(objective='reg:squarederror') model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse', early_stopping_rounds=100, verbose=False) #Get best model by test MSE XGboost_best_model_index = model.best_iteration XGboost_best_iteration = model.get_booster( ).best_ntree_limit # make predictions for full dataset y_pred = model.predict(X_ASV, ntree_limit=XGboost_best_iteration) metadata_list.append(y_pred[:, np.newaxis]) return MergeTable(metadata_list, metadata_variables) var_list = [ "Latitude", "Longitude", "Altitude", "Area", "Depth", "Temperature", "Secchi", "O2", "CH4", "pH", "TIC", "SiO2", "KdPAR" ] train_size = 1 - self.paramData['TestSize'] test_size = self.paramData['TestSize'] seed = self.paramData['RandomSeed'] predicted_metadata = PredictMetadata(ASV, var_list, train_size, test_size, seed) with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(predicted_metadata) elif self.paramData['type'] == 'rf_side': # retrieving network data NNdata = self.PreProcessing() rf = random_forest.RandomForest() seed = self.paramData['RandomSeed'] clfs, scores_test, scores_train = rf.predict_t( self.X_train, self.X_test, self.y_train_l, self.y_test_l, seed) elif self.paramData['type'] == 'all': ''' Neural Network ''' # Neural network with multiple layers - regression - BCC Bray distances by CDOM - original data X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy() y_CDOM = CDOM.loc[:, "ASV.dist"].to_numpy( )[:, np.newaxis] #Original data n_hidden_neurons = [] for layer in range(self.paramData['NHiddenLayers']): n_hidden_neurons.append(self.paramData['NHiddenNeurons']) for layer in range(1, self.paramData['NHiddenLayers'], 1): n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2) #print(n_hidden_neurons) NN_reg_original = neural.NeuralNetworkML( X_CDOM, y_CDOM, trainingShare=1 - self.paramData['TestSize'], n_hidden_layers=self.paramData['NHiddenLayers'], n_hidden_neurons=n_hidden_neurons, n_categories=1, epochs=self.paramData['epochs'], batch_size=self.paramData['BatchSize'], eta=self.paramData['alpha'], lmbd=0, fixed_LR=False, method="regression", activation="sigmoid", seed=self.paramData['RandomSeed']) NN_reg_original.train() x_mesh = np.log10( np.arange(min(CDOM.loc[:, "CDOM.x1"]), max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)) + 1 y_mesh = x_mesh.copy() x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh) X_CDOM_mesh = self.funcs.pdCat( x_mesh.ravel()[:, np.newaxis], y_mesh.ravel()[:, np.newaxis]).to_numpy() best_prediction = NN_reg_original.model_prediction( X_CDOM_mesh, NN_reg_original.accuracy_list.index( min(NN_reg_original.accuracy_list))) x_mesh = np.arange(min(CDOM.loc[:, "CDOM.x1"]), max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01) y_mesh = x_mesh.copy() x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh) ff_pred_original = best_prediction.copy() ff_pred_original = np.reshape(ff_pred_original, (363, 363)) ff_pred_original[x_mesh - y_mesh == 0] = np.nan ff_pred_original[x_mesh > y_mesh] = np.nan ''' XGBoost part ''' X_CDOM = CDOM.loc[:, [ "CDOM.x1", "CDOM.x2" ]] #Molten meshgrid CDOM values for real data BCC Bray distances X_CDOM_diag_mesh = CDOM_diag_mesh.loc[:, [ "CDOM.x1", "CDOM.x2" ]] #Molten meshgrid CDOM values for generating predicted BCC Bray distances y_CDOM = CDOM.loc[:, "ASV.dist"] X_train, X_test, y_train, y_test = train_test_split( X_CDOM, y_CDOM, train_size=1 - self.paramData['TestSize'], test_size=self.paramData['TestSize'], random_state=self.paramData['RandomSeed']) # initialising xgboosting xgboosting = xgb.XGBoosting() model = xgboosting.RunModel(X_train, X_test, y_train, y_test, X_CDOM, X_CDOM_diag_mesh, CDOM, CDOM_sorted, self.paramData['OutputPath']) #Get best model by test MSE XGboost_best_model_index = model.best_iteration XGboost_best_iteration = model.get_booster().best_ntree_limit MSE_per_epoch = model.evals_result() # make predictions for test data y_pred = model.predict(X_test, ntree_limit=XGboost_best_iteration) y_pred_train = model.predict(X_train) #predictions = [round(value) for value in y_pred] best_prediction = model.predict(X_CDOM, ntree_limit=XGboost_best_iteration) CDOM_pred = best_prediction.copy( ) #CDOM_pred.shape: (2556,) CDOM_pred are the predicted BCC Bray distances for CDOM value pairs CDOM_pred_fine_mesh = model.predict( X_CDOM_diag_mesh, ntree_limit=XGboost_best_iteration) ''' Simple OLS - generating design matrix out of data set etc. ''' reg = regression.Regression() X_mesh = reg.GenerateMesh( 0.21, 3.83, 0.21, 3.83, 0.01, 0.01, log_transform=True ) # The low number of points on the higher end of the gradient causes distortions for linear regression X_mesh_degree_list = reg.DesignMatrixList(X_mesh[0], X_mesh[1], 12)[1:] X_degree_list = reg.DesignMatrixList(CDOM.loc[:, "CDOM.x1"], CDOM.loc[:, "CDOM.x2"], 12)[1:] X_degree_list_subset = [] z = CDOM_pred #XGboost-predicted values z = CDOM.loc[:, "ASV.dist"] #Original data #ebv_no_resampling = reg.generate_error_bias_variance_without_resampling(X_degree_list, 1) #ebv_resampling = reg.generate_error_bias_variance_with_resampling(X_degree_list, 1, 100) #reg.ebv_by_model_complexity(ebv_resampling) #reg.training_vs_test(ebv_no_resampling) CDOM_pred_reg = X_mesh_degree_list[8] @ reg.beta_SVD( X_degree_list[8], CDOM_pred) #print(pd.DataFrame(X_mesh_degree_list[1])) #print(CDOM_pred_reg) #with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also # print(pd.DataFrame(CDOM_pred_reg)) x_mesh_reg = np.arange(min(CDOM.loc[:, "CDOM.x1"]), max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01) y_mesh_reg = x_mesh_reg.copy() x_mesh_reg, y_mesh_reg = np.meshgrid(x_mesh_reg, y_mesh_reg) X_CDOM_mesh = self.funcs.pdCat(x_mesh_reg.ravel()[:, np.newaxis], y_mesh_reg.ravel()[:, np.newaxis]) #print(pd.DataFrame(X_CDOM_mesh)) #print("CDOM_pred_reg.shape", CDOM_pred_reg.shape) z_CDOM_mesh_pred = np.reshape( CDOM_pred_reg, (x_mesh_reg.shape[0], x_mesh_reg.shape[0])) z_CDOM_mesh_pred[x_mesh_reg - y_mesh_reg == 0] = np.nan z_CDOM_mesh_pred[x_mesh_reg > y_mesh_reg] = np.nan ''' Neural Network with data from XGBoost ''' X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy() y_CDOM = CDOM_pred[:, np.newaxis] #Predicted data from XGboost n_hidden_neurons = [] for layer in range(self.paramData['NHiddenLayers']): n_hidden_neurons.append(self.paramData['NHiddenNeurons']) for layer in range(1, self.paramData['NHiddenLayers'], 1): n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2) #print(n_hidden_neurons) NN_reg = neural.NeuralNetworkML( X_CDOM, y_CDOM, trainingShare=1 - self.paramData['TestSize'], n_hidden_layers=self.paramData['NHiddenLayers'], n_hidden_neurons=n_hidden_neurons, n_categories=1, epochs=self.paramData['epochs'], batch_size=self.paramData['BatchSize'], eta=self.paramData['alpha'], lmbd=0, fixed_LR=False, method="regression", activation="sigmoid", seed=self.paramData['RandomSeed']) NN_reg.train() test_predict = NN_reg.predict(NN_reg.XTest) print(NN_reg.accuracy_list) #Use log-transformed CDOM values for creating design matrix, then plot on original values x_mesh = np.log10( np.arange(min(CDOM.loc[:, "CDOM.x1"]), max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)) + 1 y_mesh = x_mesh.copy() x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh) X_CDOM_mesh = self.funcs.pdCat( x_mesh.ravel()[:, np.newaxis], y_mesh.ravel()[:, np.newaxis]).to_numpy() best_prediction = NN_reg.model_prediction( X_CDOM_mesh, NN_reg.accuracy_list.index(min(NN_reg.accuracy_list))) x_mesh = np.arange(min(CDOM.loc[:, "CDOM.x1"]), max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01) y_mesh = x_mesh.copy() x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh) ff_pred = best_prediction.copy() ff_pred = np.reshape(ff_pred, (363, 363)) ff_pred[x_mesh - y_mesh == 0] = np.nan ff_pred[x_mesh > y_mesh] = np.nan ''' Plotting 3d graphs for all data ''' fontsize = 6 #Compare raw data to XGboost, neural network predicted data and XGboost predicted data smoothed with neural network fig = plt.figure(figsize=plt.figaspect(0.5)) ax = fig.add_subplot(2, 3, 1, projection='3d') ax.set_title("BCC Bray distances by sites' DOM", fontsize=fontsize) #plt.subplots_adjust(left=0, bottom=0, right=2, top=2, wspace=0, hspace=0) ax.view_init(elev=30.0, azim=300.0) surf = ax.plot_trisurf(CDOM.loc[:, "CDOM.x1"], CDOM.loc[:, "CDOM.x2"], CDOM.loc[:, "ASV.dist"], cmap='viridis', edgecolor='none') # Customize the z axis. ax.set_zlim(0.3, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") # Set up the axes for the second plot ax = fig.add_subplot(2, 3, 2, projection='3d') #ax.set_title("XGboost-Predicted BCC Bray distances by sites' CDOM, dataset CDOM coordinates", fontsize=8) ax.set_title( "XGboost-Predicted BCC \n Bray distances by sites' DOM", fontsize=fontsize) ax.view_init(elev=30.0, azim=300.0) # Plot the surface. ax.plot_trisurf( CDOM.loc[:, "CDOM.x1"], CDOM.loc[:, "CDOM.x2"], CDOM_pred, #197109 datapoints cmap='viridis', edgecolor='none') # Customize the z axis. z_range = (np.nanmax(CDOM_pred) - np.nanmin(CDOM_pred)) ax.set_zlim(np.nanmin(CDOM_pred) - z_range, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") # Set up the axes for the third plot ax = fig.add_subplot(2, 3, 3, projection='3d') #ax.set_title("OLS (SVD) regression-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=6) ax.set_title( "OLS (SVD) regression-predicted \n BCC Bray distances by sites' DOM", fontsize=fontsize) ax.view_init(elev=30.0, azim=300.0) # Plot the surface. ax.plot_trisurf( x_mesh_reg.ravel(), y_mesh_reg.ravel(), z_CDOM_mesh_pred.ravel(), cmap='viridis', #197109 datapoints vmin=np.nanmin(z_CDOM_mesh_pred), vmax=np.nanmax(z_CDOM_mesh_pred), edgecolor='none') # Customize the z axis. z_range = (np.nanmax(z_CDOM_mesh_pred) - np.nanmin(z_CDOM_mesh_pred)) ax.set_zlim(np.nanmin(z_CDOM_mesh_pred) - z_range, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") # Set up the axes for the fourth plot ax = fig.add_subplot(2, 3, 4, projection='3d') #ax.set_title("NN-smoothed XGboost-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=6) ax.set_title( "NN-smoothed XGboost-predicted \n BCC Bray distances by sites' DOM", fontsize=fontsize) ax.view_init(elev=30.0, azim=300.0) # Plot the surface. ax.plot_trisurf( x_mesh.ravel(), y_mesh.ravel(), ff_pred.ravel(), #197109 datapoints cmap='viridis', edgecolor='none', vmin=np.nanmin(ff_pred), vmax=np.nanmax(ff_pred)) # Customize the z axis. z_range = (np.nanmax(ff_pred) - np.nanmin(ff_pred)) ax.set_zlim(np.nanmin(ff_pred) - z_range, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") # Set up the axes for the fifth plot ax = fig.add_subplot(2, 3, 5, projection='3d') #ax.set_title("NN-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=8) ax.set_title("NN-predicted BCC Bray \n distances by sites' DOM", fontsize=fontsize) ax.view_init(elev=30.0, azim=300.0) # Plot the surface. ax.plot_trisurf( x_mesh.ravel(), y_mesh.ravel(), ff_pred_original.ravel(), #197109 datapoints cmap='viridis', edgecolor='none', vmin=np.nanmin(ff_pred_original), vmax=np.nanmax(ff_pred_original)) # Customize the z axis. z_range = (np.nanmax(ff_pred_original) - np.nanmin(ff_pred_original)) ax.set_zlim(np.nanmin(ff_pred_original) - z_range, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") # Set up the axes for the sixth plot ax = fig.add_subplot(2, 3, 6, projection='3d') #ax.set_title("XGboost-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=8) ax.set_title( "XGboost-predicted BCC Bray \n distances by sites' DOM", fontsize=fontsize) ax.view_init(elev=30.0, azim=300.0) # Plot the surface. ax.plot_trisurf( X_CDOM_diag_mesh.loc[:, "CDOM.x1"], X_CDOM_diag_mesh.loc[:, "CDOM.x2"], CDOM_pred_fine_mesh, #197109 datapoints cmap='viridis', edgecolor='none') # Customize the z axis. z_range = (np.nanmax(CDOM_pred_fine_mesh) - np.nanmin(CDOM_pred_fine_mesh)) ax.set_zlim(np.nanmin(CDOM_pred_fine_mesh) - z_range, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) ax.tick_params(labelsize=8) ax.set_zlabel(zlabel="Bray distance") ax.set_ylabel(ylabel="DOM site 2") ax.set_xlabel(xlabel="DOM site 1") #filename = self.paramData['OutputPath'] filename = self.paramData[ 'OutputPath'] + '/' + 'everything_3d' + '.png' fig.savefig(filename) plt.show()
def test_also_likes_documents_600k(self): """ Test to see if the documents is as expected for a document and visitor in the 600k dataset. """ f = fm.FileManager(self.path_600) df = f.parse_json_dataframe() p = prc.DataProcessing(df) # get relevant readers gets a set of readers for a document and user id set_readers = p.get_relevant_readers(self.doc_id_600, self.user_id_600) # get the documents that these readers like set_docs = p.get_documents(set_readers) # We expect the following 4 readers based on the given test data. set_expected = { '1f891eb0b573e42c': { '130308221433-09f8d746cb5e46f79842433817ffa908', '130322204045-7e140c31b4df4b8da1b0d4a410620ad1', '130406004921-f9e3072c82364ccfba25da4bc8be3b04', '130412203635-288742d148524251b4ef59dfaa222008', '130412215325-b2802be64be04a86b8c67acede394982', '130517181940-3f89e9f4524d4e769c205ed6f1b0e7ae', '130601015527-c1e2993d8290975e7ef350f078134390', '130626002918-2e934fcf5642becffed4c4325fcfa6d8', '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd', '130828160643-3f7e01676f04a2f60d02f80fcbd702e1', '130829034400-ae346135ab80c636d6d7b4c0f7960c41', '130829155547-4da063e3c66df0bc6149aced2abc3720', '130930182254-898ec9d4d3724afb31b1168517d4228a', '131004224723-076660492fa2c66e5398e3dde8890d73', '131022215916-907a48e13645fa9a81860efd03e85352', '140207031738-eb742a5444c9b73df2d1ec9bff15dae9' }, '383508ea93fd2fd1': { '130412203635-288742d148524251b4ef59dfaa222008', '130412215325-b2802be64be04a86b8c67acede394982', '130601015527-c1e2993d8290975e7ef350f078134390', '130828160643-3f7e01676f04a2f60d02f80fcbd702e1', '130930182254-898ec9d4d3724afb31b1168517d4228a', '131022215916-907a48e13645fa9a81860efd03e85352', '140207031738-eb742a5444c9b73df2d1ec9bff15dae9' }, '3f64bccfd160557e': { '130406004921-f9e3072c82364ccfba25da4bc8be3b04', '130601015527-c1e2993d8290975e7ef350f078134390', '130626002918-2e934fcf5642becffed4c4325fcfa6d8', '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd', '130828160643-3f7e01676f04a2f60d02f80fcbd702e1', '130829034400-ae346135ab80c636d6d7b4c0f7960c41', '131030220741-ce78b0b193120c40fd3916fb616b63ce', '140207031738-eb742a5444c9b73df2d1ec9bff15dae9' }, '7134a88f8b201d31': { '130308221433-09f8d746cb5e46f79842433817ffa908', '130322204045-7e140c31b4df4b8da1b0d4a410620ad1', '130626002918-2e934fcf5642becffed4c4325fcfa6d8', '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd', '130829155547-4da063e3c66df0bc6149aced2abc3720', '130930182254-898ec9d4d3724afb31b1168517d4228a', '131022215916-907a48e13645fa9a81860efd03e85352', '131030220741-ce78b0b193120c40fd3916fb616b63ce', '140207031738-eb742a5444c9b73df2d1ec9bff15dae9' } } self.assertEqual(set_docs, set_expected, "Should be %s" % set_expected)
def run(args): print("") # Leave a gap print("Starting task %s" % args.task) # If the task value is 2a, then run if args.task == "2a": # Create file object f = fm.FileManager(args.file) # Create the dataframe df = f.parse_json_dataframe() # If the df was not empty, then run this if not df.empty: # Send it to dataprocessing, dataset = pr.DataProcessing(df) # Run the task for 2a. dataset.histogram_country(args.docid) # If the task value is 2b, then run elif args.task == "2b": f = fm.FileManager(args.file) df = f.parse_json_dataframe() if not df.empty: dataset = pr.DataProcessing(df) dataset.histogram_continent(args.docid) elif args.task == "3a": f = fm.FileManager(args.file) df = f.parse_json_dataframe() if not df.empty: dataset = pr.DataProcessing(df) dataset.histogram_browsers_a() elif args.task == "3b": f = fm.FileManager(args.file) df = f.parse_json_dataframe() if not df.empty: dataset = pr.DataProcessing(df) dataset.histogram_browsers_b() elif args.task == "4": f = fm.FileManager(args.file) df = f.parse_json_dataframe() if not df.empty: dataset = pr.DataProcessing(df) output = dataset.visitor_readtimes() print("Reader(s): | Total readtime(s): ") print("------------------------------------------------") for k, v in output.items(): print('%s | %s' % (k, v)) elif args.task == "5": f = fm.FileManager(args.file) df = f.parse_json_dataframe() if not df.empty: dataset = pr.DataProcessing(df) readers, output = dataset.run_task_5(args.docid, args.userid) print("Relevant readers for the document:") print("Reader(s) ") print("-----------") for reader in readers: print("%s |" % reader[-4:]) print("") print("Top 10 most read (also-like) documents: ") print("Document(s) | Times Read") print("----------------------------") for documents, count in output.items(): if documents[-4:] == args.docid[-4:]: print("%s (*) | %s" % (documents[-4:], count)) else: print("%s | %s" % (documents[-4:], count)) print("Where (*) is the input document.") elif args.task == "6": f = fm.FileManager(args.file) dataset = pr.DataProcessing(f.parse_json_dataframe()) dataset.run_task_6(args.docid, args.userid) else: return "No conditions set"
def __init__(self): self.a=Q1.DataProcessing() #sklearn maybe not need add 1 to the first column! self.traingSet=self.a.standardForm()[0] self.testSet=self.a.standardForm()[1] self.validationSet=self.a.standardForm()[2]