Esempio n. 1
0
    def __init__(self):
        super().__init__()

        self.init_labels()
        self.init_textboxes()
        self.init_buttons()
        self.init_dropdowns()
        self.init_sublayouts()
        self.start_up()

        self.main_widget = QWidget()
        self.main_layout = QtWidgets.QGridLayout(self.main_widget)
        self.main_layout.sizeConstraint = QtWidgets.QLayout.SetDefaultConstraint

        self.main_layout.addLayout(self.panel_sublayout, 0, 1)
        self.main_layout.setColumnStretch(0, 1)

        self.canvas = self.init_graphs()
        self.main_layout.addWidget(self.canvas, 0, 0)

        self.main_widget.setLayout(self.main_layout)
        self.setCentralWidget(self.main_widget)

        self.setGeometry(50, 50, 1200, 700)
        self.setWindowTitle("Acconeer Exploration GUI")
        self.show()

        self.radar = data_processing.DataProcessing()
    def __init__(self):
        self.dp = data_processing.DataProcessing()

        # word, POS, label
        #self.train_table = np.empty([0, 3])
        self.train_table = np.empty([0, 1])
        self.test_table = np.empty([0, 1])
Esempio n. 3
0
    def get_bioms(self):
        """
        To get the bioms that are in the mid 20% (to be or not to be)
        :return:
        """
        #cwd = os.getcwd()
        #population_path = cwd + '/../Data/ASV_table.tsv'
        data = data_processing.DataProcessing()
        population_path = data.url_ASV

        pop_bioms = pd.read_csv(population_path,
                                delimiter='\s+',
                                encoding='utf-8')

        to_keep = []
        for i in pop_bioms.columns:
            c = 0
            for j in pop_bioms.get(i):
                if j > 2:
                    c += 1
            if 0.6 > c / 72 > 0.4:
                to_keep.append(i)

        to_drop = [x for x in pop_bioms.columns if x not in to_keep]
        pop_bioms = pop_bioms.drop(to_drop, axis=1)
        return pop_bioms
Esempio n. 4
0
def execute_data_processing(count, app_version, launch_time_list):
    data_image = data_processing.DataProcessing(count, app_version,
                                                launch_time_list)
    data_image.get_data_visualization_image()
    data_report = data_image.get_data_report()
    recent_image_path = data_image.get_recent_image_path()
    recent_log_path = data_image.get_recent_log_path()
    return data_report, recent_image_path, recent_log_path
Esempio n. 5
0
 def test_graph_600k(self):
     """
     Displays the graph for the 600k dataset's task 6 test parameters.
     """
     f = fm.FileManager(self.path_600)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     p.run_task_6(self.doc_id_600, self.user_id_600)
Esempio n. 6
0
 def test_format_time(self):
     """Tests if the time is formatted correctly
     We test the seconds converted by this tool.
     test data: 1234567800
     source tool: https://www.convert-me.com/en/convert/time/millisecond/millisecond-to-dhms.html?u=millisecond&v=1%2C234%2C567%2C800
     """
     f = fm.FileManager(file_path=self.path_100)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     actual = p._format_time(
         1234567800)  # 1234567800ms is 14 days 6 hours 56 minutes 7 seconds
     expected = "14d : 6h : 56m : 7s"
     self.assertEqual(actual, expected, "Should be %s" % expected)
Esempio n. 7
0
 def test_also_likes_readers_100k(self):
     """
     Test to see if the readers is as expected for a document and visitor in the 100k dataset.
     """
     f = fm.FileManager(file_path=self.path_100)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     # get relevant readers gets a set of readers for a document and user id
     set_readers = p.get_relevant_readers(self.doc_id_100, self.user_id_100)
     set_expected = {
         '4108dc09bfe11a0c'
     }  # We expect only 1 reader based on the given test data
     self.assertEqual(set_readers, set_expected,
                      "Should be %s" % set_expected)
Esempio n. 8
0
 def test_also_likes_readers_600k(self):
     """
     Test to see if the readers is as expected for a document and visitor in the 600k dataset.
     """
     f = fm.FileManager(self.path_600)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     # get relevant readers gets a set of readers for a document and user id
     set_readers = p.get_relevant_readers(self.doc_id_600, self.user_id_600)
     # We expect the following 4 readers based on the given test data.
     set_expected = {
         '383508ea93fd2fd1', '3f64bccfd160557e', '1f891eb0b573e42c',
         '7134a88f8b201d31'
     }
     self.assertEqual(set_readers, set_expected,
                      "Should be %s" % set_expected)
Esempio n. 9
0
    def read_data(self, pop_bool=True, metadata_bool=True):
        data = data_processing.DataProcessing()
        # Reading file into data frame
        #cwd = os.getcwd()
        population_path = data.url_ASV  #cwd + '/../Data/ASV_table.tsv'
        metadata_path = data.url_metadata  #cwd + '/../Data/Metadata_table.tsv'
        """
        df.columns (identifier)
        df.values (population size)
        
        population_size.shape -> (72, 14991)
        """
        population_size = pd.read_csv(population_path,
                                      delimiter='\s+',
                                      encoding='utf-8')

        if pop_bool:
            # find the non-zero bioms
            population_to_drop = [
                x for x in population_size.columns
                if population_size.get(x).min() == 0
            ]
            population_size = population_size.drop(population_to_drop, axis=1)
        """
        df.columns (properties)
        df.values (values)
        
        metadata.shape -> (71, 41)
        """
        metadata = pd.read_csv(metadata_path,
                               delimiter='\s+',
                               encoding='utf-8')

        # l = ["Latitude", "Longitude", "Altitude", "Area",
        if metadata_bool:
            l = [
                "Temperature", "Secchi", "O2", "CH4", "pH", "TIC", "SiO2",
                "KdPAR"
            ]

            toDrop = [x for x in metadata.columns if x not in l]
            metadata = metadata.drop(toDrop, axis=1)

        return population_size, metadata
Esempio n. 10
0
def select_file():
    """Lets the user select a file and checks if it is valid"""
    file_frame.filename = filedialog.askopenfilename(initialdir="/dataAnalysis/data", title="Select Dataset",
                                                     filetypes=FILE_TYPES)
    if file_frame.filename:
        f = fm.FileManager(file_frame.filename)
        if f.check_file_format():
            # Check if file selected is of JSON format
            global df
            # Try loading the dataframe, else return messagebox with relevant errors.
            df = f.parse_json_dataframe()
            if not df.empty:
                global dataset
                dataset = pr.DataProcessing(df)
                display_file_info(f)
            else:
                messagebox.showerror("Value Error",
                                     "The JSON file you are trying to load didn't contain valid dictionaries. Please try again")
        else:
            # Display message box in case file is incorrect format
            messagebox.showerror(title="Bad file format", message="Please load JSON file only.")
Esempio n. 11
0
 def test_also_likes_documents_100k(self):
     """
     Test to see if the documents is as expected for a document and visitor in the 100k dataset.
     """
     f = fm.FileManager(file_path=self.path_100)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     # get relevant readers gets a set of readers for a document and user id
     set_readers = p.get_relevant_readers(self.doc_id_100, self.user_id_100)
     # get the documents that these readers like
     set_docs = p.get_documents(set_readers)
     # We expect only 4 documents based on the given test data
     set_expected = {
         '4108dc09bfe11a0c': {
             '100405170355-00000000ee4bfd24d2ff703b9147dd59',
             '100806162735-00000000115598650cb8b514246272b5',
             '100806172045-0000000081705fbea3553bd0d745b92f',
             '101122221951-00000000a695c340822e61891c8f14cf'
         }
     }
     self.assertEqual(set_docs, set_expected, "Should be %s" % set_expected)
Esempio n. 12
0
# Python library imports
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from tkinter import simpledialog

# Local library imports
import file_manager as fm
import data_processing as pr

df = None
dataset = pr.DataProcessing(df)

# Create a window on which all our widgets will be built
root = tk.Tk()
# Set the dimensions
root.geometry("940x600")
# Set the title of the window
root.title("Python Data Analysis App | Coursework 2")
# root.resizable(width=False, height=False)

# Defining rows in the grid
root.grid_rowconfigure(0, weight=2)
root.grid_rowconfigure(1, weight=1)
root.grid_rowconfigure(2, weight=16)
root.grid_rowconfigure(3, weight=10)

# Define columns in the grid
root.grid_columnconfigure(0, weight=6)
root.grid_columnconfigure(1, weight=4)
 def __init__(self):
     self.a = Q1.DataProcessing()
     #the first column not need to add '1'!
     self.traingSet = self.a.standardForm()[0]
     self.testSet = self.a.standardForm()[1]
     self.validationSet = self.a.standardForm()[2]
Esempio n. 14
0
label_map = {
    1: "No entry",
    2: "No parking / waiting",
    3: "No turning",
    4: "Max Speed",
    5: "Other prohibition signs",
    6: "Warning",
    7: "Mandatory",
}
num_classes = 7
batch_size = args.batch_size


fdataset = tf.data.TFRecordDataset(TFRECORDS_FILE)
data_processor = data_processing.DataProcessing(400, 154)
label_encoder = m.LabelEncoder()
dataset = fdataset.map(data_processor.preprocess_data)
dataset = dataset.shuffle(8 * batch_size)
dataset = dataset.padded_batch(
    batch_size,
    padding_values=(0.0, 1e-8, tf.cast(-1, tf.int64)),
    drop_remainder=True,
)
dataset = dataset.map(
    label_encoder.encode_batch, num_parallel_calls=autotune
)
dataset = dataset.apply(tf.data.experimental.ignore_errors())
dataset = dataset.prefetch(autotune)

val_size = 500
Esempio n. 15
0
    def Run(self, *args):
        # getting data for main problem
        dataproc = data_processing.DataProcessing()
        data = dataproc.GetMainData()
        # getting everything we need
        CDOM, CDOM_sorted, CDOM_diag_mesh, \
        ASV, ASV_ranged, \
        metadata, metadata_scaled, \
        X_ASV, y_CDOM = data
        #XGboost with scikitlearn - data with spatial component (BCC Bray distance by CDOM)
        X_CDOM = CDOM.loc[:, [
            "CDOM.x1", "CDOM.x2"
        ]]  #Molten meshgrid CDOM values for real data BCC Bray distances
        X_CDOM_diag_mesh = CDOM_diag_mesh.loc[:, [
            "CDOM.x1", "CDOM.x2"
        ]]  #Molten meshgrid CDOM values for generating predicted BCC Bray distances
        y_CDOM = CDOM.loc[:, "ASV.dist"]

        if self.paramData['type'] == 'ffnn_keras':
            # retrieving network data
            NNdata = self.PreProcessing()
            # passing parameter file
            print(self.paramData)
            '''
            Getting Network Architecture
            '''
            network = neural.NeuralNetwork(NNdata)
            # passing network architecture and create the model
            model = network.BuildModel()
            # training model
            model, history = network.TrainModel(
                model, self.X_train, self.X_test, self.Y_train_onehot,
                self.Y_test_onehot)  #self.X_norm, self.Y_onehot)
            test_loss, test_acc = model.evaluate(self.X_test,
                                                 self.Y_test_onehot)
            print('Test accuracy:', test_acc)

            # Plotting results
            self.funcs.PlotResultsKeras(history, self.paramData['type'],
                                        self.paramData['OutputPath'],
                                        self.paramData['epochs'],
                                        self.paramData['Optimization'],
                                        self.paramData['BatchSize'])

        elif self.paramData['type'] == 'snn_keras':
            # retrieving network data
            NNdata = self.PreProcessing()
            '''
            Getting Network Architecture
            '''
            network = neural.NeuralNetwork(NNdata)
            # passing network architecture and create the model
            model = network.BuildModel()
            # training model
            model, history = network.TrainModel(model, self.pairs_train,
                                                self.Y_train_onehot,
                                                self.pairs_test,
                                                self.Y_test_onehot)
            # Plotting results
            self.funcs.PlotResultsKeras(history, self.paramData['type'],
                                        self.paramData['OutputPath'],
                                        self.paramData['epochs'],
                                        self.paramData['Optimization'],
                                        self.paramData['BatchSize'])
        elif self.paramData['type'] == 'tnn_keras':
            # retrieving network data
            NNdata = self.PreProcessing()
            '''
            Getting Network Architecture
            '''
            network = neural.NeuralNetwork(NNdata)
            # passing network architecture and create the model
            model = network.BuildModel()
            # training model
            model, history = network.TrainModel(model, self.triplets_train,
                                                self.Y_train_onehot,
                                                self.triplets_test,
                                                self.Y_test_onehot)
            # Plotting results
            self.funcs.PlotResultsKeras(history, self.paramData['type'],
                                        self.paramData['OutputPath'],
                                        self.paramData['epochs'],
                                        self.paramData['Optimization'],
                                        self.paramData['BatchSize'])

        elif self.paramData['type'] == 'ffnn_manual':
            # Neural network with multiple layers - regression - BCC Bray distances by CDOM - original data
            X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy()
            y_CDOM = CDOM.loc[:, "ASV.dist"].to_numpy(
            )[:, np.newaxis]  #Original data
            '''
            NN_reg_original = neural.NeuralNetworkML(X_CDOM, y_CDOM,
                                                     trainingShare=0.80,
                                                     n_hidden_layers=3,
                                                     n_hidden_neurons=[2000, 1000, 500],
                                                     n_categories=1,
                                                     epochs=10, batch_size=10,
                                                     eta=1e-8,
                                                     lmbd=0, fixed_LR=False,
                                                     method="regression",
                                                     activation="sigmoid",
                                                     seed = self.paramData['RandomSeed'])
            '''
            n_hidden_neurons = []
            for layer in range(self.paramData['NHiddenLayers']):
                n_hidden_neurons.append(self.paramData['NHiddenNeurons'])
            for layer in range(1, self.paramData['NHiddenLayers'], 1):
                n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2)
            #print(n_hidden_neurons)

            NN_reg_original = neural.NeuralNetworkML(
                X_CDOM,
                y_CDOM,
                trainingShare=1 - self.paramData['TestSize'],
                n_hidden_layers=self.paramData['NHiddenLayers'],
                n_hidden_neurons=n_hidden_neurons,
                n_categories=1,
                epochs=self.paramData['epochs'],
                batch_size=self.paramData['BatchSize'],
                eta=self.paramData['alpha'],
                lmbd=0,
                fixed_LR=False,
                method="regression",
                activation="sigmoid",
                seed=self.paramData['RandomSeed'])

            NN_reg_original.train()
            # Plotting results
            self.funcs.PlotResultsManualFFNN(NN_reg_original, CDOM,
                                             self.paramData['type'],
                                             self.paramData['OutputPath'],
                                             self.paramData['epochs'],
                                             self.paramData['BatchSize'])
        elif self.paramData['type'] == 'xgb':
            X_train, X_test, y_train, y_test = train_test_split(
                X_CDOM,
                y_CDOM,
                train_size=1 - self.paramData['TestSize'],
                test_size=self.paramData['TestSize'],
                random_state=self.paramData['RandomSeed'])
            # initialising xgboosting
            xgboosting = xgb.XGBoosting()
            model = xgboosting.RunModel(X_train, X_test, y_train, y_test,
                                        X_CDOM, X_CDOM_diag_mesh, CDOM,
                                        CDOM_sorted,
                                        self.paramData['OutputPath'])
            #Get best model by test MSE
            XGboost_best_model_index = model.best_iteration
            XGboost_best_iteration = model.get_booster().best_ntree_limit
            MSE_per_epoch = model.evals_result()

            # make predictions for test data
            y_pred = model.predict(X_test, ntree_limit=XGboost_best_iteration)
            y_pred_train = model.predict(X_train)
            #predictions = [round(value) for value in y_pred]

            best_prediction = model.predict(X_CDOM,
                                            ntree_limit=XGboost_best_iteration)
            CDOM_pred = best_prediction.copy(
            )  #CDOM_pred.shape: (2556,) CDOM_pred are the predicted BCC Bray distances for CDOM value pairs
            CDOM_pred_fine_mesh = model.predict(
                X_CDOM_diag_mesh, ntree_limit=XGboost_best_iteration)
            '''
            y_pred,\
            y_pred_train,\
            MSE_per_epoch,\
            CDOM_pred, \
            CDOM_pred_fine_mesh, \
            XGboost_best_model_index = xgboosting.RunModel(X_train, X_test,
                                                        y_train, y_test,
                                                        X_CDOM, X_CDOM_diag_mesh,
                                                        CDOM, CDOM_sorted,
                                                        self.paramData['OutputPath'])
            '''
            # plotting 3d plots and mse for XGBoost
            self.funcs.PlotResultsXGBoost(CDOM, CDOM_sorted, X_CDOM_diag_mesh,
                                          CDOM_pred_fine_mesh, CDOM_pred,
                                          self.paramData['OutputPath'], y_pred,
                                          y_pred_train, MSE_per_epoch, y_train,
                                          y_test, XGboost_best_model_index)
        elif self.paramData['type'] == 'rf_main':
            rf = random_forest.RandomForest()
            # Laurent
            population_size, metadata = rf.read_data(False, False)
            predictions, test_y, ML_ = rf.prepare_data(
                population_size, metadata, self.paramData['TestSize'],
                self.paramData['RandomSeed'])
            all_predictions = rf.predict_all_metadata(population_size,
                                                      metadata, ML_)

            # we will compare the outcome with xgboost
            def MergeTable(var_list, metadata_variables):
                table = pd.DataFrame(np.concatenate((var_list), axis=1))
                table.columns = metadata_variables
                return table

            def PredictMetadata(ASV_table, metadata_variables, train_size,
                                test_size, seed):
                X_ASV = ASV_table
                X_ASV.columns = [''] * len(X_ASV.columns)
                X_ASV = X_ASV.to_numpy()
                metadata_list = []
                for i in metadata_variables:
                    #y_CDOM = metadata.loc[:, i][:, np.newaxis]

                    # split data into train and test sets
                    y_meta = metadata.loc[:, i]  #Requires 1d array
                    X_train, X_test, y_train, y_test = train_test_split(
                        X_ASV,
                        y_meta,
                        train_size=train_size,
                        test_size=test_size,
                        random_state=seed)

                    # fit model no training data
                    model = XGBRegressor(objective='reg:squarederror')
                    model.fit(X_train,
                              y_train,
                              eval_set=[(X_train, y_train), (X_test, y_test)],
                              eval_metric='rmse',
                              early_stopping_rounds=100,
                              verbose=False)

                    #Get best model by test MSE
                    XGboost_best_model_index = model.best_iteration
                    XGboost_best_iteration = model.get_booster(
                    ).best_ntree_limit

                    # make predictions for full dataset
                    y_pred = model.predict(X_ASV,
                                           ntree_limit=XGboost_best_iteration)
                    metadata_list.append(y_pred[:, np.newaxis])
                return MergeTable(metadata_list, metadata_variables)

            var_list = [
                "Latitude", "Longitude", "Altitude", "Area", "Depth",
                "Temperature", "Secchi", "O2", "CH4", "pH", "TIC", "SiO2",
                "KdPAR"
            ]
            train_size = 1 - self.paramData['TestSize']
            test_size = self.paramData['TestSize']
            seed = self.paramData['RandomSeed']
            predicted_metadata = PredictMetadata(ASV, var_list, train_size,
                                                 test_size, seed)

            with pd.option_context('display.max_rows', None,
                                   'display.max_columns',
                                   None):  # more options can be specified also
                print(predicted_metadata)

        elif self.paramData['type'] == 'rf_side':
            # retrieving network data
            NNdata = self.PreProcessing()
            rf = random_forest.RandomForest()
            seed = self.paramData['RandomSeed']
            clfs, scores_test, scores_train = rf.predict_t(
                self.X_train, self.X_test, self.y_train_l, self.y_test_l, seed)

        elif self.paramData['type'] == 'all':
            '''
            Neural Network
            '''
            # Neural network with multiple layers - regression - BCC Bray distances by CDOM - original data
            X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy()
            y_CDOM = CDOM.loc[:, "ASV.dist"].to_numpy(
            )[:, np.newaxis]  #Original data

            n_hidden_neurons = []
            for layer in range(self.paramData['NHiddenLayers']):
                n_hidden_neurons.append(self.paramData['NHiddenNeurons'])
            for layer in range(1, self.paramData['NHiddenLayers'], 1):
                n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2)
            #print(n_hidden_neurons)

            NN_reg_original = neural.NeuralNetworkML(
                X_CDOM,
                y_CDOM,
                trainingShare=1 - self.paramData['TestSize'],
                n_hidden_layers=self.paramData['NHiddenLayers'],
                n_hidden_neurons=n_hidden_neurons,
                n_categories=1,
                epochs=self.paramData['epochs'],
                batch_size=self.paramData['BatchSize'],
                eta=self.paramData['alpha'],
                lmbd=0,
                fixed_LR=False,
                method="regression",
                activation="sigmoid",
                seed=self.paramData['RandomSeed'])

            NN_reg_original.train()

            x_mesh = np.log10(
                np.arange(min(CDOM.loc[:, "CDOM.x1"]),
                          max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)) + 1
            y_mesh = x_mesh.copy()
            x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh)
            X_CDOM_mesh = self.funcs.pdCat(
                x_mesh.ravel()[:, np.newaxis],
                y_mesh.ravel()[:, np.newaxis]).to_numpy()
            best_prediction = NN_reg_original.model_prediction(
                X_CDOM_mesh,
                NN_reg_original.accuracy_list.index(
                    min(NN_reg_original.accuracy_list)))

            x_mesh = np.arange(min(CDOM.loc[:, "CDOM.x1"]),
                               max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)
            y_mesh = x_mesh.copy()
            x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh)

            ff_pred_original = best_prediction.copy()
            ff_pred_original = np.reshape(ff_pred_original, (363, 363))
            ff_pred_original[x_mesh - y_mesh == 0] = np.nan
            ff_pred_original[x_mesh > y_mesh] = np.nan
            '''
            XGBoost part
            '''
            X_CDOM = CDOM.loc[:, [
                "CDOM.x1", "CDOM.x2"
            ]]  #Molten meshgrid CDOM values for real data BCC Bray distances
            X_CDOM_diag_mesh = CDOM_diag_mesh.loc[:, [
                "CDOM.x1", "CDOM.x2"
            ]]  #Molten meshgrid CDOM values for generating predicted BCC Bray distances
            y_CDOM = CDOM.loc[:, "ASV.dist"]

            X_train, X_test, y_train, y_test = train_test_split(
                X_CDOM,
                y_CDOM,
                train_size=1 - self.paramData['TestSize'],
                test_size=self.paramData['TestSize'],
                random_state=self.paramData['RandomSeed'])
            # initialising xgboosting
            xgboosting = xgb.XGBoosting()
            model = xgboosting.RunModel(X_train, X_test, y_train, y_test,
                                        X_CDOM, X_CDOM_diag_mesh, CDOM,
                                        CDOM_sorted,
                                        self.paramData['OutputPath'])

            #Get best model by test MSE
            XGboost_best_model_index = model.best_iteration
            XGboost_best_iteration = model.get_booster().best_ntree_limit
            MSE_per_epoch = model.evals_result()

            # make predictions for test data
            y_pred = model.predict(X_test, ntree_limit=XGboost_best_iteration)
            y_pred_train = model.predict(X_train)
            #predictions = [round(value) for value in y_pred]

            best_prediction = model.predict(X_CDOM,
                                            ntree_limit=XGboost_best_iteration)
            CDOM_pred = best_prediction.copy(
            )  #CDOM_pred.shape: (2556,) CDOM_pred are the predicted BCC Bray distances for CDOM value pairs
            CDOM_pred_fine_mesh = model.predict(
                X_CDOM_diag_mesh, ntree_limit=XGboost_best_iteration)
            '''
            Simple OLS - generating design matrix out of data set etc.
            '''
            reg = regression.Regression()
            X_mesh = reg.GenerateMesh(
                0.21, 3.83, 0.21, 3.83, 0.01, 0.01, log_transform=True
            )  # The low number of points on the higher end of the gradient causes distortions for linear regression
            X_mesh_degree_list = reg.DesignMatrixList(X_mesh[0], X_mesh[1],
                                                      12)[1:]
            X_degree_list = reg.DesignMatrixList(CDOM.loc[:, "CDOM.x1"],
                                                 CDOM.loc[:,
                                                          "CDOM.x2"], 12)[1:]
            X_degree_list_subset = []

            z = CDOM_pred  #XGboost-predicted values
            z = CDOM.loc[:, "ASV.dist"]  #Original data
            #ebv_no_resampling = reg.generate_error_bias_variance_without_resampling(X_degree_list, 1)
            #ebv_resampling = reg.generate_error_bias_variance_with_resampling(X_degree_list, 1, 100)
            #reg.ebv_by_model_complexity(ebv_resampling)
            #reg.training_vs_test(ebv_no_resampling)

            CDOM_pred_reg = X_mesh_degree_list[8] @ reg.beta_SVD(
                X_degree_list[8], CDOM_pred)
            #print(pd.DataFrame(X_mesh_degree_list[1]))
            #print(CDOM_pred_reg)
            #with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            #  print(pd.DataFrame(CDOM_pred_reg))

            x_mesh_reg = np.arange(min(CDOM.loc[:, "CDOM.x1"]),
                                   max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)
            y_mesh_reg = x_mesh_reg.copy()
            x_mesh_reg, y_mesh_reg = np.meshgrid(x_mesh_reg, y_mesh_reg)
            X_CDOM_mesh = self.funcs.pdCat(x_mesh_reg.ravel()[:, np.newaxis],
                                           y_mesh_reg.ravel()[:, np.newaxis])
            #print(pd.DataFrame(X_CDOM_mesh))
            #print("CDOM_pred_reg.shape", CDOM_pred_reg.shape)
            z_CDOM_mesh_pred = np.reshape(
                CDOM_pred_reg, (x_mesh_reg.shape[0], x_mesh_reg.shape[0]))
            z_CDOM_mesh_pred[x_mesh_reg - y_mesh_reg == 0] = np.nan
            z_CDOM_mesh_pred[x_mesh_reg > y_mesh_reg] = np.nan
            '''
            Neural Network with data from XGBoost
            '''
            X_CDOM = CDOM.loc[:, ["CDOM.x1", "CDOM.x2"]].to_numpy()
            y_CDOM = CDOM_pred[:, np.newaxis]  #Predicted data from XGboost

            n_hidden_neurons = []
            for layer in range(self.paramData['NHiddenLayers']):
                n_hidden_neurons.append(self.paramData['NHiddenNeurons'])
            for layer in range(1, self.paramData['NHiddenLayers'], 1):
                n_hidden_neurons[layer] = int(n_hidden_neurons[layer - 1] / 2)
            #print(n_hidden_neurons)

            NN_reg = neural.NeuralNetworkML(
                X_CDOM,
                y_CDOM,
                trainingShare=1 - self.paramData['TestSize'],
                n_hidden_layers=self.paramData['NHiddenLayers'],
                n_hidden_neurons=n_hidden_neurons,
                n_categories=1,
                epochs=self.paramData['epochs'],
                batch_size=self.paramData['BatchSize'],
                eta=self.paramData['alpha'],
                lmbd=0,
                fixed_LR=False,
                method="regression",
                activation="sigmoid",
                seed=self.paramData['RandomSeed'])

            NN_reg.train()
            test_predict = NN_reg.predict(NN_reg.XTest)
            print(NN_reg.accuracy_list)

            #Use log-transformed CDOM values for creating design matrix, then plot on original values
            x_mesh = np.log10(
                np.arange(min(CDOM.loc[:, "CDOM.x1"]),
                          max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)) + 1
            y_mesh = x_mesh.copy()
            x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh)
            X_CDOM_mesh = self.funcs.pdCat(
                x_mesh.ravel()[:, np.newaxis],
                y_mesh.ravel()[:, np.newaxis]).to_numpy()
            best_prediction = NN_reg.model_prediction(
                X_CDOM_mesh,
                NN_reg.accuracy_list.index(min(NN_reg.accuracy_list)))

            x_mesh = np.arange(min(CDOM.loc[:, "CDOM.x1"]),
                               max(CDOM.loc[:, "CDOM.x2"]) + 0.01, 0.01)
            y_mesh = x_mesh.copy()
            x_mesh, y_mesh = np.meshgrid(x_mesh, y_mesh)

            ff_pred = best_prediction.copy()
            ff_pred = np.reshape(ff_pred, (363, 363))
            ff_pred[x_mesh - y_mesh == 0] = np.nan
            ff_pred[x_mesh > y_mesh] = np.nan
            '''
            Plotting 3d graphs for all data
            '''
            fontsize = 6
            #Compare raw data to XGboost, neural network predicted data and XGboost predicted data smoothed with neural network
            fig = plt.figure(figsize=plt.figaspect(0.5))
            ax = fig.add_subplot(2, 3, 1, projection='3d')
            ax.set_title("BCC Bray distances by sites' DOM", fontsize=fontsize)
            #plt.subplots_adjust(left=0, bottom=0, right=2, top=2, wspace=0, hspace=0)
            ax.view_init(elev=30.0, azim=300.0)
            surf = ax.plot_trisurf(CDOM.loc[:, "CDOM.x1"],
                                   CDOM.loc[:, "CDOM.x2"],
                                   CDOM.loc[:, "ASV.dist"],
                                   cmap='viridis',
                                   edgecolor='none')
            # Customize the z axis.
            ax.set_zlim(0.3, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            # Set up the axes for the second plot
            ax = fig.add_subplot(2, 3, 2, projection='3d')
            #ax.set_title("XGboost-Predicted BCC Bray distances by sites' CDOM, dataset CDOM coordinates", fontsize=8)
            ax.set_title(
                "XGboost-Predicted BCC \n Bray distances by sites' DOM",
                fontsize=fontsize)
            ax.view_init(elev=30.0, azim=300.0)

            # Plot the surface.
            ax.plot_trisurf(
                CDOM.loc[:, "CDOM.x1"],
                CDOM.loc[:, "CDOM.x2"],
                CDOM_pred,  #197109 datapoints
                cmap='viridis',
                edgecolor='none')

            # Customize the z axis.
            z_range = (np.nanmax(CDOM_pred) - np.nanmin(CDOM_pred))
            ax.set_zlim(np.nanmin(CDOM_pred) - z_range, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            # Set up the axes for the third plot
            ax = fig.add_subplot(2, 3, 3, projection='3d')
            #ax.set_title("OLS (SVD) regression-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=6)
            ax.set_title(
                "OLS (SVD) regression-predicted \n BCC Bray distances by sites' DOM",
                fontsize=fontsize)
            ax.view_init(elev=30.0, azim=300.0)

            # Plot the surface.
            ax.plot_trisurf(
                x_mesh_reg.ravel(),
                y_mesh_reg.ravel(),
                z_CDOM_mesh_pred.ravel(),
                cmap='viridis',  #197109 datapoints
                vmin=np.nanmin(z_CDOM_mesh_pred),
                vmax=np.nanmax(z_CDOM_mesh_pred),
                edgecolor='none')

            # Customize the z axis.
            z_range = (np.nanmax(z_CDOM_mesh_pred) -
                       np.nanmin(z_CDOM_mesh_pred))
            ax.set_zlim(np.nanmin(z_CDOM_mesh_pred) - z_range, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            # Set up the axes for the fourth plot
            ax = fig.add_subplot(2, 3, 4, projection='3d')
            #ax.set_title("NN-smoothed XGboost-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=6)
            ax.set_title(
                "NN-smoothed XGboost-predicted \n BCC Bray distances by sites' DOM",
                fontsize=fontsize)
            ax.view_init(elev=30.0, azim=300.0)

            # Plot the surface.
            ax.plot_trisurf(
                x_mesh.ravel(),
                y_mesh.ravel(),
                ff_pred.ravel(),  #197109 datapoints
                cmap='viridis',
                edgecolor='none',
                vmin=np.nanmin(ff_pred),
                vmax=np.nanmax(ff_pred))

            # Customize the z axis.
            z_range = (np.nanmax(ff_pred) - np.nanmin(ff_pred))
            ax.set_zlim(np.nanmin(ff_pred) - z_range, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            # Set up the axes for the fifth plot
            ax = fig.add_subplot(2, 3, 5, projection='3d')
            #ax.set_title("NN-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=8)
            ax.set_title("NN-predicted BCC Bray \n distances by sites' DOM",
                         fontsize=fontsize)
            ax.view_init(elev=30.0, azim=300.0)

            # Plot the surface.
            ax.plot_trisurf(
                x_mesh.ravel(),
                y_mesh.ravel(),
                ff_pred_original.ravel(),  #197109 datapoints
                cmap='viridis',
                edgecolor='none',
                vmin=np.nanmin(ff_pred_original),
                vmax=np.nanmax(ff_pred_original))

            # Customize the z axis.
            z_range = (np.nanmax(ff_pred_original) -
                       np.nanmin(ff_pred_original))
            ax.set_zlim(np.nanmin(ff_pred_original) - z_range, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            # Set up the axes for the sixth plot
            ax = fig.add_subplot(2, 3, 6, projection='3d')
            #ax.set_title("XGboost-predicted BCC Bray distances by sites' CDOM, CDOM 0.01 step meshgrid", fontsize=8)
            ax.set_title(
                "XGboost-predicted BCC Bray \n distances by sites' DOM",
                fontsize=fontsize)
            ax.view_init(elev=30.0, azim=300.0)

            # Plot the surface.
            ax.plot_trisurf(
                X_CDOM_diag_mesh.loc[:, "CDOM.x1"],
                X_CDOM_diag_mesh.loc[:, "CDOM.x2"],
                CDOM_pred_fine_mesh,  #197109 datapoints
                cmap='viridis',
                edgecolor='none')

            # Customize the z axis.
            z_range = (np.nanmax(CDOM_pred_fine_mesh) -
                       np.nanmin(CDOM_pred_fine_mesh))
            ax.set_zlim(np.nanmin(CDOM_pred_fine_mesh) - z_range, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f"))
            ax.tick_params(labelsize=8)
            ax.set_zlabel(zlabel="Bray distance")
            ax.set_ylabel(ylabel="DOM site 2")
            ax.set_xlabel(xlabel="DOM site 1")

            #filename = self.paramData['OutputPath']
            filename = self.paramData[
                'OutputPath'] + '/' + 'everything_3d' + '.png'
            fig.savefig(filename)

            plt.show()
Esempio n. 16
0
 def test_also_likes_documents_600k(self):
     """
     Test to see if the documents is as expected for a document and visitor in the 600k dataset.
     """
     f = fm.FileManager(self.path_600)
     df = f.parse_json_dataframe()
     p = prc.DataProcessing(df)
     # get relevant readers gets a set of readers for a document and user id
     set_readers = p.get_relevant_readers(self.doc_id_600, self.user_id_600)
     # get the documents that these readers like
     set_docs = p.get_documents(set_readers)
     # We expect the following 4 readers based on the given test data.
     set_expected = {
         '1f891eb0b573e42c': {
             '130308221433-09f8d746cb5e46f79842433817ffa908',
             '130322204045-7e140c31b4df4b8da1b0d4a410620ad1',
             '130406004921-f9e3072c82364ccfba25da4bc8be3b04',
             '130412203635-288742d148524251b4ef59dfaa222008',
             '130412215325-b2802be64be04a86b8c67acede394982',
             '130517181940-3f89e9f4524d4e769c205ed6f1b0e7ae',
             '130601015527-c1e2993d8290975e7ef350f078134390',
             '130626002918-2e934fcf5642becffed4c4325fcfa6d8',
             '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd',
             '130828160643-3f7e01676f04a2f60d02f80fcbd702e1',
             '130829034400-ae346135ab80c636d6d7b4c0f7960c41',
             '130829155547-4da063e3c66df0bc6149aced2abc3720',
             '130930182254-898ec9d4d3724afb31b1168517d4228a',
             '131004224723-076660492fa2c66e5398e3dde8890d73',
             '131022215916-907a48e13645fa9a81860efd03e85352',
             '140207031738-eb742a5444c9b73df2d1ec9bff15dae9'
         },
         '383508ea93fd2fd1': {
             '130412203635-288742d148524251b4ef59dfaa222008',
             '130412215325-b2802be64be04a86b8c67acede394982',
             '130601015527-c1e2993d8290975e7ef350f078134390',
             '130828160643-3f7e01676f04a2f60d02f80fcbd702e1',
             '130930182254-898ec9d4d3724afb31b1168517d4228a',
             '131022215916-907a48e13645fa9a81860efd03e85352',
             '140207031738-eb742a5444c9b73df2d1ec9bff15dae9'
         },
         '3f64bccfd160557e': {
             '130406004921-f9e3072c82364ccfba25da4bc8be3b04',
             '130601015527-c1e2993d8290975e7ef350f078134390',
             '130626002918-2e934fcf5642becffed4c4325fcfa6d8',
             '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd',
             '130828160643-3f7e01676f04a2f60d02f80fcbd702e1',
             '130829034400-ae346135ab80c636d6d7b4c0f7960c41',
             '131030220741-ce78b0b193120c40fd3916fb616b63ce',
             '140207031738-eb742a5444c9b73df2d1ec9bff15dae9'
         },
         '7134a88f8b201d31': {
             '130308221433-09f8d746cb5e46f79842433817ffa908',
             '130322204045-7e140c31b4df4b8da1b0d4a410620ad1',
             '130626002918-2e934fcf5642becffed4c4325fcfa6d8',
             '130813183014-f447fd9c4d6abcdfb20e8f0d925c63fd',
             '130829155547-4da063e3c66df0bc6149aced2abc3720',
             '130930182254-898ec9d4d3724afb31b1168517d4228a',
             '131022215916-907a48e13645fa9a81860efd03e85352',
             '131030220741-ce78b0b193120c40fd3916fb616b63ce',
             '140207031738-eb742a5444c9b73df2d1ec9bff15dae9'
         }
     }
     self.assertEqual(set_docs, set_expected, "Should be %s" % set_expected)
Esempio n. 17
0
def run(args):
    print("")  # Leave a gap
    print("Starting task %s" % args.task)
    # If the task value is 2a, then run
    if args.task == "2a":
        # Create file object
        f = fm.FileManager(args.file)
        # Create the dataframe
        df = f.parse_json_dataframe()
        # If the df was not empty, then run this
        if not df.empty:
            # Send it to dataprocessing,
            dataset = pr.DataProcessing(df)
            # Run the task for 2a.
            dataset.histogram_country(args.docid)
    # If the task value is 2b, then run
    elif args.task == "2b":
        f = fm.FileManager(args.file)
        df = f.parse_json_dataframe()
        if not df.empty:
            dataset = pr.DataProcessing(df)
            dataset.histogram_continent(args.docid)
    elif args.task == "3a":
        f = fm.FileManager(args.file)
        df = f.parse_json_dataframe()
        if not df.empty:
            dataset = pr.DataProcessing(df)
            dataset.histogram_browsers_a()
    elif args.task == "3b":
        f = fm.FileManager(args.file)
        df = f.parse_json_dataframe()
        if not df.empty:
            dataset = pr.DataProcessing(df)
            dataset.histogram_browsers_b()
    elif args.task == "4":
        f = fm.FileManager(args.file)
        df = f.parse_json_dataframe()
        if not df.empty:
            dataset = pr.DataProcessing(df)
            output = dataset.visitor_readtimes()
            print("Reader(s):        |  Total readtime(s): ")
            print("------------------------------------------------")
            for k, v in output.items():
                print('%s  |  %s' % (k, v))
    elif args.task == "5":
        f = fm.FileManager(args.file)
        df = f.parse_json_dataframe()
        if not df.empty:
            dataset = pr.DataProcessing(df)
            readers, output = dataset.run_task_5(args.docid, args.userid)
            print("Relevant readers for the document:")
            print("Reader(s)  ")
            print("-----------")
            for reader in readers:
                print("%s      |" % reader[-4:])
            print("")
            print("Top 10 most read (also-like) documents: ")
            print("Document(s)  |   Times Read")
            print("----------------------------")
            for documents, count in output.items():
                if documents[-4:] == args.docid[-4:]:
                    print("%s (*)     |   %s" % (documents[-4:], count))
                else:
                    print("%s         |   %s" % (documents[-4:], count))
            print("Where (*) is the input document.")
    elif args.task == "6":
        f = fm.FileManager(args.file)
        dataset = pr.DataProcessing(f.parse_json_dataframe())
        dataset.run_task_6(args.docid, args.userid)
    else:
        return "No conditions set"
 def __init__(self):
     self.a=Q1.DataProcessing()
     #sklearn maybe not need add 1 to the first column!
     self.traingSet=self.a.standardForm()[0]
     self.testSet=self.a.standardForm()[1]
     self.validationSet=self.a.standardForm()[2]