def load_data(self):
        directory_path = filedialog.askdirectory(
            initialdir=os.getcwd(),
            mustexist=True,
            title="Please select the data directory...")

        extractor = DataExtractor(directory_path, self.best_nest_var.get(),
                                  self.max_sim_time_var.get())
        invalid_files, unfinished_sims = extractor.extract_data()

        self.data_set = extractor.data_set
        self.data_plot = DataPlotter(self.data_set)
        msg_string = "%s simulations had missing or blank files.\n" % invalid_files
        msg_string += "%s simulations exceeded than the maximum time and so were removed." % unfinished_sims
        messagebox.showinfo('Data Loaded', msg_string)

        self.list_box.delete(0, tk.END)

        grid_row = 0
        for data in self.data_set:
            raw_data_string = ""
            for key, value in data.items():
                raw_data_string += "%s=%s, " % (key, value)
            grid_row += 1
            self.list_box.insert(tk.END, raw_data_string[:-2])
            if grid_row % 2 == 0:
                self.list_box.itemconfig(tk.END, bg='#e0e0e0')
            else:
                self.list_box.itemconfig(tk.END, bg='#f4f4f4')

        # Updating the list of options to split the data by
        options = self.data_set[0].keys()
        menu = self.split_options["menu"]
        menu.delete(0, "end")
        menu.add_command(label='none',
                         command=lambda: self.split_on_var.set('none'))
        for string in options:
            menu.add_command(
                label=string,
                command=lambda option=string: self.split_on_var.set(option))

        self.add_button.config(state=tk.ACTIVE)
Beispiel #2
0
            ])))


logging.basicConfig(level='INFO')
logger = logging.getLogger("Logger")

data_extractor = DataExtractor(logger)
matrix_extractor = MatrixExtractor()

f = open("train_data.txt", "r")
data = f.read()
logger.debug(data)

columns = ['subject', 'content', 'category']
text_columns = ['subject', 'content']
extracted_data = data_extractor.extract_data(data, text_columns)

df = pd.DataFrame(extracted_data, columns=columns)

category_plot()
print_common_unigrams_bigrams()
split_train_test(
    False
)  #True for rebalancing (training set) unbalanced data as seen in histogram
run_naive_bayes()

print('\n**')
print('df sample:')
print('***')
print(df.sample())