def load_data(self): directory_path = filedialog.askdirectory( initialdir=os.getcwd(), mustexist=True, title="Please select the data directory...") extractor = DataExtractor(directory_path, self.best_nest_var.get(), self.max_sim_time_var.get()) invalid_files, unfinished_sims = extractor.extract_data() self.data_set = extractor.data_set self.data_plot = DataPlotter(self.data_set) msg_string = "%s simulations had missing or blank files.\n" % invalid_files msg_string += "%s simulations exceeded than the maximum time and so were removed." % unfinished_sims messagebox.showinfo('Data Loaded', msg_string) self.list_box.delete(0, tk.END) grid_row = 0 for data in self.data_set: raw_data_string = "" for key, value in data.items(): raw_data_string += "%s=%s, " % (key, value) grid_row += 1 self.list_box.insert(tk.END, raw_data_string[:-2]) if grid_row % 2 == 0: self.list_box.itemconfig(tk.END, bg='#e0e0e0') else: self.list_box.itemconfig(tk.END, bg='#f4f4f4') # Updating the list of options to split the data by options = self.data_set[0].keys() menu = self.split_options["menu"] menu.delete(0, "end") menu.add_command(label='none', command=lambda: self.split_on_var.set('none')) for string in options: menu.add_command( label=string, command=lambda option=string: self.split_on_var.set(option)) self.add_button.config(state=tk.ACTIVE)
]))) logging.basicConfig(level='INFO') logger = logging.getLogger("Logger") data_extractor = DataExtractor(logger) matrix_extractor = MatrixExtractor() f = open("train_data.txt", "r") data = f.read() logger.debug(data) columns = ['subject', 'content', 'category'] text_columns = ['subject', 'content'] extracted_data = data_extractor.extract_data(data, text_columns) df = pd.DataFrame(extracted_data, columns=columns) category_plot() print_common_unigrams_bigrams() split_train_test( False ) #True for rebalancing (training set) unbalanced data as seen in histogram run_naive_bayes() print('\n**') print('df sample:') print('***') print(df.sample())