def test_write_to_db(): df1 = load_data('disaster_messages.csv', 'disaster_categories.csv') df2 = clean_data(df1) database_filepath = 'sqlite:///DisasterResponsetest.db' save_data(df2, database_filepath) # load data from database df3 = pd.read_sql_table('MessageClass', database_filepath) assert (df3.shape == (26216, 40))
def test_save_data(self): """Does save_data persist the data in the database ?""" # Arrange df_input = load_data("unittest_disaster_messages.csv", "unittest_disaster_categories.csv") df_result = clean_data(df_input) # Act save_data(df_result, self.UNIT_TEST_DB) # Assert engine = create_engine('sqlite:///' + self.UNIT_TEST_DB) df = pd.read_sql_table('Messages', engine) self.assertIsNotNone(df) self.assertEqual(df.shape, (1, 40))
pca_data = pd.DataFrame(pca_data, columns=['comp' + str(i) for i in range(ncols)]) # Pull relevant data pca_meta = pd.DataFrame() pca_meta['explained_variance'] = pca.explained_variance_ pca_meta['explained_variance_ratio'] = pca.explained_variance_ratio_ for i, xcol in enumerate(pc.X_cols): pca_meta[xcol + "_weight"] = pca.components_[:, i] for col in pc.Y_cols: pca_data[col] = data[col] # Return transform and relevant data return pca, pca_meta, pca_data if __name__ == "__main__": # Read file filename = pc.check_args(sys.argv) _, data = pc.process_metadata(filename) # Get PCA results _, pca_meta, pca_data = run(data) # Will add tolerance opt later # Save to file meta_savefile = pc.new_filename(filename, "pca_metadata") data_savefile = pc.new_filename(filename, "pca_components") pc.save_data(pca_meta, meta_savefile) pc.save_data(pca_data, data_savefile)
#import modules import process_data import pandas as pd #load data from csv files using process_data.py methods data = process_data.load_data('disaster_messages.csv', 'disaster_categories.csv') #creating a separate clean dataset to train model using the process_data.py method data_clean = process_data.clean_data(data) #saving a sqlite db for models using the processed data and process_data.py methods process_data.save_data(data_clean, 'emergency') def custom_clean_data(df): """Clean categories and merge to messages Args: df => DataFrame of merged categories and messages csv files Returns: df => Dataframe of cleaned categories and dropped duplicateds """ categories = pd.Series(df.categories).str.split(';', expand=True) row = categories.loc[0] category_colnames = row.apply(lambda x: x[:-2]).values categories.columns = category_colnames for column in categories: # set each value to be the last character of the string categories[column] = categories[column].apply(lambda x: x[-1:]).values
#import modules import process_data import pandas as pd #load data from csv files using process_data.py methods data = process_data.load_data('disaster_messages.csv', 'disaster_categories.csv') #creating a separate clean dataset to train model using the process_data.py method data_clean = process_data.clean_data(data) #saving a sqlite db for models using the processed data and process_data.py methods process_data.save_data(data_clean, 'emergency') def custom_clean_data(df): """Clean categories and merge to messages Args: df => DataFrame of merged categories and messages csv files Returns: df => Dataframe of cleaned categories and dropped duplicateds """ categories = pd.Series(df.categories).str.split(';', expand=True) row = categories.loc[0] category_colnames = row.apply(lambda x: x[:-2]).values categories.columns = category_colnames for column in categories: # set each value to be the last character of the string categories[column] = categories[column].apply(lambda x: x[-1:]).values # convert column from string to numeric categories[column] = categories[column].astype(int)
sys.exit(-1) # Get algorithm name alg_name = argv[2] if alg_name not in algorithms: print("Invalid algorithm name: " + alg_name) sys.exit(-1) grid_search = True if len(argv) == 4 and argv[3] == "grid" else False return filename, alg_name, grid_search ### Main program if __name__ == "__main__": # Read file filename, alg_name, grid_search = check_ML_args(sys.argv) _, data = pc.process_metadata(filename) # Get ML results new_data, metrics = run(data, alg_name, grid_search) grid_tag = "gridsearch" if grid_search else "ML" data_savefile = pc.new_filename(filename, grid_tag + "_" + alg_name) file_tag = os.path.basename(filename).split("_")[0] metrics_savefile = pc.new_filename( f"{file_tag}_{grid_tag}_{alg_name}_metrics.yaml", truncate=False) pc.save_data(new_data, data_savefile) pc.save_data(metrics, metrics_savefile)