def test_write_to_db(): df1 = load_data('disaster_messages.csv', 'disaster_categories.csv') df2 = clean_data(df1) database_filepath = 'sqlite:///DisasterResponsetest.db' save_data(df2, database_filepath) # load data from database df3 = pd.read_sql_table('MessageClass', database_filepath) assert (df3.shape == (26216, 40))
def clean_tokenize_datasets(dataset_path): """ Argument: dataset_path: the folder name storing amazon or yelp dataset, like '../data/Amazon' or '../data/Yelp' Output: .csv files containing cleaned and tokenized datasets """ # get the full paths, word2idx and embedding matrix for each dataset is_amazon = re.search(r'Amazon', dataset_path) if is_amazon is not None: dataset_paths = [dataset_path + "/amazon.train.csv",\ dataset_path + "/amazon.valid.csv",\ dataset_path + "/amazon.test.csv"] else: dataset_paths = [dataset_path + "/yelp.train.csv",\ dataset_path + "/yelp.valid.csv",\ dataset_path + "/yelp.test.csv"] # read three files clean and tokenize them # then get the average vector for each review. for file in dataset_paths: data = pd.read_csv(file, header=None, low_memory=False) # the first column is label, the second one is context label_list = np.asarray(data.iloc[:, 0]) context_list = np.asarray(data.iloc[:, 1]) # get the average word vector representation for each sample cleaned_context_list = [] for context in context_list: cleaned_context_list.append(tokenize(clean_data(context))) # write the whole data into a .csv file. # determine the output file name if re.search(r'amazon', file) is not None: if re.search(r'train', file) is not None: output_file = "/amazon.train.cleaned.tokenized.csv" elif re.search(r'valid', file) is not None: output_file = "/amazon.valid.cleaned.tokenized.csv" else: output_file = "/amazon.test.cleaned.tokenized.csv" else: if re.search(r'train', file) is not None: output_file = "/yelp.train.cleaned.tokenized.csv" elif re.search(r'valid', file) is not None: output_file = "/yelp.valid.cleaned.tokenized.csv" else: output_file = "/yelp.test.cleaned.tokenized.csv" clean_token_df = pd.DataFrame() clean_token_df['label'] = label_list clean_token_df['cleaned_tokenized_review'] = cleaned_context_list clean_token_df.to_csv(dataset_path + output_file, encoding='utf-8', index=False)
def test_clean_data(self): """Does the clean data function produce the desired result ?""" # Arrange df_input = load_data("unittest_disaster_messages.csv", "unittest_disaster_categories.csv") # Act df_result = clean_data(df_input) # Assert self.assertIsNotNone(df_result) self.assertEqual(df_result.shape, (1, 40)) self.assertEqual((df_result['id'][0]), 2) self.assertEqual((df_result['message'][0]), 'Weather update - a cold front from Cuba that could pass over Haiti') self.assertEqual((df_result['original'][0]), 'Un front froid se retrouve sur Cuba ce matin. Il pourrait traverser Haiti demain. Des averses de pluie isolee sont encore prevues sur notre region ce soi') self.assertEqual((df_result['genre'][0]), 'direct') self.assertEqual((df_result['related'][0]), 1) self.assertEqual((df_result['request'][0]), 0) self.assertEqual((df_result['offer'][0]), 0) self.assertEqual((df_result['aid_related'][0]), 0) self.assertEqual((df_result['medical_help'][0]), 0) self.assertEqual((df_result['medical_products'][0]), 0) self.assertEqual((df_result['search_and_rescue'][0]), 0) self.assertEqual((df_result['security'][0]), 0) self.assertEqual((df_result['military'][0]), 0) self.assertEqual((df_result['child_alone'][0]), 0) self.assertEqual((df_result['water'][0]), 0) self.assertEqual((df_result['food'][0]), 0) self.assertEqual((df_result['shelter'][0]), 0) self.assertEqual((df_result['clothing'][0]), 0) self.assertEqual((df_result['money'][0]), 0) self.assertEqual((df_result['missing_people'][0]), 0) self.assertEqual((df_result['refugees'][0]), 0) self.assertEqual((df_result['death'][0]), 0) self.assertEqual((df_result['other_aid'][0]), 0) self.assertEqual((df_result['infrastructure_related'][0]), 0) self.assertEqual((df_result['transport'][0]), 0) self.assertEqual((df_result['buildings'][0]), 0) self.assertEqual((df_result['electricity'][0]), 0) self.assertEqual((df_result['tools'][0]), 0) self.assertEqual((df_result['hospitals'][0]), 0) self.assertEqual((df_result['shops'][0]), 0) self.assertEqual((df_result['aid_centers'][0]), 0) self.assertEqual((df_result['other_infrastructure'][0]), 0) self.assertEqual((df_result['weather_related'][0]), 0) self.assertEqual((df_result['floods'][0]), 0) self.assertEqual((df_result['storm'][0]), 0) self.assertEqual((df_result['fire'][0]), 0) self.assertEqual((df_result['earthquake'][0]), 0) self.assertEqual((df_result['cold'][0]), 0) self.assertEqual((df_result['other_weather'][0]), 0) self.assertEqual((df_result['direct_report'][0]), 0)
def test_save_data(self): """Does save_data persist the data in the database ?""" # Arrange df_input = load_data("unittest_disaster_messages.csv", "unittest_disaster_categories.csv") df_result = clean_data(df_input) # Act save_data(df_result, self.UNIT_TEST_DB) # Assert engine = create_engine('sqlite:///' + self.UNIT_TEST_DB) df = pd.read_sql_table('Messages', engine) self.assertIsNotNone(df) self.assertEqual(df.shape, (1, 40))
def read_input_file(path, header, attributes, clean, map_label=False): data = pd.read_json(path) data = data.loc[:, attributes] if header is not None: for col_name in header: flattened_data = pd.json_normalize(data[col_name]) header_names = flattened_data.columns for i in header_names: data[i] = flattened_data[i] data = data.drop(columns=header) if clean == True: data = prcs.clean_data(data, c.LABELS) if map_label == True: data = prcs.map_label(data, c.LABEL_MAP, c.LABELS) return data
def test_clean_data(): df1 = load_data('disaster_messages.csv', 'disaster_categories.csv') df2 = clean_data(df1) assert (df2.shape == (26216, 40))
#import modules import process_data import pandas as pd #load data from csv files using process_data.py methods data = process_data.load_data('disaster_messages.csv', 'disaster_categories.csv') #creating a separate clean dataset to train model using the process_data.py method data_clean = process_data.clean_data(data) #saving a sqlite db for models using the processed data and process_data.py methods process_data.save_data(data_clean, 'emergency') def custom_clean_data(df): """Clean categories and merge to messages Args: df => DataFrame of merged categories and messages csv files Returns: df => Dataframe of cleaned categories and dropped duplicateds """ categories = pd.Series(df.categories).str.split(';', expand=True) row = categories.loc[0] category_colnames = row.apply(lambda x: x[:-2]).values categories.columns = category_colnames for column in categories: # set each value to be the last character of the string categories[column] = categories[column].apply(lambda x: x[-1:]).values
def get_processed_dataset(dataset, sentiment_analysis): processed_dataset = clean_data( dataset) if sentiment_analysis else strip_data(dataset) return processed_dataset
import process_data import pandas as pd df = process_data.load_data('./disaster_messages.csv', './disaster_categories.csv') print(df.columns) df = process_data.clean_data(df) print(df)
from flask import Flask from flask import render_template, request, jsonify from plotly.graph_objs import Bar from sqlalchemy import create_engine import joblib # import the load and clean data functions import sys sys.path.insert(1, './data') from process_data import load_data, clean_data app = Flask(__name__) # load and clean the data df = load_data('data/disaster_messages.csv', 'data/disaster_categories.csv') df = clean_data(df) def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load model