Ejemplo n.º 1
0
def test_write_to_db():
    df1 = load_data('disaster_messages.csv', 'disaster_categories.csv')
    df2 = clean_data(df1)
    database_filepath = 'sqlite:///DisasterResponsetest.db'
    save_data(df2, database_filepath)
    # load data from database
    df3 = pd.read_sql_table('MessageClass', database_filepath)

    assert (df3.shape == (26216, 40))
Ejemplo n.º 2
0
    def test_save_data(self):
        """Does save_data persist the data in the database ?"""
        # Arrange
        df_input = load_data("unittest_disaster_messages.csv", "unittest_disaster_categories.csv")
        df_result = clean_data(df_input)

        # Act
        save_data(df_result, self.UNIT_TEST_DB)
        # Assert
        engine = create_engine('sqlite:///' + self.UNIT_TEST_DB)
        df = pd.read_sql_table('Messages', engine)

        self.assertIsNotNone(df)
        self.assertEqual(df.shape, (1, 40))
Ejemplo n.º 3
0
    pca_data = pd.DataFrame(pca_data,
                            columns=['comp' + str(i) for i in range(ncols)])

    # Pull relevant data
    pca_meta = pd.DataFrame()
    pca_meta['explained_variance'] = pca.explained_variance_
    pca_meta['explained_variance_ratio'] = pca.explained_variance_ratio_
    for i, xcol in enumerate(pc.X_cols):
        pca_meta[xcol + "_weight"] = pca.components_[:, i]

    for col in pc.Y_cols:
        pca_data[col] = data[col]

    # Return transform and relevant data
    return pca, pca_meta, pca_data


if __name__ == "__main__":
    # Read file
    filename = pc.check_args(sys.argv)
    _, data = pc.process_metadata(filename)

    # Get PCA results
    _, pca_meta, pca_data = run(data)  # Will add tolerance opt later

    # Save to file
    meta_savefile = pc.new_filename(filename, "pca_metadata")
    data_savefile = pc.new_filename(filename, "pca_components")
    pc.save_data(pca_meta, meta_savefile)
    pc.save_data(pca_data, data_savefile)
Ejemplo n.º 4
0
#import modules
import process_data
import pandas as pd

#load data from csv files using process_data.py methods
data = process_data.load_data('disaster_messages.csv',
                              'disaster_categories.csv')
#creating a separate clean dataset to train model using the process_data.py method
data_clean = process_data.clean_data(data)

#saving a sqlite db for models using the processed data and process_data.py methods
process_data.save_data(data_clean, 'emergency')


def custom_clean_data(df):
    """Clean categories and merge to messages

    Args:
        df => DataFrame of merged categories and messages csv files

    Returns:
        df => Dataframe of cleaned categories and dropped duplicateds

    """
    categories = pd.Series(df.categories).str.split(';', expand=True)
    row = categories.loc[0]
    category_colnames = row.apply(lambda x: x[:-2]).values
    categories.columns = category_colnames
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].apply(lambda x: x[-1:]).values
Ejemplo n.º 5
0
#import modules
import process_data
import pandas as pd

#load data from csv files using process_data.py methods
data = process_data.load_data('disaster_messages.csv',
                              'disaster_categories.csv')
#creating a separate clean dataset to train model using the process_data.py method
data_clean = process_data.clean_data(data)

#saving a sqlite db for models using the processed data and process_data.py methods
process_data.save_data(data_clean, 'emergency')


def custom_clean_data(df):
    """Clean categories and merge to messages
    Args:
        df => DataFrame of merged categories and messages csv files
    Returns:
        df => Dataframe of cleaned categories and dropped duplicateds
    """
    categories = pd.Series(df.categories).str.split(';', expand=True)
    row = categories.loc[0]
    category_colnames = row.apply(lambda x: x[:-2]).values
    categories.columns = category_colnames
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].apply(lambda x: x[-1:]).values

        # convert column from string to numeric
        categories[column] = categories[column].astype(int)
Ejemplo n.º 6
0
        sys.exit(-1)

    # Get algorithm name
    alg_name = argv[2]
    if alg_name not in algorithms:
        print("Invalid algorithm name: " + alg_name)
        sys.exit(-1)

    grid_search = True if len(argv) == 4 and argv[3] == "grid" else False

    return filename, alg_name, grid_search


### Main program
if __name__ == "__main__":
    # Read file
    filename, alg_name, grid_search = check_ML_args(sys.argv)

    _, data = pc.process_metadata(filename)

    # Get ML results
    new_data, metrics = run(data, alg_name, grid_search)

    grid_tag = "gridsearch" if grid_search else "ML"
    data_savefile = pc.new_filename(filename, grid_tag + "_" + alg_name)
    file_tag = os.path.basename(filename).split("_")[0]
    metrics_savefile = pc.new_filename(
        f"{file_tag}_{grid_tag}_{alg_name}_metrics.yaml", truncate=False)
    pc.save_data(new_data, data_savefile)
    pc.save_data(metrics, metrics_savefile)