def test_read_col(self): """ Tests if spark context is able to read columns of json file """ etl = ea.ETLAmazon() test_file = etl.read_json("thenextbestbook/etl/tests/data/test_file.json.gz") test_file_pd = test_file.toPandas() test_file_truth = pd.read_json("thenextbestbook/etl/tests/data/test_file.json") self.assertEqual(test_file_pd.shape[1], test_file_truth.shape[1])
def test_sql_cmd(self): """ test if sql query is executing correctly on the dataset """ etl = ea.ETLAmazon() runs = etl.read_json( "thenextbestbook/etl/tests/data/test_file.json.gz") runs.createGlobalTempView("runs") query_result = etl.sql_query("SELECT COUNT(*)" "FROM global_temp.runs") query_result_pd = query_result.toPandas() self.assertEqual(int(query_result_pd.iloc[0]), 263)
""" script to run ETL on Amazon data """ import etl_amazon as ea import constants as ct # Initiate Spark Session etl = ea.ETLAmazon() # Create variable 'book' to store book review JSON object books = etl.read_json(ct.AMAZON_BOOKS_JSON) # Create variable 'metadata' to store metadata JSON object metadata = etl.read_json(ct.AMAZON_METADATA_JSON) # Create global variables for spark SQL command books.createGlobalTempView("books") metadata.createGlobalTempView("metadata") # Create variable 'books_with_title' to store result of SQL books_with_title = etl.get_title_on_asin() # Save result to JSON folder books_with_title.write.format('json').save(ct.AMAZON_REVIEWS_DESTINATION)