def test_41_load_incremental_should_return_0_if_no_new_records(self): test_covid_nyt_data_latest = os.path.join(sys.path[0], 'testdata/nyt_data_latest.csv') df_nyt_data, df_jh_data = extract_covid_data(test_covid_nyt_data_latest, self._test_covid_jh_data) df_transformed_latest = transform(df_nyt_data, df_jh_data) df_size = load_incremental(df_transformed_latest, self.conn) self.assertEqual(df_size, 0)
def setUpClass(self): self._test_covid_nyt_data = os.path.join(sys.path[0], 'testdata/nyt_data.csv') self._test_covid_jh_data = os.path.join(sys.path[0], 'testdata/jh_data.csv') df_nyt_data, df_jh_data = extract_covid_data(self._test_covid_nyt_data, self._test_covid_jh_data) self._df_transformed = transform(df_nyt_data, df_jh_data) self.postgresql = testing.postgresql.Postgresql() self.conn = psycopg2.connect(**self.postgresql.dsn()) cursor = self.conn.cursor() cursor.execute("CREATE SCHEMA covid") cursor.close() self.conn.commit()
def load_data(event, context): try: data = yaml.load(open(CONFIG_FILE), Loader=yaml.BaseLoader)['data'] url_covid_nyt_data = data['url_covid_nyt_data'] url_covid_jh_data = data['url_covid_jh_data'] dwh_conn = db_utils.get_dwh_conn('dwh') df_nyt_data, df_jh_data = extract_covid_data(url_covid_nyt_data, url_covid_jh_data) df_transformed = transform(df_nyt_data, df_jh_data) load_to_dwh(df_transformed, dwh_conn) except (Exception, psycopg2.Error) as error: notify_etl_status(False, str(error))
def test_40_load_incremental(self): test_covid_nyt_data_latest = os.path.join(sys.path[0], 'testdata/nyt_data_latest.csv') df_nyt_data, df_jh_data = extract_covid_data(test_covid_nyt_data_latest, self._test_covid_jh_data) df_transformed_latest = transform(df_nyt_data, df_jh_data) load_to_dwh(df_transformed_latest, self.conn) df_result = db_utils.get_dwh_result_as_df(self.conn, "select * from covid.daily_stats ds order by ds.rep_date desc", "") exp_shape = (257, 4) exp_recent_record = list([datetime.date(2020, 10, 4), 7444705, 209603, 2911699]) exp_columns = list(['rep_date', 'cases', 'deaths', 'recovered']) self.assertTupleEqual(df_result.shape, exp_shape) self.assertListEqual(list(df_result.columns), exp_columns) self.assertListEqual(list(df_result.iloc[0]), exp_recent_record)
import logging from config import configure_logging from etl.extract import import_data from etl.transform import transform from etl.load import load # from etl.load import if __name__ == "__main__": configure_logging() # create the dashboard heading logging.info("Launching ETL") dataset = None # Extract and import data from remote repo dataset = import_data(remote=True) # Transform data dataset = transform(dataset) # Save data to disk result = load(dataset) logging.info(f"ETL finished: {result}")
import findspark findspark.init() from pyspark.sql import SparkSession from etl.ingest import ingest_logs from etl.transform import transform from analytics.log_analytics import analysis if __name__ == '__main__': spark = SparkSession.builder.appName('Whitehouse Logs').config( 'spark.master', 'local').getOrCreate() print(spark.version) ingest_logs(spark) transform(spark) analysis(spark)
# Lets get to work... downloaded_count = None data = None output = None if run_cmds.find('e') > -1: # Extract files from ftp location and download them downloaded_count = extract.extract(source = source, extract_storage = extract_storage) print 'Downloaded %s files ' % downloaded_count if run_cmds.find('t') > -1: # Transform downloaded files data = transform.transform(extract_storage = extract_storage, db = db, csv_schema = csv_schema) print 'Processed and stored %s files in %s' % (len(data), db) if run_cmds.find('l') > -1: # Load to processed content to Google Fusion username = raw_input('Google user: '******'Google password: '******'Done'
def go (): return transform(export())
def test_transform(self): df_transformed = transform(self._df_nyt_data, self._df_jh_data) exp_shape = (253, 4) self.assertTupleEqual(df_transformed.shape, exp_shape) exp_columns = list(['date', 'cases', 'deaths', 'recovered']) self.assertListEqual(list(df_transformed.columns), exp_columns)