def get_config(provider, config_path=None): """ Obtain provider's configuration data from the given file path, or the default file path if None. """ if config_path: return mds.ConfigFile(config_path, provider).dump() elif pathlib.Path("./config.json").exists(): return mds.ConfigFile("./config.json", provider).dump() else: return {}
def load_to_s3_pgdb(**kwargs): """ Python operator to load data to s3 for an operator and the database """ # load config from s3 s3 = connect_aws_s3() try: s3.Bucket("city-of-los-angeles-data-lake").download_file( "dockless/config.json", "/tmp/config.json" ) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": print("The object does not exist.") else: raise company = kwargs["params"]["company"] config = mds.ConfigFile("/tmp/config.json", company) logging.info("Downloaded and parsed config from S3") # assert the version parameter version = getattr(config, "version", "0.3.2") # set company logging.info(f"set company to {company}") logging.info(f"Referencing MDS @ {version}") # load company client = mds.Client(company, config, version=version) execution_date = kwargs["execution_date"] # test is provider is jump, up hours because their ETL is slow. if client.provider.provider_id == "c20e08cf-8488-46a6-a66c-5d8fb827f7e0": end_time = execution_date - timedelta(hours=25) start_time = end_time - timedelta(hours=12) else: end_time = execution_date start_time = end_time - timedelta(hours=12) status_changes = client.get_status_changes(end_time=end_time, start_time=start_time) obj = s3.Object( "city-of-los-angeles-data-lake", f"dockless/data/{company}/status_changes/{kwargs['ts']}.json", ) obj.put(Body=json.dumps(status_changes)) logging.info(f"Wrote {company} status changes to s3") # query trips trips = client.get_trips(end_time=end_time, start_time=start_time) obj = s3.Object( "city-of-los-angeles-data-lake", f"dockless/data/{company}/trips/{kwargs['ts']}.json", ) obj.put(Body=json.dumps(trips)) logging.info(f"Wrote {company} trips to s3") logging.info("Connecting to DB") logging.info(f"Logging into postgres") db = mds.Database(uri=POSTGRES_URI, version=version) if len(status_changes) != 0: logging.info("loading {company} status changes into DB") db.load_status_changes( source=status_changes, stage_first=5, before_load=normalize_status_changes ) else: logging.info( "Warning: not loading status change data for {company} as no data was " "received" ) if len(trips) != 0: logging.info("loading {company} trips into DB") db.load_trips(source=trips, stage_first=5, before_load=normalize_trips) else: logging.info( "Warning: not loading trip data for {company} as no data was received" ) return True
def load_to_s3(**kwargs): """ Python operator to load data to s3 for an operator and the database """ # load config from s3 s3 = connect_aws_s3() try: s3.Bucket('city-of-los-angeles-data-lake').download_file( 'dockless/config.json', '/tmp/config.json') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist.") else: raise company = kwargs['params']['company'] config = mds.ConfigFile('/tmp/config.json', company) logging.info("Downloaded and parsed config from S3") # assert the version parameter version = getattr(config, 'version', '0.3.2') # set company logging.info(f"set company to {company}") logging.info(f"Referencing MDS @ {version}") # load company client = mds.Client(company, config, version=version) end_time = kwargs['execution_date'] ## test is provider is jump, up hours because their ETL is slow. if client.provider.provider_id == 'c20e08cf-8488-46a6-a66c-5d8fb827f7e0': start_time = end_time - timedelta(hours=25) else: start_time = end_time - timedelta(hours=12) status_changes = client.get_status_changes(end_time=end_time, start_time=start_time) obj = s3.Object( 'city-of-los-angeles-data-lake', f"dockless/data/{company}/status_changes/{kwargs['ts']}.json") obj.put(Body=json.dumps(status_changes)) logging.info(f"Wrote {company} status changes to s3") # query trips trips = client.get_trips(end_time=end_time, start_time=start_time) obj = s3.Object('city-of-los-angeles-data-lake', f"dockless/data/{company}/trips/{kwargs['ts']}.json") obj.put(Body=json.dumps(trips)) logging.info(f"Wrote {company} trips to s3") logging.info("Connecting to DB") user = pg_conn.login password = pg_conn.get_password() host = pg_conn.host dbname = pg_conn.schema logging.info(f"Logging into postgres://-----:----@{host}:5432/{dbname}") db = mds.Database(uri=f'postgres://{user}:{password}@{host}:5432/{dbname}', version=version) logging.info("loading {company} status changes into DB") db.load_status_changes(source=status_changes, stage_first=5, before_load=normalize_status_changes) logging.info("loading {company} trips into DB") db.load_trips(source=trips, stage_first=5, before_load=normalize_trips) return True