def main(context: DurableOrchestrationContext): logging.info(f"Main ETL orchestrator has been triggered") default_retry_opts = RetryOptions( first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=6 ) retry_twice_opts = RetryOptions( first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=2 ) trigger_payload: str = context.get_input() logging.info(f"\tTrigger received: {trigger_payload}") trigger_data = loads(trigger_payload) environment = trigger_data.get("ENVIRONMENT", ENVIRONMENT) file_name: str = trigger_data['fileName'] raw_timestamp = trigger_data.get('timestamp', context.current_utc_datetime.isoformat())[:26] logging.info(f"Process timestamp: {raw_timestamp}") file_date_raw, _ = raw_timestamp.split("T") file_date = datetime.strptime(file_date_raw, "%Y-%m-%d") now = context.current_utc_datetime now = datetime( year=file_date.year, month=file_date.month, day=file_date.day, hour=now.hour, minute=now.minute, second=now.second, microsecond=now.microsecond ) # Prod registration is done through `register_file`. if not context.is_replaying and environment != "PRODUCTION": logging.info(f"Not replaying - registering '{file_name}'") register_file(filepath=file_name, timestamp=now, instance_id=context.instance_id) if not file_name.endswith("json"): context.set_custom_status(f"Identified as non-JSON: {file_name}.") _ = yield context.call_sub_orchestrator_with_retry( "chunk_etl_orchestrator", input_=dumps({ "fileName": file_name, "environment": environment, }), retry_options=retry_twice_opts ) return f"DONE: {trigger_data}" logging.info("Following the main data pathway.") # Determine whether or not the payload is for # processing legacy data. # NOTE: Legacy data do not get: # - deployed to the database, # - archived, # - a new despatch token. is_legacy = trigger_data.get("legacy", False) logging.info(f"> Legacy mode: {is_legacy}") # Generate retrieve payload retrieve_payload = { 'data_path': file_name, 'timestamp': f"{raw_timestamp:0<26}", 'legacy': is_legacy } logging.info( f'\tTrigger payload parsed - ' f'processing "{retrieve_payload["data_path"]}" @ ' f'"{retrieve_payload["timestamp"]}"' ) # Read file and split into chunks by area type logging.info(f'\tStarting the process to retrieve new data') area_data_paths = yield context.call_activity_with_retry( "main_etl_retrieve_data", input_=retrieve_payload, retry_options=retry_twice_opts ) logging.info(f'\tDOWNLOAD COMPLETE') context.set_custom_status("Data file has been parsed.") # Process chunks logging.info(f'Starting the main process') tasks = list() for data_path in area_data_paths: task = context.call_activity_with_retry( "main_etl_processor", input_=dict( data_path=data_path, timestamp=retrieve_payload['timestamp'] + "5Z", environment=environment ), retry_options=default_retry_opts ) tasks.append(task) # Await processes etl_response = yield context.task_all(tasks) logging.info(f'>>> ALL MAIN ETL PROCESSES COMPLETE - length: {len(etl_response)}') context.set_custom_status("Main ETL processes are done. Creating box plot.") if is_legacy is True: context.set_custom_status("Legacy file detected.") return f"DONE: {context.current_utc_datetime}" _ = yield context.call_activity_with_retry( "chunk_etl_postprocessing", input_={ "timestamp": now.isoformat(), "environment": environment, "category": "main" }, retry_options=retry_twice_opts ) context.set_custom_status("Deploying to the DB.") _ = yield context.call_sub_orchestrator_with_retry( "db_etl_orchestrator", input_=dumps({ "datestamp": now.isoformat(), "environment": ENVIRONMENT, "main_data_path": file_name }), retry_options=retry_twice_opts ) context.set_custom_status("Submitting main postprocessing tasks") _ = yield context.call_activity_with_retry( "main_etl_postprocessors", input_=dict( original_path=retrieve_payload['data_path'], timestamp=raw_timestamp, environment=environment ), retry_options=retry_twice_opts ) logging.info("Done with latest main_etl_postprocessors.") # ==================================================================================== tasks = list() # Retrieve scales context.set_custom_status("Requesting latest scale records.") area_types = ["nation", "region", "utla", "ltla", "msoa"] for area_type in area_types: task = context.call_activity_with_retry( "rate_scales_worker", retry_options=retry_twice_opts, input_={ "type": "RETRIEVE", "timestamp": raw_timestamp, "area_type": area_type } ) tasks.append(task) raw_scale_records = yield context.task_all(tasks) logging.info("Received latest scale records.") # ------------------------------------------------------------------------------------ context.set_custom_status("Creating post deployment tasks") # Concatenate and archive processed data archive_response = context.call_activity_with_retry( "main_etl_archiver", input_=dict( results=etl_response, original_path=retrieve_payload['data_path'], timestamp=retrieve_payload['timestamp'] + "5Z", environment=environment ), retry_options=retry_twice_opts ) logging.info("Created jobs for `main_etl_archiver`") # .................................................................................... # Pre-populate cache populate_cache = context.call_activity_with_retry( "cache_prepopulate", input_=dict( timestamp=raw_timestamp, environment=environment ), retry_options=retry_twice_opts ) logging.info("Created jobs for `cache_prepopulate`") # .................................................................................... # Send daily report email daily_report = context.call_activity( "main_etl_daily_report", input_=dict( legacy=is_legacy, timestamp=raw_timestamp, environment=environment ) ) logging.info("Created jobs for `main_etl_daily_report`") # .................................................................................... tasks = [ daily_report, archive_response, populate_cache ] # .................................................................................... # Generate rate scales for item in raw_scale_records: for record in item['records']: task = context.call_activity_with_retry( "rate_scales_worker", retry_options=retry_twice_opts, input_={ "type": "GENERATE", "date": file_date_raw, "timestamp": item["timestamp"], "area_type": record['area_type'], "area_code": record['area_code'], "rate": record['rate'], "percentiles": item['percentiles'], } ) tasks.append(task) logging.info("Created jobs for `rate_scales_worker`") # .................................................................................... context.set_custom_status("Submitting post deployment tasks") _ = yield context.task_all(tasks) context.set_custom_status("ALL done.") return f"DONE: {trigger_data}"
def orchestrator_function(context: df.DurableOrchestrationContext): """This function provides a sample for activity trigger Parameters ---------- context: DurableOrchestrationContext This context has the past history and the durable orchestration API Returns ------- message Returns the result of the activity function return values. Yields ------- call_activity: str Yields, depending on the `json_rule`, to wait on either all tasks to complete, or until one of the tasks completes. """ message = [] ret_bool = yield context.call_activity("ReturnBool", "1") message.append(f"ret_bool: {ret_bool} {type(ret_bool)}") # Not supported: return value from activity trigger "bytes" is not json serializable! # ret_bytes = yield context.call_activity("ReturnBytes", "1b2b3b") # message.append(f"ret_bytes : {ret_bytes} {type(ret_bytes)}") ret_dict_of_string = yield context.call_activity("ReturnDictOfString", "kv") message.append( f"ret_dict_of_string : {ret_dict_of_string} {type(ret_dict_of_string)}" ) ret_dict_of_string_anno = yield context.call_activity( "ReturnDictOfStringWithAnnotation", "kv_anno") message.append( f"ret_dict_of_string_anno : {ret_dict_of_string_anno} {type(ret_dict_of_string_anno)}" ) ret_float = yield context.call_activity("ReturnFloat", "123.0") message.append(f"ret_float : {ret_float} {type(ret_float)}") ret_int = yield context.call_activity("ReturnInt", "123") message.append(f"ret_int : {ret_int} {type(ret_int)}") ret_int_from_float = yield context.call_activity("ReturnIntFromFloat", 3.14) message.append( f"ret_int_from_float : {ret_int_from_float} {type(ret_int_from_float)}" ) ret_list_of_float = yield context.call_activity("ReturnListOfFloat", "4.5") message.append( f"ret_list_of_float : {ret_list_of_float} {type(ret_list_of_float)}") ret_list_of_float_anno = yield context.call_activity( "ReturnListOfFloatWithAnnotation", "5.6") message.append( f"ret_list_of_float_anno : {ret_list_of_float_anno} {type(ret_list_of_float_anno)}" ) # Not supported: return value from activity trigger "set" is not json serializable! # ret_set_of_int = yield context.call_activity("ReturnSetOfInt", 5) # message.append(f"ret_set_of_int : {ret_set_of_int} {type(ret_set_of_int)}") ret_string = yield context.call_activity("ReturnString", "simple_string") message.append(f"ret_string : {ret_string} {type(ret_string)}") return message
def orchestrator_function(context: df.DurableOrchestrationContext): result = yield context.call_activity('ActivityProcessECG', "") return [result]
def orchestrator_function(context: df.DurableOrchestrationContext): yield context.call_activity('Test-F1') result2 = yield context.call_activity('Test-F2') yield context.call_activity('Test-F3', result2)
def orchestrator_function(context: df.DurableOrchestrationContext): outputs = [] approve = yield context.call_activity("Approval", "Approved") reject = yield context.call_activity("Approval", "Rejected") return [approve, reject]
def main(context: DurableOrchestrationContext): logging.info(f"Chunk ETL orchestrator has been triggered") default_retry_opts = RetryOptions( first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=6) retry_twice_opts = RetryOptions(first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=2) trigger_payload: str = context.get_input() logging.info(f"\tTrigger received: {trigger_payload}") trigger_data = loads(trigger_payload) file_name = trigger_data["fileName"] metadata = parse_filepath(file_name) if metadata is None: # Path pattern does not conform # to the defined pattern. context.set_custom_status( "File name cannot be parsed. Process terminated.") return f"DONE: {trigger_data}" now = context.current_utc_datetime timestamp_raw = (datetime.strptime( f'{metadata["timestamp"]}{now:%S}.{now:%f}', "%Y%m%d%H%M%S.%f")) timestamp = timestamp_raw.isoformat() main_path = trigger_data['fileName'] if main_path.endswith("json"): process_name = "MAIN" else: process_name = category_label(metadata) msg = ( f'Starting to upload pre-processed data: ' f'{metadata["area_type"]}::{metadata["category"]}::{metadata["subcategory"]}' ) if (metadata["area_type"], metadata["category"]) == ("MSOA", "vaccinations-by-vaccination-date"): logging.info(msg) context.set_custom_status(msg) process_name = "MSOA: VACCINATION - EVENT DATE" _ = yield context.call_activity_with_retry( "chunk_db_direct", input_={ 'data_path': file_name, 'area_type': metadata["area_type"], 'timestamp': timestamp, 'process_name': process_name }, retry_options=retry_twice_opts) logging.info(f"DONE: {msg}") context.set_custom_status(f"DONE: {msg}") elif (metadata["area_type"], metadata["category"]) == ("MSOA", "cases-by-specimen-date"): logging.info(msg) context.set_custom_status(msg) process_name = "MSOA" _ = yield context.call_sub_orchestrator_with_retry( "msoa_etl_orchestrator", input_=dumps({ 'data_path': file_name, 'area_type': metadata["area_type"], 'timestamp': timestamp, 'process_name': process_name, 'main_data_path': file_name }), retry_options=retry_twice_opts) logging.info(f"DONE: {msg}") context.set_custom_status(f"DONE: {msg}") else: # Read file and split into chunks # by area type / area code combination. logging.info(f'\tStarting the process to retrieve new data') context.set_custom_status("Parsing the payload") area_data_paths = yield context.call_activity_with_retry( "chunk_etl_retriever", input_={ 'path': file_name, 'date': metadata["date"], 'area_type': metadata["area_type"], 'category': metadata["category"], 'subcategory': metadata["subcategory"], 'timestamp': timestamp }, retry_options=retry_twice_opts) logging.info(f'\tDOWNLOAD COMPLETE') context.set_custom_status("Payload has been parsed") # Process chunks logging.info(f'Starting the main process') tasks = list() context.set_custom_status("Submitting main ETL processes") # Create ETL tasks based on the paths # returned by `chunk_etl_retriever`. for item in area_data_paths: data_path = item.pop("path") task = context.call_activity_with_retry( "chunk_etl_processor", input_=dict(base=dict(data_path=data_path, timestamp=timestamp, environment="PRODUCTION"), **item), retry_options=default_retry_opts) tasks.append(task) context.set_custom_status("Awaiting ETL processes") # Await processes etl_response = yield context.task_all(tasks) logging.info( f'>>> ALL MAIN ETL PROCESSES COMPLETE - length: {len(etl_response)}' ) chunks_path = f"daily_chunks/{metadata['category']}/{metadata['date']}/" if metadata['subcategory']: chunks_path = f"daily_chunks/{metadata['category']}/{metadata['subcategory']}/{metadata['date']}/" # Deploy processed data to the DB. context.set_custom_status(f"Deploying to the database: {chunks_path}") _ = yield context.call_sub_orchestrator_with_retry( "db_etl_orchestrator", input_=dumps({ "datestamp": metadata['date'], "path": chunks_path, "environment": "PRODUCTION", "area_type": metadata['area_type'], "category": metadata['category'], "subcategory": metadata['subcategory'], "main_data_path": file_name }), retry_options=retry_twice_opts) context.set_custom_status("Postprocessing") _ = yield context.call_activity_with_retry( "chunk_etl_postprocessing", input_={ "timestamp": timestamp, "environment": "PRODUCTION", "category": metadata['category'], "subcategory": metadata['subcategory'] if metadata['subcategory'] != "" else None, "area_type": metadata['area_type'] if metadata['area_type'] != "" else None }, retry_options=retry_twice_opts) context.set_custom_status( "Deployment to the DB is complete, submitting postprocessing tasks.") settings_task = context.call_activity_with_retry( 'db_etl_update_db', input_=dict(date=f"{timestamp_raw:%Y-%m-%d}", process_name=process_name, environment=trigger_data['environment']), retry_options=retry_twice_opts) context.set_custom_status("Submitting main postprocessing tasks") post_processes = context.call_activity_with_retry( "main_etl_postprocessors", input_=dict(original_path=file_name, timestamp=timestamp, environment=trigger_data['environment']), retry_options=retry_twice_opts) graphs_task = context.call_activity_with_retry( 'db_etl_homepage_graphs', input_=dict(date=f"{timestamp_raw:%Y-%m-%d}", category=metadata['category'], subcategory=metadata['subcategory']), retry_options=retry_twice_opts) _ = yield context.task_all([graphs_task, settings_task, post_processes]) context.set_custom_status( "Metadata updated / graphs created / main postprocessing tasks complete. ALL DONE." ) return f"DONE: {trigger_data}"
def orchestrator_function(context: df.DurableOrchestrationContext): x = yield context.call_activity("F1", None) y = yield context.call_activity("F2", x) z = yield context.call_activity("F3", y) result = yield context.call_activity("F4", z) return result
def orchestrator_function(context: df.DurableOrchestrationContext): result1 = yield context.call_activity('E1_SayHello', "Tokyo") result2 = yield context.call_activity('E1_SayHello', "Seattle") result3 = yield context.call_activity('E1_SayHello', "London") return [result1, result2, result3]
def orchestrator_function(context: df.DurableOrchestrationContext): uri = context.get_input() result = yield context.call_http('GET', uri=uri) return [result, ]
def main(context: DurableOrchestrationContext): logging.info(f"Despatch ops orchestrator has been triggered") trigger_payload: str = context.get_input() logging.info(f"\tTrigger received: {trigger_payload}") retry_options = RetryOptions( first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=5 ) trigger_data = loads(trigger_payload) devices = [Device.desktop, Device.mobile] area_types = ["utla", "ltla", "msoa"] tasks = list() for area_type, device in product(area_types, devices): task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "map_geojson", "payload": { "area_type": area_type, "device": device, "timestamp": trigger_data["timestamp"] } } ) tasks.append(task) task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "vax_map_geojson", "payload": {"timestamp": trigger_data["timestamp"]} } ) tasks.append(task) area_types = ["nation", "region", "utla", "ltla", "msoa"] for area_type in area_types: task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "map_percentiles", "payload": { "area_type": area_type, "timestamp": trigger_data["timestamp"] } } ) tasks.append(task) task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "archive_dates", "payload": { "data_type": "MAIN", "timestamp": trigger_data["timestamp"] }} ) tasks.append(task) task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "og_images", "payload": {"timestamp": trigger_data["timestamp"]} } ) tasks.append(task) task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "sitemap", "payload": {"timestamp": trigger_data["timestamp"]} } ) tasks.append(task) task = context.call_activity_with_retry( "despatch_ops_workers", retry_options=retry_options, input_={ "handler": "landing_page_map", "payload": {"timestamp": trigger_data["timestamp"]} } ) tasks.append(task) context.set_custom_status("All jobs created - submitting for execution.") _ = yield context.task_all(tasks) context.set_custom_status("All jobs complete - updating timestamps.") tasks = list() for item in ReleaseTimestamps: processor_fn = item["process"] value = processor_fn(trigger_data["releaseTimestamp"]) task = context.call_activity_with_retry( "despatch_ops_release", retry_options=retry_options, input_={ "path": item["path"], "container": item["container"], "value": value } ) tasks.append(task) _ = yield context.task_all(tasks) context.set_custom_status("Timestamps updated - clearing Redis cache.") _ = yield context.call_sub_orchestrator_with_retry( 'cache_buster_orchestrator', input_=dumps({ "to": FLUSH_DESPATCH, "timestamp": trigger_data.get('timestamp') }), retry_options=retry_options ) context.set_custom_status(f"ALL DONE: {trigger_payload}") return f"ALL DONE: {trigger_payload}"
def orchestrator_function(context: df.DurableOrchestrationContext): result = yield context.call_activity( 'CalculateTurkeyBrineEquationAndRoastRecommendation', 7) return result
def orchestrator_function(context: df.DurableOrchestrationContext): logging.info("Orchestrator started") ## Make sure the mp4 is in the right container inputDict = json.loads(context._input) logging.info(f"inputDict: {inputDict}") # _container_ = inputDict['container'] fileURL = inputDict['fileUrl'] rowID = inputDict['RowID'] try: # imagesAlreadyCreated = inputDict['imagesAlreadyCreated'] MyFunctions.update_row_status( rowID=rowID, status=f'In Progress - {os.getenv("appName")}' ) startUTCstr = datetime.strftime(context.current_utc_datetime, "%Y-%m-%d %H:%M:%S.%f") ## Get AzureBlobVideos table from SQL, in dict form abv = MyFunctions.getAzureBlobVideos2() logging.info(f"AzureBlobVideos table retrieved, rows: {len(abv)}") ## If the video name is in the dict, extract the information try: videoName0 = inputDict['blob'] ## If last 11 characters (excluding '.mp4') follow '-YYYY-MM-DD' ## then remove them videoName = MyFunctions.cleanUpVidName(videoName0) ## Get relevant sport and event name for the video (excluding '.mp4') (videoID,sport,event, endpointID,multipleVideoEvent, samplingProportion,audioTranscript, databaseID) = abv[videoName[:-4]] ## Convert databaseID to None if it is been left empty rather than NULL databaseID = None if databaseID == "" else databaseID for metric,value in [ ("videoID",videoID), ("sport",sport), ("event",event), ("endpointID",endpointID), ("multipleVideoEvent",multipleVideoEvent), ("samplingProportion",samplingProportion), ("audioTranscript",audioTranscript), ('databaseID',databaseID) ]: logging.info(f"{metric}: {value}") ## Correct samplingProportion from nan to None if needed if pd.isna(samplingProportion): samplingProportion = None logging.info("samplingProportion changed") logging.info(f"samplingProportion: {samplingProportion}") else: logging.info("this is not True: pd.isna(samplingProportion)") except KeyError: videoID = None sport = None event = None endpointID = None multipleVideoEvent = None samplingProportion = None audioTranscript = None logging.info("Video not in AzureBlobVideos so relevant values assigned None") ## Make sure `videoName` has got a value, otherwise give it None try: videoName except NameError: videoName = None if (sport == 'baseball') & (not MyFunctions.is_uuid(inputDict['blob'])): ## Get time to cut from, using MLB API timeToCutUTC = yield context.call_activity(name='CallAPI', input_=context._input) logging.info('timeToCutUTC acquired from API') else: ## Make timeToCutUTC a time far in the future (my 100th birthday) timeToCutUTC = "2095-03-13 00:00:00.00000" logging.info("Not baseball, so distant timeToCutUTC provided") ## Get list of frame numbers to convert to JPEGs, ending at `timeToCutUTC` ## Use composite object ## - https://docs.microsoft.com/en-us/azure/azure-functions/durable/durable-functions-orchestrations?tabs=python#passing-multiple-parameters vidDets = namedtuple('VideoDetails', ['blobDetails', 'timeToCutUTC', 'frameNumberList', 'sport', 'event', 'multipleVideoEvent', 'samplingProportion']) videoDetails = vidDets(blobDetails=context._input, timeToCutUTC=timeToCutUTC, frameNumberList=None, sport=None, event=None, multipleVideoEvent=None, samplingProportion=samplingProportion) logging.info("Initial videoDetails object created") listOfFrameNumbers = yield context.call_activity( name='ReturnFrameNumbers', input_=videoDetails) logging.info(f'List of {len(json.loads(listOfFrameNumbers))} generated') # Create images from list MP4toJPEGsoutput = yield context.call_activity( name='MP4toJPEGs', input_=vidDets(blobDetails=context._input, timeToCutUTC=None, frameNumberList=listOfFrameNumbers, sport=sport, event=event, multipleVideoEvent=multipleVideoEvent, samplingProportion=samplingProportion) ) (imagesCreatedList,imagesCreatedCount, imageNames, outputContainer,outputBlobStorageAccount) = json.loads(MP4toJPEGsoutput) endUTCstr = datetime.strftime(context.current_utc_datetime, "%Y-%m-%d %H:%M:%S.%f") logging.info("Images generated!") ## If AudioTranscript value is True, copy the video to audiotranscript-files if (audioTranscript == True) | (audioTranscript == 1): viResult = yield context.call_activity( "VideoIndex", { "fileURL" : fileURL } ) ## If endpointID provided in `AzureBlobVideos`, add row to `ComputerVisionProccessingJobs` for each image if endpointID is not None: ## If DatabaseID column empty in AzureBlobVideo, follow the same ## and VERY SLOW old way of doing things if databaseID is None: logging.info("endpointID given but no databaseID") ## Create composite object to use QueueDetails = namedtuple('QueueDetails', [ 'endpointID', 'sport', 'event', 'blobDetails', 'frameNumberList', 'imagesCreatedList', 'imageNames' ]) ocr_result = yield context.call_activity( name="QueueProcessingJobs", input_=QueueDetails( endpointID=endpointID, sport=sport, event=event, blobDetails=context._input, frameNumberList=listOfFrameNumbers, imagesCreatedList=imagesCreatedList, imageNames=imageNames ) ) else: logging.info("both endpointID and databaseID given") ocr_result = yield context.call_activity( name='QueueOcrEvent', input_={ 'JobCreatedBy' : 'FuturesVideoJPEGing', 'JobPriority' : 10, 'ClientDatabaseId' : databaseID, 'EndpointId' : endpointID, 'Sport' : sport, 'SportsEvent' : event, 'NumberOfImages' : len(json.loads(listOfFrameNumbers)) } ) ## Add line to SQL - using another composite object UploadDetails = namedtuple('UploadDetails', ['startUTC', 'endUTC', 'videoID', 'videoName', 'event', 'outputContainer', 'outputBlobStorageAccount', 'imagesCreatedCount']) wts_result = yield context.call_activity( name='WriteToSQL', input_=UploadDetails( startUTC=startUTCstr, endUTC=endUTCstr, videoID=videoID, videoName=videoName, event=event, outputContainer=outputContainer, outputBlobStorageAccount=outputBlobStorageAccount, imagesCreatedCount=imagesCreatedCount) ) ## Update row status MyFunctions.update_row_status( rowID=rowID, status="Finished" ) logging.info("row updated to `Finished`") return f"{ocr_result} & {wts_result}" if endpointID is not None else wts_result except Exception as error: ## Update row status MyFunctions.update_row_status( rowID=rowID, status="Error" ) logging.info("row updated to `Error`") # logging.error(error) ## Raise error raise Exception("".join(traceback.TracebackException.from_exception(error).format()))
def orchestrator_function(context: df.DurableOrchestrationContext): # get creds from body creds = context.get_input() # start etl process with creds result1 = yield context.call_activity('RunETL', creds) return result1
def main(context: DurableOrchestrationContext): retry_twice_opts = RetryOptions(first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=2) timestamp = context.current_utc_datetime trigger_payload = loads(context.get_input()) logging.info(f"triggered with payload: {trigger_payload}") # ------------------------------------------------------------------------------------ # Retrieve blob paths # ------------------------------------------------------------------------------------ context.set_custom_status("Retrieving artefacts") logging.info("retrieving artefacts") task_artefacts = list() for task_manifest in housekeeping_tasks: logging.info(f"submitting '{task_manifest['label']}' to retriever") artefacts = context.call_activity_with_retry( "housekeeping_retriever", input_=RetrieverPayload(timestamp=timestamp.isoformat(), environment=trigger_payload['environment'], manifest=task_manifest), retry_options=retry_twice_opts) task_artefacts.append(artefacts) logging.info("awaiting retriever tasks") retrieved_artefacts = yield context.task_all(task_artefacts) # ------------------------------------------------------------------------------------ # Submit for archiving # ------------------------------------------------------------------------------------ context.set_custom_status("Submitting candidates to the archiver") logging.info("submitting candidates to the archiver") archive_modes = [ProcessMode.ARCHIVE_AND_DISPOSE, ProcessMode.ARCHIVE_ONLY] activities = list() for task in chain(*retrieved_artefacts): logging.info(f"submitting '{task['manifest']['label']}' to archiver") if task["manifest"]["mode"] not in archive_modes: logging.info("-- not archived") continue activity = context.call_activity_with_retry( "housekeeping_archiver", input_=task, retry_options=retry_twice_opts) activities.append(activity) logging.info("awaiting archiver tasks") archived_artefacts = yield context.task_all(activities) # ------------------------------------------------------------------------------------ # Dispose of archived blobs # ------------------------------------------------------------------------------------ context.set_custom_status("Removing archived data") logging.info("removing archived data") disposable_only = filter( lambda t: t['manifest']['mode'] == ProcessMode.DISPOSE_ONLY, chain(*retrieved_artefacts)) disposal_modes = [ ProcessMode.ARCHIVE_AND_DISPOSE, ProcessMode.DISPOSE_ONLY ] activities = list() for task in chain(archived_artefacts, disposable_only): logging.info(f"submitting '{task['manifest']['label']}' to disposer") if task["manifest"]["mode"] not in disposal_modes: logging.info("-- not disposed") continue activity = context.call_activity_with_retry( "housekeeping_disposer", input_=task, retry_options=retry_twice_opts) activities.append(activity) logging.info("awaiting disposer tasks") report = yield context.task_all(activities) # ------------------------------------------------------------------------------------ context.set_custom_status( f"ALL DONE - processed {report['total_processed']} artefacts") return f"DONE - {timestamp.isoformat()}"
def orchestrator_function(context: df.DurableOrchestrationContext): url = "https://fortesting.azurewebsites.net/api/Function1?code=caATd9U/wemV9vBy7ySFHfiEJCfQr0QZYzCdzGBHvIkWapwdjvjV1g==" result = yield context.call_activity('Activity', url) return [result]
def orchestrator_function(context: df.DurableOrchestrationContext): input = context.get_input() runs = input['runs'] data = {'y': input['y'], 'x': input['x'], 'regimes': input['regimes'], 'maxit': input['maxit'], 'onesigma': input['onesigma'], 'disp': input['disp'], 'convergence': input['convergence']} # This section runs the optimizations in parallel. parallel_tasks = [] for r in range(runs): data['run_number'] = r data_str = json.dumps(data) parallel_tasks.append( context.call_activity('hmarkov_em_solve', data_str)) parallel_outputs = yield context.task_all(parallel_tasks) # # This section runs sequentially, and it's intended as a comparison # # to test the # # parallelization of the previous section. # # parallel_outputs = [] # for r in range(runs): # data['run_number'] = r # data_str = json.dumps(data) # o = yield context.call_activity('hmarkov_em_solve', data_str) # parallel_outputs.append(o) outputs = [json.loads(po) for po in parallel_outputs] llf_all = np.asarray([o['llf'] for o in outputs], dtype=np.float64) llf_argmax = np.argmax(llf_all) llf_max = llf_all[llf_argmax] # Note we take the maximum of the likelihoods. This is because # we're using an iterative routine from James D. Hamilton. If we # were calling an optimizer, people would usually miminize the # negative log likelihood. res = outputs[llf_argmax] res['llf_all'] = llf_all.tolist() res['llf_max'] = llf_max iterations_all = np.asarray([o['iterations'] for o in outputs], dtype=np.float64) res['iterations_all'] = iterations_all.tolist() res['iterations'] = iterations_all[llf_argmax] res['time_start'] = input['start_time'] function_times = list() for o in outputs: function_times.append( {'run_number': o['run_number'], 'start_time': o['start_time'], 'end_time': o['end_time'], }) res['function_times'] = function_times if input['disp']: logs = [o['log_info'] for o in outputs] res['logs_all'] = logs # Output paths to be saved to Azure blob. full_path = f"{input['account_url']}{input['base_container']}/{input['unique_id']}/" res['input_file'] = f'{full_path}inputs.json' res['output_file'] = f'{full_path}outputs.json' res['chart_file'] = f'{full_path}charts.png' if input['create_charts'] else None res['account_url'] = input['account_url'] # Charts and t-statistics and email if optioned. parallel_tasks_2 = [] if input['tstats']: parallel_tasks_2.append( context.call_activity('hmarkov_t_statistics', json.dumps(res))) if input['create_charts']: charts_str = json.dumps( {'res': res, 'container_name': input['base_container'], 'blob_name': input['unique_id']}) parallel_tasks_2.append( context.call_activity('hmarkov_charts', charts_str)) if len(parallel_tasks_2) > 0: parallel_outputs_2 = yield context.task_all(parallel_tasks_2) if input['tstats']: tstat_results = json.loads(parallel_outputs_2[0]) res['t_statistics'] = tstat_results['t_statistics'] res['t_statistics_time'] = tstat_results['t_statistics_time'] res['hessian'] = tstat_results['hessian'] # Save results to blob. out = json.dumps(res, indent='\t', sort_keys=True) save_params = json.dumps({ 'out': out, 'account_url': input['account_url'], 'container': input['base_container'], 'blob': input['outputs_file_name']}) yield context.call_activity('hmarkov_save_outputs', save_params) # After everything else is done, send an email if optioned. if input['notification_email_address'] is not None: email_str = json.dumps({'input_file': res['input_file'], 'output_file': res['output_file'], 'chart_file': res['chart_file'], 'time_total': res['time_solve'], 'email_address': input['notification_email_address']}) yield context.call_activity('hmarkov_email', email_str) # return None as a string because the Durable Function # extension seems to work well with strings. return 'None'
def orchestrator_function(context: df.DurableOrchestrationContext): # --- start transferring data to the google sheet service result5 = yield context.call_activity('google_spreadsheets_activity') return [result5]
def main(context: DurableOrchestrationContext): logging.info(f"DB ETL orchestrator has been triggered") retry_options = RetryOptions(first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=5) trigger_payload: str = context.get_input() logging.info(f"\tTrigger received: {trigger_payload}") trigger_data = loads(trigger_payload) timestamp = trigger_data["datestamp"] datestamp = trigger_data["datestamp"].split("T")[0] if "path" in trigger_data: paths = [trigger_data["path"]] else: paths = [ # f"daily_chunks/specimen_date_cases/by_age/{datestamp}/", # f"daily_chunks/deaths_28days_death_date/by_age/{datestamp}/", f"daily_chunks/main/{datestamp}/" ] category = trigger_data.get("category", "main") main_path = trigger_data['main_data_path'] if main_path.endswith("json"): process_name = "MAIN" else: parsed_path = parse_filepath(main_path) process_name = category_label(parsed_path) tasks = list() if len(timestamp) > 10: now = datetime.fromisoformat(timestamp) else: file_date = datetime.strptime(datestamp, "%Y-%m-%d") now = context.current_utc_datetime now = datetime(year=file_date.year, month=file_date.month, day=file_date.day, hour=now.hour, minute=now.minute, second=now.second, microsecond=now.microsecond) release_id, timestamp = get_release_id(now, process_name) set_file_releaseid(filepath=trigger_data["main_data_path"], release_id=release_id) payload = {"timestamp": timestamp.isoformat()} for path in paths: with StorageClient(container="pipeline", path=path) as client: for file in client: payload.update({'file_path': file['name']}) task = context.call_activity_with_retry( "db_etl_upload", retry_options=retry_options, input_=payload) tasks.append(task) _ = yield context.task_all(tasks) context.set_custom_status("Upload to database is complete.") if category != "main": # Categories other than main may have DB level processes. These # need to be performed before stats and graphs are generated. # Processes for stats and graphs are therefore moved to chunk # processor. context.set_custom_status( "Chunk deployment is done. Remaining processes are skipped.") return f"DONE: {trigger_payload}" settings_task = context.call_activity_with_retry( 'db_etl_update_db', input_=dict(date=f"{now:%Y-%m-%d}", process_name=process_name, environment=trigger_data.get('environment', "PRODUCTION")), retry_options=retry_options) graphs_task = context.call_activity_with_retry('db_etl_homepage_graphs', input_=dict( date=f"{now:%Y-%m-%d}", category=category), retry_options=retry_options) _ = yield context.task_all([settings_task, graphs_task]) context.set_custom_status("Metadata updated / graphs created.") return f"DONE: {trigger_payload}"
def orchestrator_function(context: df.DurableOrchestrationContext): input_ = context.get_input() result1 = yield context.call_activity('Hello', input_) return result1