def main(context: DurableOrchestrationContext):
    logging.info(f"Main ETL orchestrator has been triggered")

    default_retry_opts = RetryOptions(
        first_retry_interval_in_milliseconds=5_000,
        max_number_of_attempts=6
    )

    retry_twice_opts = RetryOptions(
        first_retry_interval_in_milliseconds=5_000,
        max_number_of_attempts=2
    )

    trigger_payload: str = context.get_input()
    logging.info(f"\tTrigger received: {trigger_payload}")

    trigger_data = loads(trigger_payload)
    environment = trigger_data.get("ENVIRONMENT", ENVIRONMENT)

    file_name: str = trigger_data['fileName']

    raw_timestamp = trigger_data.get('timestamp', context.current_utc_datetime.isoformat())[:26]
    logging.info(f"Process timestamp: {raw_timestamp}")

    file_date_raw, _ = raw_timestamp.split("T")
    file_date = datetime.strptime(file_date_raw, "%Y-%m-%d")
    now = context.current_utc_datetime
    now = datetime(
        year=file_date.year,
        month=file_date.month,
        day=file_date.day,
        hour=now.hour,
        minute=now.minute,
        second=now.second,
        microsecond=now.microsecond
    )

    # Prod registration is done through `register_file`.
    if not context.is_replaying and environment != "PRODUCTION":
        logging.info(f"Not replaying - registering '{file_name}'")
        register_file(filepath=file_name, timestamp=now, instance_id=context.instance_id)

    if not file_name.endswith("json"):
        context.set_custom_status(f"Identified as non-JSON: {file_name}.")
        _ = yield context.call_sub_orchestrator_with_retry(
            "chunk_etl_orchestrator",
            input_=dumps({
                "fileName": file_name,
                "environment": environment,
            }),
            retry_options=retry_twice_opts
        )

        return f"DONE: {trigger_data}"

    logging.info("Following the main data pathway.")

    # Determine whether or not the payload is for
    # processing legacy data.
    # NOTE: Legacy data do not get:
    #       - deployed to the database,
    #       - archived,
    #       - a new despatch token.
    is_legacy = trigger_data.get("legacy", False)
    logging.info(f"> Legacy mode: {is_legacy}")

    # Generate retrieve payload
    retrieve_payload = {
        'data_path': file_name,
        'timestamp':  f"{raw_timestamp:0<26}",
        'legacy': is_legacy
    }
    logging.info(
        f'\tTrigger payload parsed - '
        f'processing "{retrieve_payload["data_path"]}" @ '
        f'"{retrieve_payload["timestamp"]}"'
    )

    # Read file and split into chunks by area type
    logging.info(f'\tStarting the process to retrieve new data')
    area_data_paths = yield context.call_activity_with_retry(
        "main_etl_retrieve_data",
        input_=retrieve_payload,
        retry_options=retry_twice_opts
    )
    logging.info(f'\tDOWNLOAD COMPLETE')
    context.set_custom_status("Data file has been parsed.")

    # Process chunks
    logging.info(f'Starting the main process')

    tasks = list()
    for data_path in area_data_paths:
        task = context.call_activity_with_retry(
            "main_etl_processor",
            input_=dict(
                data_path=data_path,
                timestamp=retrieve_payload['timestamp'] + "5Z",
                environment=environment
            ),
            retry_options=default_retry_opts
        )
        tasks.append(task)

    # Await processes
    etl_response = yield context.task_all(tasks)
    logging.info(f'>>> ALL MAIN ETL PROCESSES COMPLETE - length: {len(etl_response)}')
    context.set_custom_status("Main ETL processes are done. Creating box plot.")

    if is_legacy is True:
        context.set_custom_status("Legacy file detected.")
        return f"DONE: {context.current_utc_datetime}"

    _ = yield context.call_activity_with_retry(
        "chunk_etl_postprocessing",
        input_={
            "timestamp": now.isoformat(),
            "environment": environment,
            "category": "main"
        },
        retry_options=retry_twice_opts
    )

    context.set_custom_status("Deploying to the DB.")

    _ = yield context.call_sub_orchestrator_with_retry(
        "db_etl_orchestrator",
        input_=dumps({
            "datestamp": now.isoformat(),
            "environment": ENVIRONMENT,
            "main_data_path": file_name
        }),
        retry_options=retry_twice_opts
    )

    context.set_custom_status("Submitting main postprocessing tasks")
    _ = yield context.call_activity_with_retry(
        "main_etl_postprocessors",
        input_=dict(
            original_path=retrieve_payload['data_path'],
            timestamp=raw_timestamp,
            environment=environment
        ),
        retry_options=retry_twice_opts
    )
    logging.info("Done with latest main_etl_postprocessors.")

    # ====================================================================================

    tasks = list()

    # Retrieve scales
    context.set_custom_status("Requesting latest scale records.")

    area_types = ["nation", "region", "utla", "ltla", "msoa"]

    for area_type in area_types:
        task = context.call_activity_with_retry(
            "rate_scales_worker",
            retry_options=retry_twice_opts,
            input_={
                "type": "RETRIEVE",
                "timestamp": raw_timestamp,
                "area_type": area_type
            }
        )
        tasks.append(task)

    raw_scale_records = yield context.task_all(tasks)
    logging.info("Received latest scale records.")

    # ------------------------------------------------------------------------------------

    context.set_custom_status("Creating post deployment tasks")

    # Concatenate and archive processed data
    archive_response = context.call_activity_with_retry(
        "main_etl_archiver",
        input_=dict(
            results=etl_response,
            original_path=retrieve_payload['data_path'],
            timestamp=retrieve_payload['timestamp'] + "5Z",
            environment=environment
        ),
        retry_options=retry_twice_opts
    )
    logging.info("Created jobs for `main_etl_archiver`")

    # ....................................................................................
    # Pre-populate cache

    populate_cache = context.call_activity_with_retry(
        "cache_prepopulate",
        input_=dict(
            timestamp=raw_timestamp,
            environment=environment
        ),
        retry_options=retry_twice_opts
    )
    logging.info("Created jobs for `cache_prepopulate`")

    # ....................................................................................

    # Send daily report email
    daily_report = context.call_activity(
        "main_etl_daily_report",
        input_=dict(
            legacy=is_legacy,
            timestamp=raw_timestamp,
            environment=environment
        )
    )
    logging.info("Created jobs for `main_etl_daily_report`")

    # ....................................................................................

    tasks = [
        daily_report,
        archive_response,
        populate_cache
    ]

    # ....................................................................................
    # Generate rate scales

    for item in raw_scale_records:
        for record in item['records']:
            task = context.call_activity_with_retry(
                "rate_scales_worker",
                retry_options=retry_twice_opts,
                input_={
                    "type": "GENERATE",
                    "date": file_date_raw,
                    "timestamp": item["timestamp"],
                    "area_type": record['area_type'],
                    "area_code": record['area_code'],
                    "rate": record['rate'],
                    "percentiles": item['percentiles'],
                }
            )
            tasks.append(task)

    logging.info("Created jobs for `rate_scales_worker`")
    # ....................................................................................

    context.set_custom_status("Submitting post deployment tasks")
    _ = yield context.task_all(tasks)
    context.set_custom_status("ALL done.")

    return f"DONE: {trigger_data}"
def orchestrator_function(context: df.DurableOrchestrationContext):
    """This function provides a sample for activity trigger

    Parameters
    ----------
    context: DurableOrchestrationContext
        This context has the past history and the durable orchestration API

    Returns
    -------
    message
        Returns the result of the activity function return values.

    Yields
    -------
    call_activity: str
        Yields, depending on the `json_rule`, to wait on either all
        tasks to complete, or until one of the tasks completes.
    """

    message = []

    ret_bool = yield context.call_activity("ReturnBool", "1")
    message.append(f"ret_bool: {ret_bool} {type(ret_bool)}")

    # Not supported: return value from activity trigger "bytes" is not json serializable!
    # ret_bytes = yield context.call_activity("ReturnBytes", "1b2b3b")
    # message.append(f"ret_bytes : {ret_bytes} {type(ret_bytes)}")

    ret_dict_of_string = yield context.call_activity("ReturnDictOfString",
                                                     "kv")
    message.append(
        f"ret_dict_of_string : {ret_dict_of_string} {type(ret_dict_of_string)}"
    )

    ret_dict_of_string_anno = yield context.call_activity(
        "ReturnDictOfStringWithAnnotation", "kv_anno")
    message.append(
        f"ret_dict_of_string_anno : {ret_dict_of_string_anno} {type(ret_dict_of_string_anno)}"
    )

    ret_float = yield context.call_activity("ReturnFloat", "123.0")
    message.append(f"ret_float : {ret_float} {type(ret_float)}")

    ret_int = yield context.call_activity("ReturnInt", "123")
    message.append(f"ret_int : {ret_int} {type(ret_int)}")

    ret_int_from_float = yield context.call_activity("ReturnIntFromFloat",
                                                     3.14)
    message.append(
        f"ret_int_from_float : {ret_int_from_float} {type(ret_int_from_float)}"
    )

    ret_list_of_float = yield context.call_activity("ReturnListOfFloat", "4.5")
    message.append(
        f"ret_list_of_float : {ret_list_of_float} {type(ret_list_of_float)}")

    ret_list_of_float_anno = yield context.call_activity(
        "ReturnListOfFloatWithAnnotation", "5.6")
    message.append(
        f"ret_list_of_float_anno : {ret_list_of_float_anno} {type(ret_list_of_float_anno)}"
    )

    # Not supported: return value from activity trigger "set" is not json serializable!
    # ret_set_of_int = yield context.call_activity("ReturnSetOfInt", 5)
    # message.append(f"ret_set_of_int : {ret_set_of_int} {type(ret_set_of_int)}")

    ret_string = yield context.call_activity("ReturnString", "simple_string")
    message.append(f"ret_string : {ret_string} {type(ret_string)}")

    return message
def orchestrator_function(context: df.DurableOrchestrationContext):
    result = yield context.call_activity('ActivityProcessECG', "")
    return [result]
def orchestrator_function(context: df.DurableOrchestrationContext):
    yield context.call_activity('Test-F1')
    result2 = yield context.call_activity('Test-F2')
    yield context.call_activity('Test-F3', result2)
Esempio n. 5
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    outputs = []
    approve = yield context.call_activity("Approval", "Approved")
    reject = yield context.call_activity("Approval", "Rejected")

    return [approve, reject]
def main(context: DurableOrchestrationContext):
    logging.info(f"Chunk ETL orchestrator has been triggered")

    default_retry_opts = RetryOptions(
        first_retry_interval_in_milliseconds=5_000, max_number_of_attempts=6)

    retry_twice_opts = RetryOptions(first_retry_interval_in_milliseconds=5_000,
                                    max_number_of_attempts=2)

    trigger_payload: str = context.get_input()
    logging.info(f"\tTrigger received: {trigger_payload}")

    trigger_data = loads(trigger_payload)

    file_name = trigger_data["fileName"]

    metadata = parse_filepath(file_name)

    if metadata is None:
        # Path pattern does not conform
        # to the defined pattern.
        context.set_custom_status(
            "File name cannot be parsed. Process terminated.")
        return f"DONE: {trigger_data}"

    now = context.current_utc_datetime
    timestamp_raw = (datetime.strptime(
        f'{metadata["timestamp"]}{now:%S}.{now:%f}', "%Y%m%d%H%M%S.%f"))
    timestamp = timestamp_raw.isoformat()

    main_path = trigger_data['fileName']
    if main_path.endswith("json"):
        process_name = "MAIN"
    else:
        process_name = category_label(metadata)

    msg = (
        f'Starting to upload pre-processed data: '
        f'{metadata["area_type"]}::{metadata["category"]}::{metadata["subcategory"]}'
    )

    if (metadata["area_type"],
            metadata["category"]) == ("MSOA",
                                      "vaccinations-by-vaccination-date"):
        logging.info(msg)
        context.set_custom_status(msg)

        process_name = "MSOA: VACCINATION - EVENT DATE"

        _ = yield context.call_activity_with_retry(
            "chunk_db_direct",
            input_={
                'data_path': file_name,
                'area_type': metadata["area_type"],
                'timestamp': timestamp,
                'process_name': process_name
            },
            retry_options=retry_twice_opts)
        logging.info(f"DONE: {msg}")
        context.set_custom_status(f"DONE: {msg}")

    elif (metadata["area_type"],
          metadata["category"]) == ("MSOA", "cases-by-specimen-date"):
        logging.info(msg)
        context.set_custom_status(msg)

        process_name = "MSOA"

        _ = yield context.call_sub_orchestrator_with_retry(
            "msoa_etl_orchestrator",
            input_=dumps({
                'data_path': file_name,
                'area_type': metadata["area_type"],
                'timestamp': timestamp,
                'process_name': process_name,
                'main_data_path': file_name
            }),
            retry_options=retry_twice_opts)
        logging.info(f"DONE: {msg}")
        context.set_custom_status(f"DONE: {msg}")

    else:
        # Read file and split into chunks
        # by area type / area code combination.
        logging.info(f'\tStarting the process to retrieve new data')
        context.set_custom_status("Parsing the payload")
        area_data_paths = yield context.call_activity_with_retry(
            "chunk_etl_retriever",
            input_={
                'path': file_name,
                'date': metadata["date"],
                'area_type': metadata["area_type"],
                'category': metadata["category"],
                'subcategory': metadata["subcategory"],
                'timestamp': timestamp
            },
            retry_options=retry_twice_opts)
        logging.info(f'\tDOWNLOAD COMPLETE')
        context.set_custom_status("Payload has been parsed")

        # Process chunks
        logging.info(f'Starting the main process')

        tasks = list()
        context.set_custom_status("Submitting main ETL processes")

        # Create ETL tasks based on the paths
        # returned by `chunk_etl_retriever`.
        for item in area_data_paths:
            data_path = item.pop("path")
            task = context.call_activity_with_retry(
                "chunk_etl_processor",
                input_=dict(base=dict(data_path=data_path,
                                      timestamp=timestamp,
                                      environment="PRODUCTION"),
                            **item),
                retry_options=default_retry_opts)
            tasks.append(task)

        context.set_custom_status("Awaiting ETL processes")
        # Await processes
        etl_response = yield context.task_all(tasks)
        logging.info(
            f'>>> ALL MAIN ETL PROCESSES COMPLETE - length: {len(etl_response)}'
        )

        chunks_path = f"daily_chunks/{metadata['category']}/{metadata['date']}/"
        if metadata['subcategory']:
            chunks_path = f"daily_chunks/{metadata['category']}/{metadata['subcategory']}/{metadata['date']}/"

        # Deploy processed data to the DB.
        context.set_custom_status(f"Deploying to the database: {chunks_path}")

        _ = yield context.call_sub_orchestrator_with_retry(
            "db_etl_orchestrator",
            input_=dumps({
                "datestamp": metadata['date'],
                "path": chunks_path,
                "environment": "PRODUCTION",
                "area_type": metadata['area_type'],
                "category": metadata['category'],
                "subcategory": metadata['subcategory'],
                "main_data_path": file_name
            }),
            retry_options=retry_twice_opts)

    context.set_custom_status("Postprocessing")
    _ = yield context.call_activity_with_retry(
        "chunk_etl_postprocessing",
        input_={
            "timestamp":
            timestamp,
            "environment":
            "PRODUCTION",
            "category":
            metadata['category'],
            "subcategory":
            metadata['subcategory'] if metadata['subcategory'] != "" else None,
            "area_type":
            metadata['area_type'] if metadata['area_type'] != "" else None
        },
        retry_options=retry_twice_opts)
    context.set_custom_status(
        "Deployment to the DB is complete, submitting postprocessing tasks.")

    settings_task = context.call_activity_with_retry(
        'db_etl_update_db',
        input_=dict(date=f"{timestamp_raw:%Y-%m-%d}",
                    process_name=process_name,
                    environment=trigger_data['environment']),
        retry_options=retry_twice_opts)

    context.set_custom_status("Submitting main postprocessing tasks")
    post_processes = context.call_activity_with_retry(
        "main_etl_postprocessors",
        input_=dict(original_path=file_name,
                    timestamp=timestamp,
                    environment=trigger_data['environment']),
        retry_options=retry_twice_opts)

    graphs_task = context.call_activity_with_retry(
        'db_etl_homepage_graphs',
        input_=dict(date=f"{timestamp_raw:%Y-%m-%d}",
                    category=metadata['category'],
                    subcategory=metadata['subcategory']),
        retry_options=retry_twice_opts)

    _ = yield context.task_all([graphs_task, settings_task, post_processes])

    context.set_custom_status(
        "Metadata updated / graphs created / main postprocessing tasks complete. ALL DONE."
    )

    return f"DONE: {trigger_data}"
Esempio n. 7
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    x = yield context.call_activity("F1", None)
    y = yield context.call_activity("F2", x)
    z = yield context.call_activity("F3", y)
    result = yield context.call_activity("F4", z)
    return result
def orchestrator_function(context: df.DurableOrchestrationContext):
    result1 = yield context.call_activity('E1_SayHello', "Tokyo")
    result2 = yield context.call_activity('E1_SayHello', "Seattle")
    result3 = yield context.call_activity('E1_SayHello', "London")
    return [result1, result2, result3]
Esempio n. 9
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    uri = context.get_input()
    result = yield context.call_http('GET', uri=uri)
    return [result, ]
def main(context: DurableOrchestrationContext):
    logging.info(f"Despatch ops orchestrator has been triggered")

    trigger_payload: str = context.get_input()
    logging.info(f"\tTrigger received: {trigger_payload}")

    retry_options = RetryOptions(
        first_retry_interval_in_milliseconds=5_000,
        max_number_of_attempts=5
    )

    trigger_data = loads(trigger_payload)

    devices = [Device.desktop, Device.mobile]
    area_types = ["utla", "ltla", "msoa"]

    tasks = list()
    for area_type, device in product(area_types, devices):
        task = context.call_activity_with_retry(
            "despatch_ops_workers",
            retry_options=retry_options,
            input_={
                "handler": "map_geojson",
                "payload": {
                    "area_type": area_type,
                    "device": device,
                    "timestamp": trigger_data["timestamp"]
                }
            }
        )

        tasks.append(task)

    task = context.call_activity_with_retry(
        "despatch_ops_workers",
        retry_options=retry_options,
        input_={
            "handler": "vax_map_geojson",
            "payload": {"timestamp": trigger_data["timestamp"]}
        }
    )

    tasks.append(task)

    area_types = ["nation", "region", "utla", "ltla", "msoa"]
    for area_type in area_types:
        task = context.call_activity_with_retry(
            "despatch_ops_workers",
            retry_options=retry_options,
            input_={
                "handler": "map_percentiles",
                "payload": {
                    "area_type": area_type,
                    "timestamp": trigger_data["timestamp"]
                }
            }
        )

        tasks.append(task)

    task = context.call_activity_with_retry(
        "despatch_ops_workers",
        retry_options=retry_options,
        input_={
            "handler": "archive_dates",
            "payload": {
                "data_type": "MAIN",
                "timestamp": trigger_data["timestamp"]
            }}
    )
    tasks.append(task)

    task = context.call_activity_with_retry(
        "despatch_ops_workers",
        retry_options=retry_options,
        input_={
            "handler": "og_images",
            "payload": {"timestamp": trigger_data["timestamp"]}
        }
    )
    tasks.append(task)

    task = context.call_activity_with_retry(
        "despatch_ops_workers",
        retry_options=retry_options,
        input_={
            "handler": "sitemap",
            "payload": {"timestamp": trigger_data["timestamp"]}
        }
    )
    tasks.append(task)

    task = context.call_activity_with_retry(
        "despatch_ops_workers",
        retry_options=retry_options,
        input_={
            "handler": "landing_page_map",
            "payload": {"timestamp": trigger_data["timestamp"]}
        }
    )
    tasks.append(task)

    context.set_custom_status("All jobs created - submitting for execution.")
    _ = yield context.task_all(tasks)

    context.set_custom_status("All jobs complete - updating timestamps.")

    tasks = list()
    for item in ReleaseTimestamps:
        processor_fn = item["process"]
        value = processor_fn(trigger_data["releaseTimestamp"])

        task = context.call_activity_with_retry(
            "despatch_ops_release",
            retry_options=retry_options,
            input_={
                "path": item["path"],
                "container": item["container"],
                "value": value
            }
        )

        tasks.append(task)

    _ = yield context.task_all(tasks)
    context.set_custom_status("Timestamps updated - clearing Redis cache.")

    _ = yield context.call_sub_orchestrator_with_retry(
        'cache_buster_orchestrator',
        input_=dumps({
            "to": FLUSH_DESPATCH,
            "timestamp": trigger_data.get('timestamp')
        }),
        retry_options=retry_options
    )

    context.set_custom_status(f"ALL DONE: {trigger_payload}")

    return f"ALL DONE: {trigger_payload}"
Esempio n. 11
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    result = yield context.call_activity(
        'CalculateTurkeyBrineEquationAndRoastRecommendation', 7)
    return result
Esempio n. 12
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    logging.info("Orchestrator started")

    ## Make sure the mp4 is in the right container
    inputDict = json.loads(context._input)
    logging.info(f"inputDict: {inputDict}")
    # _container_ = inputDict['container']
    fileURL = inputDict['fileUrl']
    rowID = inputDict['RowID']
    try:
        # imagesAlreadyCreated = inputDict['imagesAlreadyCreated']
        MyFunctions.update_row_status(
            rowID=rowID,
            status=f'In Progress - {os.getenv("appName")}'
        )

        startUTCstr = datetime.strftime(context.current_utc_datetime,
                                        "%Y-%m-%d %H:%M:%S.%f")
        ## Get AzureBlobVideos table from SQL, in dict form
        abv = MyFunctions.getAzureBlobVideos2()
        logging.info(f"AzureBlobVideos table retrieved, rows: {len(abv)}")

        ## If the video name is in the dict, extract the information
        try:
            videoName0 = inputDict['blob']
            ## If last 11 characters (excluding '.mp4') follow '-YYYY-MM-DD'
            ##    then remove them
            videoName = MyFunctions.cleanUpVidName(videoName0)
            ## Get relevant sport and event name for the video (excluding '.mp4')
            (videoID,sport,event,
            endpointID,multipleVideoEvent,
            samplingProportion,audioTranscript,
            databaseID) = abv[videoName[:-4]]
            ## Convert databaseID to None if it is been left empty rather than NULL
            databaseID = None if databaseID == "" else databaseID
            for metric,value in [
                ("videoID",videoID),
                ("sport",sport),
                ("event",event),
                ("endpointID",endpointID),
                ("multipleVideoEvent",multipleVideoEvent),
                ("samplingProportion",samplingProportion),
                ("audioTranscript",audioTranscript),
                ('databaseID',databaseID)
            ]:
                logging.info(f"{metric}: {value}")
            ## Correct samplingProportion from nan to None if needed
            if pd.isna(samplingProportion):
                samplingProportion = None
                logging.info("samplingProportion changed")
                logging.info(f"samplingProportion: {samplingProportion}")
            else:
                logging.info("this is not True: pd.isna(samplingProportion)")
        except KeyError:
            videoID = None
            sport = None
            event = None
            endpointID = None
            multipleVideoEvent = None
            samplingProportion = None
            audioTranscript = None
            logging.info("Video not in AzureBlobVideos so relevant values assigned None")

        ## Make sure `videoName` has got a value, otherwise give it None
        try:
            videoName
        except NameError:
            videoName = None

        if (sport == 'baseball') & (not MyFunctions.is_uuid(inputDict['blob'])):
            ## Get time to cut from, using MLB API
            timeToCutUTC = yield context.call_activity(name='CallAPI',
                                                        input_=context._input)
            logging.info('timeToCutUTC acquired from API')
        else:
            ## Make timeToCutUTC a time far in the future (my 100th birthday)
            timeToCutUTC = "2095-03-13 00:00:00.00000"
            logging.info("Not baseball, so distant timeToCutUTC provided")

        ## Get list of frame numbers to convert to JPEGs, ending at `timeToCutUTC`
        ##    Use composite object
        ##     - https://docs.microsoft.com/en-us/azure/azure-functions/durable/durable-functions-orchestrations?tabs=python#passing-multiple-parameters
        vidDets = namedtuple('VideoDetails',
                            ['blobDetails',
                            'timeToCutUTC',
                            'frameNumberList',
                            'sport',
                            'event',
                            'multipleVideoEvent',
                            'samplingProportion'])
        videoDetails = vidDets(blobDetails=context._input,
                                timeToCutUTC=timeToCutUTC,
                                frameNumberList=None,
                                sport=None,
                                event=None,
                                multipleVideoEvent=None,
                                samplingProportion=samplingProportion)
        logging.info("Initial videoDetails object created")
        listOfFrameNumbers = yield context.call_activity(
                                        name='ReturnFrameNumbers',
                                        input_=videoDetails)
        logging.info(f'List of {len(json.loads(listOfFrameNumbers))} generated')

        # Create images from list
        MP4toJPEGsoutput = yield context.call_activity(
                                        name='MP4toJPEGs',
                                        input_=vidDets(blobDetails=context._input,
                                                        timeToCutUTC=None,
                                                        frameNumberList=listOfFrameNumbers,
                                                        sport=sport,
                                                        event=event,
                                                        multipleVideoEvent=multipleVideoEvent,
                                                        samplingProportion=samplingProportion)
                                                )
        (imagesCreatedList,imagesCreatedCount,
            imageNames,
            outputContainer,outputBlobStorageAccount) = json.loads(MP4toJPEGsoutput)
        endUTCstr = datetime.strftime(context.current_utc_datetime,
                                        "%Y-%m-%d %H:%M:%S.%f")
        logging.info("Images generated!")

        ## If AudioTranscript value is True, copy the video to audiotranscript-files
        if (audioTranscript == True) | (audioTranscript == 1):
            viResult = yield context.call_activity(
                "VideoIndex",
                {
                    "fileURL" : fileURL
                }
            )


        ## If endpointID provided in `AzureBlobVideos`, add row to `ComputerVisionProccessingJobs` for each image
        if endpointID is not None:
            ## If DatabaseID column empty in AzureBlobVideo, follow the same
            ##    and VERY SLOW old way of doing things
            if databaseID is None:
                logging.info("endpointID given but no databaseID")
                ## Create composite object to use
                QueueDetails = namedtuple('QueueDetails',
                                    [
                                        'endpointID',
                                        'sport',
                                        'event',
                                        'blobDetails',
                                        'frameNumberList',
                                        'imagesCreatedList',
                                        'imageNames'
                                    ])
                ocr_result = yield context.call_activity(
                    name="QueueProcessingJobs",
                    input_=QueueDetails(
                        endpointID=endpointID,
                        sport=sport,
                        event=event,
                        blobDetails=context._input,
                        frameNumberList=listOfFrameNumbers,
                        imagesCreatedList=imagesCreatedList,
                        imageNames=imageNames
                    )
                )
            else:
                logging.info("both endpointID and databaseID given")
                ocr_result = yield context.call_activity(
                    name='QueueOcrEvent',
                    input_={
                        'JobCreatedBy' : 'FuturesVideoJPEGing',
                        'JobPriority' : 10,
                        'ClientDatabaseId' : databaseID,
                        'EndpointId' : endpointID,
                        'Sport' : sport,
                        'SportsEvent' : event,
                        'NumberOfImages' : len(json.loads(listOfFrameNumbers))
                    }
                )


        ## Add line to SQL - using another composite object
        UploadDetails = namedtuple('UploadDetails',
                            ['startUTC',
                            'endUTC',
                            'videoID',
                            'videoName',
                            'event',
                            'outputContainer',
                            'outputBlobStorageAccount',
                            'imagesCreatedCount'])
        wts_result = yield context.call_activity(
                                        name='WriteToSQL',
                                        input_=UploadDetails(
                                                        startUTC=startUTCstr,
                                                        endUTC=endUTCstr,
                                                        videoID=videoID,
                                                        videoName=videoName,
                                                        event=event,
                                                        outputContainer=outputContainer,
                                                        outputBlobStorageAccount=outputBlobStorageAccount,
                                                        imagesCreatedCount=imagesCreatedCount)
                                                )

        ## Update row status
        MyFunctions.update_row_status(
            rowID=rowID,
            status="Finished"
        )
        logging.info("row updated to `Finished`")                                     

        return f"{ocr_result} & {wts_result}" if endpointID is not None else wts_result
    
    except Exception as error:
        ## Update row status
        MyFunctions.update_row_status(
            rowID=rowID,
            status="Error"
        )
        logging.info("row updated to `Error`")
        # logging.error(error)
        ## Raise error
        raise Exception("".join(traceback.TracebackException.from_exception(error).format()))
Esempio n. 13
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    # get creds from body
    creds = context.get_input()
    # start etl process with creds
    result1 = yield context.call_activity('RunETL', creds)
    return result1
def main(context: DurableOrchestrationContext):
    retry_twice_opts = RetryOptions(first_retry_interval_in_milliseconds=5_000,
                                    max_number_of_attempts=2)

    timestamp = context.current_utc_datetime
    trigger_payload = loads(context.get_input())

    logging.info(f"triggered with payload: {trigger_payload}")

    # ------------------------------------------------------------------------------------
    # Retrieve blob paths
    # ------------------------------------------------------------------------------------
    context.set_custom_status("Retrieving artefacts")
    logging.info("retrieving artefacts")

    task_artefacts = list()

    for task_manifest in housekeeping_tasks:
        logging.info(f"submitting '{task_manifest['label']}' to retriever")

        artefacts = context.call_activity_with_retry(
            "housekeeping_retriever",
            input_=RetrieverPayload(timestamp=timestamp.isoformat(),
                                    environment=trigger_payload['environment'],
                                    manifest=task_manifest),
            retry_options=retry_twice_opts)

        task_artefacts.append(artefacts)

    logging.info("awaiting retriever tasks")
    retrieved_artefacts = yield context.task_all(task_artefacts)

    # ------------------------------------------------------------------------------------
    # Submit for archiving
    # ------------------------------------------------------------------------------------
    context.set_custom_status("Submitting candidates to the archiver")
    logging.info("submitting candidates to the archiver")

    archive_modes = [ProcessMode.ARCHIVE_AND_DISPOSE, ProcessMode.ARCHIVE_ONLY]
    activities = list()

    for task in chain(*retrieved_artefacts):
        logging.info(f"submitting '{task['manifest']['label']}' to archiver")

        if task["manifest"]["mode"] not in archive_modes:
            logging.info("-- not archived")
            continue

        activity = context.call_activity_with_retry(
            "housekeeping_archiver",
            input_=task,
            retry_options=retry_twice_opts)
        activities.append(activity)

    logging.info("awaiting archiver tasks")
    archived_artefacts = yield context.task_all(activities)

    # ------------------------------------------------------------------------------------
    # Dispose of archived blobs
    # ------------------------------------------------------------------------------------
    context.set_custom_status("Removing archived data")
    logging.info("removing archived data")

    disposable_only = filter(
        lambda t: t['manifest']['mode'] == ProcessMode.DISPOSE_ONLY,
        chain(*retrieved_artefacts))

    disposal_modes = [
        ProcessMode.ARCHIVE_AND_DISPOSE, ProcessMode.DISPOSE_ONLY
    ]
    activities = list()

    for task in chain(archived_artefacts, disposable_only):
        logging.info(f"submitting '{task['manifest']['label']}' to disposer")

        if task["manifest"]["mode"] not in disposal_modes:
            logging.info("-- not disposed")
            continue

        activity = context.call_activity_with_retry(
            "housekeeping_disposer",
            input_=task,
            retry_options=retry_twice_opts)
        activities.append(activity)

    logging.info("awaiting disposer tasks")
    report = yield context.task_all(activities)

    # ------------------------------------------------------------------------------------

    context.set_custom_status(
        f"ALL DONE - processed {report['total_processed']} artefacts")

    return f"DONE - {timestamp.isoformat()}"
Esempio n. 15
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    url = "https://fortesting.azurewebsites.net/api/Function1?code=caATd9U/wemV9vBy7ySFHfiEJCfQr0QZYzCdzGBHvIkWapwdjvjV1g=="

    result = yield context.call_activity('Activity', url)
    return [result]
Esempio n. 16
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    input = context.get_input()

    runs = input['runs']

    data = {'y': input['y'], 'x': input['x'], 'regimes': input['regimes'], 'maxit': input['maxit'],
            'onesigma': input['onesigma'], 'disp': input['disp'], 'convergence': input['convergence']}

    # This section runs the optimizations in parallel.
    parallel_tasks = []
    for r in range(runs):
        data['run_number'] = r
        data_str = json.dumps(data)

        parallel_tasks.append(
            context.call_activity('hmarkov_em_solve', data_str))

    parallel_outputs = yield context.task_all(parallel_tasks)

    # # This section runs sequentially, and it's intended as a comparison
    # # to test the
    # # parallelization of the previous section.
    #
    # parallel_outputs = []
    # for r in range(runs):
    #     data['run_number'] = r
    #     data_str = json.dumps(data)

    #     o = yield context.call_activity('hmarkov_em_solve', data_str)
    #     parallel_outputs.append(o)

    outputs = [json.loads(po) for po in parallel_outputs]

    llf_all = np.asarray([o['llf'] for o in outputs], dtype=np.float64)

    llf_argmax = np.argmax(llf_all)
    llf_max = llf_all[llf_argmax]

    # Note we take the maximum of the likelihoods. This is because
    # we're using an iterative routine from James D. Hamilton. If we
    # were calling an optimizer, people would usually miminize the
    # negative log likelihood.
    res = outputs[llf_argmax]
    res['llf_all'] = llf_all.tolist()
    res['llf_max'] = llf_max

    iterations_all = np.asarray([o['iterations']
                                 for o in outputs], dtype=np.float64)

    res['iterations_all'] = iterations_all.tolist()
    res['iterations'] = iterations_all[llf_argmax]

    res['time_start'] = input['start_time']

    function_times = list()
    for o in outputs:
        function_times.append(
            {'run_number': o['run_number'], 'start_time': o['start_time'], 'end_time': o['end_time'], })

    res['function_times'] = function_times

    if input['disp']:
        logs = [o['log_info'] for o in outputs]
        res['logs_all'] = logs

    # Output  paths to be saved to Azure blob.
    full_path = f"{input['account_url']}{input['base_container']}/{input['unique_id']}/"

    res['input_file'] = f'{full_path}inputs.json'
    res['output_file'] = f'{full_path}outputs.json'
    res['chart_file'] = f'{full_path}charts.png' if input['create_charts'] else None
    res['account_url'] = input['account_url']

    # Charts and t-statistics and email if optioned.
    parallel_tasks_2 = []
    if input['tstats']:
        parallel_tasks_2.append(
            context.call_activity('hmarkov_t_statistics', json.dumps(res)))

    if input['create_charts']:
        charts_str = json.dumps(
            {'res': res, 'container_name': input['base_container'], 'blob_name': input['unique_id']})

        parallel_tasks_2.append(
            context.call_activity('hmarkov_charts', charts_str))

    if len(parallel_tasks_2) > 0:
        parallel_outputs_2 = yield context.task_all(parallel_tasks_2)

    if input['tstats']:
        tstat_results = json.loads(parallel_outputs_2[0])

        res['t_statistics'] = tstat_results['t_statistics']
        res['t_statistics_time'] = tstat_results['t_statistics_time']
        res['hessian'] = tstat_results['hessian']

    # Save results to blob.
    out = json.dumps(res, indent='\t', sort_keys=True)

    save_params = json.dumps({
        'out': out, 'account_url': input['account_url'], 'container': input['base_container'], 'blob': input['outputs_file_name']})

    yield context.call_activity('hmarkov_save_outputs', save_params)

    # After everything else is done, send an email if optioned.
    if input['notification_email_address'] is not None:
        email_str = json.dumps({'input_file': res['input_file'], 'output_file': res['output_file'], 'chart_file': res['chart_file'],
                                'time_total': res['time_solve'], 'email_address': input['notification_email_address']})

    yield context.call_activity('hmarkov_email', email_str)

    # return None as a string because the Durable Function
    # extension seems to work well with strings.
    return 'None'
Esempio n. 17
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    # --- start transferring data to the google sheet service
    result5 = yield context.call_activity('google_spreadsheets_activity')

    return [result5]
Esempio n. 18
0
def main(context: DurableOrchestrationContext):
    logging.info(f"DB ETL orchestrator has been triggered")

    retry_options = RetryOptions(first_retry_interval_in_milliseconds=5_000,
                                 max_number_of_attempts=5)

    trigger_payload: str = context.get_input()
    logging.info(f"\tTrigger received: {trigger_payload}")

    trigger_data = loads(trigger_payload)

    timestamp = trigger_data["datestamp"]
    datestamp = trigger_data["datestamp"].split("T")[0]

    if "path" in trigger_data:
        paths = [trigger_data["path"]]
    else:
        paths = [
            # f"daily_chunks/specimen_date_cases/by_age/{datestamp}/",
            # f"daily_chunks/deaths_28days_death_date/by_age/{datestamp}/",
            f"daily_chunks/main/{datestamp}/"
        ]

    category = trigger_data.get("category", "main")

    main_path = trigger_data['main_data_path']
    if main_path.endswith("json"):
        process_name = "MAIN"
    else:
        parsed_path = parse_filepath(main_path)
        process_name = category_label(parsed_path)

    tasks = list()

    if len(timestamp) > 10:
        now = datetime.fromisoformat(timestamp)
    else:
        file_date = datetime.strptime(datestamp, "%Y-%m-%d")
        now = context.current_utc_datetime
        now = datetime(year=file_date.year,
                       month=file_date.month,
                       day=file_date.day,
                       hour=now.hour,
                       minute=now.minute,
                       second=now.second,
                       microsecond=now.microsecond)

    release_id, timestamp = get_release_id(now, process_name)
    set_file_releaseid(filepath=trigger_data["main_data_path"],
                       release_id=release_id)

    payload = {"timestamp": timestamp.isoformat()}

    for path in paths:
        with StorageClient(container="pipeline", path=path) as client:
            for file in client:
                payload.update({'file_path': file['name']})

                task = context.call_activity_with_retry(
                    "db_etl_upload",
                    retry_options=retry_options,
                    input_=payload)
                tasks.append(task)

    _ = yield context.task_all(tasks)
    context.set_custom_status("Upload to database is complete.")

    if category != "main":
        # Categories other than main may have DB level processes. These
        # need to be performed before stats and graphs are generated.
        # Processes for stats and graphs are therefore moved to chunk
        # processor.
        context.set_custom_status(
            "Chunk deployment is done. Remaining processes are skipped.")
        return f"DONE: {trigger_payload}"

    settings_task = context.call_activity_with_retry(
        'db_etl_update_db',
        input_=dict(date=f"{now:%Y-%m-%d}",
                    process_name=process_name,
                    environment=trigger_data.get('environment', "PRODUCTION")),
        retry_options=retry_options)

    graphs_task = context.call_activity_with_retry('db_etl_homepage_graphs',
                                                   input_=dict(
                                                       date=f"{now:%Y-%m-%d}",
                                                       category=category),
                                                   retry_options=retry_options)

    _ = yield context.task_all([settings_task, graphs_task])

    context.set_custom_status("Metadata updated / graphs created.")

    return f"DONE: {trigger_payload}"
Esempio n. 19
0
def orchestrator_function(context: df.DurableOrchestrationContext):
    input_ = context.get_input()
    result1 = yield context.call_activity('Hello', input_)
    return result1