コード例 #1
0
def create_EMR_cluster(cluster_name, emr_version, subnet_ids):
    cluster_id = EmrClusterController.create_cluster_job_execution(cluster_name, emr_version, subnet_ids)
    print("Waiting for Cluster: ", cluster_id)
    xcom_return = {"clusterId": cluster_id}

    with open("/airflow/xcom/return.json", "w") as file:
        json.dump(xcom_return, file)

    return EmrClusterController.wait_for_cluster_creation(cluster_id)
コード例 #2
0
def configure_job(cluster_id, s3_jar_path):
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                                                ['aws', 's3', 'cp', s3_jar_path, "/home/hadoop/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
コード例 #3
0
def spark_submit(cluster_id, jar_path):
    step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                                                          ['spark-submit', '--class', 'com.ricardo.farias.App',
                                                           jar_path])
    EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit)
    status = EmrClusterController.get_step_status(cluster_id, step_spark_submit)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
コード例 #4
0
def get_credentials(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar",
                           ["aws", "s3", "cp", "s3://emr-configuration-scripts/credentials", "/home/hadoop/.aws/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET CREDENTIALS FROM S3 FAILED")
        raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")
コード例 #5
0
def spark_submit(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                           ['spark-submit', '--class', 'com.ricardo.farias.App',
                            "/home/hadoop/SparkPractice-assembly-0.1.jar"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
コード例 #6
0
def get_jar(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                           ['aws', 's3', 'cp', 's3://emr-configuration-scripts/SparkPractice-assembly-0.1.jar',
                            "/home/hadoop/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
コード例 #7
0
def configure_job(cluster_id, data_product):
    step_get_credentials = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar",
                                                ["aws", "s3", "cp", "s3://art-emr-configuration-scripts/credentials",
                                                 "/home/hadoop/.aws/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_get_credentials)
    status = EmrClusterController.get_step_status(cluster_id, step_get_credentials)
    if status == "FAILED":
        print("GET CREDENTIALS FROM S3 FAILED")
        raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")

    if data_product == 'citi_bike':
        s3_jar_path = 's3://art-emr-configuration-scripts/CitiBikeDataProduct-assembly-0.1.jar'
    elif data_product == 'covid':
        s3_jar_path = 's3://art-emr-configuration-scripts/SparkPractice-assembly-0.1.jar'
    else:
        raise RuntimeError("Invalid data_product Option")
        
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                                            ['aws', 's3', 'cp', s3_jar_path,"/home/hadoop/"])

    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")    
コード例 #8
0
def spark_submit(cluster_id, data_product):
    if data_product == 'citi_bike':
        jar_path = '/home/hadoop/CitiBikeDataProduct-assembly-0.1.jar'
    elif data_product == 'covid':
        jar_path = '/home/hadoop/SparkPractice-assembly-0.1.jar'
    else:
        raise RuntimeError("Invalid data_product Option")

    step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                                                ['spark-submit', '--class', 'com.ricardo.farias.App',jar_path])
    EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit)
    status = EmrClusterController.get_step_status(cluster_id, step_spark_submit)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
コード例 #9
0
def create_livy_session(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    master_dns = EmrClusterController.get_cluster_dns(cluster_id)
    print(f"\n\n MASTER DNS: {master_dns}")
    response_headers = EmrClusterController.create_spark_session(master_dns)
    print(f"Create Spark Session: {response_headers}")
    session_url = EmrClusterController.wait_for_idle_session(
        master_dns, response_headers)
    spark_response = EmrClusterController.submit_statement(
        session_url, "./dags/spark/RddCreation.scala")
    print(f"Spark Command Response: {spark_response}")
    EmrClusterController.track_statement_progress(master_dns,
                                                  spark_response.headers)
    EmrClusterController.kill_spark_session(session_url)
コード例 #10
0
def create_emr_cluster(**kwargs):
    cluster_id = EmrClusterController.create_cluster_job_execution(
        "Livy Cluster", "emr-5.30.0")
    return cluster_id
コード例 #11
0
def terminate_cluster(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    EmrClusterController.terminate_cluster(cluster_id)
コード例 #12
0
def wait_for_cluster(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    EmrClusterController.wait_for_cluster_creation(cluster_id)
コード例 #13
0
def terminate_cluster(cluster_id):
    EmrClusterController.terminate_cluster(cluster_id)