Beispiel #1
0
def preparation(**kwargs):
    # Without this global setting, this DAG on EC2 server got the following error:
    #     UnboundLocalError: local variable 'VPC_ID' referenced before assignment
    global VPC_ID, SUBNET_ID, CLUSTER_NAME
    Variable.delete('cluster_id')
    Variable.delete('keypair_name')
    Variable.delete('master_sg_id')
    Variable.delete('slave_sg_id')
    Variable.delete('short_interests_dag_state')

    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)

    if VPC_ID == '':
        VPC_ID = emrs.get_first_available_vpc(ec2)

    if SUBNET_ID == '':
        SUBNET_ID = emrs.get_first_available_subnet(ec2, VPC_ID)

    master_sg_id = emrs.create_security_group(
        ec2, '{}SG'.format(CLUSTER_NAME),
        'Master SG for {}'.format(CLUSTER_NAME), VPC_ID)
    slave_sg_id = emrs.create_security_group(
        ec2, '{}SlaveSG'.format(CLUSTER_NAME),
        'Slave SG for {}'.format(CLUSTER_NAME), VPC_ID)

    Variable.set('master_sg_id', master_sg_id)
    Variable.set('slave_sg_id', slave_sg_id)

    keypair = emrs.create_key_pair(ec2, '{}_pem'.format(CLUSTER_NAME))
    Variable.set('keypair_name', keypair['KeyName'])

    emrs.create_default_roles(iam)
Beispiel #2
0
def cleanup(**kwargs):
    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)
    ec2.delete_key_pair(KeyName=Variable.get('keypair_name'))
    emrs.delete_security_group(ec2, Variable.get('master_sg_id'))
    time.sleep(2)
    emrs.delete_security_group(ec2, Variable.get('slave_sg_id'))
    Variable.delete('cluster_id')
    Variable.delete('keypair_name')
    Variable.delete('master_sg_id')
    Variable.delete('slave_sg_id')
    Variable.delete('short_interests_dag_state')
def submit_spark_job_from_file(**kwargs):
    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)
    if emrs.is_cluster_terminated(emr, Variable.get('cluster_id')):
        Variable.set('combine_dag_state', 'FAILED')
        raise AirflowException("Cluster has been terminated. Redo all DAGs.")

    if Variable.get(
            'short_interests_dag_state', None) == 'FAILED' or Variable.get(
                'prices_dag_state', None) == 'FAILED':
        Variable.set('combine_dag_state', 'FAILED')
        raise AirflowException("Error in prices_dag. Redo all DAGs.")

    cluster_dns = emrs.get_cluster_dns(emr, Variable.get('cluster_id'))
    emrs.kill_all_inactive_spark_sessions(cluster_dns)
    session_headers = emrs.create_spark_session(cluster_dns)
    helperspath = None
    if 'helperspath' in kwargs:
        helperspath = kwargs['helperspath']
    commonpath = None
    if 'commonpath' in kwargs:
        commonpath = kwargs['commonpath']
    emrs.wait_for_spark(cluster_dns, session_headers)
    job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns,
        session_headers,
        kwargs['filepath'],
        args=kwargs['args'],
        commonpath=commonpath,
        helperspath=helperspath)

    final_status, logs = emrs.track_spark_job(cluster_dns,
                                              job_response_headers)
    emrs.kill_spark_session(cluster_dns, session_headers)
    for line in logs:
        logging.info(line)
        if '(FAIL)' in str(line):
            logging.error(line)
            raise AirflowException("ETL process fails.")

    if final_status in ['available', 'ok'] and 'on_complete' in kwargs:
        # Update CSV file's ACL to public-read
        if 's3a://' in config['App']['DB_HOST'] or 's3://' in config['App'][
                'DB_HOST']:
            bucket = config['App']['DB_HOST'].split('/')[-1]
            key = config['App']['TABLE_SHORT_ANALYSIS'][1:] + '.csv'
            (boto3.session.Session(
                region_name='us-east-1').resource('s3').Object(
                    bucket, key).Acl().put(ACL='public-read'))

        kwargs['on_complete']()
Beispiel #4
0
def create_cluster(**kwargs):
    logging.info("instance type is " +
                 config['AWS']['EMR_CORE_NODE_INSTANCE_TYPE'])
    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)
    emrs.wait_for_roles(iam)
    cluster_id = emrs.create_emr_cluster(
        emr,
        CLUSTER_NAME,
        Variable.get('master_sg_id'),
        Variable.get('slave_sg_id'),
        Variable.get('keypair_name'),
        SUBNET_ID,
        num_core_nodes=int(config['AWS']['EMR_NUM_CORE_NODES']),
        core_node_instance_type=config['AWS']['EMR_CORE_NODE_INSTANCE_TYPE'],
        release_label='emr-5.28.1')
    Variable.set('cluster_id', cluster_id)
Beispiel #5
0
def submit_spark_job_from_file(**kwargs):
    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)

    if emrs.is_cluster_terminated(emr, Variable.get('cluster_id', None)):
        Variable.set('short_interests_dag_state', 'FAILED')
        raise AirflowException("Cluster has been terminated. Redo all DAGs.")

    if Variable.get('prices_dag_state', None) == 'FAILED':
        Variable.set('short_interests_dag_state', 'FAILED')
        raise AirflowException("Error in prices_dag. Redo all DAGs.")

    cluster_dns = emrs.get_cluster_dns(emr, Variable.get('cluster_id'))
    emrs.kill_all_spark_sessions(cluster_dns)
    session_headers = emrs.create_spark_session(cluster_dns)
    helperspath = None
    if 'helperspath' in kwargs:
        helperspath = kwargs['helperspath']
    commonpath = None
    if 'commonpath' in kwargs:
        commonpath = kwargs['commonpath']
    emrs.wait_for_spark(cluster_dns, session_headers)
    job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns,
        session_headers,
        kwargs['filepath'],
        args=kwargs['args'],
        commonpath=commonpath,
        helperspath=helperspath)

    final_status, logs = emrs.track_spark_job(cluster_dns,
                                              job_response_headers,
                                              sleep_seconds=300)
    emrs.kill_spark_session(cluster_dns, session_headers)
    for line in logs:
        logging.info(line)
        if '(FAIL)' in str(line):
            logging.error(line)
            Variable.set('short_interests_dag_state', 'ERROR')
            raise AirflowException("ETL process fails.")

    if final_status in ['available', 'ok'] and 'on_complete' in kwargs:
        kwargs['on_complete']()
Beispiel #6
0
def terminate_cluster(**kwargs):
    keep_cluster = Variable.get('keep_emr_cluster', default_var=False)
    if not keep_cluster:
        ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                              config=config)
        emrs.delete_cluster(emr, Variable.get('cluster_id'))
def terminate_cluster(**kwargs):
    ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'],
                                          config=config)
    emrs.delete_cluster(emr, Variable.get('cluster_id'))