def preparation(**kwargs): # Without this global setting, this DAG on EC2 server got the following error: # UnboundLocalError: local variable 'VPC_ID' referenced before assignment global VPC_ID, SUBNET_ID, CLUSTER_NAME Variable.delete('cluster_id') Variable.delete('keypair_name') Variable.delete('master_sg_id') Variable.delete('slave_sg_id') Variable.delete('short_interests_dag_state') ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) if VPC_ID == '': VPC_ID = emrs.get_first_available_vpc(ec2) if SUBNET_ID == '': SUBNET_ID = emrs.get_first_available_subnet(ec2, VPC_ID) master_sg_id = emrs.create_security_group( ec2, '{}SG'.format(CLUSTER_NAME), 'Master SG for {}'.format(CLUSTER_NAME), VPC_ID) slave_sg_id = emrs.create_security_group( ec2, '{}SlaveSG'.format(CLUSTER_NAME), 'Slave SG for {}'.format(CLUSTER_NAME), VPC_ID) Variable.set('master_sg_id', master_sg_id) Variable.set('slave_sg_id', slave_sg_id) keypair = emrs.create_key_pair(ec2, '{}_pem'.format(CLUSTER_NAME)) Variable.set('keypair_name', keypair['KeyName']) emrs.create_default_roles(iam)
def cleanup(**kwargs): ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) ec2.delete_key_pair(KeyName=Variable.get('keypair_name')) emrs.delete_security_group(ec2, Variable.get('master_sg_id')) time.sleep(2) emrs.delete_security_group(ec2, Variable.get('slave_sg_id')) Variable.delete('cluster_id') Variable.delete('keypair_name') Variable.delete('master_sg_id') Variable.delete('slave_sg_id') Variable.delete('short_interests_dag_state')
def submit_spark_job_from_file(**kwargs): ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) if emrs.is_cluster_terminated(emr, Variable.get('cluster_id')): Variable.set('combine_dag_state', 'FAILED') raise AirflowException("Cluster has been terminated. Redo all DAGs.") if Variable.get( 'short_interests_dag_state', None) == 'FAILED' or Variable.get( 'prices_dag_state', None) == 'FAILED': Variable.set('combine_dag_state', 'FAILED') raise AirflowException("Error in prices_dag. Redo all DAGs.") cluster_dns = emrs.get_cluster_dns(emr, Variable.get('cluster_id')) emrs.kill_all_inactive_spark_sessions(cluster_dns) session_headers = emrs.create_spark_session(cluster_dns) helperspath = None if 'helperspath' in kwargs: helperspath = kwargs['helperspath'] commonpath = None if 'commonpath' in kwargs: commonpath = kwargs['commonpath'] emrs.wait_for_spark(cluster_dns, session_headers) job_response_headers = emrs.submit_spark_job_from_file( cluster_dns, session_headers, kwargs['filepath'], args=kwargs['args'], commonpath=commonpath, helperspath=helperspath) final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers) emrs.kill_spark_session(cluster_dns, session_headers) for line in logs: logging.info(line) if '(FAIL)' in str(line): logging.error(line) raise AirflowException("ETL process fails.") if final_status in ['available', 'ok'] and 'on_complete' in kwargs: # Update CSV file's ACL to public-read if 's3a://' in config['App']['DB_HOST'] or 's3://' in config['App'][ 'DB_HOST']: bucket = config['App']['DB_HOST'].split('/')[-1] key = config['App']['TABLE_SHORT_ANALYSIS'][1:] + '.csv' (boto3.session.Session( region_name='us-east-1').resource('s3').Object( bucket, key).Acl().put(ACL='public-read')) kwargs['on_complete']()
def create_cluster(**kwargs): logging.info("instance type is " + config['AWS']['EMR_CORE_NODE_INSTANCE_TYPE']) ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) emrs.wait_for_roles(iam) cluster_id = emrs.create_emr_cluster( emr, CLUSTER_NAME, Variable.get('master_sg_id'), Variable.get('slave_sg_id'), Variable.get('keypair_name'), SUBNET_ID, num_core_nodes=int(config['AWS']['EMR_NUM_CORE_NODES']), core_node_instance_type=config['AWS']['EMR_CORE_NODE_INSTANCE_TYPE'], release_label='emr-5.28.1') Variable.set('cluster_id', cluster_id)
def submit_spark_job_from_file(**kwargs): ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) if emrs.is_cluster_terminated(emr, Variable.get('cluster_id', None)): Variable.set('short_interests_dag_state', 'FAILED') raise AirflowException("Cluster has been terminated. Redo all DAGs.") if Variable.get('prices_dag_state', None) == 'FAILED': Variable.set('short_interests_dag_state', 'FAILED') raise AirflowException("Error in prices_dag. Redo all DAGs.") cluster_dns = emrs.get_cluster_dns(emr, Variable.get('cluster_id')) emrs.kill_all_spark_sessions(cluster_dns) session_headers = emrs.create_spark_session(cluster_dns) helperspath = None if 'helperspath' in kwargs: helperspath = kwargs['helperspath'] commonpath = None if 'commonpath' in kwargs: commonpath = kwargs['commonpath'] emrs.wait_for_spark(cluster_dns, session_headers) job_response_headers = emrs.submit_spark_job_from_file( cluster_dns, session_headers, kwargs['filepath'], args=kwargs['args'], commonpath=commonpath, helperspath=helperspath) final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers, sleep_seconds=300) emrs.kill_spark_session(cluster_dns, session_headers) for line in logs: logging.info(line) if '(FAIL)' in str(line): logging.error(line) Variable.set('short_interests_dag_state', 'ERROR') raise AirflowException("ETL process fails.") if final_status in ['available', 'ok'] and 'on_complete' in kwargs: kwargs['on_complete']()
def terminate_cluster(**kwargs): keep_cluster = Variable.get('keep_emr_cluster', default_var=False) if not keep_cluster: ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) emrs.delete_cluster(emr, Variable.get('cluster_id'))
def terminate_cluster(**kwargs): ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config) emrs.delete_cluster(emr, Variable.get('cluster_id'))