def transform_geocam_sum(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') cluster_dns = emr.get_cluster_dns(cluster_id) headers = emr.create_spark_session(cluster_dns, 'spark') session_url = emr.wait_for_idle_session(cluster_dns, headers) statement_response = emr.submit_statement( session_url, '/root/airflow/dags/transform/extract_timeseries.scala') emr.track_statement_progress(cluster_dns, statement_response.headers) emr.kill_spark_session(session_url)
def transform_links_to_parquet(**kwargs): # ti is the Task Instance ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') cluster_dns = emr.get_cluster_dns(cluster_id) headers = emr.create_spark_session(cluster_dns, 'spark') session_url = emr.wait_for_idle_session(cluster_dns, headers) statement_response = emr.submit_statement( session_url, '/airflow/dags/transform/links.scala') emr.track_statement_progress(cluster_dns, statement_response.headers) emr.kill_spark_session(session_url)
def generate_perfil_training(**kwargs): # ti is the Task Instance ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') cluster_dns = emr.get_cluster_dns(cluster_id) headers = emr.create_spark_session(cluster_dns, 'pyspark') session_url = emr.wait_for_idle_session(cluster_dns, headers) statement_response = emr.submit_statement(session_url, 's3://hdata-belcorp/pyspark-files/Perfiles_Training.py') emr.track_statement_progress(cluster_dns, statement_response.headers) emr.kill_spark_session(session_url)
def submit_emr(**kwargs): """Submit spark job to MRR.""" # ti is the Task Instance ti = kwargs['ti'] args = kwargs.get('pyspark_file_args', '') cluster_id = ti.xcom_pull(task_ids='create_cluster') cluster_dns = emr.get_cluster_dns(cluster_id) headers = emr.create_spark_session(cluster_dns, 'pyspark') session_url = emr.wait_for_idle_session(cluster_dns, headers) statement_response = emr.submit_statement(session_url, kwargs['file_path'], args) emr.track_statement_progress(cluster_dns, statement_response.headers) emr.kill_spark_session(session_url)
def spark_submit_to_emr(**kwargs): # ti is the Task Instance script_file = kwargs['params']['file'] ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') cluster_dns = emr.get_cluster_dns(cluster_id) headers = emr.create_spark_session(cluster_dns, 'pyspark') session_url = emr.wait_for_idle_session(cluster_dns, headers) execution_date = kwargs["execution_date"] year = execution_date.strftime("%Y") logging.info('processing for year ' + str(year)) statement_response = emr.submit_statement(session_url, script_file, "year = '" + str(year) + "'\n") emr.track_statement_progress(cluster_dns, statement_response.headers) emr.kill_spark_session(session_url)