def data_proc_spark_operator(context): dt = datetime.datetime.strptime(context.solid_config['date'], "%Y-%m-%d") cluster_resource = DataprocResource(DATAPROC_CLUSTER_CONFIG) job_config = { 'job': { 'placement': { 'clusterName': 'gcp-data-platform' }, 'reference': { 'projectId': PROJECT_ID }, 'sparkJob': { 'args': [ '--gcs-input-bucket', INPUT_BUCKET, '--gcs-output-bucket', OUTPUT_BUCKET, '--date', dt.strftime('%Y-%m-%d'), ], 'mainClass': 'io.dagster.events.EventPipeline', 'jarFileUris': [ '%s/events-assembly-%s.jar' % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], }, }, 'projectId': PROJECT_ID, 'region': REGION, } job = cluster_resource.submit_job(job_config) job_id = job['reference']['jobId'] cluster_resource.wait_for_job(job_id)
def data_proc_spark_operator(context): dt = datetime.datetime.strptime(context.solid_config["date"], "%Y-%m-%d") cluster_resource = DataprocResource(DATAPROC_CLUSTER_CONFIG) job_config = { "job": { "placement": { "clusterName": "gcp-data-platform" }, "reference": { "projectId": PROJECT_ID }, "sparkJob": { "args": [ "--gcs-input-bucket", INPUT_BUCKET, "--gcs-output-bucket", OUTPUT_BUCKET, "--date", dt.strftime("%Y-%m-%d"), ], "mainClass": "io.dagster.events.EventPipeline", "jarFileUris": [ "%s/events-assembly-%s.jar" % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], }, }, "projectId": PROJECT_ID, "region": REGION, } job = cluster_resource.submit_job(job_config) job_id = job["reference"]["jobId"] cluster_resource.wait_for_job(job_id)
def delete_dataproc_cluster(_): DataprocResource(DATAPROC_CLUSTER_CONFIG).delete_cluster()
def create_dataproc_cluster(_): DataprocResource(DATAPROC_CLUSTER_CONFIG).create_cluster()
def delete_dataproc_cluster(_): DataprocResource(cfg.dataproc_create_cluster_config).delete_cluster()
def data_proc_spark_operator(context): cluster_resource = DataprocResource(cfg.dataproc_create_cluster_config) job = cluster_resource.submit_job(cfg.dataproc_pyspark_job_config) job_id = job["reference"]["jobId"] cluster_resource.wait_for_job(job_id)