Beispiel #1
0
def data_proc_spark_operator(context):
    dt = datetime.datetime.strptime(context.solid_config['date'], "%Y-%m-%d")

    cluster_resource = DataprocResource(DATAPROC_CLUSTER_CONFIG)
    job_config = {
        'job': {
            'placement': {
                'clusterName': 'gcp-data-platform'
            },
            'reference': {
                'projectId': PROJECT_ID
            },
            'sparkJob': {
                'args': [
                    '--gcs-input-bucket',
                    INPUT_BUCKET,
                    '--gcs-output-bucket',
                    OUTPUT_BUCKET,
                    '--date',
                    dt.strftime('%Y-%m-%d'),
                ],
                'mainClass':
                'io.dagster.events.EventPipeline',
                'jarFileUris': [
                    '%s/events-assembly-%s.jar' %
                    (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
                ],
            },
        },
        'projectId': PROJECT_ID,
        'region': REGION,
    }
    job = cluster_resource.submit_job(job_config)
    job_id = job['reference']['jobId']
    cluster_resource.wait_for_job(job_id)
Beispiel #2
0
def data_proc_spark_operator(context):
    dt = datetime.datetime.strptime(context.solid_config["date"], "%Y-%m-%d")

    cluster_resource = DataprocResource(DATAPROC_CLUSTER_CONFIG)
    job_config = {
        "job": {
            "placement": {
                "clusterName": "gcp-data-platform"
            },
            "reference": {
                "projectId": PROJECT_ID
            },
            "sparkJob": {
                "args": [
                    "--gcs-input-bucket",
                    INPUT_BUCKET,
                    "--gcs-output-bucket",
                    OUTPUT_BUCKET,
                    "--date",
                    dt.strftime("%Y-%m-%d"),
                ],
                "mainClass":
                "io.dagster.events.EventPipeline",
                "jarFileUris": [
                    "%s/events-assembly-%s.jar" %
                    (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
                ],
            },
        },
        "projectId": PROJECT_ID,
        "region": REGION,
    }
    job = cluster_resource.submit_job(job_config)
    job_id = job["reference"]["jobId"]
    cluster_resource.wait_for_job(job_id)
Beispiel #3
0
def delete_dataproc_cluster(_):
    DataprocResource(DATAPROC_CLUSTER_CONFIG).delete_cluster()
Beispiel #4
0
def create_dataproc_cluster(_):
    DataprocResource(DATAPROC_CLUSTER_CONFIG).create_cluster()
def delete_dataproc_cluster(_):
    DataprocResource(cfg.dataproc_create_cluster_config).delete_cluster()
def data_proc_spark_operator(context):
    cluster_resource = DataprocResource(cfg.dataproc_create_cluster_config)
    job = cluster_resource.submit_job(cfg.dataproc_pyspark_job_config)
    job_id = job["reference"]["jobId"]
    cluster_resource.wait_for_job(job_id)