Ejemplo n.º 1
0
            "Usage  : spark_gcs_integration.py <PROJECT_ID> <BUCKET_NAME> <PRIVATE_KEY> <PRIVATE_KEY_ID> <CLIENT_EMAIL>",
            file=sys.stderr)
        print(
            "Example: spark_gcs_integration.py ranga-gcp-spark-project ranga-spark-gcp-bkt ranga_private_key ranga_private_key_id [email protected]",
            file=sys.stderr)
        exit(-1)

    projectId = sys.argv[1]
    bucketName = sys.argv[2]
    privateKey = sys.argv[3]
    privateKeyId = sys.argv[4]
    clientEmail = sys.argv[5]
    appName = "PySpark GCS Integration Example"

    # Creating the SparkSession object
    spark = SparkSession.appName(appName).builder.config(
        conf=conf).getOrCreate()

    # GCS settings
    conf = spark.sparkContext._jsc.hadoopConfiguration()
    conf.set("fs.gs.impl",
             "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    conf.set("fs.AbstractFileSystem.gs.impl",
             "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    conf.set("fs.gs.auth.service.account.enable", "true")
    conf.set("fs.gs.project.id", projectId)
    conf.set("fs.gs.auth.service.account.private.key", privateKey)
    conf.set("fs.gs.auth.service.account.private.key.id", privateKeyId)
    conf.set("fs.gs.auth.service.account.email", clientEmail)

    print("SparkSession Created successfully")