model = pkl.load(open(args.existing_model_pkl)) elif args.existing_model_proto: model = LOPQModel.load_proto(args.existing_model_proto) args = validate_arguments(args, model) # Build descriptive app name get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None) steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps)))) APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str) sc = SparkContext(appName=APP_NAME) # Load UDF module if provided and load training data RDD if args.data_udf: sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py') sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py') udf_module = __import__(args.data_udf, fromlist=['udf']) load_udf = udf_module.udf # NB: load data method splits vectors into 2 parts, after applying pca if model is provided data = load_data(sc, args, data_load_fn=load_udf) else: # NB: load data method splits vectors into 2 parts, after applying pca if model is provided data = load_data(sc, args) # Initialize parameters Cs = Rs = mus = subs = None # Get coarse quantizers if STEP_COARSE in args.steps: Cs = train_coarse(sc, data, args.V, seed=args.seed)
jsc = intp.getJavaSparkContext() jconf = intp.getSparkConf() conf = SparkConf(_jvm = gateway.jvm, _jconf = jconf) sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf) sqlc = HiveContext(sc, intp.sqlContext()) sqlContext = sqlc spark = SparkSession(sc, intp.getSparkSession()) ##add pyfiles try: pyfile = sys.argv[5] pyfiles = pyfile.split(',') for i in range(len(pyfiles)): if ""!=pyfiles[i]: sc.addPyFile(pyfiles[i]) except Exception as e: print("add pyfile error: " + pyfile) class UDF(object): def __init__(self, intp, sqlc): self.intp = intp self.sqlc = sqlc def register(self, udfName, udf): self.sqlc.registerFunction(udfName, udf) def listUDFs(self): self.intp.listUDFs() def existsUDF(self, name): self.intp.existsUDF(name) udf = UDF(intp, sqlc) intp.onPythonScriptInitialized(os.getpid())
jcontext.sparkContext()) sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf) context = SparkSession(sc, jcontext.spark()) else: customContext = job.build_context(gateway, jcontext, spark_conf) if customContext is not None: context = customContext else: exit_with_failure( "Expected JavaSparkContext, SQLContext " "or HiveContext but received %s" % repr(context_class), 2) egg_path = os.environ.get("EGGPATH", None) if egg_path and sc: try: sc.addPyFile(egg_path) except Exception as error: exit_with_failure( "Error while adding Python Egg to Spark Context: %s\n%s" % (repr(error), traceback.format_exc()), 5) try: job_data = job.validate(context, None, job_config) except Exception as error: exit_with_failure( "Error while calling 'validate': %s\n%s" % (repr(error), traceback.format_exc()), 3) if isinstance(job_data, list) and \ isinstance(job_data[0], ValidationProblem): entry_point.setValidationProblems([p.problem for p in job_data]) exit_with_failure("Validation problems in job, exiting") else:
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.types import * import datetime from awsglue import DynamicFrame os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages io.delta:delta-core_2.11:0.6.1 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"' ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() sc.addPyFile("s3://chen115y-jar-deltalake/delta-core_2.11-0.6.1.jar") glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) conf = spark.sparkContext._conf.setAll([ ('spark.delta.logStore.class', 'org.apache.spark.sql.delta.storage.S3SingleDriverLogStore'), ('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('fs.s3n.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('fs.AbstractFileSystem.s3.impl', 'org.apache.hadoop.fs.s3a.S3A'), ('fs.AbstractFileSystem.s3n.impl', 'org.apache.hadoop.fs.s3a.S3A'), ('fs.AbstractFileSystem.s3a.impl', 'org.apache.hadoop.fs.s3a.S3A')
jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext( jcontext.sparkContext()) sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf) context = SparkSession(sc, jcontext.spark()) else: customContext = job.build_context(gateway, jcontext, spark_conf) if customContext is not None: context = customContext else: exit_with_failure( "Expected JavaSparkContext, SQLContext " "or HiveContext but received %s" % repr(context_class), 2) package_path = os.environ.get("PACKAGEPATH", None) if package_path and sc: try: sc.addPyFile(package_path) except Exception as error: exit_with_failure( "Error while adding Python package to Spark Context: %s\n%s" % (repr(error), traceback.format_exc()), 5) try: job_data = job.validate(context, None, job_config) except Exception as error: exit_with_failure( "Error while calling 'validate': %s\n%s" % (repr(error), traceback.format_exc()), 3) if isinstance(job_data, list) and \ isinstance(job_data[0], ValidationProblem): entry_point.setValidationProblems([p.problem for p in job_data]) exit_with_failure("Validation problems in job, exiting") else:
from pyspark import SparkFiles import parse_csv as pycsv sc = SparkContext() # sqlCtx = SQLContext or HiveContext #sqlCtx=SQLContext(sc) if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("Python load csv to DataFrame example") \ .getOrCreate() # 将parse_csv.py上传到spark集群,否则spark-submit会提示找不到此文件 sc.addPyFile('/home/xuan/mystudy/pyspark/csvToDataFrame/parse_csv.py') # Read csv data via SparkContext and convert it to DataFrame # load with rdd # 不带表头 plaintext_rdd = sc.textFile( "hdfs:///ubuntuxuan/MyData/Titanic/train_without_header.csv") dataframe = pycsv.csvToDataFrame(spark, plaintext_rdd,columns=["PassengerId","Survived","Pclass","Name","Sex","Ag","SibSp","Parch","Ticket","Fare","Cabin","Embarked"],parseDate=False) dataframe.show() #带表头,自行推出字段类型 plaintext_rdd = sc.textFile( "hdfs:///ubuntuxuan/MyData/Titanic/train_with_header.csv") dataframe = pycsv.csvToDataFrame(spark, plaintext_rdd,columns=None,parseDate=False) dataframe.show()