model = pkl.load(open(args.existing_model_pkl))
    elif args.existing_model_proto:
        model = LOPQModel.load_proto(args.existing_model_proto)

    args = validate_arguments(args, model)

    # Build descriptive app name
    get_step_name = lambda x: {STEP_COARSE: 'coarse', STEP_ROTATION: 'rotations', STEP_SUBQUANT: 'subquantizers'}.get(x, None)
    steps_str = ', '.join(filter(lambda x: x is not None, map(get_step_name, sorted(args.steps))))
    APP_NAME = 'LOPQ{V=%d,M=%d}; training %s' % (args.V, args.M, steps_str)

    sc = SparkContext(appName=APP_NAME)

    # Load UDF module if provided and load training data RDD
    if args.data_udf:
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/memex_udf.py')
        sc.addPyFile('hdfs://memex/user/skaraman/build-lopq-index/lopq/spark/deepsentibanktf_udf.py')
        udf_module = __import__(args.data_udf, fromlist=['udf'])
        load_udf = udf_module.udf
        # NB: load data method splits vectors into 2 parts, after applying pca if model is provided
        data = load_data(sc, args, data_load_fn=load_udf)
    else:
        # NB: load data method splits vectors into 2 parts, after applying pca if model is provided
        data = load_data(sc, args)

    # Initialize parameters
    Cs = Rs = mus = subs = None

    # Get coarse quantizers
    if STEP_COARSE in args.steps:
        Cs = train_coarse(sc, data, args.V, seed=args.seed)
Exemple #2
0
jsc = intp.getJavaSparkContext()
jconf = intp.getSparkConf()
conf = SparkConf(_jvm = gateway.jvm, _jconf = jconf)
sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
sqlc = HiveContext(sc, intp.sqlContext())
sqlContext = sqlc
spark = SparkSession(sc, intp.getSparkSession())

##add pyfiles
try:
    pyfile = sys.argv[5]
    pyfiles = pyfile.split(',')
    for i in range(len(pyfiles)):
        if ""!=pyfiles[i]:
            sc.addPyFile(pyfiles[i])
except Exception as e:
    print("add pyfile error: " + pyfile)

class UDF(object):
    def __init__(self, intp, sqlc):
        self.intp = intp
        self.sqlc = sqlc
    def register(self, udfName, udf):
        self.sqlc.registerFunction(udfName, udf)
    def listUDFs(self):
        self.intp.listUDFs()
    def existsUDF(self, name):
        self.intp.existsUDF(name)
udf = UDF(intp, sqlc)
intp.onPythonScriptInitialized(os.getpid())
            jcontext.sparkContext())
        sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
        context = SparkSession(sc, jcontext.spark())
    else:
        customContext = job.build_context(gateway, jcontext, spark_conf)
        if customContext is not None:
            context = customContext
        else:
            exit_with_failure(
                "Expected JavaSparkContext, SQLContext "
                "or HiveContext but received %s" % repr(context_class), 2)

    egg_path = os.environ.get("EGGPATH", None)
    if egg_path and sc:
        try:
            sc.addPyFile(egg_path)
        except Exception as error:
            exit_with_failure(
                "Error while adding Python Egg to Spark Context: %s\n%s" %
                (repr(error), traceback.format_exc()), 5)
    try:
        job_data = job.validate(context, None, job_config)
    except Exception as error:
        exit_with_failure(
            "Error while calling 'validate': %s\n%s" %
            (repr(error), traceback.format_exc()), 3)
    if isinstance(job_data, list) and \
            isinstance(job_data[0], ValidationProblem):
        entry_point.setValidationProblems([p.problem for p in job_data])
        exit_with_failure("Validation problems in job, exiting")
    else:
            jcontext.sparkContext())
        sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
        context = SparkSession(sc, jcontext.spark())
    else:
        customContext = job.build_context(gateway, jcontext, spark_conf)
        if customContext is not None:
            context = customContext
        else:
            exit_with_failure(
                    "Expected JavaSparkContext, SQLContext "
                    "or HiveContext but received %s" % repr(context_class), 2)

    egg_path = os.environ.get("EGGPATH", None)
    if egg_path and sc:
        try:
            sc.addPyFile(egg_path)
        except Exception as error:
            exit_with_failure(
                "Error while adding Python Egg to Spark Context: %s\n%s" %
                (repr(error), traceback.format_exc()), 5)
    try:
        job_data = job.validate(context, None, job_config)
    except Exception as error:
        exit_with_failure(
            "Error while calling 'validate': %s\n%s" %
            (repr(error), traceback.format_exc()), 3)
    if isinstance(job_data, list) and \
            isinstance(job_data[0], ValidationProblem):
        entry_point.setValidationProblems([p.problem for p in job_data])
        exit_with_failure("Validation problems in job, exiting")
    else:
Exemple #5
0
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.types import *
import datetime
from awsglue import DynamicFrame

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages io.delta:delta-core_2.11:0.6.1 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"'
## @params: [JOB_NAME]

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()

sc.addPyFile("s3://chen115y-jar-deltalake/delta-core_2.11-0.6.1.jar")
glueContext = GlueContext(sc)
spark = glueContext.spark_session

job = Job(glueContext)
job.init(args['JOB_NAME'], args)

conf = spark.sparkContext._conf.setAll([
    ('spark.delta.logStore.class',
     'org.apache.spark.sql.delta.storage.S3SingleDriverLogStore'),
    ('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),
    ('fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),
    ('fs.s3n.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),
    ('fs.AbstractFileSystem.s3.impl', 'org.apache.hadoop.fs.s3a.S3A'),
    ('fs.AbstractFileSystem.s3n.impl', 'org.apache.hadoop.fs.s3a.S3A'),
    ('fs.AbstractFileSystem.s3a.impl', 'org.apache.hadoop.fs.s3a.S3A')
Exemple #6
0
     jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext(
         jcontext.sparkContext())
     sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
     context = SparkSession(sc, jcontext.spark())
 else:
     customContext = job.build_context(gateway, jcontext, spark_conf)
     if customContext is not None:
         context = customContext
     else:
         exit_with_failure(
             "Expected JavaSparkContext, SQLContext "
             "or HiveContext but received %s" % repr(context_class), 2)
 package_path = os.environ.get("PACKAGEPATH", None)
 if package_path and sc:
     try:
         sc.addPyFile(package_path)
     except Exception as error:
         exit_with_failure(
             "Error while adding Python package to Spark Context: %s\n%s" %
             (repr(error), traceback.format_exc()), 5)
 try:
     job_data = job.validate(context, None, job_config)
 except Exception as error:
     exit_with_failure(
         "Error while calling 'validate': %s\n%s" %
         (repr(error), traceback.format_exc()), 3)
 if isinstance(job_data, list) and \
         isinstance(job_data[0], ValidationProblem):
     entry_point.setValidationProblems([p.problem for p in job_data])
     exit_with_failure("Validation problems in job, exiting")
 else:
from pyspark import SparkFiles
import parse_csv as pycsv


sc = SparkContext()
# sqlCtx = SQLContext or HiveContext
#sqlCtx=SQLContext(sc)

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("Python load csv to DataFrame example") \
        .getOrCreate()

    # 将parse_csv.py上传到spark集群,否则spark-submit会提示找不到此文件
    sc.addPyFile('/home/xuan/mystudy/pyspark/csvToDataFrame/parse_csv.py')

    # Read csv data via SparkContext and convert it to DataFrame
    # load with rdd
    # 不带表头
    plaintext_rdd = sc.textFile(
        "hdfs:///ubuntuxuan/MyData/Titanic/train_without_header.csv")
    dataframe = pycsv.csvToDataFrame(spark, plaintext_rdd,columns=["PassengerId","Survived","Pclass","Name","Sex","Ag","SibSp","Parch","Ticket","Fare","Cabin","Embarked"],parseDate=False)
    dataframe.show()
    
    #带表头,自行推出字段类型
    plaintext_rdd = sc.textFile(
        "hdfs:///ubuntuxuan/MyData/Titanic/train_with_header.csv")
    dataframe = pycsv.csvToDataFrame(spark, plaintext_rdd,columns=None,parseDate=False)
    dataframe.show()