def init_spark_context(details=[]): global spark_context if spark_context: return build_type = yb_dist_tests.global_conf.build_type from pyspark import SparkContext # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of # retries. # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism # is just for the resilience of the test framework itself. SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES)) spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', DEFAULT_SPARK_MASTER_URL) details += [ 'user: {}'.format(getpass.getuser()), 'build type: {}'.format(build_type) ] if 'BUILD_URL' in os.environ: details.append('URL: {}'.format(os.environ['BUILD_URL'])) spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details))) spark_context.addPyFile(yb_dist_tests.__file__)
def init_spark_context(details=[]): global spark_context if spark_context: return build_type = yb_dist_tests.global_conf.build_type from pyspark import SparkContext # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of # retries. # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism # is just for the resilience of the test framework itself. SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES)) if yb_dist_tests.global_conf.build_type == 'tsan': logging.info("Using a separate default Spark cluster for TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN else: logging.info("Using the regular default Spark cluster for non-TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url) details += [ 'user: {}'.format(getpass.getuser()), 'build type: {}'.format(build_type) ] if 'BUILD_URL' in os.environ: details.append('URL: {}'.format(os.environ['BUILD_URL'])) spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details))) spark_context.addPyFile(yb_dist_tests.__file__)
def main(input, output, alpha, iters): SparkContext.setSystemProperty('spark.executor.memory', '3g') conf = SparkConf().setAppName("SparkPageRank") sc = SparkContext(conf=conf) lines = sc.textFile(input).filter(lambda x: len(x) and x[0] != '#') vertexes = lines.flatMap(lambda x: map(int, x.strip().split())) \ .distinct().map(lambda x: [x, None]) edges = lines.map(lambda x: map(int, x.strip().split())).groupByKey() t = vertexes.leftOuterJoin(edges).map(lambda x: (x[0], x[1][1])) pr = vertexes.map(lambda x: (x[0], 1 / N)) hang_pr = t.join(pr).filter(lambda x: x[1][0] is None).map( lambda x: x[1][1]).sum() for i in range(iters): emits = t.join(pr).flatMap(lambda x: mapper(x[0], x[1][1], x[1][0])) pr = emits.groupByKey().map( lambda x: reducer(x[0], x[1], hang_pr, alpha)).cache() hang_pr = t.join(pr).filter(lambda x: x[1][0] is None).map( lambda x: x[1][1]).sum() pr = pr.sortBy(lambda x: x[1], False) pr.saveAsTextFile(output)
def init_spark(verbose_logging=False, show_progress=False): if not show_progress: SparkContext.setSystemProperty('spark.ui.showConsoleProgress', 'false') sc = SparkContext() sqlContext = HiveContext(sc) if verbose_logging: sc.setLogLevel( 'INFO' if isinstance(verbose_logging, bool) else verbose_logging) return sc, sqlContext
def main(*argv): #Setup to read from s3 aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') conf = SparkConf().setAppName("taxishift1") SparkContext.setSystemProperty('spark.executor.memory', '5g') sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId",aws_access_key) sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",aws_secret_access_key) #Read from s3 for i in range(1,13): raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\ 'org.apache.hadoop.mapred.TextInputFormat',\ 'org.apache.hadoop.io.Text',\ 'org.apache.hadoop.io.LongWritable') #Call user defined function to map raw data into key-value pairs new_data = raw_data.map(lambda x:splitit(x[1])) #Combine data from multiple csv files if i<2: total_data = new_data else: total_data = total_data.union(new_data) #Create total list of rides/shifts for each driver total_data = total_data.reduceByKey(lambda x,y:x+y)\ .mapValues(lambda x:mergeit(x))\ #Create key-value for each shift, and set shifts greater than 10 hours to 10 hours (We want to know exactly when they go over) ungrouped_data = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\ .mapValues(lambda x: x if x[1]-x[0] < 36000 else [x[0],x[0]+36000]) #Extract only 10 hour shift offenders offenders = ungrouped_data.filter(lambda x:x[1][1] - x[1][0] >= 36000)\ .map(lambda x:(x[1][1]))\ .collect() #Plot number of offenders for every 30 minutes offender_hist = plt.hist(offenders, bins=range(int(min(offenders)), int(max(offenders)) + 1800, 1800)) hist_min = int(min(offender_hist)) hist_min+=900 #Save to file hist_csv = open('/home/ubuntu/offenders.csv','w') for i in offender_hist[0]: hist_csv.write(str(hist_min)+','+str(i)+'\n') hist_min+=1800 hist_csv.close() return
def initialize_params(self, partitions=2100, cores=5, memory=11): conf = SparkConf() conf.set('spark.sql.shuffle.partitions', str(partitions)) conf.set("spark.executor.cores", str(cores)) SparkContext.setSystemProperty('spark.executor.memory', str(memory) + 'g') SparkContext.setSystemProperty('spark.driver.memory', str(memory) + 'g') self.sc = SparkContext(appName='mm_exp', conf=conf) self.sqlContext = pyspark.SQLContext(self.sc)
def main(): if len(sys.argv) < 3: print("Usage: input <file> mapping <file>", file=sys.stderr) exit(-1) time1 = datetime.datetime.fromtimestamp(time.time()) os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH/lib/spark" usage = "usage: run_index_field_extractor.py [options]" # --subapp data_load --op0 source_db --op1 source_table --op2 target_db --op3 target_table --op4 partitoins_column=date --op5 10 global return_code, listOfPartitions, final_properties, sourceDB, sourceTable, targetDB, targetTable, partitonColumn, partitonColumnDataType, numberOfPartitins, app, sub_app, env, env_ver, group, common_properties, minPartition, maxPartition, start_line sourceFile, mappingFile, outputDir, lookupTable = arg_handle() # SparkContext.setSystemProperty( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf = SparkConf().setAppName("CrissIndexExtractor") sc = SparkContext(conf=conf) #sc = SparkContext(appName="IndexFieldExtractor"); lines = sc.textFile(sourceFile, 1) # Filter header records headerList = lines.filter(lambda x: "HDR:" in x) # Filter data records dataFilter = lines.filter(lambda data: "DTL" in data) # dataFilter.cache() referenceMappedData = dataFilter.map(processDataLine(headerList.collect())) # get required index mapping of each record in indexMappingLines = sc.textFile(mappingFile, 1) indexFieldList = list( itertools.chain.from_iterable( indexMappingLines.map(createIndexColumnList).collect())) # print("IndexmappingList " , indexFieldList) # groupDataby reference Key referenceGroupedData = referenceMappedData.groupByKey().map( lambda x: processReferenceData(x[0], list(x[1]), indexFieldList)) # print("referenceMappedData" , referenceGroupedData.collect()) referenceGroupedData.saveAsTextFile(outputDir) formidReferenceList = dataFilter.map(getReferenceFormidList) formidReferenceList.saveAsTextFile(lookupTable) # print(createTableStatement(indexFieldList)) sc.stop() time2 = datetime.datetime.fromtimestamp(time.time()) print("time taken", time2 - time1) sys.exit()
def getSpark(self): SparkContext.setSystemProperty("hive.metastore.uris", "thrift://localhost:9083") spark = SparkSession \ .builder \ .appName('example-pyspark-read-and-write-from-hive') \ .config("spark.sql.warehouse.dir", "spark_warehouse") \ .enableHiveSupport() \ .getOrCreate() #spark.conf.get("spark.sql.hive.metastore.version") spark.sql("SET spark.sql.hive.metastore.version=2.3.2").show() #spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate() return spark
def context_spark(): conf = (SparkConf().setMaster("local[4]").set("spark.executor.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true").set("spark.driver.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true")) sc = SparkContext(conf=conf) sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") sql = SQLContext(sc) hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.s3a.awsAccessKeyId", "---") hadoopConf.set("fs.s3a.awsSecretAccessKey", "---") hadoopConf.set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") hadoopConf.set("com.amazonaws.services.s3a.enableV4", "true") hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") print(sc._conf.getAll()) return sql
def create_spark_session(app_name="SparkApplication"): memory = '1g' pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell' os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args SparkContext.setSystemProperty('spark.executor.memory', '1g') # SparkContext.setSystemProperty('spark.driver.maxResultSize', '25g') spark_session = SparkSession.builder \ .appName(app_name) \ .master("local[*]") \ .getOrCreate() spark_session.sparkContext.setLogLevel("WARN") return spark_session
def main(*argv): #Setup to read from s3 aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') conf = SparkConf().setAppName("taxishift2") SparkContext.setSystemProperty('spark.executor.memory', '5g') sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key) sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", aws_secret_access_key) #Check for new data every five minutes ssc = StreamingContext(sc, 300) ssc.checkpoint('s3a://ddrum-s3/checkpoint/') #Update state with new data def updateFunc(currentTaxi, taxiState): if taxiState is None: taxiState = [] #Create total list of rides/shifts for each driver return mergeit(currentTaxi + taxiState) #Read streaming from s3 raw_data = ssc.textFileStream('s3a://ddrum-s3/trip_data/') #Call user defined function to map raw data into key-value pairs, and update state total_data = raw_data.map(lambda x:splitit(x))\ .updateStateByKey(updateFunc) #Create key-value for each shift, and return people with nine hour shifts or greater within 30 minutes of zerohour warn_list = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\ .filter(lambda x:x[1][1] - x[1][0] > 32400)\ .filter(lambda x:x[1][1] < (zerohour + 1800) and x[1][1] > zerohour) #Indicator warning warn_list.pprint() #Begin stream ssc.start() ssc.awaitTermination() return
def spark_write_to_db(source, table_name, db_user, db_psrwd, db_endpoint, aws_access_key_id, aws_secret_access_key): """ Dumb copy of source data (parquet) into DB table using spark. :params source (str) - s3 or local path to parquet file :params table_name (str) - name of table in planetpulse postgresql db :params db_user (str) - username for db access :params db_pswrd (str) - password for db access :params db_endpoint (str) - endpoint of planetpulse postgresql db """ # Again, This is a work-around since we are running spark locally on EC2... # TODO - this is a similar to spark setup in intake/spark_etl.py - let's refactor to use shared utils code... # We should also be able to remove some of this when we move from PythonOperator -> SparkSubmitOperator os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-common:3.0.0,org.apache.hadoop:hadoop-aws:3.0.0,org.apache.hadoop:hadoop-client:3.0.0 pyspark-shell' os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key jars_path = pkg_resources.resource_filename('intake.jars', 'postgresql-42.2.23.jar') conf = SparkConf().set('spark.jars', jars_path) sc = SparkContext(conf=conf) sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true') hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.s3a.awsAccessKeyId", aws_access_key_id) hadoopConf.set("fs.s3a.awsSecretAccessKey", aws_secret_access_key) hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com") hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') spark = SparkSession(sc) # Always overwrite data. We are always processing all data from # source, rather than just new data. No need to load current db data # and decide what to write. mode = 'overwrite' properties = { "user": db_user, "password": db_psrwd, "driver": "org.postgresql.Driver" } df = spark.read.parquet(source) df.write.jdbc(url=db_endpoint, table=table_name, mode=mode, properties=properties)
def main(*argv): #Setup to read from s3 aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') conf = SparkConf().setAppName("taxishift3") SparkContext.setSystemProperty('spark.executor.memory', '5g') sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key) sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", aws_secret_access_key) #Read from s3 for i in range(1, 13): raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\ 'org.apache.hadoop.mapred.TextInputFormat',\ 'org.apache.hadoop.io.Text',\ 'org.apache.hadoop.io.LongWritable') #Call user defined function to map raw data into key-value pairs new_data = raw_data.map(lambda x: splitit(x[1])) #Combine data from multiple csv files if i < 2: total_data = new_data else: total_data = total_data.union(new_data) #Create total list of pickup chains and pickups for each coordinate #If there are more than three times as many pickups as pickup chains, this might be a taxistand total_data = total_data.reduceByKey(lambda x,y:x+y)\ .mapValues(lambda x:mergeit(x))\ .filter(lambda x:x[1][1]/x[1][0] > 3)\ .collect() #Print results total_data.sort() for i in total_data: print i return
def spark_run_etl(source, output_path, aws_access_key_id='', aws_secret_access_key='', local=False): """ Run ETL from Source and output to Parquet :params source (str) - source from intake/sources :params output_path (str) - output path to s3 or local file system :params aws_access_key_id (str) - defaults to None :params aws_secret_access_key (str) - defaults to None :params local (bool) - Run pipeline for AWS or Local (s3 or local file system) """ # This is a work-around since we are running spark locally on EC2 # If we were running on a hadoop cluster, we could bypass this... # Unfortunately, we are cheap and spend too much of our money on # NYC rent and street food... if not local: os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-common:3.0.0,org.apache.hadoop:hadoop-aws:3.0.0,org.apache.hadoop:hadoop-client:3.0.0 pyspark-shell' # Seems like we need to export env vars, too. Another # hacky workaround that will stick for now... os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key print(f"ACCESS KEY SECRET: {aws_secret_access_key}") sc = SparkContext() sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true') hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.s3a.awsAccessKeyId", aws_access_key_id) hadoopConf.set("fs.s3a.awsSecretAccessKey", aws_secret_access_key) hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com") hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') else: sc = SparkContext() spark = SparkSession(sc) print(f'Reading from {source}!') print(f'Writing to {output_path}') run_etl(source, output_path, spark=spark)
def init_spark_context(details=[]): global spark_context if spark_context: return build_type = yb_dist_tests.global_conf.build_type from pyspark import SparkContext # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of # retries. # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism # is just for the resilience of the test framework itself. SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES)) spark_master_url = g_spark_master_url_override if spark_master_url is None: if is_macos(): logging.info("This is macOS, using the macOS Spark cluster") spark_master_url = SPARK_URLS['macos'] elif yb_dist_tests.global_conf.build_type in ['asan', 'tsan']: logging.info( "Using a separate Spark cluster for ASAN and TSAN tests") spark_master_url = SPARK_URLS['linux_asan_tsan'] else: logging.info( "Using the regular Spark cluster for non-ASAN/TSAN tests") spark_master_url = SPARK_URLS['linux_default'] logging.info("Spark master URL: %s", spark_master_url) spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', spark_master_url) details += [ 'user: {}'.format(getpass.getuser()), 'build type: {}'.format(build_type) ] if 'BUILD_URL' in os.environ: details.append('URL: {}'.format(os.environ['BUILD_URL'])) spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details))) spark_context.addPyFile(yb_dist_tests.__file__)
def main(test_file, model_file, output_file): SparkContext.setSystemProperty('spark.executor.memory', '4g') SparkContext.setSystemProperty('spark.driver.memory', '4g') sc = SparkContext.getOrCreate() start = time.time() sc.broadcast(sc.textFile(model_file,42).saveAsTextFile('task2_model')) model = sc.broadcast(sc.textFile('task2_model',42).map(json.loads).collectAsMap()) shutil.rmtree('task2_model') data = sc.textFile(test_file).map(json.loads) profiles = data.map(lambda d: predict(d, model)).filter(lambda d: d["sim"] >= 0.01).map(json.dumps) json_string = profiles.reduce(lambda x, y: x + "\n" + y) # write your string to a file with open(output_file, "w") as f: f.write(json_string.encode("utf-8")) f.close() print("Duration:",time.time()-start)
def main(train_file, model_file, stopwords_file): SparkContext.setSystemProperty('spark.executor.memory', '4g') SparkContext.setSystemProperty('spark.driver.memory', '4g') sc = SparkContext.getOrCreate() start = time.time() stopwords = {s for s in sc.textFile(stopwords_file).collect()} reviews = (sc.textFile(train_file).map( json.loads).map(lambda d: tokenize(d, stopwords)).persist( StorageLevel(True, True, False, False))) n = reviews.count() #calculating number of documents the term appears in dfs = (reviews.flatMap(lambda d: d["tokens"]).map( lambda t: (t, 1)).reduceByKey(add).collectAsMap()) idfs = {k: math.log(n / v) for k, v in dfs.items()} def add_key_prefix(rdd, prefix): return rdd.map(lambda x: ("{}_{}".format(prefix, x[0]), x[1])) business_profiles = build_profile(reviews, "business_id", idfs) user_profiles = (reviews.map(lambda d: (d["business_id"], d["user_id"])). join(business_profiles).values().aggregateByKey({}, merge, merge)) add_key_prefix(user_profiles, "u").union( add_key_prefix(business_profiles, "b")).map(json.dumps).saveAsTextFile('task2_model') with open(model_file, 'wb') as outfile: for filename in glob.glob('task2_model/part*'): with open(filename, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) shutil.rmtree('task2_model') print("Duration:", time.time() - start)
def create_spark_sql_context(app_name): """ Instantiates spark and sql contexts. If executed twice, it will return the first instance. :param app_name: name of the app to assign to the created spark context. :return: """ # Initialize Spark and SQL context # set spark config and master conf = copy(spark_conf) conf.setMaster(spark_master).setAppName(app_name) # set PROFILE environment in executors conf.setExecutorEnv('PROFILE', os.environ.get('PROFILE')) # set spark system properties for k, v in spark_sys_properties.items(): SparkContext.setSystemProperty(k, v) sc = SparkContext(conf=conf) if sc is None: raise SparkClusterException("Unable to instantiate SparkContext") # Adding spark_helpers.zip to SparkContext so that workers can load modules from spark_helpers # http://apache-spark-user-list.1001560.n3.nabble.com/Loading-Python-libraries-into-Spark-td7059.html tmp_dir = tempfile.mkdtemp() sc.addPyFile( shutil.make_archive(base_name='{}/spark_cluster_pkg'.format(tmp_dir), format='zip', root_dir=os.path.abspath(path_pkg))) sq = SQLContext(sc) if sq is None: raise SparkClusterException("Unable to instantiate SQLContext") return sc, sq
def setup_spark_connection(): """ Set up Spark connection @input None @output sqlconn(a HiveContext object) -- """ # check Spark Expiration date SparkContext.setSystemProperty("hive.metastore.uris", "thrift://z9awsspsyn2m52.celgene.com:9083") # set up connection sparkconf = SparkConf().setAppName("upload_data") sc = SparkContext(conf=sparkconf) logger = sc._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) sqlconn = HiveContext(sc) return (sqlconn)
def get_sc(row_id_str,sp_master, exe_memory, core_max): from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD, LogisticRegressionModel from pyspark.mllib.linalg import SparseVector from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.tree import DecisionTree from pyspark.mllib.clustering import KMeans, KMeansModel, GaussianMixture, GaussianMixtureModel from pyspark.mllib.linalg import Vectors SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', exe_memory) SparkContext.setSystemProperty('spark.cores.max', core_max) sc = SparkContext(sp_master, 'single_predict:'+row_id_str) return sc
def main(): # parse arguments print "INFO: creating parquet ..." args = read_args(sys.argv[1:]) SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty( 'spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'parquet_creator:' + str(args.row_id)) sqlCtx = SQLContext(sc) # load json obj from file to srdd ifname = args.hd_master + os.path.join(args.src_dir, args.src_files) df = sqlCtx.read.json(ifname) out_fname = args.hd_master + os.path.join(args.src_dir, args.out_dir) print "INFO: out_dir=" + args.out_dir # clean up existing hdfs file try: hdfs.rmr(out_fname) except: e = sys.exc_info()[0] print "WARNING: ", e # convert dataframe format print "INFO: SQL=", args.sql_script df.registerTempTable(args.tblname) df2 = sqlCtx.sql(args.sql_script) # save as parquet df2.write.parquet(out_fname) df2.printSchema()
from pyspark import SparkConf, SparkContext SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\") import sys, pickle,math from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('random-forest') sc = SparkContext(conf=conf) input = sys.argv[1] # Load and parse the data def parsePoint(line): return LabeledPoint(float(line[1]), line[0]) train = sc.pickleFile(input+'/bow_train/part-00000') test = sc.pickleFile(input+'/bow_test/part-00000') parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedtest.map(lambda x: x.features)) labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions) val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count()) parsedtest.unpersist() RMSE=math.sqrt(val_err) print("Root Mean Squared Error Test= " + str(RMSE))
from pyspark.ml.feature import VectorAssembler, VectorIndexer from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import RegressionEvaluator def distance(long1, lat1, long2, lat2): radius = 6371 diff_lat = radians(lat2 - lat1) diff_long = radians(long2 - long1) a = sin(diff_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(diff_long / 2)**2 c = 2 * atan2(a**0.5, (1 - a)**0.5) return radius * c if __name__ == "__main__": SparkContext.setSystemProperty("saprk.executor.memory", "12g") spark = SparkSession.builder.appName("RegressionTree").getOrCreate() # Load up data as dataframe data = spark.read.csv( "/Users/rafaelchen/Documents/MapReduce/hw2 decision tree/src/train.csv", header=True) # Data preprocessing data = data.withColumn( "pickup_longitude", data["pickup_longitude"].cast("float")).withColumn( "pickup_latitude", data["pickup_latitude"].cast("float")).withColumn( "dropoff_longitude", data["dropoff_longitude"].cast("float")).withColumn( "dropoff_latitude", data["dropoff_latitude"].cast("float")).withColumn(
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import StringType, IntegerType, LongType import couchdb import requests import json import ast SparkContext.setSystemProperty('spark.driver.memory', '1g') SparkContext.setSystemProperty('spark.driver.cores', '3') SparkContext.setSystemProperty('spark.executor.memory', '1g') SparkContext.setSystemProperty('spark.executor.cores', '3') SparkContext.setSystemProperty('spark.driver.memoryOverhead', '1g') SparkContext.setSystemProperty('spark.storage.memoryFraction', '0.9') SparkContext.setSystemProperty('spark.sql.codegen.wholeStage', 'false') spark = SparkSession.builder.appName("Movies: DATA PREP")\ .config("dfs.client.read.shortcircuit.skip.checksum", "true")\ .getOrCreate() print("##################################### LOADING MOVIES DATASET") ### LOAD MOVIES DATASET movies_df = spark.read.csv('ml-latest/movies.csv',header=True,inferSchema=True) \ .select('movieId', 'genres').rdd.flatMapValues(lambda x:x.split("|")) \ .toDF(['IdMovie', 'genres']) print("##################################### GATHERING DATA FROM COUCHDB") response = requests.get( "http://SERVER:5984/moviesdetails/_all_docs?include_docs=true") y = [response.text][0]
# >>> excute_sql = job_day(date= job_date,module_sql= module_sql) # >>> print("excute_sql:",excute_sql) "excute_sql": " select * from db.table where day between '2019-03-01' and '2019-03-31' " """ date_start = date.strftime('%Y-%m-%d') monthrange = calendar.monthrange(date.year, date.month)[1] date_end = (datetime.datetime(date.year, date.month, 1) + datetime.timedelta(monthrange - 1)).strftime('%Y-%m-%d') moudle_sql = moudle_sql.replace('__DAY1__', str(date_start)).replace( '__DAY2__', str(date_end)) return moudle_sql if __name__ == "__main__": SparkContext.setSystemProperty("hive.metastore.uris", "thrift://hdp-0:9083") sparksession = (SparkSession.builder.appName( 'device_increase').enableHiveSupport().getOrCreate()) # spark-submit 提供时间类型参数,计算日,周,月 day,week,month parser = argparse.ArgumentParser() parser.add_argument('time_type') parser.add_argument('excute_day') # add_argument()指定程序可以接受的命令行选项 args = parser.parse_args() time_type = args.time_type excute_date = parse(str(args.excute_day)) day_sql = """ select t1.*
from pyspark.sql import Row from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import json from pyspark.sql import SparkSession, SQLContext import redis from pyspark import SparkConf, SparkContext from pyspark import SparkContext from config.config import KAFKA_NODES SparkContext.setSystemProperty("spark.cassandra.connection.host", 'ec2-54-85-200-216.compute-1.amazonaws.com') """ processParition will compute the spread among different exchanges and returns the best exchange at the discrete timestamp """ def processPartition(partition, table, keyspace, sc): if partition.isEmpty(): return else: spark = SparkSession(sc) def f(accum, x): if ('asks' in list(accum.keys()) and 'asks' in list(x.keys())): if ((float(accum['bids']) - float(accum['asks'])) < (float(x['bids']) - float(x['asks']))): return accum else: return x
import time import pyspark from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark.sql import SQLContext from pyspark.sql.functions import udf from nvludfs import * @nvl("(long,long,long)->long", tupargs=True) def dot_prod(a,b,c): return 3*a+2*b+c SparkContext.setSystemProperty("useNvl", "true") SparkContext.setSystemProperty("offHeap", "true") SparkContext.setSystemProperty("pythonNvl", "true") conf = (SparkConf() .setMaster("local") .setAppName("udf_example") .set("spark.executor.memory", "2g")) sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) dot_udf = udf(dot_prod, LongType()) df = sqlContext.read.parquet("assembly/udf-test-s").cache() times = [] for i in range(0, 11): t = time.time() df.withColumn("udf", dot_udf(df['a'], df['b'], df['c'])).selectExpr("sum(udf)").show() times.append(time.time() - t) print "average time: " + str(sum(times[1:])/10.0)
# -*- coding: utf-8 -*- from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, HiveContext from pyspark.sql import functions as F import pyspark.sql.types as T SparkContext.setSystemProperty( "hive.metastore.uris", "thrift://bdcsceprod-bdcsce-1.compute-590737110.oraclecloud.internal:9083" ) ##nn1:9083 #hive mestastore dir : /apps/hive/warehouse sparkSession = (SparkSession.builder.appName( 'hive_connection').enableHiveSupport().getOrCreate()) sparkSession.sparkContext.setLogLevel("ERROR") hive_command = sparkSession.sql('USE MEDIX') #%% actualizar las tablas STAGE con respecto al ciclo #CUP = sparkSession.read.option('header','true').csv('hdfs:///user/oracle/medix/auditoria/CUP/cup_medibutil_rev.csv') #for x in list(range(1,16)): # CUP = CUP.withColumn('val'+str(x),CUP['val'+str(x)].cast('Int')) #%% catalogo catalogo = sparkSession.sql('SELECT * FROM STAGE_AUDITORIA_CATALOGO') rutas = sparkSession.sql('SELECT * FROM EXT_TABLE_AUDITORIA_RUTAS') Ct = catalogo.alias('Ct') catalogo = catalogo.join(rutas,Ct.clo_brick_num == rutas.ar_brick_num,how='left')\ .select(F.col('clo_brick_num').alias('Brick_num'),\ F.col('clo_cdg_postal_num').alias('Cdg_postal'),\ F.col('clo_tipo_asent').alias('Tipo_asentamiento'),\
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.ml.recommendation import ALSModel from pyspark.ml.feature import StringIndexer, IndexToString from pyspark.ml import Pipeline from pyspark.sql.functions import explode from functools import reduce from pyspark.sql import DataFrame SparkContext.setSystemProperty('spark.driver.memory', '512m') SparkContext.setSystemProperty('spark.driver.cores', '1') SparkContext.setSystemProperty('spark.executor.memory', '2560m') #2560m SparkContext.setSystemProperty('spark.executor.cores', '8') #SparkContext.setSystemProperty('spark.executor.memoryOverhead', '1536m') SparkContext.setSystemProperty("spark.scheduler.mode", "FAIR") SparkContext.setSystemProperty('spark.memory.fraction', '0.8') SparkContext.setSystemProperty('spark.memory.storageFraction', '0.1') SparkContext.setSystemProperty("spark.default.parallelism", "256") SparkContext.setSystemProperty("spark.num.executors", "1") SparkContext.setSystemProperty("spark.local.dir", "/tmp") conf = SparkConf().setAppName('MoviesRec: Preditcions') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.setCheckpointDir('/ML/movies/checkpoint/') df = sqlContext.read.load(path='/ML/movies/data/*', format='com.databricks.spark.csv', delimiter=',',
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier # Initialize Spark SparkContext.setSystemProperty("spark.executor.memory", "4g") conf = SparkConf() conf.set("spark.executor.instances", 20) sc = SparkContext("yarn-client", "kdd99", conf=conf) hc = HiveContext(sc) kdd = hc.table("kdd99") (trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42) trainData.cache() services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct() testData = testData.join(services, testData.service == services.srvc) # filter out any rows with a service not trained upon testData.cache() print "training set has " + str(trainData.count()) + " instances" print "test set has " + str(testData.count()) + " instances" # Build model inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat") inx2 = StringIndexer(inputCol="service", outputCol="service-cat") inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat") inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label") ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
GatherStates() CalculateConditionScores() CalculateReadmissionAndDeathScore() CalculateFinalScores() SanityCheckScores() finalScores = sqlContext.sql(""" SELECT st.State, s.ConditionScoreAverage, s.ReAdmissionAndDeathScore, (s.ConditionScoreAverage + s.ReAdmissionAndDeathScore) / 2 as FinalScore FROM score_data_tmp s JOIN states_tmp st on st.State = s.State ORDER BY FinalScore DESC """) finalScores.show(numberToShow, False) return # # Main body # # Set up spark environment, requesting a little more memory than the default SparkContext.setSystemProperty('spark.executor.memory', '4g') sc = SparkContext("local", "transformer app") sqlContext = HiveContext(sc) ShowTopStates(10)
# Students: A. Romriell, D. Wen, J. Pastor, J. Pollard # MSAN 694 Project from pyspark import SparkContext SparkContext.setSystemProperty('spark.executor.memory', '45g') sc = SparkContext("local", "arXiv") from pyspark.mllib.clustering import PowerIterationClustering def parse_meta(line): """ """ pieces = line.strip().split("|") return {"id": pieces[0], "subj": pieces[1], "dt": pieces[2], "title": pieces[3]} def get_paper_subj(d): """ """ return (d["id"], d["subj"]) def parse_auth(line): """ """ paper_id, the_authors = line.strip().split(":", 1) authors = the_authors.strip().split("|") return {"id": paper_id, "authors": authors}
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark import SQLContext, HiveContext from pyspark import SparkContext import ta if __name__ == "__main__": sc = SparkContext(appName="bintrade_candidate", master="yarn-client") sc.setSystemProperty("spark.driver.memory", "1g") sc.setSystemProperty("spark.executor.memory", "8g") sc.setSystemProperty("spark.executor.instances", "8") sc.setSystemProperty("spark.executor.cores", "4") sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") ta.run(sc, sqlContext, isHive=True)
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark import SQLContext, HiveContext from pyspark import SparkContext import eod if __name__ == "__main__": sc = SparkContext(appName="bintrade_candidate", master="yarn-client") sc.setSystemProperty("spark.driver.memory", "1g") sc.setSystemProperty("spark.executor.memory", "8g") sc.setSystemProperty("spark.executor.cores", "2") sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "16") sqlContext.sql("use fex") eod.run(sc, sqlContext, isHive=True)
def sc_start(app): global sc SparkContext.setSystemProperty("spark.port.maxRetries", "100") SparkContext.setSystemProperty("spark.ui.enabled", "false") SparkContext.setSystemProperty("spark.task.cpus", "2") SparkContext.setSystemProperty("spark.driver.memory", "100g") SparkContext.setSystemProperty("spark.driver.maxResultSize", "20g") SparkContext.setSystemProperty("spark.driver.cores", "4") SparkContext.setSystemProperty("spark.executor.instances", "25") sc = SparkContext.getOrCreate()
client1.close() logger.add( 'shopid={shopid} ,cid={cid} et_update_list get Subscription {c}.......' .format(shopid=shopid, cid=cid, c=len(filters))) merge_path = "/user/athena/{}/meb_attribute/{}".format(shopid, lsdate) if not util_hadoop.PathIsExit(merge_path): ls = util_hadoop.GetPath(merge_path.rsplit("/", 1)[0]) lsdate = max(ls).rsplit('/', 1)[1] merge_path = "/user/athena/{}/meb_attribute/{}".format( shopid, lsdate) logger.add( 'shopid={shopid} ,cid={cid} et_update_list get ta by last date {lsdate}.......' .format(shopid=shopid, cid=cid, lsdate=lsdate)) SparkContext.setSystemProperty('spark.cores.max', '56') sc = SparkContext(appName="et_update_list_{}".format(shopid)) def mapp(l): npt = l.split("\t")[12] if l.split("\t")[12] != 'cindy1' else 'S3' return { 'memberid': l.split("\t")[1], 'tag': l.split("\t")[4], 'channel': [1, 0, 0], 'npt': npt } member_ls = sc.textFile(merge_path, 30).filter(lambda l: l.split("\t")[0] == shopid and l.split("\t")[3] == "L7D" and l.split("\t")[1] in filters) \ .map(lambda l: mapp(l)).collect() logger.add(
#!/usr/bin/env python import matplotlib.pyplot as plot import csv from ast import literal_eval from pyspark import SparkContext, SparkConf, StorageLevel from operator import add """ -------------------------------------------------------- SPARK CONFIGURATION Used only for standalone execution via bin/spark-submit -------------------------------------------------------- """ SparkContext.setSystemProperty("spark.executor.memory", "28g") SparkContext.setSystemProperty("spark.default.parallelism", "500") conf = (SparkConf() .setMaster("local") .setAppName("Uptime per machine") .set("spark.worker.memory", "28g") .set("spark.driver.memory", "28g") .set("spark.local.dir", "/Users/ksmuga/workspace/data/out")) sc = SparkContext(conf = conf) """ -------------------------------------------------------- FIRST MAPPING TRANSFORMATION
df = pd.DataFrame(names, columns=["name", "count"]) data = go.Data([go.Bar(x=df["name"], y=df["count"])]) layout = go.Layout(title='Top 10 names who liked my posts most') fig = go.Figure(data=data, layout=layout) py.offline.plot( fig, filename="/Users/sunling/MUM/BDT/project/BGFacebook/output/test/" + fpa_conf.user + "_top10names.html", auto_open=False) if __name__ == "__main__": sc = SparkContext(appName="CS523FinalProject") sc.setLogLevel("ERROR") sc.setSystemProperty("hive.metastore.uris", "") ssc = StreamingContext(sc, 10) print("start reading data from kafka...") kvs = KafkaUtils.createDirectStream( ssc, [fpa_conf.topic], {"metadata.broker.list": fpa_conf.brokers}) parsed = kvs.map(lambda v: json.loads(v[1])).flatMap( lambda post: post.values()) if parsed is not None: print("start analysising...") hc = getHiveContextInstance(sc) hc.sql("drop table if exists t_posts") posts = parsed.map(lambda r: (r['id'],r['message'],r['created_time'],\ r['likes'],r['comment_count'],r['like_names']))
# -*- coding: utf-8 -*- from pyspark import SparkConf from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql import HiveContext import requests import json SparkContext.setSystemProperty('spark.executor.memory', '10g') SparkContext.setSystemProperty("spark.executor.cores",'4') class SparkHiveExample: def __init__(self): ## initialize spark session self.spark = SparkSession.builder.appName("Spark Hive example").enableHiveSupport().getOrCreate() def run(self): ## download with opendata API url = "http://data.coa.gov.tw/Service/OpenData/ODwsv/ODwsvTravelFood.aspx?" data = requests.get(url) ## convert from JSON to dataframe df = self.spark.createDataFrame(data.json()) ## display schema df.printSchema() ## creates a temporary view using the DataFrame df.createOrReplaceTempView("travelfood")