def get_spark_context(workers="*", driver_memory=None, executor_memory=None): """ This function sets up a local Spark context, configured for use with SQL Server and AWS S3. """ # we need some libraries (jars) to connect to SQL Server and S3, so define this config jar_dir = r"C:\Jars" files = os.listdir(jar_dir) jars = [f for f in files if f.lower().endswith(".jar")] extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars]) # setup spark context conf = SparkConf().setMaster(f"local[{workers}]") \ .set("spark.driver.extraClassPath", extra_class_path) \ .set("spark.executor.heartbeatInterval", "60s") if driver_memory: conf.set("spark.driver.memory", driver_memory) if executor_memory: conf.set("spark.executor.memory", executor_memory) spark_context = SparkContext(conf=conf) # we need to configure our s3 endpoint because our buckets are in London spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.eu-west-2.amazonaws.com") return spark_context
def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def run_driver(keyspace, table): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ( { "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": {"visitor_id": "xyz"} }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def initialize(): global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) buckets_user = items.groupByKey().mapValues(list).filter( lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex( removeDuplicateEntriesAfter) print("Without Duplicates DOne..") # withoutDuplicates = checkM.mapPartitionsWithIndex( # removeDuplicateEntries).groupByKey().mapValues(list) if (case == 1): # buckets_user = withoutDuplicates.mapPartitionsWithIndex( # createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold) callSonPhase1(buckets_user) print("Initializing Phase 2.....") finalFreq = buckets_user.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass if (case == 2): buckets_business = withoutDuplicates.mapPartitionsWithIndex( createBuckets_case2).groupByKey().mapValues(list) callSonPhase1(buckets_business) print("Initializing Phase 2.....") finalFreq = buckets_business.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass
def initialize(): global sc, spark, items, inputfile print("Initializing...") sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") jsonread = sc.textFile(inputfile) items = jsonread.map(json.loads)
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars.packages", "graphframes:graphframes:0.8.0-spark3.0-s_2.12") # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "4g") conf.set("spark.sql.shuffle.partitions", "8") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction("Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()) return spark
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar") #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar") conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar" ) conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar" ) # SET TO YOUR SPARK INSTALATION # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "1g") conf.set("spark.sql.shuffle.partitions", "4") #conf.set("spark.sql.files.maxPartitionBytes","536870912") #conf.set("spark.sql.files.maxPartitionBytes","250000000") #conf.set("spark.sql.files.maxPartitionBytes","134217728") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types ''' spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction( "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType() ) ''' return spark
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
from pyspark.sql import SQLContext from pyspark.sql import functions as f from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from awsglue.transforms import * from awsglue.context import GlueContext from awsglue.job import Job import sys args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) conf = SparkConf() conf.set("spark.sql.parquet.compression.codec", "snappy") conf.set("spark.sql.parquet.writeLegacyFormat", "true") sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) input_file_path = "s3://xxxxx" df = spark.read.option("header","true")\
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
TENSORFLOW_HADOOP = "preproc/data/tensorflow-hadoop-1.5.0.jar" from IPython.display import display import pyspark.sql.functions as F from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT from pyspark.context import SparkContext, SparkConf from pyspark.sql.session import SparkSession conf = SparkConf().setMaster('local[*]').set( 'spark.executor.memory', '40g').set('spark.driver.memory', '200g').set("spark.local.dir", SPARK_TEMP_FOLDER) conf.set("spark.jars", TENSORFLOW_HADOOP) conf.set("spark.sql.files.maxPartitionBytes", 805306368) sc = SparkContext(conf=conf) spark = SparkSession(sc) from pyspark.sql import Row from pyspark.sql.types import ArrayType, BinaryType, DoubleType, LongType, StringType, StructField, StructType from pyspark.sql.functions import col, when, log1p, udf import numpy as np import scipy.sparse import math import datetime import time
from pyspark.context import SparkContext, SparkConf from pyspark.sql import SparkSession conf = SparkConf() conf.set('spark.driver.extraClassPath', 'jars/scala-udf-similarity-0.0.6.jar') conf.set('spark.jars', 'jars/scala-udf-similarity-0.0.6.jar') spark = SparkSession(sc) sc = SparkContext.getOrCreate(conf=conf) udfs = [('jaro_winkler_sim', 'JaroWinklerSimilarity', DoubleType()), ('jaccard_sim', 'JaccardSimilarity', DoubleType()), ('cosine_distance', 'CosineDistance', DoubleType()), ('Dmetaphone', 'DoubleMetaphone', StringType()), ('QgramTokeniser', 'QgramTokeniser', StringType())] for a, b, c in udfs: spark.udf.registerJavaFunction(a, 'uk.gov.moj.dash.linkage.' + b, c)
password = creds['password'] host = creds['host'] args = getResolvedOptions(sys.argv, ['JOB_NAME']) partition_by_cols=["year","month"] output_dir_path="s3://bucket-name/orders_data_pyspark" sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) conf = SparkConf() conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") conf.set("spark.sql.parquet.compression.codec","snappy") conf.set("spark.sql.parquet.writeLegacyFormat","true") jdbc_url = "jdbc:mysql://"+host+":3306/ecommerce_db" sql_qry = """ ( select * , year(order_purchase_timestamp) as year , month(order_purchase_timestamp) as month from orders ) as t """
EXPORT_ARG_TABLE_NAME = "TableName" EXPORT_ARG_READ_PCT = "ReadPercentage" EXPORT_ARG_PREFIX = "OutputPrefix" EXPORT_ARG_FORMAT = "OutputFormat" # signature of a glue python job is to run from __main__ if __name__ == '__main__': import sys from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext, SparkConf from awsglue.context import GlueContext # setup gzip compression # TODO figure out why this isn't working conf = SparkConf() conf.set("spark.hadoop.mapred.output.compress", "true") conf.set("spark.hadoop.mapred.output.compression.codec", "true") conf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec") conf.set("spark.hadoop.mapred.output.compression.type", "BLOCK") # import arguments from the Glue invocation interface args = getResolvedOptions( sys.argv, [ 'JOB_NAME', EXPORT_ARG_TABLE_NAME, EXPORT_ARG_READ_PCT, EXPORT_ARG_PREFIX, EXPORT_ARG_FORMAT ] )