Exemple #1
0
def get_spark_context(workers="*", driver_memory=None, executor_memory=None):
    """
    This function sets up a local Spark context, configured for use with SQL Server and AWS S3.
    """

    # we need some libraries (jars) to connect to SQL Server and S3, so define this config
    jar_dir = r"C:\Jars"

    files = os.listdir(jar_dir)

    jars = [f for f in files if f.lower().endswith(".jar")]

    extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars])

    # setup spark context
    conf = SparkConf().setMaster(f"local[{workers}]") \
        .set("spark.driver.extraClassPath", extra_class_path) \
        .set("spark.executor.heartbeatInterval", "60s")

    if driver_memory:
        conf.set("spark.driver.memory", driver_memory)

    if executor_memory:
        conf.set("spark.executor.memory", executor_memory)

    spark_context = SparkContext(conf=conf)

    # we need to configure our s3 endpoint because our buckets are in London
    spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint",
                                                 "s3.eu-west-2.amazonaws.com")

    return spark_context
Exemple #2
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def run_driver(keyspace, table):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = (
        {
            "customer_id": "example.com",
            "url": "http://example.com/article1/",
            "hour": dt.datetime(2014, 1, 2, 1),
            "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
            "pixel_id": str(uuid4()),
            "data": {"visitor_id": "xyz"}
        },
    )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def initialize():
    global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold
    print("Initializing...")
    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    buckets_user = items.groupByKey().mapValues(list).filter(
        lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex(
            removeDuplicateEntriesAfter)
    print("Without Duplicates DOne..")
    # withoutDuplicates = checkM.mapPartitionsWithIndex(
    #     removeDuplicateEntries).groupByKey().mapValues(list)

    if (case == 1):
        # buckets_user = withoutDuplicates.mapPartitionsWithIndex(
        #     createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold)

        callSonPhase1(buckets_user)
        print("Initializing Phase 2.....")
        finalFreq = buckets_user.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
    if (case == 2):
        buckets_business = withoutDuplicates.mapPartitionsWithIndex(
            createBuckets_case2).groupByKey().mapValues(list)
        callSonPhase1(buckets_business)
        print("Initializing Phase 2.....")
        finalFreq = buckets_business.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
Exemple #5
0
def initialize():
    global sc, spark, items, inputfile
    print("Initializing...")
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    jsonread = sc.textFile(inputfile)
    items = jsonread.map(json.loads)
Exemple #6
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars.packages",
             "graphframes:graphframes:0.8.0-spark3.0-s_2.12")

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.sql.shuffle.partitions", "8")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types

    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction("Dmetaphone",
                                   "uk.gov.moj.dash.linkage.DoubleMetaphone",
                                   types.StringType())
    return spark
Exemple #7
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar")
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar")

    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar"
    )
    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar"
    )
    # SET TO YOUR SPARK INSTALATION

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "1g")
    conf.set("spark.sql.shuffle.partitions", "4")

    #conf.set("spark.sql.files.maxPartitionBytes","536870912")
    #conf.set("spark.sql.files.maxPartitionBytes","250000000")
    #conf.set("spark.sql.files.maxPartitionBytes","134217728")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types
    '''
    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction(
        "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()
    )
    '''

    return spark
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
from pyspark.sql import SQLContext
from pyspark.sql import functions as f

from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.transforms import *

from awsglue.context import GlueContext
from awsglue.job import Job
import sys

args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])

conf = SparkConf()

conf.set("spark.sql.parquet.compression.codec", "snappy")
conf.set("spark.sql.parquet.writeLegacyFormat", "true")

sc = SparkContext()

glueContext = GlueContext(sc)

spark = glueContext.spark_session

job = Job(glueContext)

job.init(args['JOB_NAME'], args)

input_file_path = "s3://xxxxx"

df = spark.read.option("header","true")\
Exemple #10
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
Exemple #11
0
TENSORFLOW_HADOOP = "preproc/data/tensorflow-hadoop-1.5.0.jar"

from IPython.display import display

import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession

conf = SparkConf().setMaster('local[*]').set(
    'spark.executor.memory', '40g').set('spark.driver.memory',
                                        '200g').set("spark.local.dir",
                                                    SPARK_TEMP_FOLDER)
conf.set("spark.jars", TENSORFLOW_HADOOP)
conf.set("spark.sql.files.maxPartitionBytes", 805306368)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

from pyspark.sql import Row
from pyspark.sql.types import ArrayType, BinaryType, DoubleType, LongType, StringType, StructField, StructType
from pyspark.sql.functions import col, when, log1p, udf

import numpy as np
import scipy.sparse

import math
import datetime
import time
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf()
conf.set('spark.driver.extraClassPath', 'jars/scala-udf-similarity-0.0.6.jar')
conf.set('spark.jars', 'jars/scala-udf-similarity-0.0.6.jar')

spark = SparkSession(sc)
sc = SparkContext.getOrCreate(conf=conf)
udfs = [('jaro_winkler_sim', 'JaroWinklerSimilarity', DoubleType()),
        ('jaccard_sim', 'JaccardSimilarity', DoubleType()),
        ('cosine_distance', 'CosineDistance', DoubleType()),
        ('Dmetaphone', 'DoubleMetaphone', StringType()),
        ('QgramTokeniser', 'QgramTokeniser', StringType())]

for a, b, c in udfs:
    spark.udf.registerJavaFunction(a, 'uk.gov.moj.dash.linkage.' + b, c)
Exemple #13
0
password = creds['password']
host = creds['host']

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
partition_by_cols=["year","month"]
output_dir_path="s3://bucket-name/orders_data_pyspark"

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)


conf = SparkConf()
conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")
conf.set("spark.sql.parquet.compression.codec","snappy")
conf.set("spark.sql.parquet.writeLegacyFormat","true")

jdbc_url = "jdbc:mysql://"+host+":3306/ecommerce_db"

sql_qry = """
            (
				select * , 
				year(order_purchase_timestamp) as year ,
				month(order_purchase_timestamp) as month
				 from orders
            ) as t

          """
Exemple #14
0
EXPORT_ARG_TABLE_NAME = "TableName"
EXPORT_ARG_READ_PCT = "ReadPercentage"
EXPORT_ARG_PREFIX = "OutputPrefix"
EXPORT_ARG_FORMAT = "OutputFormat"

# signature of a glue python job is to run from __main__
if __name__ == '__main__':
    import sys
    from awsglue.utils import getResolvedOptions
    from pyspark.context import SparkContext, SparkConf
    from awsglue.context import GlueContext

    # setup gzip compression
    # TODO figure out why this isn't working
    conf = SparkConf()
    conf.set("spark.hadoop.mapred.output.compress", "true")
    conf.set("spark.hadoop.mapred.output.compression.codec", "true")
    conf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.set("spark.hadoop.mapred.output.compression.type", "BLOCK")

    # import arguments from the Glue invocation interface
    args = getResolvedOptions(
        sys.argv,
        [
            'JOB_NAME',
            EXPORT_ARG_TABLE_NAME,
            EXPORT_ARG_READ_PCT,
            EXPORT_ARG_PREFIX,
            EXPORT_ARG_FORMAT
        ]
    )