Python SparkConf.SparkConf Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: SparkConf

Method/Function: SparkConf

Examples at hotexamples.com: 30

SparkConf is a configuration class in PySpark used to configure Spark application properties such as application name, master URL, and memory usage. It is typically the first Spark class that is instantiated when creating a PySpark application.

Example Usage:

To create a SparkConf object, the initialization takes a dictionary of configuration properties. The following code initializes a SparkConf object with the application name and sets the master URL to local[*]:

from pyspark import SparkConf

conf = SparkConf().setAppName("my_app").setMaster("local[*]")

To set additional configurations, simply use the `set` method, as shown in the following example:

conf.set("spark.executor.instances", 2)
conf.set("spark.executor.memory", "2g")

This will set up two executor instances and each executor will have a maximum memory limit of 2 GB. Library: pyspark - PySpark is the Python API for Apache Spark, the open-source big data processing framework. It provides a simple interface for distributed computing and data processing, making it easier to work with large datasets.

Python SparkConf.SparkConf - 30 examples found. These are the top rated real world Python examples of pyspark.SparkConf.SparkConf extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SparkConf(30)

set(30)

setAll(30)

setAppName(30)

setMaster(30)

get(20)

getAll(13)

setExecutorEnv(13)

toDebugString(7)

setIfMissing(4)

contains(1)

read(1)

setExectorMemory(1)

setExecutorCores(1)

setSparkHome(1)

textFile(1)

Example #1

Show file

File: hists.py Project: ATLAS-Analytics/popularity_analysis

def get_spark():
    conf = (SparkConf().setAppName("read_pigstorage").set(
        "spark.authenticate.secret", "thisisasecret"))
    return SparkContext(conf=conf)

Example #2

Show file

File: 1.word-count.py Project: TrafalgarLaw-24/Spark-programs

from pyspark import SparkConf, SparkContext

#we're going to find out how many words are there in any book/text file using Spark

#set master to local when you're running in your machine
conf = SparkConf().setMaster("local").setAppName("WordCount")
#initialize SparkContext as it is like the entry point to spark core
sc = SparkContext(conf = conf)

#reading the book.txt file into an RDD
input = sc.textFile("C:/Users/siraj/github/Spark-programs/data/book.txt")
#flatmap will transform a corpus of texts into words. we're spliting it word by word
words = input.flatMap(lambda x: x.split())
#countByValue will return the sum of each word present in the object.
wordCounts = words.countByValue()

#iterate through the object to get the word and its corresponding number of occurence.
for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))

Example #3

Show file

File: bm_task_mongo2.py Project: fastRundj/Bigdata

                          'calculate time:%s' % str(time.time() - start_time))
        send_msg_to_kafka(producer, result)
    producer.close()
    '''
    with topic_out.get_sync_producer() as producer:
        for query in msgs:
            # result = BM25_cores(query, Lave, bi)
            tmp = '%s' % query
            producer.produce(str(tmp).encode('utf-8'))
    '''


if __name__ == '__main__':
    conf = SparkConf().setMaster("spark://cdh-master-slave1:7077").set(
        "spark.executor.memory",
        "5G").set("spark.driver.memory",
                  "3G").set("spark.executor.cores",
                            "2").set("spark.cores.max", "6")
    # sc = SparkContext(conf=conf)

    # conf = SparkConf().setAppName("bm25")
    sc = SparkContext(conf=conf)
    # sqlContext = SQLContext(sc)
    ssc = StreamingContext(sc, 0.5)
    mongo_client = pymongo.MongoClient('mongodb://192.168.10.219:49019/')
    bi = mongo_client.lawbot.bm25_inverted
    be = mongo_client.lawbot.bm25_extra
    # sc.broadcast(bi)

    extra_data = be.find_one()
    total_word = extra_data.get('total_word')

Example #4

Show file

File: ch01.py Project: kgisme170/mynet

#!/usr/bin/env python
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
conf = SparkConf().setMaster("local").setAppName("My app")
sc = SparkContext(conf = conf)
lines = sc.textFile("ch01.py")
inputRDD = lines.filter(lambda x:"sc" in x)
for line in inputRDD.take(10):
    print line

lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line:line.split(" "))
print words.first()

data = sc.parallelize([1,2,3,4,1,3])
print data.reduce(lambda x,y: x+y)

def printall(rdd):
    print("----------")
    for r in list(rdd.collect()):
        print r
printall(data.distinct())
for d in list(data.distinct().collect()):
    print d

d = sc.parallelize(["1, hello", "2, hi", "3, how are you"])
for _ in list(d.map(lambda x:(x.split(",")[0], x)).collect()):
    print _

Example #5

Show file

    hashh = ""
    i = 0
    while i < len(maximum):

        if i == (len(maximum) - 1):
            hashh = hashh + str(maximum[i][0])
        else:
            hashh = hashh + str(maximum[i][0]) + ","
        i = i + 1
    if hashh != "":
        print("%s" % (hashh))


wind_size = int(sys.argv[1])
batch_duration = int(sys.argv[2])
conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, batch_duration)
ssc.checkpoint("~/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(lambda w: (w.split(';')[7]))

hashtag = tweet.flatMap(lambda w: (w.split(',')))
hasht = hashtag.map(lambda w: (w, 1))
counts = hasht.filter(lambda x: x[0] != '')

totalcount = counts.reduceByKeyAndWindow(

Example #6

Show file

File: spark_pull_sentiments.py Project: daxaurora/MIDS_W251_YetAnotherTwitterSentimentAnalyzer_YATSA

    try:
        firstRow=tweet_rdd.first()
        tweet_rdd=tweet_rdd.filter(lambda row:row != firstRow)

        if not tweet_rdd.isEmpty():
            sqlContext.createDataFrame(tweet_rdd, schema).write \
                                                         .format("org.apache.spark.sql.cassandra") \
                                                         .mode('append') \
                                                         .options(table="sentiment", keyspace="w251twitter") \
                                                         .save()
    except ValueError:
        print("The RDD was empty...continuing...")

if __name__ == "__main__":
    sparkConf = SparkConf().setAppName("TwitterSentimentAnalysis") \
        .set("spark.cassandra.connection.host", "cassandra1, cassandra2, cassandra3")

    sc = SparkContext(conf=sparkConf)
    session = SparkSession(sc)
    sqlContext = SQLContext(sc)
    ssc = StreamingContext(sc, 2)
    brokers, topic = sys.argv[1:]

    kvs = setup_kafka_stream()

    nlp = StanfordCoreNLP('http://localhost:9000')

    tweets = kvs.filter(lambda x: x is not None).filter(lambda x: x is not '').map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()

    sentiment_stream = tweets.map(lambda tweet: get_tweet_sentiment(tweet)).filter(lambda x: x is not None)

Example #7

Show file

from pyspark import SparkContext, SparkConf
import numpy as np
import time

#=============SETUP SPARK==================
local = False

if local:
    spark = pyspark.SparkContext("local[*]")
    spark.setLogLevel("ALL")

else:
    import os
    master = os.environ["SPARK_MASTER"]
    master = "spark://{}:7077".format(master)
    conf = SparkConf().setAppName("SpotTrawl").setMaster(master)
    spark = SparkContext(conf=conf)

#===========DEFINE SAMPLING FUNCTION=======
numSamples = 10**7


def sample(p):
    x, y = np.random.random(), np.random.random()
    return 1 if x * x + y * y < 1 else 0


#==========TAKE SAMPLES======================
count = spark.parallelize(xrange(0, numSamples)).map(sample) \
             .reduce(lambda a, b: a + b)

Example #8

Show file

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("customerSpendings")
sc = SparkContext(conf=conf)


def getCustomerAndSpendings(line):
    row = line.split(',')
    custId = int(row[0])
    amount = float(row[2])
    return (custId, amount)


lines = sc.textFile("./data/customer-orders.csv")
parsedLines = lines.map(getCustomerAndSpendings)

totalAmounts = parsedLines.reduceByKey(lambda v1, v2: v1 + v2)
reversedTotalAmounts = totalAmounts.map(lambda entry: (entry[1], entry[0]))
sortedTotalAmounts = reversedTotalAmounts.sortByKey(
    False)  #false gives descending order

results = sortedTotalAmounts.collect()

print("\nTotal spendings by each customer: \n")
for amount, customer in results:
    print("customer-" + str(customer) + "\t {:.2f}$".format(amount))

Example #9

Show file

        print(s_list[0][0],
              s_list[1][0],
              s_list[2][0],
              s_list[3][0],
              s_list[4][0],
              sep=",")


def func2(line):
    hashtag = line.split(";")[7]
    if (',' in hashtag):
        return hashtag.split(",")
    return [hashtag]


conf1 = SparkConf()
conf1.setAppName("BigData")
sc1 = SparkContext(conf=conf1)

sscp = StreamingContext(sc1, int(sys.argv[2]))
sscp.checkpoint("/checkpoint_BIGDATA")

dataStream1 = sscp.socketTextStream("localhost", 9009)

hashtags = dataStream1.window(int(sys.argv[1]), 1).flatMap(func2).map(
    lambda h: (h, 1)).reduceByKey(lambda x, y: int(x) + int(y))

hashtags.foreachRDD(func)

ssc.start()
ssc.awaitTermination(60)

Example #10

Show file

File: PROC_F_CI_CLASSIFY_RESULT_TEMP.py Project: airuibel/python-1

#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CLASSIFY_RESULT_TEMP').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")

Example #11

Show file

File: demo_lda.py Project: wangzhiwubigdata/Sparkkky

    ldaModel = LDA.train(corpus, k=topic_num)
    return ldaModel


if __name__ == '__main__': 
    if len(sys.argv) != 6:
        print >> sys.stderr, "Usage: %s <input> <model_path> <stopfile> topic_num name" % sys.argv[0]
        exit(1)

    input_path = sys.argv[1]
    model_path = sys.argv[2]
    stopfile = sys.argv[3]
    topic_num = int(sys.argv[4])
    appname = sys.argv[5]

    conf = SparkConf().setAppName(appname)
    sc = SparkContext(conf=conf)

    rdd_lines = sc.textFile(input_path)
    parsed_data = segment(rdd_lines)

    if stopfile:
	parsed_data = filter_stopword(parsed_data, stopfile).cache()  

    tf_data = vectorize(sc, parsed_data)
    ldaModel = lda(tf_data, topic_num)
    
    print "finish train model..."
    t = ldaModel.describeTopics(5)
    print t

Example #12

Show file

File: Similarity_compute.py Project: blgn4/fitCompete


def write_into_redis(s):
    redis_client = redis.StrictRedis(
        host='ec2-52-40-47-83.us-west-2.compute.amazonaws.com',
        port=6379,
        db=0,
        password='')
    pipe = redis_client.pipeline()
    for i in s:
        redis_client.delete(*i[0])
        pipe.lpush(i[0], *i[1])
    pipe.execute()


appName = 'Similarity_APP'
master = 'spark://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:7077'
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
start_time = time.time()
list_1 = get_data_from_influx()

print("--- %s seconds ---" % (time.time() - start_time))
rdd = sc.parallelize(list_1)

tupls = rdd.map(split_string)

buckets = tupls.reduceByKey(lambda a, b: a + b)
write_into_redis.count = 0
buckets.foreachPartition(write_into_redis)

Example #13

Show file

# course-word-count.py
# August 3, 2020
#
# Solution script provided by course.
# Count the number of occurrences of each word in a text file.
# Using the 'Book' text file.

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('word-count')
sc = SparkContext(conf=conf)

lines = sc.textFile('Book')
words = lines.flatMap(lambda x: x.split())
wordCounts = words.countByValue()

for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if cleanWord:
        print cleanWord, count

#print wordCounts

Example #14

Show file

    map_count = fix_date.map(lambda x: ((x[0], x[1]), 1))
    map_count = map_count.reduceByKey(lambda x, y: x + y)
    return map_count


if __name__ == '__main__':
    # Get input/output files from user
    parser = argparse.ArgumentParser()
    parser.add_argument('commits', help='File to load Amazon review data from')
    parser.add_argument('repos', help='File to load Yelp business data from')
    #parser.add_argument('output', help='Directory to save DStream results to')
    args = parser.parse_args()

    # Setup Spark
    conf = SparkConf().setAppName("timezone")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    setup_table(sc, sqlContext, args.commits, args.repos)

    print("-" * 15 + " OUTPUT " + "-" * 15)
    langs = {
        "Python", "Java", "JavaScript", "Ruby", "SQL", "C#", "C++", "nodejs",
        "PHP", "C", "objective-c"
    }

    out = timezone(sc, sqlContext, langs)
    out.saveAsTextFile("/user/renukan2/timezone_github")

    print("-" * 30)

Example #15

Show file

'''
Created on Jun 10, 2017

@author: SathishParthasarathy
'''

from pyspark import SparkConf, SparkContext
from hdfs3 import HDFileSystem
if __name__ == '__main__':
    conf = SparkConf().setAppName("Word Count - Python")
    spark = SparkContext(conf=conf)
    hdfs = HDFileSystem('hadoop.master.com', port=9000)
    if hdfs.exists("/user/psathishcs/Output/Books/Science_Python") != True:
        text_file = spark.textFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Input/Books/The_Outline_of_Science.txt"
        )
        words = text_file.flatMap(lambda line: line.split())
        wordCounts = words.map(lambda word: (word, 1)).reduceByKey(
            lambda a, b: a + b)
        wordCounts.saveAsTextFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Output/Books/Science_Python"
        )

Example #16

Show file

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf=conf)

lines = sc.textFile("c:///SparkCourse/ml-100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

flipped = movieCounts.map(lambda xy: (xy[1], xy[0]))
sortedMovies = flipped.sortByKey()

results = sortedMovies.collect()

for result in results:
    print(result)

Example #17

Show file

File: frequent_itemsets.py Project: jaslatendresse/Frequent-items-and-clustering

from pyspark.sql import SparkSession
import sys
import csv
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import Row
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as func
file=sys.argv[1]
n=sys.argv[2]
s=sys.argv[3]
c=sys.argv[4]
conf = SparkConf().setAppName('Kia_bigdata_lab').setMaster('local')
sc = SparkContext(conf=conf)
spark=SparkSession.builder.appName("lab3").getOrCreate()
rd=sc.textFile(file).map(lambda x: Row((x.split(','))[0],x.split(',')[1:]))
df=rd.toDF(["items","plant"]).withColumn("id",monotonically_increasing_id())
df=df[["id","items","plant"]]

fpGrowth = FPGrowth(itemsCol="plant", minSupport=float(s), minConfidence=float(c))
model = fpGrowth.fit(df)

# Display frequent itemsets.
ml=model.freqItemsets
ml.orderBy([func.size("items"), "freq"], ascending=[0,0]).show(int(n))

# Display generated association rules.
#ml=model.associationRules.show(10)
#ml.orderBy([func.size("antecedent")],"confidence",ascending=[0,0]).show(int(n))

Example #18

Show file

import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating


def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item", encoding='ascii', errors="ignore") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')

print("\nLoading movie names...")
nameDict = loadMovieNames()

data = sc.textFile("ml-100k/u.data")

ratings = data.map(lambda l: l.split()).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print("\nTraining recommendation model...")
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6

Example #19

Show file

def run():
    conf = SparkConf()
    #conf.set('spark.shuffle.blockTransferService', 'nio')
    conf.set('spark.files.fetchTimeout', '180')
    conf.set('spark.files.overwrite', 'yes')
    conf.set('spark.akka.timeout', '180')
    #conf.set('spark.task.maxFailures', '30000')
    conf.set('spark.akka.frameSize', '500')
    conf.set('spark.network.timeout', '180')

    myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb'))

    dataSetMaker = DataSetMakerV2(n=200000)

    feed = FeedNewsFromGoogleFinance()

    def sendRecord(rdd):
        print('new try...')
        if (not rdd.isEmpty()):
            newsRDD = dataSetMaker.processKeepNews(rdd)
            res = newsRDD.map(
                lambda x: (x[0], myClassifierOnevsOne.predict(x[1].features)))
            print('for each result...')
            for result in res.collect():
                symbole = result[0].symbole
                r = requests.put('http://wtun.mooo.com:5000',
                                 data={
                                     'jdata':
                                     NewsPrediction(result[0],
                                                    str(result[1])).json(),
                                     'symbole':
                                     symbole,
                                     'label':
                                     str(result[1])
                                 })
                print('send ok')
                print('receive %s' % str(r.text))
        else:
            print('empty!')

    sc = SparkContext(conf=conf)

    symbolesRDD = sc.parallelize([('NASDAQ:GOOGL', ['GOOG', 'GOOGL',
                                                    'GOOGLE']),
                                  ('NASDAQ:NVDA', ['NVIDIA']),
                                  ('VTX:SCMN', ['SWISSCOM'])])
    taskdt = 600
    running = True
    oldNewsRDD = None
    firstTime = True
    intersectRDD = None
    dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2'
    cpt = 0
    while (running):
        today = datetime.datetime.now()
        yesterday = today - datetime.timedelta(days=1)
        tomorrow = today + datetime.timedelta(days=1)
        newsRDD = symbolesRDD.flatMap(
            lambda x: feed.lookingAt(x[0], yesterday, tomorrow, x[1]))
        if (firstTime):
            firstTime = False
            intersectRDD = newsRDD
        else:
            try:
                intersectRDD = oldNewsRDD.intersection(newsRDD)
            except:
                pass  # empty rdd

        oldNewsRDD = newsRDD

        try:
            sendRecord(intersectRDD)
            intersectRDD.saveAsPickleFile(
                dataDirectory + '/' +
                datetime.datetime.now().strftime('%Y-%m-%d--') + str(cpt))
            cpt += 1
        except:
            pass  # empty rdd

        time.sleep(taskdt)

    running = False  # TODO remove it

Example #20

Show file

        "com.databricks.spark.csv").option("header",
                                           "true").save('locked_data.csv')
    byDateUnlocked.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('unlocked_data.csv')

    # 5 over_18
    byDate18 = df10.filter('over_18 == true').select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDate18.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save('over18_data.csv')

    # final 4
    dff4 = df10.groupBy('title').agg(
        avg('Positive').alias('avgPos'),
        avg('Negative').alias('avgNeg'))
    dff4.orderBy('avgPos', ascending=0).limit(10).show(truncate=False)
    dff4.orderBy('avgNeg', ascending=0).limit(10).show(truncate=False)


if __name__ == "__main__":
    conf = SparkConf().setAppName("CS143 Project 2B")
    conf = conf.setMaster("local[*]")
    # conf = (conf.set('spark.executor.memory', '4G').set('spark.driver.memory', '4G'))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    sc.addPyFile("cleantext.py")
    main(sqlContext)

Example #21

Show file

File: customer-amounts.py Project: abcoep/SparkCourse

from pyspark import SparkContext, SparkConf
import collections


def parseLine(line):
    cells = line.split(',')
    return (int(cells[0]), float(cells[2]))


conf = SparkConf().setMaster("local").setAppName("CustomerOrders")
sc = SparkContext(conf=conf)

rdd = sc.textFile("customer-orders.csv")
custOrders = rdd.map(parseLine)
custAmounts = custOrders.reduceByKey(lambda x, y: x + y).map(lambda x:
                                                             (x[1], x[0]))
custAmountsSorted = custAmounts.sortByKey()
results = custAmountsSorted.collect()

for result in results:
    print(str(result[1]) + ": {:.2f}".format(result[0]))
'''
Output:

45: 3309.38
79: 3790.57
96: 3924.23
23: 4042.65
99: 4172.29
...
...

Example #22

Show file

File: PySpark_WriteAPIs.py Project: mallireddy21/pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('WriteAPIs').setMaster('local')
sc = SparkContext(conf=conf)
ss = SparkSession.builder.appName('WriteAPIs').master('local').getOrCreate()

#-----------------------------------------------------------------------------------------------
# Writing to files - RDD
#-----------------------------------------------------------------------------------------------
# car_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/PySparkCodes/sampledata/car_sales_data.csv'
# output_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out'

# num_partitions = 16
# rdd = sc.textFile(car_file)
# rdd = sc.textFile(car_file, num_partitions)
# print('Total no of partitions: ',rdd.getNumPartitions())

# rdd1 = rdd.map(lambda x: (x.split(',')[3],x.split(',')[5],x.split(',')[11]))
# rdd1.saveAsTextFile(output_file)
# rdd1.coalesce(1).saveAsTextFile(output_file)
# rdd1.repartition(1).saveAsTextFile(output_file)

# rdd2 = sc.textFile("/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out/part-*")
# print(rdd2.getNumPartitions())

# print(rdd1.count())
# print(rdd2.count())
#
# for i in rdd1.take(5):
#     print(i)

Example #23

Show file

File: dbscan_general.py Project: LeBronHe/SparkDBSCAN

import findspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

from src.serial import NaiveDBSCAN, MatrixDBSCAN
from src.utils import DataLoader, Evaluation, timeit
from src.settings import UNKNOWN, NOISE

import numpy as np

# broadcast variable
b_dataset = None
b_eps = None
b_min_pts = None


def load_data_label(path):
    pts = sc.textFile(path).map(lambda x: x.strip().split()[:-1]).map(
        lambda x: tuple([float(i) for i in x]))
    return pts.collect()


def load_data(path):
    pts = sc.textFile(path).map(lambda x: x.strip().split()).map(
        lambda x: tuple([float(i) for i in x]))
    return pts.collect()

Example #24

Show file

sys.path.append('..')
import settings

logger = logging.getLogger()
logger.setLevel(settings.COSINE_SIMILARITY['LOG_LEVEL'])

try:
    from pyspark import SparkContext
    from pyspark import SparkConf, SparkContext
except ImportError as e:
    logging.error("Can not import Spark Modules", e)
    sys.exit(1)

logging.info("Successfully imported Spark Modules")

conf = SparkConf().setMaster("local").setAppName("AggregatingMotionDeviceData")
sc = SparkContext(conf=conf)

# Script specific configurations
MINUTE_WINDOW = settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW'] * 60 * 1000
BASE_TIME = settings.COSINE_SIMILARITY['BASE_TIME']
MAX_TIME = settings.COSINE_SIMILARITY['MAX_TIME']
INPUT_DIR = settings.COSINE_SIMILARITY['INPUT_DIR']
OUTPUT_DIR = '../front_end/motion_split_files_' + str(settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW']) \
    + '_mins_window/'
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# Actual Code
def parse_line(line):

Example #25

Show file

File: movie-similarities-1m.py Project: tomchen29/Spark

    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = (numerator / (float(denominator))) if (denominator) else 0
    return (score, numPairs)

#use Spark built-in cluster manager to treat very laptop's core as a node
print("\nLoading movie names...")

#build a SparkContext nd create ratings: [user_ID, (movieID, rating)]
data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat")
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

# Emit every movie rated together by the same user.
# Self-join to find every combination.
joinedRatings = ratings.join(ratings)  #[_user_ID, ((movieID1, rating1), (movieID2, rating2))]

# Filter out duplicate pairs. filterDUplicates is a function that returns True of False
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

# Now key by movie pairs: [(movie1, movie2), (rating1, rating2)]
moviePairs = uniqueJoinedRatings.map(makePairs)

# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()  #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]

Example #26

Show file

File: Join.py Project: seshendranath/SparkTutorial

import findspark
from pyspark import SparkContext, SparkConf

findspark.init(python_path='/Users/khwu/.virtualenvs/spark/bin/python3')

if __name__ == '__main__':
    conf = SparkConf().setAppName('join').setMaster('local[*]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')
    ages = sc.parallelize([("Tom", 29), ("John", 22)]).persist()
    addresses = sc.parallelize([("James", "USA"), ("John", "UK")]).persist()

    ages.join(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_join.text')

    ages.leftOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_left_out_join.text')

    ages.rightOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_right_out_join.text')

    ages.fullOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_full_out_join.text')

Example #27

Show file


def vec_sum(x, y):
    return [(x[i] + y[i]) for i in range(len(x))]


def generalized_error(y):
    key = y[0][0]
    value = [z[1] for z in y]
    gen_error = functools.reduce(vec_sum, value, [0, 0])
    gen_error = [y / num_points for y in gen_error]

    return key, gen_error


from pyspark import SparkConf, SparkContext
if len(sys.argv) != 2:
    print('Usage: ' + sys.argv[0] + '<out>')
    sys.exit(1)
outputloc = sys.argv[1]

conf = SparkConf().setAppName('sim')
sc = SparkContext(conf=conf)

keys = sc.parallelize(par)
data = keys.map(get_data)
data = data.flatMap(lambda x: x)
error = data.map(classify)
gen_error = error.reduceByKey(generalized_error)
gen_error.saveAsTextFile(outputloc)
sc.stop()

Example #28

Show file

File: PROC_A_RPT_SUN_INFO_DETAIL.py Project: airuibel/python-1

#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_RPT_SUN_INFO_DETAIL').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0

Example #29

Show file

File: musica.py Project: stfny222/ml_music_classification

def init_spark():
    conf = SparkConf().setAppName("Music").setMaster("local")
    return SparkContext(conf=conf)

Example #30

Show file

# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Naive Bayes').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data file
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

# split data approximately into training and test
training, test = data.randomSplit([0.6, 0.4])

# train a naive bayes model
model = NaiveBayes.train(training, 1.0)

# make prediction and test accuracy
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(
    lambda (v, p): v == p).count() / test.count()
print('model accuracy :' + format(accuracy))

# save and load model
output_dir = '../model/myNaiveBayesModel'
# MLUtils.rmtree(output_dir, ignore_errors=True)
model.save(sc, output_dir)
sameModel = NaiveBayesModel.load(sc, output_dir)
predictionAndLabel = test.map(lambda p:
                              (sameModel.predict(p.features), p.label))