def get_spark():
    conf = (SparkConf().setAppName("read_pigstorage").set(
        "spark.authenticate.secret", "thisisasecret"))
    return SparkContext(conf=conf)
from pyspark import SparkConf, SparkContext

#we're going to find out how many words are there in any book/text file using Spark

#set master to local when you're running in your machine
conf = SparkConf().setMaster("local").setAppName("WordCount")
#initialize SparkContext as it is like the entry point to spark core
sc = SparkContext(conf = conf)

#reading the book.txt file into an RDD
input = sc.textFile("C:/Users/siraj/github/Spark-programs/data/book.txt")
#flatmap will transform a corpus of texts into words. we're spliting it word by word
words = input.flatMap(lambda x: x.split())
#countByValue will return the sum of each word present in the object.
wordCounts = words.countByValue()

#iterate through the object to get the word and its corresponding number of occurence.
for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))
Example #3
0
                          'calculate time:%s' % str(time.time() - start_time))
        send_msg_to_kafka(producer, result)
    producer.close()
    '''
    with topic_out.get_sync_producer() as producer:
        for query in msgs:
            # result = BM25_cores(query, Lave, bi)
            tmp = '%s' % query
            producer.produce(str(tmp).encode('utf-8'))
    '''


if __name__ == '__main__':
    conf = SparkConf().setMaster("spark://cdh-master-slave1:7077").set(
        "spark.executor.memory",
        "5G").set("spark.driver.memory",
                  "3G").set("spark.executor.cores",
                            "2").set("spark.cores.max", "6")
    # sc = SparkContext(conf=conf)

    # conf = SparkConf().setAppName("bm25")
    sc = SparkContext(conf=conf)
    # sqlContext = SQLContext(sc)
    ssc = StreamingContext(sc, 0.5)
    mongo_client = pymongo.MongoClient('mongodb://192.168.10.219:49019/')
    bi = mongo_client.lawbot.bm25_inverted
    be = mongo_client.lawbot.bm25_extra
    # sc.broadcast(bi)

    extra_data = be.find_one()
    total_word = extra_data.get('total_word')
Example #4
0
#!/usr/bin/env python
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
conf = SparkConf().setMaster("local").setAppName("My app")
sc = SparkContext(conf = conf)
lines = sc.textFile("ch01.py")
inputRDD = lines.filter(lambda x:"sc" in x)
for line in inputRDD.take(10):
    print line

lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line:line.split(" "))
print words.first()

data = sc.parallelize([1,2,3,4,1,3])
print data.reduce(lambda x,y: x+y)

def printall(rdd):
    print("----------")
    for r in list(rdd.collect()):
        print r
printall(data.distinct())
for d in list(data.distinct().collect()):
    print d

d = sc.parallelize(["1, hello", "2, hi", "3, how are you"])
for _ in list(d.map(lambda x:(x.split(",")[0], x)).collect()):
    print _
Example #5
0
    hashh = ""
    i = 0
    while i < len(maximum):

        if i == (len(maximum) - 1):
            hashh = hashh + str(maximum[i][0])
        else:
            hashh = hashh + str(maximum[i][0]) + ","
        i = i + 1
    if hashh != "":
        print("%s" % (hashh))


wind_size = int(sys.argv[1])
batch_duration = int(sys.argv[2])
conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, batch_duration)
ssc.checkpoint("~/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(lambda w: (w.split(';')[7]))

hashtag = tweet.flatMap(lambda w: (w.split(',')))
hasht = hashtag.map(lambda w: (w, 1))
counts = hasht.filter(lambda x: x[0] != '')

totalcount = counts.reduceByKeyAndWindow(
    try:
        firstRow=tweet_rdd.first()
        tweet_rdd=tweet_rdd.filter(lambda row:row != firstRow)

        if not tweet_rdd.isEmpty():
            sqlContext.createDataFrame(tweet_rdd, schema).write \
                                                         .format("org.apache.spark.sql.cassandra") \
                                                         .mode('append') \
                                                         .options(table="sentiment", keyspace="w251twitter") \
                                                         .save()
    except ValueError:
        print("The RDD was empty...continuing...")

if __name__ == "__main__":
    sparkConf = SparkConf().setAppName("TwitterSentimentAnalysis") \
        .set("spark.cassandra.connection.host", "cassandra1, cassandra2, cassandra3")

    sc = SparkContext(conf=sparkConf)
    session = SparkSession(sc)
    sqlContext = SQLContext(sc)
    ssc = StreamingContext(sc, 2)
    brokers, topic = sys.argv[1:]

    kvs = setup_kafka_stream()

    nlp = StanfordCoreNLP('http://localhost:9000')

    tweets = kvs.filter(lambda x: x is not None).filter(lambda x: x is not '').map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()

    sentiment_stream = tweets.map(lambda tweet: get_tweet_sentiment(tweet)).filter(lambda x: x is not None)
Example #7
0
from pyspark import SparkContext, SparkConf
import numpy as np
import time

#=============SETUP SPARK==================
local = False

if local:
    spark = pyspark.SparkContext("local[*]")
    spark.setLogLevel("ALL")

else:
    import os
    master = os.environ["SPARK_MASTER"]
    master = "spark://{}:7077".format(master)
    conf = SparkConf().setAppName("SpotTrawl").setMaster(master)
    spark = SparkContext(conf=conf)

#===========DEFINE SAMPLING FUNCTION=======
numSamples = 10**7


def sample(p):
    x, y = np.random.random(), np.random.random()
    return 1 if x * x + y * y < 1 else 0


#==========TAKE SAMPLES======================
count = spark.parallelize(xrange(0, numSamples)).map(sample) \
             .reduce(lambda a, b: a + b)
Example #8
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("customerSpendings")
sc = SparkContext(conf=conf)


def getCustomerAndSpendings(line):
    row = line.split(',')
    custId = int(row[0])
    amount = float(row[2])
    return (custId, amount)


lines = sc.textFile("./data/customer-orders.csv")
parsedLines = lines.map(getCustomerAndSpendings)

totalAmounts = parsedLines.reduceByKey(lambda v1, v2: v1 + v2)
reversedTotalAmounts = totalAmounts.map(lambda entry: (entry[1], entry[0]))
sortedTotalAmounts = reversedTotalAmounts.sortByKey(
    False)  #false gives descending order

results = sortedTotalAmounts.collect()

print("\nTotal spendings by each customer: \n")
for amount, customer in results:
    print("customer-" + str(customer) + "\t {:.2f}$".format(amount))
Example #9
0
        print(s_list[0][0],
              s_list[1][0],
              s_list[2][0],
              s_list[3][0],
              s_list[4][0],
              sep=",")


def func2(line):
    hashtag = line.split(";")[7]
    if (',' in hashtag):
        return hashtag.split(",")
    return [hashtag]


conf1 = SparkConf()
conf1.setAppName("BigData")
sc1 = SparkContext(conf=conf1)

sscp = StreamingContext(sc1, int(sys.argv[2]))
sscp.checkpoint("/checkpoint_BIGDATA")

dataStream1 = sscp.socketTextStream("localhost", 9009)

hashtags = dataStream1.window(int(sys.argv[1]), 1).flatMap(func2).map(
    lambda h: (h, 1)).reduceByKey(lambda x, y: int(x) + int(y))

hashtags.foreachRDD(func)

ssc.start()
ssc.awaitTermination(60)
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CLASSIFY_RESULT_TEMP').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
Example #11
0
    ldaModel = LDA.train(corpus, k=topic_num)
    return ldaModel


if __name__ == '__main__': 
    if len(sys.argv) != 6:
        print >> sys.stderr, "Usage: %s <input> <model_path> <stopfile> topic_num name" % sys.argv[0]
        exit(1)

    input_path = sys.argv[1]
    model_path = sys.argv[2]
    stopfile = sys.argv[3]
    topic_num = int(sys.argv[4])
    appname = sys.argv[5]

    conf = SparkConf().setAppName(appname)
    sc = SparkContext(conf=conf)

    rdd_lines = sc.textFile(input_path)
    parsed_data = segment(rdd_lines)

    if stopfile:
	parsed_data = filter_stopword(parsed_data, stopfile).cache()  

    tf_data = vectorize(sc, parsed_data)
    ldaModel = lda(tf_data, topic_num)
    
    print "finish train model..."
    t = ldaModel.describeTopics(5)
    print t
Example #12
0

def write_into_redis(s):
    redis_client = redis.StrictRedis(
        host='ec2-52-40-47-83.us-west-2.compute.amazonaws.com',
        port=6379,
        db=0,
        password='')
    pipe = redis_client.pipeline()
    for i in s:
        redis_client.delete(*i[0])
        pipe.lpush(i[0], *i[1])
    pipe.execute()


appName = 'Similarity_APP'
master = 'spark://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:7077'
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
start_time = time.time()
list_1 = get_data_from_influx()

print("--- %s seconds ---" % (time.time() - start_time))
rdd = sc.parallelize(list_1)

tupls = rdd.map(split_string)

buckets = tupls.reduceByKey(lambda a, b: a + b)
write_into_redis.count = 0
buckets.foreachPartition(write_into_redis)
Example #13
0
# course-word-count.py
# August 3, 2020
#
# Solution script provided by course.
# Count the number of occurrences of each word in a text file.
# Using the 'Book' text file.

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('word-count')
sc = SparkContext(conf=conf)

lines = sc.textFile('Book')
words = lines.flatMap(lambda x: x.split())
wordCounts = words.countByValue()

for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if cleanWord:
        print cleanWord, count

#print wordCounts
Example #14
0
    map_count = fix_date.map(lambda x: ((x[0], x[1]), 1))
    map_count = map_count.reduceByKey(lambda x, y: x + y)
    return map_count


if __name__ == '__main__':
    # Get input/output files from user
    parser = argparse.ArgumentParser()
    parser.add_argument('commits', help='File to load Amazon review data from')
    parser.add_argument('repos', help='File to load Yelp business data from')
    #parser.add_argument('output', help='Directory to save DStream results to')
    args = parser.parse_args()

    # Setup Spark
    conf = SparkConf().setAppName("timezone")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    setup_table(sc, sqlContext, args.commits, args.repos)

    print("-" * 15 + " OUTPUT " + "-" * 15)
    langs = {
        "Python", "Java", "JavaScript", "Ruby", "SQL", "C#", "C++", "nodejs",
        "PHP", "C", "objective-c"
    }

    out = timezone(sc, sqlContext, langs)
    out.saveAsTextFile("/user/renukan2/timezone_github")

    print("-" * 30)
Example #15
0
'''
Created on Jun 10, 2017

@author: SathishParthasarathy
'''

from pyspark import SparkConf, SparkContext
from hdfs3 import HDFileSystem
if __name__ == '__main__':
    conf = SparkConf().setAppName("Word Count - Python")
    spark = SparkContext(conf=conf)
    hdfs = HDFileSystem('hadoop.master.com', port=9000)
    if hdfs.exists("/user/psathishcs/Output/Books/Science_Python") != True:
        text_file = spark.textFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Input/Books/The_Outline_of_Science.txt"
        )
        words = text_file.flatMap(lambda line: line.split())
        wordCounts = words.map(lambda word: (word, 1)).reduceByKey(
            lambda a, b: a + b)
        wordCounts.saveAsTextFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Output/Books/Science_Python"
        )
Example #16
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf=conf)

lines = sc.textFile("c:///SparkCourse/ml-100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

flipped = movieCounts.map(lambda xy: (xy[1], xy[0]))
sortedMovies = flipped.sortByKey()

results = sortedMovies.collect()

for result in results:
    print(result)
from pyspark.sql import SparkSession
import sys
import csv
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import Row
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as func
file=sys.argv[1]
n=sys.argv[2]
s=sys.argv[3]
c=sys.argv[4]
conf = SparkConf().setAppName('Kia_bigdata_lab').setMaster('local')
sc = SparkContext(conf=conf)
spark=SparkSession.builder.appName("lab3").getOrCreate()
rd=sc.textFile(file).map(lambda x: Row((x.split(','))[0],x.split(',')[1:]))
df=rd.toDF(["items","plant"]).withColumn("id",monotonically_increasing_id())
df=df[["id","items","plant"]]

fpGrowth = FPGrowth(itemsCol="plant", minSupport=float(s), minConfidence=float(c))
model = fpGrowth.fit(df)

# Display frequent itemsets.
ml=model.freqItemsets
ml.orderBy([func.size("items"), "freq"], ascending=[0,0]).show(int(n))

# Display generated association rules.
#ml=model.associationRules.show(10)
#ml.orderBy([func.size("antecedent")],"confidence",ascending=[0,0]).show(int(n))
Example #18
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating


def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item", encoding='ascii', errors="ignore") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')

print("\nLoading movie names...")
nameDict = loadMovieNames()

data = sc.textFile("ml-100k/u.data")

ratings = data.map(lambda l: l.split()).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print("\nTraining recommendation model...")
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
Example #19
0
def run():
    conf = SparkConf()
    #conf.set('spark.shuffle.blockTransferService', 'nio')
    conf.set('spark.files.fetchTimeout', '180')
    conf.set('spark.files.overwrite', 'yes')
    conf.set('spark.akka.timeout', '180')
    #conf.set('spark.task.maxFailures', '30000')
    conf.set('spark.akka.frameSize', '500')
    conf.set('spark.network.timeout', '180')

    myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb'))

    dataSetMaker = DataSetMakerV2(n=200000)

    feed = FeedNewsFromGoogleFinance()

    def sendRecord(rdd):
        print('new try...')
        if (not rdd.isEmpty()):
            newsRDD = dataSetMaker.processKeepNews(rdd)
            res = newsRDD.map(
                lambda x: (x[0], myClassifierOnevsOne.predict(x[1].features)))
            print('for each result...')
            for result in res.collect():
                symbole = result[0].symbole
                r = requests.put('http://wtun.mooo.com:5000',
                                 data={
                                     'jdata':
                                     NewsPrediction(result[0],
                                                    str(result[1])).json(),
                                     'symbole':
                                     symbole,
                                     'label':
                                     str(result[1])
                                 })
                print('send ok')
                print('receive %s' % str(r.text))
        else:
            print('empty!')

    sc = SparkContext(conf=conf)

    symbolesRDD = sc.parallelize([('NASDAQ:GOOGL', ['GOOG', 'GOOGL',
                                                    'GOOGLE']),
                                  ('NASDAQ:NVDA', ['NVIDIA']),
                                  ('VTX:SCMN', ['SWISSCOM'])])
    taskdt = 600
    running = True
    oldNewsRDD = None
    firstTime = True
    intersectRDD = None
    dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2'
    cpt = 0
    while (running):
        today = datetime.datetime.now()
        yesterday = today - datetime.timedelta(days=1)
        tomorrow = today + datetime.timedelta(days=1)
        newsRDD = symbolesRDD.flatMap(
            lambda x: feed.lookingAt(x[0], yesterday, tomorrow, x[1]))
        if (firstTime):
            firstTime = False
            intersectRDD = newsRDD
        else:
            try:
                intersectRDD = oldNewsRDD.intersection(newsRDD)
            except:
                pass  # empty rdd

        oldNewsRDD = newsRDD

        try:
            sendRecord(intersectRDD)
            intersectRDD.saveAsPickleFile(
                dataDirectory + '/' +
                datetime.datetime.now().strftime('%Y-%m-%d--') + str(cpt))
            cpt += 1
        except:
            pass  # empty rdd

        time.sleep(taskdt)

    running = False  # TODO remove it
Example #20
0
        "com.databricks.spark.csv").option("header",
                                           "true").save('locked_data.csv')
    byDateUnlocked.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('unlocked_data.csv')

    # 5 over_18
    byDate18 = df10.filter('over_18 == true').select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDate18.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save('over18_data.csv')

    # final 4
    dff4 = df10.groupBy('title').agg(
        avg('Positive').alias('avgPos'),
        avg('Negative').alias('avgNeg'))
    dff4.orderBy('avgPos', ascending=0).limit(10).show(truncate=False)
    dff4.orderBy('avgNeg', ascending=0).limit(10).show(truncate=False)


if __name__ == "__main__":
    conf = SparkConf().setAppName("CS143 Project 2B")
    conf = conf.setMaster("local[*]")
    # conf = (conf.set('spark.executor.memory', '4G').set('spark.driver.memory', '4G'))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    sc.addPyFile("cleantext.py")
    main(sqlContext)
Example #21
0
from pyspark import SparkContext, SparkConf
import collections


def parseLine(line):
    cells = line.split(',')
    return (int(cells[0]), float(cells[2]))


conf = SparkConf().setMaster("local").setAppName("CustomerOrders")
sc = SparkContext(conf=conf)

rdd = sc.textFile("customer-orders.csv")
custOrders = rdd.map(parseLine)
custAmounts = custOrders.reduceByKey(lambda x, y: x + y).map(lambda x:
                                                             (x[1], x[0]))
custAmountsSorted = custAmounts.sortByKey()
results = custAmountsSorted.collect()

for result in results:
    print(str(result[1]) + ": {:.2f}".format(result[0]))
'''
Output:

45: 3309.38
79: 3790.57
96: 3924.23
23: 4042.65
99: 4172.29
...
...
Example #22
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('WriteAPIs').setMaster('local')
sc = SparkContext(conf=conf)
ss = SparkSession.builder.appName('WriteAPIs').master('local').getOrCreate()

#-----------------------------------------------------------------------------------------------
# Writing to files - RDD
#-----------------------------------------------------------------------------------------------
# car_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/PySparkCodes/sampledata/car_sales_data.csv'
# output_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out'

# num_partitions = 16
# rdd = sc.textFile(car_file)
# rdd = sc.textFile(car_file, num_partitions)
# print('Total no of partitions: ',rdd.getNumPartitions())

# rdd1 = rdd.map(lambda x: (x.split(',')[3],x.split(',')[5],x.split(',')[11]))
# rdd1.saveAsTextFile(output_file)
# rdd1.coalesce(1).saveAsTextFile(output_file)
# rdd1.repartition(1).saveAsTextFile(output_file)

# rdd2 = sc.textFile("/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out/part-*")
# print(rdd2.getNumPartitions())

# print(rdd1.count())
# print(rdd2.count())
#
# for i in rdd1.take(5):
#     print(i)
Example #23
0
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

from src.serial import NaiveDBSCAN, MatrixDBSCAN
from src.utils import DataLoader, Evaluation, timeit
from src.settings import UNKNOWN, NOISE

import numpy as np

# broadcast variable
b_dataset = None
b_eps = None
b_min_pts = None


def load_data_label(path):
    pts = sc.textFile(path).map(lambda x: x.strip().split()[:-1]).map(
        lambda x: tuple([float(i) for i in x]))
    return pts.collect()


def load_data(path):
    pts = sc.textFile(path).map(lambda x: x.strip().split()).map(
        lambda x: tuple([float(i) for i in x]))
    return pts.collect()

Example #24
0
sys.path.append('..')
import settings

logger = logging.getLogger()
logger.setLevel(settings.COSINE_SIMILARITY['LOG_LEVEL'])

try:
    from pyspark import SparkContext
    from pyspark import SparkConf, SparkContext
except ImportError as e:
    logging.error("Can not import Spark Modules", e)
    sys.exit(1)

logging.info("Successfully imported Spark Modules")

conf = SparkConf().setMaster("local").setAppName("AggregatingMotionDeviceData")
sc = SparkContext(conf=conf)

# Script specific configurations
MINUTE_WINDOW = settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW'] * 60 * 1000
BASE_TIME = settings.COSINE_SIMILARITY['BASE_TIME']
MAX_TIME = settings.COSINE_SIMILARITY['MAX_TIME']
INPUT_DIR = settings.COSINE_SIMILARITY['INPUT_DIR']
OUTPUT_DIR = '../front_end/motion_split_files_' + str(settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW']) \
    + '_mins_window/'
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# Actual Code
def parse_line(line):
Example #25
0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = (numerator / (float(denominator))) if (denominator) else 0
    return (score, numPairs)

#use Spark built-in cluster manager to treat very laptop's core as a node
print("\nLoading movie names...")

#build a SparkContext nd create ratings: [user_ID, (movieID, rating)]
data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat")
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

# Emit every movie rated together by the same user.
# Self-join to find every combination.
joinedRatings = ratings.join(ratings)  #[_user_ID, ((movieID1, rating1), (movieID2, rating2))]

# Filter out duplicate pairs. filterDUplicates is a function that returns True of False
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

# Now key by movie pairs: [(movie1, movie2), (rating1, rating2)]
moviePairs = uniqueJoinedRatings.map(makePairs)

# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()  #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]
Example #26
0
import findspark
from pyspark import SparkContext, SparkConf

findspark.init(python_path='/Users/khwu/.virtualenvs/spark/bin/python3')

if __name__ == '__main__':
    conf = SparkConf().setAppName('join').setMaster('local[*]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')
    ages = sc.parallelize([("Tom", 29), ("John", 22)]).persist()
    addresses = sc.parallelize([("James", "USA"), ("John", "UK")]).persist()

    ages.join(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_join.text')

    ages.leftOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_left_out_join.text')

    ages.rightOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_right_out_join.text')

    ages.fullOuterJoin(addresses) \
        .coalesce(1) \
        .saveAsTextFile('../../out/age_address_full_out_join.text')
Example #27
0

def vec_sum(x, y):
    return [(x[i] + y[i]) for i in range(len(x))]


def generalized_error(y):
    key = y[0][0]
    value = [z[1] for z in y]
    gen_error = functools.reduce(vec_sum, value, [0, 0])
    gen_error = [y / num_points for y in gen_error]

    return key, gen_error


from pyspark import SparkConf, SparkContext
if len(sys.argv) != 2:
    print('Usage: ' + sys.argv[0] + '<out>')
    sys.exit(1)
outputloc = sys.argv[1]

conf = SparkConf().setAppName('sim')
sc = SparkContext(conf=conf)

keys = sc.parallelize(par)
data = keys.map(get_data)
data = data.flatMap(lambda x: x)
error = data.map(classify)
gen_error = error.reduceByKey(generalized_error)
gen_error.saveAsTextFile(outputloc)
sc.stop()
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_RPT_SUN_INFO_DETAIL').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
def init_spark():
    conf = SparkConf().setAppName("Music").setMaster("local")
    return SparkContext(conf=conf)
Example #30
0
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Naive Bayes').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data file
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

# split data approximately into training and test
training, test = data.randomSplit([0.6, 0.4])

# train a naive bayes model
model = NaiveBayes.train(training, 1.0)

# make prediction and test accuracy
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(
    lambda (v, p): v == p).count() / test.count()
print('model accuracy :' + format(accuracy))

# save and load model
output_dir = '../model/myNaiveBayesModel'
# MLUtils.rmtree(output_dir, ignore_errors=True)
model.save(sc, output_dir)
sameModel = NaiveBayesModel.load(sc, output_dir)
predictionAndLabel = test.map(lambda p:
                              (sameModel.predict(p.features), p.label))