def init_spark_context():
    conf = SparkConf().setAll([("spark.app.name", "Spark_Processor"), ("spark.redis.port", "6379"), 
                                      ("spark.jars", "spark-redis-branch-2.4/target/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar")])
    sc = SparkContext(conf=conf)
    return sc
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import RFormula

if __name__ == "__main__":

    # _we can use spark in either local mode or cluster mode. Below is the configuration for local mode.
    sc = SparkContext("local", "Hello World")
    sc.setLogLevel('ERROR')

    # _start spark sessin from context
    spark = SparkSession(sc)

    dataset = spark.createDataFrame(
        [(7, "US", 18, 1.0),
        (8, "CA", 12, 0.0),
        (9, "NZ", 15, 0.0)],
        ["id", "country", "hour", "clicked"])

    formula = RFormula(
        formula="clicked ~ country + hour",
        featuresCol="features",
        labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()

    sc.stop()

                globals()['table_created'] = True

            # insert data into table
            insert_table(word_counts_df.toPandas())

        except Exception as e:
            print('Error:', e)


if __name__ == "__main__":

    global table_created
    table_created = False

    # create spark context with the above configuration
    sc = SparkContext(appName='TwitterStream')
    sc.setLogLevel('ERROR')

    # create the Streaming Context from the above spark context with interval size 2 seconds
    ssc = StreamingContext(sc, 2)

    # read data from port
    with open('config.yaml', 'r') as stream:
        details = yaml.safe_load(stream)

    lines = ssc.socketTextStream(details['host'], details['port'])

    # split each tweet into words
    words = lines.flatMap(lambda line: line.split(' '))

    # do processing for each RDD generated in each interval
Beispiel #4
0
def spark_context(master):
    conf = SparkConf().setAppName('zhangxinyun-spark').setMaster(master)
    sc = SparkContext(conf=conf)
    return sc
Beispiel #5
0
import datetime

#Normally in Spark you'd use a Window. 
#We cannot do this with our stream because it simply returns the last 150 trades
def isWithin30Sec(time):
	if datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() - datetime.timedelta(0, 30):
		return True
	else:
		return False

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: btcmonitor.py <hostname> <port>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="BTCPriceMonitor")
    ssc = StreamingContext(sc, 1)

    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
   
    prices = lines.filter(lambda line: len(line.split(',')) > 5)\
		  .filter(lambda line: isWithin30Sec(line.split(',')[5]))
            
    sums = prices.map(lambda line: (line.split(',')[0], float(line.split(',')[1]))).reduceByKey(lambda a,b: a+b)
    counts = prices.map(lambda line: (line.split(',')[0], 1)).reduceByKey(lambda a,b: a+b)
    sums = sums.join(counts)
    avg = sums.map(lambda k: (k[0], k[1][0]/k[1][1]))


    avg.pprint()
 
Beispiel #6
0
#    Spark
from pyspark import SparkContext

#    Spark Streaming
from pyspark.streaming import StreamingContext

#    Kafka
from pyspark.streaming.kafka import KafkaUtils

#    json parsing
import json

sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("WARN")

ssc = StreamingContext(sc, 60)

kafkaStream = KafkaUtils.createStream(ssc, "192.168.0.9:9092",
                                      "spark-streaming", {"jsontest1": 1})

parsed = kafkaStream.map(lambda v: json.loads(v[1]))

print(parsed)

ssc.start()
ssc.awaitTermination()
Beispiel #7
0
def createCombiner(kw):
    return set([kw])


def mergeValue(set, kw):
    set.update([kw])
    return set


def mergeCombiners(set0, set1):
    set0.update(set1)
    return set0


sc = SparkContext(appName="platform")
data = sc.textFile("/commit/regist/daichang/yixing.phonecheck.2017-02-08").map(lambda a: f(a)).filter(
    lambda a: a is not None).cache()

previous = sc.textFile("/user/lel/results/yixin/previous/*").map(lambda a: a.split("\t")).collectAsMap()
previous_b = sc.broadcast(previous)
previous_bv = previous_b.value

othersRDD = data.filter(lambda a: a[1] not in '拉卡拉') \
    .combineByKey(lambda a: createCombiner(a), lambda a, b: mergeValue(a, b), lambda a, b: mergeCombiners(a, b)) \
    .map(lambda a: distinct_pre(a, previous_bv)).filter(lambda a: a is not None)
othersRDD.coalesce(1).map(lambda a: a[0] + "\t" + ','.join(list(set(a[1])))).saveAsTextFile(
    "/user/lel/results/yixin/except_lakala20170213")

others_dis = othersRDD.collectAsMap()
others_dis_b = sc.broadcast(others_dis)
STREAM_IN = 'stream-IN'
STREAM_OUT = 'stream-OUT'

# We first delete all files from the STREAM_IN folder
# before starting spark streaming.
# This way, all files are new
print("Deleting existing files in %s ..." % STREAM_IN)
p = Path('.') / STREAM_IN
for f in p.glob("*.ordtmp"):
    os.remove(f)
print("... done")

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

sc = SparkContext("local[*]", "CountAndVolumePerBatch")
sc.setLogLevel(
    "WARN")  #Make sure warnings and errors observed by spark are printed.

ssc = StreamingContext(sc, 5)  #generate a mini-batch every 5 seconds
filestream = ssc.textFileStream(
    STREAM_IN)  #monitor new files in folder stream-IN


def parseOrder(line):
    '''parses a single line in the orders file'''
    s = line.split(",")
    try:
        if s[6] != "B" and s[6] != "S":
            raise Exception('Wrong format')
        return [{
Beispiel #9
0
def get_adid_terms(line):
    entry = json.loads(line.strip())
    ad_id = entry['adId']
    adid_terms = []
    #print entry['keyWords']
    #use hashmap to dedupe adid_term
    for term in entry['keyWords']:
        val = str(ad_id) + "_" + term
        adid_terms.append(val)
    return adid_terms


def generate_json(items):
    result = {}
    result['term'] = items[0]
    result['doc_freq'] = items[1]
    return json.dumps(result)


if __name__ == "__main__":
    adfile = sys.argv[1]  #raw ads data
    sc = SparkContext(appName="DF_Features")
    #[1111_makeup, 2311_makeup,2311_makeup, 987_makeup, 433_cosmetic, 867_cosmetic] => #[1111_makeup,2311_makeup, 987_makeup, 433_cosmetic, 867_cosmetic]
    #(makeup , 1), (makeup , 1), (makeup , 1), (cosmetic, 1), (cosmetic, 1)
    data = sc.textFile(adfile).flatMap(lambda line: get_adid_terms(
        line)).distinct().map(lambda w: (get_term(w), 1)).reduceByKey(
            lambda v1, v2: v1 + v2).map(generate_json)
    data.saveAsTextFile("/Users/jiayangan/project/SearchAds/data/log/DF13")
    sc.stop()

def transform_training_row_into_lp(row):
    features = Vectors.dense(row["x"])
    label = row["label"]
    return LabeledPoint(label, features)


def transform_test_row(row):
    return Vectors.dense(row["x"])


if __name__ == "__main__":
    # Create a local StreamingContext with two working thread
    # and batch interval of 1 second
    sc = SparkContext("local[1]", "Streaming Linear Regression")
    # 2nd argument is batch duration
    ssc = StreamingContext(sc, 5)

    directKafkaStream = KafkaUtils.createDirectStream(ssc,
                                                      ["trendy-topic"],
                                                      {"metadata.broker.list": "localhost:9092"})


    model = StreamingLinearRegressionWithSGD()
    model.setInitialWeights(np.random.rand(NUM_FEATURES))

    numStream = directKafkaStream.flatMap(extract_data_rows_from_json)

    trainingStream = numStream.filter(lambda row: row["known"]).map(transform_training_row_into_lp)
    testStream = numStream.filter(lambda row: not row["known"]).map(transform_test_row)
Beispiel #11
0
from pyspark import SparkContext

sc = SparkContext("local[2]", "First Spark App")
data = sc.textFile("data/UserPurchaseHistory.csv").map(lambda line: line.split(",")).map(lambda record: (record[0], record[1], record[2]))

numPurchases = data.count()
uniqueUsers = data.map(lambda record: record[0]).distinct().count()
totalRevenue = data.map(lambda record: float(record[2])).sum()
products = data.map(lambda record: (record[0], 1.0)).reduceByKey(lambda a, b: a+b).collect()
mostPopular = sorted(products, key=lambda x: x[1], reverse=True)[0]

print "Total pruchase: %d" % numPurchases
print "Unique users: %d" % uniqueUsers
print "Total revenue: %2.2f" % totalRevenue
print "Most popular product: %s with %d purchases" % (mostPopular[0], mostPopular[1])
Beispiel #12
0
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.fpm import FPGrowth
sc = SparkContext('local')
spark = SparkSession(sc)

txt = sc.textFile("./output/*/*")
temp_var = txt.map(lambda k: (0, list(set(k.split(" ")))))
df = temp_var.toDF(["id", "words"])

fpGrowth = FPGrowth(itemsCol="words", minSupport=0.1, minConfidence=0.1)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()
Beispiel #13
0
        #filename is the last integer value in filename text
        filename = int(file.split("/")[-1])
        tuples = ()
        for word in word_set:
            if word != "":
                tuples += ((word, [filename]), )
        return tuples
    except:
        print('Filename is not an integer.')


if __name__ == "__main__":

    #create spark context
    spark_context = SparkContext(appName="inverted_index_search",
                                 conf=SparkConf().set("spark.driver.host",
                                                      "localhost"))
    #turn inputs into RDDs
    inputRDDs = spark_context.wholeTextFiles("../input/*")
    #tokenize texts
    #set the words as keys and filenames as values
    processed = inputRdds.map(tokenize).map(filename_value).flatMap(
        lambda x: x)
    #reduce the previous rdd so that word keys now have a list of all the filenames that contain that word
    reduced = processed.reduceByKey(lambda x, y: x + y)
    #assign an index to every key value pair in the previous rdd
    zipped = reduced.zipWithIndex()
    #create a dictionary with words as keys and indices as values
    dictionary = zipped.map(lambda x: (x[0][0], x[1])).collectAsMap()

    #output dictionary
Beispiel #14
0
from pyspark import SparkContext

if __name__ == '__main__':
    sc = SparkContext('local', 'wordcount')
    lines = sc.textFile('/home/wangheng/Desktop/spark_test_data.txt', 1)
    words = lines.flatMap(lambda line: line.split(" "))
    paris = words.map(lambda x: (x, 1))
    count = paris.reduceByKey(lambda x, y: x + y)
    for x in count.collect():
        print(x[0], " appear ", x[1], " times!")
Beispiel #15
0

def map_ORCID(x):
    result = []
    author = dict(fullname='',
                  identifiers=[
                      dict(scheme='ORCID',
                           value="https://orcid.org/" + x['orcid'],
                           provenance='ORCID')
                  ],
                  affiliations=[],
                  given=x.get('firstname', ''),
                  family=x.get('lastname', ''))
    fullname = "%s %s" % (author['given'], author['family'])
    author['fullname'] = fullname.strip()
    for item in x['publications']:
        if item['doi'] is not None and len(item['doi']) > 0:
            result.append((item['doi'].lower(), [author]))
    return result


if __name__ == '__main__':
    sc = SparkContext(appName='generateORCIDDataFrame')
    spark = SparkSession(sc)
    sc.textFile('/data/orciddump.txt').map(
        json.loads).flatMap(map_ORCID).reduceByKey(lambda a, b: a + b).map(
            lambda x: dict(doi=x[0], authors=x[1])).toDF(
                get_schema()).write.save("/data/ORCID.parquet",
                                         format="parquet")
    # .saveAsTextFile(path="/data/ORCID_df",compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
Beispiel #16
0
#
# (c) Copyright 2016 Hewlett Packard Enterprise Development LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#

from __future__ import print_function
from pyspark import SparkConf
from pyspark import SparkContext

if __name__ == "__main__":

    my_spark_conf = SparkConf().setAppName("Quicksum")
    spark_context = SparkContext(conf=my_spark_conf)
    data = [1, 2, 3, 4, 5]
    distData = spark_context.parallelize(data)
    total = distData.reduce(lambda a, b: a + b)
    print("Total is %s" % total)
    spark_context.stop()
Beispiel #17
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from kafka import KafkaProducer
import json
from json import dumps

if __name__ == "__main__":

    def to_kafka(rdd):
        cnt = rdd.count()
        if cnt > 0:
            data = rdd.take(1)
            producer.send('spark_event_out', value=data)

    sc = SparkContext(appName="StreamingKafkaEventAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 10)

    print('Listenting to topic')
    data = 'Processed a batch now'

    producer = KafkaProducer(
        bootstrap_servers=['sandbox-hdp.hortonworks.com:6667'],
        value_serializer=lambda x: dumps(x).encode('utf-8'))

    kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181',
                                          'spark-streaming',
                                          {'spark_event': 1})
    parsed = kafkaStream.map(lambda v: json.loads(v[1]))
"""SimpleApp.py"""
from pyspark import SparkContext

logFile = "file:///usr/local/spark/README.md"
sc = SparkContext("local", "Simple App")
logData = sc.textFile(logFile).cache()

numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()

print("Lines with a: %i, lines with b: %i" % (numAs, numBs))

sc.stop()
Beispiel #19
0
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json

sc = SparkContext(appName="spark2")
sc.setLogLevel("WARN")

airVelocityKMPH = [12, 13, 15, 12, 11, 12, 11]
parVelocityKMPH = sc.parallelize(airVelocityKMPH, 2)

countValue = parVelocityKMPH.count()

sumValue = parVelocityKMPH.sum()

meanValue = parVelocityKMPH.mean()

varianceValue = parVelocityKMPH.variance()

sampleVarianceValue = parVelocityKMPH.sampleVariance()

stdevValue = parVelocityKMPH.stdev()

sampleStdevValue = parVelocityKMPH.sampleStdev()

parVelocityKMPH.stats().asDict()
Beispiel #20
0
import sys
from pyspark import SparkContext
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing

output_path = sys.argv[1]
input_train = sys.argv[2]
input_all = sys.argv[3]

sc = SparkContext(appName="train")
rdd_train = sc.textFile(input_train)
rdd_all = sc.textFile(input_all)


# common func =========================
def splitx(raw):
    items = raw.split(' ')
    mtr_x = []
    mtr_y = []
    mtr_t = []
    for v in items[1].split(';'):
        tmp = v.split(',')
        if len(tmp) != 3:
            continue
        mtr_x.append(float(tmp[0]))
        mtr_y.append(float(tmp[1]))
        mtr_t.append(float(tmp[2]))
    gtmp = items[2].split(',')
    goal = [float(gtmp[0]), float(gtmp[1])]
    if len(items) == 4:
Beispiel #21
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function
import numpy as np
from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="CorrelationsExample")  # SparkContext

    # $example on$
    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
    # seriesY must have the same number of partitions and cardinality as seriesX
    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print("Correlation is: " +
          str(Statistics.corr(seriesX, seriesY, method="pearson")))

    data = sc.parallelize([
        np.array([1.0, 10.0, 100.0]),
        np.array([2.0, 20.0, 200.0]),
        np.array([5.0, 33.0, 366.0])
Beispiel #22
0
from pyspark.sql import SQLContext
from pyspark import SparkContext
import re
# other required imports here

if __name__ == "__main__":
    # create Spark context with necessary configuration
    spark = SparkContext("local", "Stock Returns")

    # read json data from the newdata directory
    # df = SQLContext(spark).read.option("multiLine", True) \
    # .option("mode", "PERMISSIVE").json("./newsdata")
    schema = (
        'date STRING, open FLOAT, high FLOAT, low FLOAT, close FLOAT, volume INT, ticker STRING'
    )

    df = SQLContext(spark).read.csv('stock_prices.csv',
                                    schema=schema,
                                    header=False)
    # df.show(2)
    # lines = df.select("date","open","close")
    # sim = df.withColumn("percent", (df("close") - df("open"))*100/df("open"))
    sim = df.withColumn("return",
                        (df["close"] - df["open"]) * 100 / df["open"])
    # sim.groupBy('date').avg('return').show()
    # sim.select("date","return").groupBy("date").avg()
    x = sim.groupBy("date").avg("return")
    x.collect()
    # sim=sim.select('date','return')
    # df.groupBy(df.date).avg(df.close - df.open).show()
    # vals = lines.map(lambda row: row[2]-row[1])
    interval.add(1)
    events.add(len(input))
  # If model predicts True warn user in red
    if (model.predict(features)):
      print '\033[31m%s.%03dZ: Interval %d: Attention needed (%d sensor events in interval)\033[0m' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time()*1000)%1000, interval.value, len(input))
    else:
      print '%s.%03dZ: Interval %d: Everything is OK (%d sensor events in interval)' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time()*1000)%1000, interval.value, len(input))
    if last_batch:
      ssc.stop()

# Initialize features to <number of sensors>-length array, filled with neutral initial sensor value
features = np.zeros(n_sensors)
features.fill(0.5) 

# Initialize streaming for specified reporting interval
sc = SparkContext(appName="iotstream_lr_mqtt")
interval = sc.accumulator(0)
empty_intervals = sc.accumulator(0)
events  = sc.accumulator(0)
ssc = StreamingContext(sc, reporting_interval)
sensor_stream = MQTTUtils.createStream(ssc, mqtt_URL, mqtt_topic)

# Load pre-computed model
model = LogisticRegressionModel.load(sc, modelname)

# Run model on each batch
#sensor_stream.pprint(10)
sensor_stream.foreachRDD(run_model)

# Start reading streaming data
ssc.start()
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

#local[*] 中必须设置大于1的并行数量
sc = SparkContext("local[2]", "streaming")
# 设置每次计算的时间间隔
ssc = StreamingContext(sc, 5)
lines = ssc.textFileStream('file:///home/zj/logs')
words = lines.flatMap(lambda l: l.split())
wordsPair = words.map(lambda x: (x, 1))
wordscount = wordsPair.reduceByKey(lambda a, b: a + b)
wordscount.pprint()

ssc.start()
ssc.awaitTermination()

Beispiel #25
0
from __future__ import print_function
import sys
from pyspark import SparkContext
from csv import reader
from operator import add

if __name__ == "__main__":
    sc = SparkContext()
    lines1 = sc.textFile(sys.argv[1], 1)
    lines1 = lines1.mapPartitions(lambda x: reader(x))
    vehicles = lines1.map(lambda x: ((x[14], x[16]), 1)).reduceByKey(add)
    vehicles = vehicles.sortBy(lambda x: x[0][0]).sortBy(lambda x: x[1], False)
    res = sc.parallelize(vehicles.take(20))
    res = res.map(
        lambda x: "{0:s}, {1:s}\t{2:d}".format(x[0][0], x[0][1], x[1]))
    res.saveAsTextFile("task6.out")
    sc.stop()
Beispiel #26
0

create_users_table_stmt = """CREATE TABLE IF NOT EXISTS users(
                                id INT PRIMARY KEY,
                                username VARCHAR(50) NOT NULL,
                                firstname VARCHAR(100),
                                lastname VARCHAR(100),
                                picture TEXT
                                );
					"""
add_user_stmt = """INSERT IGNORE INTO users(id, username, firstname, lastname, picture)
		VALUES (%s,%s,%s,%s, %s);"""

if __name__ == "__main__":

    sc = SparkContext(appName="venmoApp-userinfo-mysql")

    data_location = get_url(sys.argv)

    if data_location is None:
        logging.error("not a valid data location.\nExiting the program")
        sys.exit(0)
    logging.info("Processing:" + data_location)

    data_rdd = sc.textFile(data_location)

    parsed_users = data_rdd.flatMap(parse_user_info).\
                filter(lambda data: data is not None)

    table_created = sql_create_table(create_users_table_stmt)
Beispiel #27
0
    surrogateCoeff = 0.1 # denote the cluster ratio in surrograte model
    tabuMaxLength = 10
    tabuMaxIter = 100
    maxNumCandidate = 10


    core_num = int(sys.argv[1])
    conf = SparkConf().setMaster("spark://noah007:7077") \
        .setAppName("SPC-POSM-PSO") \
        .set("spark.submit.deployMode", "client") \
        .set("spark.cores.max", core_num) \
        .set("spark.executor.cores", "10") \
        .set("spark.executor.memory", "20g") \
        .set("spark.driver.memory", "40g")

    sc = SparkContext(conf=conf)


    '''
        experiment for accuracy on different dataset
    '''
    '''
    instanceSet = ['nuoxi2G']  # , 'nuoxi3G', 'huawei2G', 'huawei3G']
    '''
    '''
    instanceSet = [i for i in range(60)]
    aveAns, aveRuntime, aveConverGen = [], [], []

    for i in instanceSet:
        print i, 'th instance ...'
        # po is data contains informantion about PROVIDERS and CUSTOMERS
Beispiel #28
0
import sys
from pyspark import SparkContext
from itertools import combinations
import time
import random

time_start = time.time()

sc = SparkContext("local[*]", "Assignment 3 LSH Task 1")

dataFile = sc.textFile(sys.argv[1])
# dataFile = sc.textFile(sys.argv[1])
# num_chunk = 2  # 4
#     lines = sc.textFile(sys.argv[1], num_chunk)
# use this statement when run so slow
# dataFile = dataFile.repartition(2)
#
# remove duplicate
dataFile1 = dataFile.map(lambda x: x.split(","))
header = dataFile1.first()
dataFile1 = dataFile1.filter(lambda x: x != header)

unique_user_id = dataFile1.map(lambda row: row[0]).distinct().collect()
unique_user_id.sort()

# index_user_map = {}
user_index_map = {}
user_index = 0
for user_item in unique_user_id:
    # index_user_map[i] = user_item
    user_index_map[user_item] = user_index
Beispiel #29
0
import sys
from pyspark import SparkContext


# given the list of neighbors for a page and that page's rank, calculate
# what that page contributes to the rank of its neighbors
def computeContribs(neighbors, rank):
    for neighbor in neighbors:
        yield (neighbor, rank / len(neighbors))


# read in a file of page links (format: url1 url2)
linkfile = "pagelinks.txt"
sc = SparkContext(appName="pagerank")
links = sc.textFile(linkfile).map(lambda line: line.split()).map(
    lambda pages: (pages[0], pages[1])).distinct().groupByKey().persist(
    )  # filter out duplicates
#groupByKey => adjeacent list: (page3, [page1,page4]) , (page4, [page1,page2])

# set initial page ranks to 1.0
ranks = links.map(lambda (page, neighbors): (page, 1.0))

# number of iterations
n = 10
d = 0.85
# for n iterations, calculate new page ranks based on neighbor contribibutios
for x in xrange(n):
    contribs = links.join(ranks).flatMap(
        lambda (page, (neighbors, rank)): computeContribs(neighbors, rank))

    #page1, 0.5
from pyspark import SparkConf,SparkContext

# initialization of spark
# master is machine | server allooting
sc = SparkContext(master = 'local[2]')
print(sc)

#
# # spark version
print(sc.version)
# # python version
print(sc.pythonVer)
# #
print(sc.master)
print(str(sc.sparkHome))
print(str(sc.sparkUser()))
#
print(sc.appName) # Return application name
print(sc.applicationId) # Retrieve application ID
print(sc.defaultParallelism) # Return default level of parallelism
print(sc.defaultMinPartitions)


config = (SparkConf().
        setMaster("local").
        setAppName("myapp").
        set("spark.executer.memory","1g"))

# getting the configuration