Beispiel #1
0
def main():

    pic_plot()
    return

    sc = SparkContext()
    data = sc.textFile('/home/z/Documents/python/EE627_HW8/re_u.data')

    pdata = sc.parallelize(data.take(100000))
    ratings = pdata.map(lambda l: l.split(','))\
            .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))\

    #    pdb.set_trace()

    sc.setCheckpointDir('target')  # need to add this!!!

    rank = 20
    numIter = 30
    model = ALS.train(ratings, rank, numIter)
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r:
                                                 ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))
def main():
    args = parseArgs()
    sc = SparkContext(args.master, appName='Alternating least squares')

    if not args.verbose:
        sc.setLogLevel("ERROR")

    sc.setCheckpointDir('checkpoint/')

    folds = readFolds(args.data, args.folds, sc)
    cross_val_rmses = []
    for k in range(len(folds)):
        train, test = createTrainTestData(folds, k, args.N)
        print "Initiating fold %d with %d train samples and %d test samples" % (
            k, train.count(), train.count())

        start = time()
        model = ALS.train(train,
                          args.d,
                          iterations=args.iter,
                          lambda_=args.reg)
        testRMSE = testModel(model, test)
        now = time() - start
        print "Fold: %d\tTime: %f\tTestRMSE: %f" % (k, now, testRMSE)

        cross_val_rmses.append(testRMSE)
        train.unpersist()
        test.unpersist()

    print "%d-fold cross validation error is: %f " % (args.folds,
                                                      np.mean(cross_val_rmses))
Beispiel #3
0
def spark_context(request):
    """ fixture for creating a spark context
    Args:
    request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
    sc = SparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint')  # Stackoverflow error
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sc
def start():
    sc=SparkContext(appName='NetworkWordCount')
    #一个时间单位是1
    sc.setCheckpointDir('/tmp/spark')
    ssc=StreamingContext(sc,TIME_UNIT)
    lines = ssc.socketTextStream("10.5.24.137", 9999)
    words = lines.flatMap(lambda line: line.split(" "))
    pairs = words.map(lambda word: (word, 1))
    # 窗口长度:3*TIME_UNIT 每次移动长度:2*TIME_UNIT
    wordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y,3*TIME_UNIT,2*TIME_UNIT)
    print wordCounts
    wordCounts.pprint()
    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Beispiel #5
0
def setup_context():
    global SPARK_CONTEXT
    global SQL_CONTEXT

    config = config_pyspark_submit_args()

    SPARK_CONTEXT = SparkContext(conf=config)
    SQL_CONTEXT = SQLContext(SPARK_CONTEXT)

    logging.getLogger('py4j').setLevel(logging.ERROR)

    SPARK_CONTEXT.setLogLevel("ERROR")

    SPARK_CONTEXT.setCheckpointDir(f"{DATA_PATH}/checkpoint/")
def start():
    sc = SparkContext(appName='NetworkWordCount')
    ssc = StreamingContext(sc, 1)
    #必须设置checkpoint地址
    sc.setCheckpointDir('/tmp')
    lines = ssc.socketTextStream("10.5.24.137", 9999)
    words = lines.flatMap(lambda line: line.split(" "))
    pairs = words.map(lambda word: (word, 1))

    def update_count(new_value, total_value):
        return sum(new_value, total_value or 0)

    #使用updateStateBykey针对输入的每个key进行有状态的统计(会一直累加每个key的count)
    total_count = pairs.updateStateByKey(updateFunc=update_count)
    total_count.pprint()
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Beispiel #7
0
def main():
    # Configure Spark
    if not os.path.isdir("checkpoints"):
        os.mkdir("checkpoints")
    conf = SparkConf().setMaster('local').setAppName('connected components')
    sc = SparkContext(conf=conf)
    sqlcontext = SQLContext(sc)
    SparkContext.setCheckpointDir(sc, "checkpoints")

    # The directory for the file
    filename = "q1.txt"

    # Get data in proper format
    data = getData(sc, filename)
    edges = get_edges(data, sqlcontext)
    vertices = get_vertices(data, sqlcontext)
    graph = GraphFrame(vertices, edges)
    connected_components(graph=graph)
Beispiel #8
0
def run_tree_join(ACCESS_KEY, SECRET_KEY, REDIS_SERVER, REDIS_PORT,
                  CHECKPOINT_REMOTE_DIR):

    sc = SparkContext(appName='TreeJoin')

    sc.setCheckpointDir(CHECKPOINT_REMOTE_DIR)

    ss = SparkSession(sc).builder.getOrCreate()
    # ss.conf.set("spark.sql.shuffle.partitions", 4)

    # file_download_path = 's3a://heyyall/test_folder'
    # file_download_path = 's3a://heyyall/big_test'
    # file_download_path = 'RC_2011-01_my_slice_2'
    file_download_path = 's3a://heyyall/reddit_data/RC_2011-01'

    reddit_schema = StructType([
        StructField('archived', BooleanType()),
        StructField('author', StringType()),
        StructField('author_flair_css_class', StringType()),
        StructField('author_flair_text', StringType()),
        StructField('body', StringType()),
        StructField('controversiality', IntegerType()),
        StructField('created_utc', StringType()),
        StructField('distinguished', StringType()),
        StructField('downs', IntegerType()),
        StructField('edited', StringType()),
        StructField('gilded', IntegerType()),
        StructField('id', StringType()),
        StructField('link_id', StringType()),
        StructField('name', StringType()),
        StructField('parent_id', StringType()),
        StructField('retrieved_on', LongType()),
        StructField('score', IntegerType()),
        StructField('score_hidden', BooleanType()),
        StructField('subreddit', StringType()),
        StructField('subreddit_id', StringType()),
        StructField('ups', IntegerType())
    ])

    clean_data = get_clean_data(ss, file_download_path, reddit_schema)
    joined_links = link_join(clean_data)
    joined_links = joined_links.repartition('match_group')
    write_to_redis(joined_links, REDIS_SERVER, REDIS_PORT)
Beispiel #9
0
def process(name):
    CLOUDSQL_INSTANCE_IP = ''   #(database server IP)
    CLOUDSQL_DB_NAME = 'recommendation_spark'
    CLOUDSQL_USER = '******'
    CLOUDSQL_PWD  = 'tiger'  # CE
    
    conf = SparkConf().setAppName("train_model")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    USER_ID=name
    jdbcDriver = 'com.mysql.jdbc.Driver'
    jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
    
    # checkpointing helps prevent stack overflow errors
    sc.setCheckpointDir('checkpoint/')
    
    # Read the ratings and accommodations data from Cloud SQL
    dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating', useSSL='false').load()
    dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation', useSSL='false').load()
    print("read ...")
    
    # train the model
    model = ALS.train(dfRates.rdd, 20, 20) # tuning number
    print("trained ...")
    
    # use this model to predict what the user would rate accommodations that she has not rated
    allPredictions = None
    dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect()
    rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
    pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
    predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))
    predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5
    print("predicted for user={0}".format(USER_ID))
    if (allPredictions == None):
     allPredictions = predictions
    else:
     allPredictions.extend(predictions)
    
    # write them
    schema = StructType([StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True)])
    dfToSave = sqlContext.createDataFrame(allPredictions, schema)
    dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
def main():
    args = parseArgs()
    sc = SparkContext(args.master, appName='Alternating least squares')
    sess = SparkSession(sc)

    if not args.verbose:
        sc.setLogLevel("ERROR")

    sc.setCheckpointDir('checkpoint/')

    folds = readFolds(args.data, args.folds, sc, sess)
    cross_val_rmses = []
    for k in range(len(folds)):
        train, test = createTrainTestData(folds, k, args.N)
        print "Initiating fold %d with %d train samples and %d test samples" % (
            k, train.count(), train.count())

        start = time()
        als = ALS(maxIter=args.iter,
                  regParam=args.reg,
                  userCol="userId",
                  itemCol="itemId",
                  ratingCol="rating",
                  coldStartStrategy='drop')
        model = als.fit(train)
        predictions = model.transform(test)
        predictions.show()
        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        testRMSE = evaluator.evaluate(predictions)

        now = time() - start
        print "Fold: %d\tTime: %f\tTestRMSE: %f" % (k, now, testRMSE)

        cross_val_rmses.append(testRMSE)
        train.unpersist()
        test.unpersist()

    print "%d-fold cross validation error is: %f " % (args.folds,
                                                      np.mean(cross_val_rmses))
def main():

    sc = SparkContext()
    dataDir = '/home/z/Documents/python/EE627_project/data/data_in_matrixForm/'
    matrix_in_name = dataDir + 'user_track.txt'
    test_name = dataDir + 'testTrack_hierarchy.txt'
    t = strftime('%Y%m%d%H%M', gmtime())
    title = 'mf_track_estimated' + t + '.txt'
    output_file = dataDir + title
    f_out = open(output_file, 'w')

    data = sc.textFile(matrix_in_name)
    ratings = data.map(lambda l: l.split('|'))\
            .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    sc.setCheckpointDir('target')  # need to add this!!!

    rank = 150
    numIter = 30

    model = ALS.train(ratings, rank, numIter)
    # Save and load model
    model.save(sc, "target/tmp/album_rank150num30")
Beispiel #12
0

def get_most_liked_courses(n):
    """Get the top n courses with the most likes and ratings"""
    input = {
        'sort_mode': 'interesting',
        'count': n
    }
    return m.Course.search(params=input)[0]


def save_recommendations_to_mongo():
    log.info('Saving recommendations to database...')
    for user in m.User.objects:
        try:
            user.recommended_courses = engine.recommend_user(
                str(user.id),
                _PARAMS['num_courses'])
            user.save()
        except Exception as e:
            log.error(e)

if __name__ == '__main__':
    mongoengine.connect(c.MONGO_DB_RMC)
    sc = SparkContext()
    sc.setCheckpointDir('data/recommendation/checkpoint/')
    engine = RecommendationEngine(sc)
    engine.train()
    engine.load_data()
    save_recommendations_to_mongo()
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
    return movieNames

conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf = conf)
sc.setCheckpointDir('checkpoint')

print "\nLoading movie names..."
nameDict = loadMovieNames()
#umadeup: data create on top of u.data (3 rows)
data = sc.textFile("C:/Users/seeth_000/UdemySpark/ml-100k/umadeup.data")

ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print "\nTraining recommendation model..."
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
model = ALS.train(ratings, rank, numIterations)

userID = int(sys.argv[1])
Beispiel #14
0
from pyspark import SQLContext, SparkContext
from graphframes.examples import Graphs
from graphframes import *
import random
import sys

sc = SparkContext()
sqlContext = SQLContext(sc)
sc.setCheckpointDir("/home/shaanzie/sparkchecks/")

v = sqlContext.createDataFrame([("a", "Alice", 34), ("b", "Bob", 36),
                                ("c", "Charlie", 30), ("d", "David", 29),
                                ("e", "Esther", 32), ("f", "Fanny", 36),
                                ("g", "Gabby", 60)], ["id", "name", "age"])
# Edge DataFrame
e = sqlContext.createDataFrame([("a", "b", "friend"), ("b", "c", "follow"),
                                ("c", "b", "follow"), ("f", "c", "follow"),
                                ("e", "f", "follow"), ("e", "d", "friend"),
                                ("d", "a", "friend"), ("a", "e", "friend")],
                               ["src", "dst", "relationship"])
# Create a GraphFrame
g = GraphFrame(v, e)

result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()
Beispiel #15
0
SparkContext.setSystemProperty('spark.executor.memory', '2560m')  #2560m
SparkContext.setSystemProperty('spark.executor.cores', '8')

#SparkContext.setSystemProperty('spark.executor.memoryOverhead', '1536m')
SparkContext.setSystemProperty("spark.scheduler.mode", "FAIR")
SparkContext.setSystemProperty('spark.memory.fraction', '0.8')
SparkContext.setSystemProperty('spark.memory.storageFraction', '0.1')
SparkContext.setSystemProperty("spark.default.parallelism", "256")
SparkContext.setSystemProperty("spark.num.executors", "1")
SparkContext.setSystemProperty("spark.local.dir", "/tmp")

conf = SparkConf().setAppName('MoviesRec: Preditcions')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

sc.setCheckpointDir('/ML/movies/checkpoint/')

df = sqlContext.read.load(path='/ML/movies/data/*',
                          format='com.databricks.spark.csv',
                          delimiter=',',
                          inferSchema='true',
                          header="true").cache()

df = df.drop('timestamp')
oldColumns = df.schema.names
newColumns = ["userId", "itemId", "rating"]
df = reduce(
    lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]),
    range(len(oldColumns)), df)

df = df.withColumn("userId", df["userId"].cast("string"))
Beispiel #16
0
import time
import os
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from utils import *

sc = SparkContext('local', 'test')
sc.setLogLevel("ERROR")
sc.setCheckpointDir("/tmp")  # for stable state
ssc = StreamingContext(sc, 0.01)

rddQ = []
for filename in os.listdir("data/split"):
    rddQ.append(sc.textFile("data/split/" + filename))
# rddQ.append(sc.textFile("data/split/aa"))

result = []


def update_result(rdd):
    global result
    result = rdd.top(10)


# processing
dstream = ssc.queueStream(rddQ)
dstream = sclean(dstream)
dstream = scount(dstream)
dstream\
  .map(lambda x: (x[1],x[0]))\
  .foreachRDD(lambda rdd: update_result(rdd))
Beispiel #17
0
def load_spark_context():
    conf = SparkConf().setMaster("local[*]").setAppName(
        "MovieRecommendationsALS")
    sc = SparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint')
    return sc
'''
이 연습에서는 체크포인트가 반복 루틴에 미칠 수 있는 영향을 보여준다.
'''
import sys
from pyspark import SparkConf, SparkContext
sc = SparkContext()
sc.setCheckpointDir("file:///Users/isang-geon/tmp/checkpointdir")
rddofints = sc.parallelize([1,2,3,4,5,6,7,8,9,10])
try:
    # 이것은 rddofints에 대한 매우 긴 리니지를 만들 것이다.
    for i in range(1000):
        rddofints = rddofints.map(lambda x: x+1)
        if i % 10 == 0:
            print("Looped " + str(i) + " times")
            rddofints.checkpoint()
            rddofints.count()
except Exception as e:
    print("Exception: " + str(e))
    print("RDD Debug String: ")
    print(rddofints.toDebugString())
    sys.exit()
print("RDD Debug String: ")
print(rddofints.toDebugString())
    inters_dst_dst =  DataFrame.intersect(df1.select("dst"), df2.select("dst"))

    inters_dst_dst_list = inters_dst_dst.select("dst").rdd.map(lambda row : row.dst).collect()

    query_list = []
    query_list += split_query(inters_dst_dst_list, 'src', 'or', 'dst')
    query_list += split_query(inters_dst_dst_list, 'dst', 'or', 'dst')

    inters_e = reduce(DataFrame.unionAll, [e.filter(query) for query in query_list])

    inters_v = inters_dst_dst.select(col("dst").alias("id"))
    return inters_v, inters_e


sc = SparkContext()
sc.setCheckpointDir("/user/pnda/checkpoint")
sqlContext = sql.SQLContext(sc)

#load vertices
v = sqlContext.read.parquet("/user/pnda/result/vertex")

#load edges
e = sqlContext.read.parquet("/user/pnda/result/edge")

eg = e.groupBy("src").count().sort("count", ascending=[0,1]).head(2)
user_1 = eg[0].src
user_2 = eg[1].src

df1 = get_n_level_connection(user_1, 3, True)
df2 = get_n_level_connection(user_2, 3, True)
Beispiel #20
0
# In[15]:

import math
import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
s=data.alias("i").join(data.alias("j"), psf.col("i.id") < psf.col("j.id"))      .select(
          psf.col("i.id").alias("src"),
          psf.col("j.id").alias("dst"),
          dot_udf("i.norm", "j.norm").alias("relationship"))\
      .sort("src", "dst")\

# In[ ]:

#run in the spark shell
v = featurized_data.select("id", "features")
e = s.filter("relationship > 0.8")
from graphframes import *
g = GraphFrame(v, e)
g.vertices.show()
g.edges.show()
results = g.pageRank(resetProbability=0.15, maxIter=10)  #pagerank
results.vertices.select("id", "pagerank").show()
results.edges.select("src", "dst", "weight").show()
results = g.triangleCount()  #triangelCount
results.select("id", "count")
sc.setCheckpointDir("_checkpoint")
results = g.connectedComponents()  #connectedComponents
results.show()
Beispiel #21
0
    location = "hdfs"
    try:
        if "avatar" in platform.node():
            location = "local"
    except:
        pass
    try:
        if "avatar" in socket.gethostname():
            location = "local"
    except:
        pass
    print "### location %s" % location

    sc = SparkContext(appName="CRF")
    sc.setCheckpointDir("/tmp")
    year = 2015
    mode = sys.argv[1]
    tag = sys.argv[2]
    month = int(sys.argv[3])
    day = int(sys.argv[4])
    hour = int(sys.argv[5])
    partNum = None
    try:
        partNum = int(sys.argv[6])
    except:
        pass
    limit = None
    try:
        limit = int(sys.argv[7])
    except:
from pyspark.sql.types import *
import pendulum
import pandas
import requests

conf = SparkConf().setAppName("PySparkStreaming") \
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .set("spark.default.parallelism", 6) \
        .set("spark.speculation", "true") \
        .set("spark.speculation.interval", "1s") \
        .set("spark.streaming.kafka.maxRatePerPartition", 300) \
        .set("spark.sql.autoBroadcastJoinThreshold", -1)

sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
sc.setCheckpointDir("/tmp/spark-streaming")

spark = SparkSession \
        .builder \
        .appName("sparkSQL_car_gps") \
        .getOrCreate()
spark.sql("SET spark.sql.shuffle.partitions=6")

grid_schema = StructType([
    StructField("grid", IntegerType(), False),
    StructField("block", IntegerType(), False)
])


def grid_block(lng_x, lat_y):
    if not (121.34 <= lng_x < 121.68 and 24.92 <= lat_y < 25.2):
Beispiel #23
0
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import diff_feature_cls, diff_train_cls_pos


def run(sc, sql_context, is_hive):
    diff_feature_cls.main(sc, sql_context, is_hive=True)
    diff_train_cls_pos.main(sc, sql_context, is_hive=True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate",
                      master="yarn-client",
                      conf=conf)
    sc.setCheckpointDir("checkpoint/")
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
Beispiel #24
0
    testRDD = sc.textFile(testFile)

    header = inputRDD.first()  #extract header
    inputRDD = inputRDD.filter(lambda row: row != header)

    header2 = testRDD.first()  #extract header
    testRDD = testRDD.filter(lambda row: row != header2)

    inputRDD = inputRDD.map(lambda line: line.split(',')).map(
        lambda x: ((int(x[0]), int(x[1])), float(x[2])))
    testRDD = testRDD.map(lambda line: line.split(',')).map(
        lambda x: ((int(x[0]), int(x[1])), 1))

    input1 = inputRDD.subtractByKey(testRDD)
    input = input1.map(lambda x: Rating(x[0][0], x[0][1], x[1]))
    sc.setCheckpointDir('/tmp')

    rank = 8
    numIterations = 10
    lmbda = 0.1
    numBlocks = 16
    nonnegative = True
    model = ALS.train(input,
                      rank,
                      numIterations,
                      lmbda,
                      nonnegative=True,
                      seed=42)

    testRDD = testRDD.map(lambda x: (x[0][0], x[0][1])).distinct()
    predictions = model.predictAll(testRDD).map(
def computeEntityTSData():

    ##### Spark Functions
    # input: json_line
    # [(entity: {date, count})]
    def deriveEntityToDate(json_line):
        entity_dates = []

        json_obj = json.loads(json_line)
        created_string = float(json_obj['created_utc'])
        created_date = datetime.fromtimestamp(created_string).date()
        annotations = json_obj['entity_texts']
        for annotation in annotations:
            date_count = {}
            date_count[created_date] = 1
            entity_dates.append((annotation, date_count))
        return entity_dates

    def combineDateCounts(date_counts1, date_counts2):
        date_counts = date_counts1
        for date in date_counts2:
            if date in date_counts:
                date_counts[date] += date_counts2[date]
            else:
                date_counts[date] = date_counts2[date]
        return date_counts


    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Exploratory Plots")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    # Added the core limit to avoid resource allocation overruns
    conf.set("spark.cores.max", "5")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    # distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/user/derczynskil/RC_2015-01")
    distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    # Point to local file until data has finished uploading to HDFS
    # distFile = sc.textFile("/home/derczynskil/annotated/")
    distFile.cache()

    # Step 1: Derive the time-sensitive map of when entities appeared
    print("----Loading entity time-series")
    entity_citation_dates = distFile\
        .flatMap(deriveEntityToDate)\
        .reduceByKey(combineDateCounts)
    entity_citation_dates.cache()
    # print(entity_citation_dates.collect())

    print("----Deriving the count of entity citations")
    entity_citation_counts = entity_citation_dates\
        .map(lambda x: (x[0], len(x[1])))\
        .map(lambda x: (x[1], x[0]))\
        .sortByKey(False)\
        .map(lambda x: (x[1], x[0]))\
        .collect()

    # Write to local disk
    print("------Writing the output to a file")
    outputString = ""
    for (entity, count) in entity_citation_counts:
        outputString += str(entity.encode('utf-8')).replace("'", "") + "\t" + str(count) + "\n"
    # print(outputString)
    outputFile = open("data/entity_mention_frequencies.csv", "w")
    outputFile.write(outputString)
    outputFile.close()

    # Write the time-series output to local disk
    print("------Writing the ts output to a file")
    outputString = ""
    for (entity, date_to_count) in entity_citation_dates.collect():
        outputString += str(entity.encode('utf-8')).replace("'", "")
        for date in date_to_count:
            outputString += "\t" + str(date) + "|" + str(date_to_count[date])
        outputString += "\n"
    # print(outputString)
    outputFile = open("data/entity_mention_ts.csv", "w")
    outputFile.write(outputString)
    outputFile.close()

    # stop the Spark context from running
    sc.stop()
Beispiel #26
0
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from utils import *
from pyspark import SparkContext, SparkConf

file_path = "./data.csv"
checkpoint_dir = "./CheckpointDir/"
conf = SparkConf().setAppName("Car Price Prediction").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc.getConf().getAll())
sc.setCheckpointDir(checkpoint_dir)
spark = SQLContext(sc)

data = spark.read.csv(path=file_path,
                      header=True,
                      quote='"',
                      sep=",",
                      inferSchema=True)
data_test, data_train = data.randomSplit(weights=[0.3, 0.7], seed=10)

get_indexer_input = get_indexer_input(data)


def model_training(data_train, indexer_input):
    x_cols = list(
        set(data_train.columns) - set(indexer_input.keys() + ["Price"]))
    str_ind_cols = ['indexed_' + column for column in indexer_input.keys()]
    indexers = indexer_input.values()
    pipeline_tr = Pipeline(stages=indexers)
    data_tr = pipeline_tr.fit(data_train).transform(data_train)
def computeGlobalCascadeIsomorphicDistribution():
    ###### Spark Tranformation Functions
    # Returns: [(reply_id, orig_post_id)] - for flatMap - to avoid issue of 0 cardinality of the list
    def deriveReplyMap(json_line):
        reply_tuples = []
        json_obj = json.loads(json_line)
        if 'parent_id' in json_obj:
            orig_post_id = json_obj['parent_id']
            reply_post_id = json_obj['name']
            reply_tuples.append((reply_post_id, orig_post_id))
        return reply_tuples

    def combineReplies(replies1, replies2):
        replies = replies1 + replies2
        return replies

    def deriveEntityToPosts(json_line):
        # Get the broadcast maps that need to be used
        # orig_replies_map = orig_replies_map_broadcast.value
        reply_orig_map = reply_orig_map_broadcast.value
        orig_replies_map = orig_replies_map_broadcast.value

        entity_posts = []
        json_obj = json.loads(json_line)
        post_id = json_obj['name']
        entities = json_obj['entity_texts']

        # get the replies to this post
        for entity in entities:
            try:
                # ensure that the post appears in a chain - in order to filter out singleton citations
                if post_id in orig_replies_map or post_id in reply_orig_map:
                # if len(orig_replies_map.lookup(post_id)) > 0 \
                #         or len(reply_orig_map.lookup(post_id)) > 0:    # New code to use rdd lookup to save RDD partitioning
                    entity_posts.append((str(entity), [str(post_id)]))
            except:
                pass
        return entity_posts


    ###### Graph Isomorphism Functions

    # Input: (entity,[chains]) - tuple
    # output: (entity, [connected_chain]) - tuple
    def induce_type1_cascades(tuple):
        entity = tuple[0]
        chains = tuple[1]
        # entity_posts = tuple[1]['posts']

        # Get the entity posts
        entity_citation_posts = entity_posts_map_broadcast.value
        entity_posts = entity_citation_posts[entity]

        # log the connected chains of the entity
        connected_chains = []
        for chain in chains:
            # new chain formed by filtering the chain with only entity citing posts
            # Filter the chain down to include only posts citing the entity
            new_chain = set()
            for chain_edge in chain:
                source = chain_edge.split("->")[0]
                target = chain_edge.split("->")[1]
                if source in entity_posts and target in entity_posts:
                    new_chain.add(chain_edge)

            # too inefficient, better to convert to matrix form and then run this the graph
            # 1. Induce maps between node label and ids
            node_to_index = {}
            index_to_node = {}
            index = -1
            for chain_edge in new_chain:
                source = chain_edge.split("->")[0]
                target = chain_edge.split("->")[1]

                if source not in node_to_index:
                    index += 1
                    node_to_index[source] = index
                    index_to_node[index] = source

                if target not in node_to_index:
                    index += 1
                    node_to_index[target] = index
                    index_to_node[index] = target


            # 2. Populate the nd matrix
            dim = len(node_to_index)
            if dim > 1:
                M = np.zeros(shape=(dim, dim))
                for chain_edge in new_chain:
                    source = chain_edge.split("->")[0]
                    source_index = node_to_index[source]
                    target = chain_edge.split("->")[1]
                    target_index = node_to_index[target]
                    M[source_index, target_index] = 1

                # 3. Induce the connected components from the matrix
                # print(str(dim))
                # print(str(M))
                Msp = csgraph.csgraph_from_dense(M, null_value=0)
                n_components, labels = csgraph.connected_components(Msp, directed=True)
                # print("Number of connected components = " + str(n_components))
                # print("Components labels = " + str(labels))
                # get the components and their chains
                for i in range(0, n_components):
                    # print("Component: " + str(i))
                    component_chain = []
                    # get the nodes in that component
                    # print(labels)
                    c_nodes = [j for j in range(len(labels)) if labels.item(j) is i]
                    # print(c_nodes)
                    # Only log the component if more than two nodes are in in
                    if len(c_nodes) > 1:
                        # build the canonical edges
                        for source_id in c_nodes:
                            for target_id in c_nodes:
                                if int(M[(source_id, target_id)]) is 1:
                                    component_chain.append(str(source_id) + "->" + str(target_id))
                        if len(component_chain) > 0:
                            connected_chains.append(component_chain)

            canonical_chains = connected_chains
            print("Canonical Chains:")
            print(canonical_chains)


        # return back to the function the mapping between the entity and the connected chains
        return (entity, canonical_chains)

    # Input: (entity, [post])
    # Output: [(entity, {cascades: [cascade]})]
    def computePerEntityCascadesAndPosts(tuple):
        entity = tuple[0]
        posts = tuple[1]

        # Get the broadcast maps that need to be used
        orig_replies_map = orig_replies_map_broadcast.value
        reply_orig_map = reply_orig_map_broadcast.value

        # Get the cascade graphs for each entity
        entity_chains = []
        chain_posts = set()

        # get the chain that each post was in
        for post in posts:
            # Do this to speed up computation by ensuring that we haven't already recorded the post in another chain
            if post not in chain_posts:
                chain_posts.add(post)
                chain = []
                to_process = []

                # Starting from the seed post in a possible chain
                # get the replies to the post - down the chain
                if post in orig_replies_map:
                    replies = orig_replies_map[post]
                    to_process += replies
                    for reply in replies:
                        chain.append(reply + "->" + post)
                # Get the post that the post replied to
                if post in reply_orig_map:
                    orig_post = reply_orig_map[post]
                    to_process += orig_post
                    chain.append(post + "->" + orig_post)

                # Go through each post that is to be processed
                while len(to_process) > 0:
                    to_process_post = to_process.pop()
                    # log that the post has been processed in an already found chain
                    chain_posts.add(to_process_post)
                    # get the replies to this post
                    if to_process_post in orig_replies_map:
                        replies = orig_replies_map[to_process_post]
                        to_process += replies
                        for reply in replies:
                            chain.append(reply + "->" + to_process_post)
                    # get the post that this post relied to
                    if to_process_post in reply_orig_map:
                        orig_post = reply_orig_map[to_process_post]
                        to_process += orig_post
                        chain.append(to_process_post + "->" + orig_post)

                # log the chain for the entity
                entity_chains.append(chain)

        # Return the entity chains
        return (entity, entity_chains)

    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Cascade Pattern Mining")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    conf.set("spark.cores.max", "15")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
    # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    annotationFile.cache()
    # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
    thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
    thinnedFile.cache()

    # Load the reply graphs from the thinnedFile
    print("Loading replies map")
    reply_map_rdd = thinnedFile\
        .flatMap(deriveReplyMap)

    # Collect as a map and broadcast this to the cluster
    reply_orig_map = reply_map_rdd\
        .collectAsMap()
    print("Reply Orig Map Size = " + str(len(reply_orig_map)))
    # print(reply_orig_map)
    reply_orig_map_broadcast = sc.broadcast(reply_orig_map)

    # # get the: {orig, [reply]} dictionary
    orig_replies_map = reply_map_rdd\
        .map(lambda x: (x[1], [x[0]]))\
        .reduceByKey(combineReplies)\
        .collectAsMap()
    print("Orig Replies Map Size = " + str(len(orig_replies_map)))
    orig_replies_map_broadcast = sc.broadcast(orig_replies_map)


    # Load the entity to post map
    # input: json_line of annotations of each post
    # output: [(entity, [post])] of posts where entities appeared
    print("Loading entity to posts rdd")
    entity_posts_rdd = annotationFile\
        .flatMap(deriveEntityToPosts)\
        .reduceByKey(lambda p1, p2: p1 + p2)
    entity_posts_rdd_map = entity_posts_rdd.collectAsMap()
    print("Entity to posts map size = " + str(len(entity_posts_rdd_map)))
    entity_posts_map_broadcast = sc.broadcast(entity_posts_rdd_map)
    # print("Entity to posts Map Size = " + str(entity_posts_rdd.count()))
    # entity_posts_rdd_sample = sc.parallelize(entity_posts_rdd.take(100))
    # # print(entity_posts_rdd_map)
    #
    print("Computing entity cascades")
    # Output: (entity, {"cascades": entity_chains, "posts": posts})
    entity_cascades_rdd = entity_posts_rdd\
        .map(computePerEntityCascadesAndPosts)
    # #     # .reduceByKey(lambda chain1, chain2: chain1 + chain2)
    entity_cascades_rdd_map = entity_cascades_rdd.collect()
    print("Entity cascades Map Size = " + str(len(entity_cascades_rdd_map)))
    #
    print("Inducing distribution of cascade shapes - Global")
    # Induce global distributiion
    # Returns: [(entity, [chain])]
    canonical_cascade_patterns = entity_cascades_rdd\
        .map(induce_type1_cascades)
    # # canonical_cascade_patterns_distribution_map = canonical_cascade_patterns\
    # #     .take(10)
    # # print("Entity connected cacades Map Size = " + str(len(canonical_cascade_patterns_distribution_map)))
    # # print(canonical_cascade_patterns_distribution_map)

    print("Top Patterns:")
    canonical_cascade_patterns_distribution = canonical_cascade_patterns\
        .flatMap(lambda x: x[1] if len(x) is 2 else ["null"])\
        .map(lambda x: (str(x), 1))\
        .filter(lambda x: "null" not in x[0])\
        .reduceByKey(lambda count1, count2: count1 + count2)\
        .map(lambda x: (x[1], x[0]))\
        .sortByKey(False)\
        .map(lambda x: (x[1], x[0]))
    top_patterns = canonical_cascade_patterns_distribution.collect()
    # print(top_patterns)
    #
    # Write the patterns to local disk for local isomorphism computation
    outputString = ""
    for (pattern, freq) in top_patterns:
        outputString += str(pattern) + "\t" + str(freq) + "\n"
    outputFile = open("data/cascade_shapes.tsv", "w")
    outputFile.write(outputString)
    outputFile.close()

    # stop the Spark context from running
    sc.stop()
Beispiel #28
0
import sys
import json
from operator import add
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark import SQLContext, SparkContext, SparkConf
reload(sys)
sys.setdefaultencoding('utf-8')

conf = SparkConf().setAppName("chencheng's task").setMaster("spark://anti-spam-spark-001.yz.momo.com:8081,anti-spam-spark-002.yz.momo.com:8081")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("hdfs://antispam/user/hadoop/output/chencheng/checkpoint")

user_artist_data1 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/female/2016031[0-9]18/")
user_artist_data2 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/female/2016032[0-3]18/")
#user_artist_data2 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/male/2016032[0-1]18/")

user_artist_data= user_artist_data1.union(user_artist_data2)

ratings = user_artist_data.map(lambda x: json.loads(x))\
        .filter(lambda x: x[0][0] and x[0][1])\
        .map(lambda x: Rating(int(x[0][0]), int(x[0][1]), float(x[1])))
ratings.checkpoint()
ratings.cache()

rank = 30
numIterations = 25

ALS.checkpointInterval = 2
model = ALS.train(ratings, rank, numIterations,lambda_=0.03,nonnegative=True)

model.save(sc,"hdfs://antispam/user/hadoop/output/chencheng/model/als_female_parameters/30/als_female_0310-23_003_")
    const=True,
    default=False,
    help=
    'Include extra (non-restaurant businesses) rows for training the model.')

parser.add_argument('--withTrainTestSplit',
                    required=False,
                    type=str2bool,
                    nargs='?',
                    const=True,
                    default=False)
args = vars(parser.parse_args())

conf = SparkConf().setAppName("eda").setMaster("local[*]")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("checkpoint_dir/")

outFile_name = 'businesses'

start = timer()

business_json_df = (
    pd.read_json('yelp_dataset/yelp_academic_dataset_business.json',
                 lines=True)
    # Set the index to business_id; this will be used for joining later.
    .set_index('business_id').dropna(subset=['attributes', 'categories']))
# Filter out non-restaurant businesses.
mask = (business_json_df['categories'].str.contains(r"Food|Restaurant|Bar",
                                                    case=False)
        | business_json_df['name'].str.contains("Restaurant|Cuisine",
                                                case=False))
Beispiel #30
0
        if error < min_error:
            min_error = error
            best_rank = rank
            best_lambda = lambda_i

print 'The best model was trained with rank %s, lambda %f' % (best_rank, best_lambda)

# Test 
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=best_lambda)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print 'For testing data the RMSE is %s' % (error)
'''
# Using the complete dataset to build the final model; re-do the above
# Load the complete dataset file
complete_ratings_file = os.path.join('./datasets', 'ml-latest', 'ratings.csv')
complete_ratings_raw_data = sc.textFile(complete_ratings_file)
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]
# Parse
complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
print "There are %s recommendations in the complete dataset" % (complete_ratings_data.count())

# to avoid stackover flow
sc.setCheckpointDir('checkpoint/')

training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0L)
complete_model = ALS.train(training_RDD, best_rank, seed=seed, 
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel

import sys
reload(sys)
sys.setdefaultencoding('utf8')

print("------ CONFIG -------")

# CONF CLUSTER
conf=SparkConf()\
.set('spark.network.timeout','5000000s')\
.set('spark.executor.heartbeatInterval','4500000s')

sc = SparkContext(conf=conf)
sc.setCheckpointDir("checkpointdir2")
sqlContext = SQLContext(sc)

print("------ reading data -------")
data = sqlContext.read.parquet("data.parquet")

print(sc.uiWebUrl)
print("------ train test split --------")
(trainingData, testData) = data.randomSplit([0.7, 0.3])

print("--------- Indexing categorical values for OHE------------")

codeIndexer = StringIndexer(inputCol='code',
                            outputCol='code_index').setHandleInvalid("keep")
lieuIndexer = StringIndexer(inputCol='lieu',
                            outputCol='lieu_index').setHandleInvalid("keep")
import findspark  # this needs to be the first import
findspark.init()

import networkx as nx
from pyspark import SparkConf
from pyspark import SparkContext
from snpp.cores.lowrank import partition_graph


conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest"))
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')  # Stackoverflow error

train_graph_path = 'data/{}/train_graph.pkl'.format('slashdot')
g = nx.read_gpickle(train_graph_path)
partition_graph(g, k=40, sc=sc,
                lambda_=0.1,
                iterations=20,
                seed=123456)

sc.stop()
    ##### Main Execution Code
    conf = SparkConf().setAppName("Subreddit extraction")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    # Added the core limit to avoid resource allocation overruns
    conf.set("spark.cores.max", "10")
    conf.setMaster("mesos://zk://scc-culture-slave4.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # get the HDFS url of the dataset
    dataset = "reddit"
    hdfsUrl = inlocation

    # broadcast the name of the dataset to the cluster
    print("----Broadcasting the name of the dataset being processed")
    datasetName = sc.broadcast(dataset)

    # run a map-reduce job to first compile the RDD for the dataset loaded from the file
    print("-----Dataset file: " + hdfsUrl)
    rawPostsFile = sc.textFile(hdfsUrl, minPartitions=12)

    # clean the posts and write them into HDFS from their respective paritions
    print("Writing to HDFS")
Beispiel #34
0
            structfields.append(
                StructField(column_names[i], StringType(), True))
        elif column_types[i] == "date":
            structfields.append(StructField(column_names[i], DateType(), True))
        else:
            raise ValueError("column type undefined: " + column_types[i])

    schema = StructType(structfields)

    print("...SCHEMA OF " + name + " IS " + str(schema))

    # Read in files
    location_file = open(location_file_name, "r")
    locations = location_file.read().splitlines()

    sc.setCheckpointDir(locations[2] + "/" + name)
    rdd = sc.textFile(csv_file, use_unicode=False)
    rdd.checkpoint()
    rdd2 = rdd.map(lambda line: parse(column_types, line))
    df = sqlContext.createDataFrame(rdd2, schema)

    # Convert to Parquet
    start = time.time()
    df.write.parquet(locations[3] + name)
    end = time.time()

    # Record time and before/after file sizes
    output_file_name = './output_' + compression_scheme + '_' + dict_encoding + '/' + name + '_' + compression_scheme + '_' + dict_encoding + '.txt'
    if os.path.exists(output_file_name):
        append_write = 'a'  # append if already exists
    else:
Beispiel #35
0
def computeRmse(model, evalSet):
    evalSetUserProduct = evalSet.map(lambda x: (x[0], x[1]))
    predictions = model.predictAll(evalSetUserProduct).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = evalSet.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    validationRmse = math.sqrt(ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    return validationRmse


if __name__ == "__main__":
    conf = SparkConf() \
      .setAppName("YelpReviewALS") \
      .set("spark.executor.memory", "1g")
    sc = SparkContext('local', conf=conf)

    reviewRDD = sc.textFile("../../../data/review_large.txt")
    sc.setCheckpointDir("checkpoints/")

    if len(sys.argv) < 2:
        printUsage()
        exit(1)

    if sys.argv[1] == '-e':
        evalModel = True
    else:
        evalModel = False

    if sys.argv[1] == '-c':
        runValidation = True
    else:
        runValidation = False
Beispiel #36
0
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 12 14:12:44 2017

@author: Administrator
"""
from pyspark import SparkContext, SparkConf
from pyspark.mllib.recommendation import ALS
from math import sqrt
from operator import add

conf = SparkConf().setAppName("MovieRecommendation").set(
    "spark.executor.memory", "4g")
sc = SparkContext(conf=conf)
#为了保证迭代次数过多时不报错,具体原理不知道,搜了好久才看到这个解决方案
sc.setCheckpointDir("D:/WorkSpace/Spyder/checkpoint")


#处理用户评分数据
def HandleRating(line):
    user = line.strip().split(",")
    return int(user[3]) % 10, (int(user[0]), int(user[1]), float(user[2]))


#处理电影数据,返回[(电影id,电影名称)]
def HandleMovie(line):
    movie = line.strip().split(",")
    return int(movie[0]), movie[1]


#处理电影数据,返回[(电影id,(电影类型))],主要是为了解决根据电影类型推荐电影的冷启动问题
Beispiel #37
0
def train():
    conf = SparkConf() \
        .setAppName("project") \
        .setMaster("local[*]") \
        .set("spark.driver.memory","4g")
    sc = SparkContext(conf=conf)

    # check model dir
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    # rename
    raw_data = sc.textFile(train_file).map(json.loads).persist(
        StorageLevel.MEMORY_AND_DISK)
    u_table1 = raw_data.map(lambda x: x['user_id']).distinct().collect()
    u_set1 = set(u_table1)
    b_table1 = raw_data.map(lambda x: x['business_id']).distinct().collect()
    b_set1 = set(b_table1)

    user_avg = support.getAvg(user_avg_file)
    business_avg = support.getAvg(business_avg_file)
    u_set2 = set(user_avg.keys())
    b_set2 = set(business_avg.keys())

    b_table3 = sc.textFile(business_json).map(
        json.loads).map(lambda x: x['business_id']).collect()
    b_set3 = set(b_table3)

    u_table = list(u_set1.union(u_set2))
    b_table = list(b_set1.union(b_set2).union(b_set3))
    u_d = {u_table[i]: i for i in range(len(u_table))}
    b_d = {b_table[i]: i for i in range(len(b_table))}

    # agmentation
    business_avg = support.getAvg(business_avg_file)
    n_b_avg = {b_d[k]: business_avg[k] for k in business_avg}

    # get stopwords
    stopwords = sc.textFile(stopwords_file).collect()

    b_profile = sc.textFile(business_json) \
        .map(json.loads) \
        .map(lambda x: (x['business_id'], x['categories'])) \
        .map(lambda x: (b_d[x[0]], x[1])) \
        .mapValues(lambda v: processCategories(v, stopwords)) \
        .collectAsMap()
    b_list = list(sorted(b_profile.keys()))
    b_length = len(b_profile)
    jaccard_sim = sc.parallelize(b_list) \
        .flatMap(lambda x: getJS(x, b_profile, b_list)) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda vs: {k: v for k, v in vs}) \
        .collect()

    agm_data = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
        .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \
        .map(lambda x: (x[0], [(x[1], x[2])])) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda vs: processValues(vs, jaccard_sim, n_b_avg)) \
        .flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]) \
        .persist(StorageLevel.MEMORY_AND_DISK)

    # asl
    agm_train = agm_data.map(lambda x:
                             ((u_table[x[0]], b_table[x[1]]), x[2])).collect()
    support.writeDownRenameTable(agm_train, agm_train_file)

    lonely_user = agm_data.map(lambda x: (x[0], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()
    lonely_business = agm_data.map(lambda x: (x[1], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()

    stars_data = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \
        .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)
    sc.setCheckpointDir(checkpoint_file)
    ALS.checkpointInterval = 2
    modelRDD = ALS.train(ratings=stars_data,
                         rank=1,
                         iterations=70,
                         lambda_=0.01,
                         nonnegative=True)
    saveAlsModel(modelRDD, u_table, b_table, als_model_file)
def computeExposureGraphForTop500Entities():

    ###### Spark Tranformation Functions
    # Returns: [(reply_id, orig_post_id)] - for flatMap - to avoid issue of 0 cardinality of the list
    def deriveReplyMap(json_line):
        reply_tuples = []
        json_obj = json.loads(json_line)
        if 'parent_id' in json_obj:
            orig_post_id = str(json_obj['parent_id'].encode("utf-8"))
            reply_post_id = str(json_obj['name'].encode("utf-8"))
            reply_tuples.append((reply_post_id, orig_post_id))
        return reply_tuples

    def combineReplies(replies1, replies2):
        replies = replies1 + replies2
        return replies

    def derivePostDetails(json_line):
        # get the entity posts
        # entity_posts_map = entity_posts_map_broadcast.value
        # entity_posts = []
        # for entity in entity_posts_map:
        #     entity_posts += entity_posts_map[entity]
        # entity_posts_set = set(entity_posts)
        # for entity in entity_posts_map:
        #     entity_posts_map[entity]
        # get the set of posts that cite the top-500 entities
        # entity_posts_set = [entity_posts_map[entity] for entity in entity_posts_map]

        # need to log: postid, userid, time of post
        json_obj = json.loads(json_line)
        post_id = str(json_obj['name'].encode("utf-8"))
        # if post_id in entity_posts_set:
        created_string = float(json_obj['created_utc'])
        created_date = datetime.fromtimestamp(created_string).date()
        user_id = str(json_obj['author'].encode("utf-8"))

        post_dict = {'user_id': user_id,
                     'post_id': post_id,
                     'created_date': created_date}
        return (post_id, post_dict)
        # else:
        #     return ("null", {})

    def deriveEntityToPosts(json_line):
        # Get the broadcast maps that need to be used
        orig_replies_map = orig_replies_map_broadcast.value
        reply_orig_map = reply_orig_map_broadcast.value
        top_500_entities = set(top_500_entities_broadcast.value)

        entity_posts = []
        json_obj = json.loads(json_line)
        post_id = json_obj['name']
        entities = json_obj['entity_texts']
        for entity in entities:
            # ensure that the entities is one that we want to process
            if entity in top_500_entities:
                try:
                    # ensure that the post appears in a chain - in order to filter out singleton citations
                    if post_id in orig_replies_map or post_id in reply_orig_map:
                        entity_posts.append((str(entity), [str(post_id)]))
                except:
                    pass
        return entity_posts


    # input: tuple x where x[0] = entity name, x[1] = entity posts
    def derive_per_entity_exposure_distribution(x):
        entity = x[0]
        posts = x[1]

        # get the posts for this entity from the broadcast variable
        # entity_posts_list = entity_posts_map_broadcast.value[entity]
        entity_posts_list = posts
        post_details_map = post_details_broadcast.value

        # time order the posts of the entity
        print("Generating time-ordered posts and users citing the entity")
        date_to_posts = {}
        post_users = []
        for post in entity_posts_list:
            # check that the post has been by an existing user
            post_user = post_details_map[post]['user_id']
            if '[deleted]' not in post_user and post in post_details_map:
                post_users.append(post_user)
                print(post_user)
                post_date = post_details_map[post]['created_date']
                # get the timestamp
                if post_date in date_to_posts:
                    date_to_posts[post_date] += [post]
                else:
                    date_to_posts[post_date] = [post]

        # get the user specific replies
        entity_users = set(post_users)
        print("Collecting the user specific time series interaction graph")
        reply_orig_map = reply_orig_map_broadcast.value
        # Get the: {user, {date, interacted_user}} map
        user_to_interaction_dates = {}
        for reply_post_id in reply_orig_map:
            if post_details_map[reply_post_id]['user_id'] in entity_users:
                # Check that we have the post details from the reply map
                if reply_orig_map[reply_post_id] in post_details_map and reply_post_id in post_details_map:
                    user_id = post_details_map[reply_post_id]['user_id']
                    interaction_date = post_details_map[reply_post_id]['created_date']
                    interacted_user = str(post_details_map[reply_orig_map[reply_post_id]]['user_id'])
                    if '[deleted]' not in interacted_user:
                        if user_id in user_to_interaction_dates:
                            interaction_dates = user_to_interaction_dates[user_id]
                            if interaction_date in interaction_dates:
                                interaction_dates[interaction_date] += [interacted_user]
                            else:
                                interaction_dates[interaction_date] = [interacted_user]
                            user_to_interaction_dates[user_id] = interaction_dates
                        else:
                            user_to_interaction_dates[user_id] = {interaction_date: [interacted_user]}



        # Go through and derive the point of activation of each user
        activation_points = {}
        activated_users = set()
        for user in entity_users:
            # get times when they interacted with people
            # dict: {date, [user_id]}
            if user in user_to_interaction_dates:
                ts_interactions = user_to_interaction_dates[user]

                # go through the posts of the entity each day
                for date in sorted(date_to_posts):
                    # only do the computation if the user is not activated already
                    if user not in activated_users:
                        # Go through each post on the sorted dates
                        date_posts = date_to_posts[date]
                        for post in date_posts:
                            # is the post by the user - if so, this will be the user's first citation of the entity: activated
                            if post_details_map[post]['user_id'] is user:
                                # get how many times the user was exposed to the entity before adopting it
                                prior_post_authors = []
                                for prior_date in sorted(date_to_posts):
                                    if prior_date < date:
                                        # get all the users who authored posts before this one
                                        for prior_post in date_to_posts[prior_date]:
                                            prior_post_authors.append(post_details_map[prior_post]['user_id'])

                                # get all of the posts that were authored by people that the person had replied to in the past
                                prior_users = set()
                                for ts_date in sorted(ts_interactions):
                                    if ts_date < date:
                                        print("TS Date = " + str(ts_date))
                                        print(str(ts_interactions))
                                        for prior_user in ts_interactions[ts_date]:
                                            prior_users.add(prior_user)
                                exposure_count = 0
                                # count how many times the user was exposed to the entity (given that they had interacted with the users beforehand
                                for prior_author in prior_post_authors:
                                    if prior_author in prior_users:
                                        exposure_count += 1

                                # log the exposure count
                                if exposure_count in activation_points:
                                    activation_points[exposure_count] += 1
                                else:
                                    activation_points[exposure_count] = 1

                                # Exit the posts dates loop
                                activated_users.add(user)
                                break

        # Return the mapping between the entity and the activation point distribution
        return (entity, activation_points)


    ###### Execution code
    conf = SparkConf().setAppName("NER Diffusion - Exposure Dynamics")
    conf.set("spark.python.worker.memory","10g")
    conf.set("spark.driver.memory","15g")
    conf.set("spark.executor.memory","10g")
    conf.set("spark.default.parallelism", "12")
    conf.set("spark.mesos.coarse", "true")
    conf.set("spark.driver.maxResultSize", "10g")
    conf.set("spark.cores.max", "15")
    conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos")
    conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz")
    conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing")

    # use sample directory for testing
    # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample")
    annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated")
    annotationFile.cache()
    thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json")
    # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample")
    thinnedFile.cache()

    # Top 500 entities file
    top500EntitiesFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/entities/top500_entities.csv")
    top500Entities = top500EntitiesFile.map(lambda x: str(x.encode("utf-8"))).collect()
    print("Top Entities Loaded. Total = " + str(len(top500Entities)))
    top_500_entities_broadcast = sc.broadcast(top500Entities)
    # print(str(len(top500Entities)))
    # print(top500Entities)

    # Load the reply graphs from the thinnedFile
    reply_map_rdd = thinnedFile\
        .flatMap(deriveReplyMap)
    reply_map_rdd.cache()

    reply_orig_map = reply_map_rdd\
        .collectAsMap()
    print("Reply Orig Map Size = " + str(len(reply_orig_map)))
    # print(reply_orig_map)
    # TODO: Convert code to HBase format as this fails to broadcast with Java heap space memory error
    reply_orig_map_broadcast = sc.broadcast(reply_orig_map)

    # # get the: {orig, [reply]} dictionary
    orig_replies_map = reply_map_rdd\
        .map(lambda x: (x[1], [x[0]]))\
        .reduceByKey(combineReplies)\
        .collectAsMap()
    print("Orig Replies Map Size = " + str(len(orig_replies_map)))
    orig_replies_map_broadcast = sc.broadcast(orig_replies_map)

    # Load the entity to post map - restricted to the top-500 entities
    # input: json_line of annotations of each post
    # output: [(entity, [post])] of posts where entities appeared
    entity_posts_rdd = annotationFile\
        .flatMap(deriveEntityToPosts)\
        .reduceByKey(lambda p1, p2: p1 + p2)
    entity_posts_rdd_map = entity_posts_rdd.collectAsMap()
    # entity_posts_map_broadcast = sc.broadcast(entity_posts_rdd_map)
    print("Entity to posts Map Size = " + str(len(entity_posts_rdd_map)))
    # print(entity_posts_rdd_map)

    # Get the post details - restricted to the top-500 entities for now
    post_details = thinnedFile\
        .map(derivePostDetails)\
        .filter(lambda x: x[0] is not "null")\
        .collectAsMap()
    post_details_broadcast = sc.broadcast(post_details)
    print("Post Details Size = " + str(len(post_details)))
    # print(post_details)

    # Compute the exposure curves for each entity in the dataset
    entity_exposure_curves_rdd = entity_posts_rdd\
        .map(derive_per_entity_exposure_distribution)
    entity_exposure_curves_rdd_distribution = entity_exposure_curves_rdd\
        .collect()
    # print(entity_exposure_curves_rdd_distribution)

    print("Entity Exposure Curves Distribution Size = " + str(len(entity_exposure_curves_rdd_distribution)))
    output_string = ""
    for (entity, dist_dictionary) in entity_exposure_curves_rdd_distribution:
        output_string += str(entity)
        for exposure_count in dist_dictionary:
            output_string += "\t" + str(exposure_count) + ", " + str(dist_dictionary[exposure_count])
        output_string += "\n"
    print(output_string)
    outputFile = open("data/exposure_curves.tsv", "w")
    outputFile.write(output_string)
    outputFile.close()

    # stop the Spark context from running
    sc.stop()
Beispiel #39
0
    #ss = SparkSession.builder \
    # .master("local") \
    # .config("spark.some.config.option", "some-value") \
    # .getOrCreate()
    ss = SQLContext(sc)
    thres = int(sys.argv[1])
    data = sc.textFile(
        sys.argv[2]).mapPartitionsWithIndex(lambda idx, row: islice(
            row, 1, None) if idx == 0 else row).map(lambda x: x.split(','))

    b = data.map(lambda x: (x[1], x[0])).repartition(15).cache()
    edges = b.join(b).filter(lambda x: x[1][0] != x[1][1]).map(
        lambda x: (x[1], x[0])).aggregateByKey(
            [], lambda x, y: x + [y], lambda x, y: x + y).filter(
                lambda x: len(set(x[1])) >= thres).map(lambda x: x[0])
    sc.setCheckpointDir("s/")
    vertices = ss.createDataFrame(
        edges.flatMap(lambda x: list(x)).distinct().map(lambda x: [x]), ["id"])
    g_edge = ss.createDataFrame(edges, ["src", "dst"])
    print(vertices.count())
    print(g_edge.count())
    g = GraphFrame(vertices, g_edge)

    ans = g.labelPropagation(
        maxIter=5).rdd.map(lambda x: (x[1], x[0])).aggregateByKey(
            [], lambda x, y: x + [y],
            lambda x, y: x + y).map(lambda x: sorted(x[1])).sortBy(
                lambda x: (len(x), x[0])).collect()

    with open(sys.argv[3], "w") as file:
        for i in ans:
    #createDirectStream() to fetch data from topic
    print('starting streaming:' + str(datetime.now()))

    stockvol = filestream.map(lambda x: x[1]).flatMap(
        lambda x: [line for line in x.splitlines()]).flatMap(parseOrder).map(
            lambda o: ((o['symbol'], o['type']), o['amount']))
    #process dstream , pick value as kafka sends (key,value);
    #split data into lines/rows; format row , select columns
    #required & create tuple ((Symbol , type),amount)
    #to further reduce aggregating by key

    noofrecords = filestream.map(lambda x: x[1]).flatMap(
        lambda x: [line for line in x.splitlines()]).flatMap(parseOrder).map(
            lambda o: ((o['symbol'], o['type']), o['amount']))
    noofrecords.foreachRDD(saveOfunc)
    #to keep track of records' num received from Kakfa, for logging purpose

    stockvol_window = stockvol.window(10, 10)
    stockvol_aggr = stockvol_window.reduceByKey(add)
    #Create   dtream window duration/sliding to collects RDDs in it

    stockvol_highest = stockvol_aggr.window.transform(volhigh)
    stockvol_highest.foreachRDD(savefunc)
    #Transform window dstream to find highest trade volume stock
    #in each rdd of window batch and save to hdfs

    sc.setCheckpointDir("hdfs:///user/cloudera/checkpoint/")
    ssc.start()

ssc.awaitTermination()
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating


def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
    return movieNames


conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')

print("\nLoading movie names...")
nameDict = loadMovieNames()
#umadeup: data create on top of u.data (3 rows)
data = sc.textFile("C:/SparkCourse/ml-100k/umadeup.data")

ratings = data.map(lambda l: l.split()).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print("\nTraining recommendation model...")
rank = 10
# Lowered numIterations to ensure it works on lower-end systems
numIterations = 6
model = ALS.train(ratings, rank, numIterations)
def init_spark_context():
    conf = SparkConf().setAppName("MovieRatings").set("spark.executor.memory", "4g")
    sc = SparkContext(conf=conf)
    sc.setCheckpointDir('/tmp/checkpoint/')
    return sc