def initialize():
    global sc, spark, items1, items2, inputfile1, inputfile2
    print("Initializing...")
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    jsonread1 = sc.textFile(inputfile1)
    items1 = jsonread1.map(json.loads)
    jsonread2 = sc.textFile(inputfile2)
    items2 = jsonread2.map(json.loads)
def main(args):

    sc = SparkContext(appName="PGM")

    graph1 = sc.textFile(args.IN[0]).map(line_to_edge)
    graph2 = sc.textFile(args.IN[1]).map(line_to_edge)

    graph_name = args.IN[0]

    seed_num = args.sn
    PARTS = args.PARTS

    G1 = deep_copy(graph1, PARTS)
    G2 = deep_copy(graph2, PARTS)

    IsBucket = ""
    matchtype = ""

    if args.inseeds:
        seeds = sc.textFile(args.inseeds).map(line_to_edge)
        matchtype = "_seeded_"
        ETA = 0

    else:
        matchtype = "_seedless_"
        start = time()
        seeds = dinoise.seed_generator(sc, G1, G2, seed_num, PARTS)
        stop = time()
        ETA = round(float(stop - start) / 60, 4)
        stats = evaluate_output(graph_name + matchtype + IsBucket, G1, G2,
                                seeds, "seeds_log.csv", ETA, PARTS)

    if not args.bucketing:

        start = time()
        res = dinoise.distributed_noisy_seeds(sc, G1, G2, seeds, PARTS)
        stop = time()

    else:

        start = time()
        res = dinoise_w_bucketing.distributed_noisy_seeds(
            sc, G1, G2, seeds, PARTS)
        IsBucket = "_bucket_"
        stop = time()

    ETB = round(float(stop - start) / 60, 4)
    stats = evaluate_output(graph_name + matchtype + IsBucket, G1, G2, res,
                            "results_log.csv", ETB, PARTS)

    sc.stop()
Exemple #3
0
 def set_tweets_data(self, path):
     sc = SparkContext().getOrCreate()
     # Чтение данных и разбиение на колонки.
     self.tweets = sc.textFile(path).map(
         lambda line: line[1:-1].split('","'))
     # Запись выборочных колонок.
     self.tweets = self.tweets.map(lambda row: (row[1], row[10], row[24]))
class InvertedDictionary:
    def __init__(self, fileConfig):
        self.sc = SparkContext()
        self.sqlContext = HiveContext(self.sc)
        self.invertedDict = {}
        self.conn = boto.connect_s3(host='s3.amazonaws.com')
        self.bucket = fileConfig['s3_bucket']
        self.folder = fileConfig['s3_folder']

    def buildInvertedDict(self):
        bucket = self.conn.get_bucket(self.bucket)
        for idx, file in enumerate(bucket.list(self.folder, '/')):
            if idx == 0:
                continue
            fileId = file.key.split('/')[1]
            rdd = self.sc.textFile('s3a://{}/{}/{}'.format(
                self.bucket, self.folder, fileId))
            output = rdd.flatMap(lambda (words): [(re.sub(r'[^a-z\']', '', word)) for word in words.lower().split()])\
                  .map(lambda (word): (word, fileId))

            for row in output.collect():
                word, docId = str(row[0]), int(row[1])
                if word not in self.invertedDict:
                    self.invertedDict[word] = [docId]
                else:
                    if docId in self.invertedDict[word]:
                        pass
                    else:
                        self.invertedDict[word].append(docId)
def initialize():
    global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold
    print("Initializing...")
    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    buckets_user = items.groupByKey().mapValues(list).filter(
        lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex(
            removeDuplicateEntriesAfter)
    print("Without Duplicates DOne..")
    # withoutDuplicates = checkM.mapPartitionsWithIndex(
    #     removeDuplicateEntries).groupByKey().mapValues(list)

    if (case == 1):
        # buckets_user = withoutDuplicates.mapPartitionsWithIndex(
        #     createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold)

        callSonPhase1(buckets_user)
        print("Initializing Phase 2.....")
        finalFreq = buckets_user.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
    if (case == 2):
        buckets_business = withoutDuplicates.mapPartitionsWithIndex(
            createBuckets_case2).groupByKey().mapValues(list)
        callSonPhase1(buckets_business)
        print("Initializing Phase 2.....")
        finalFreq = buckets_business.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
def main():
    parser = argparse.ArgumentParser(description="Find Dependency inclusions")
    parser.add_argument('--path', type=str)
    parser.add_argument('--cores', type=str)
    args = parser.parse_args()

    sc = SparkContext(appName="DDM")
    sc.getConf().set("spark.executor.cores", args.cores)
    sc.getConf().set("spark.driver.cores", args.cores)
    sc.getConf().set("spark.worker.cores", args.cores)
    sc.getConf().set("spark.deploy.defaultCores", args.cores)
    sc.getConf().set("spark.driver.memory", "15g")
    global number_of_columns
    data = []
    file_headers = []
    for file in os.listdir(args.path):
        if file.endswith(".csv"):
            rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\""))

            file_data = rdd.collect()
            file_header = file_data[0]
            del file_data[0]
            file_data = [(number_of_columns, x) for x in file_data]
            data += file_data
            file_headers += file_header
            number_of_columns = number_of_columns + len(file_header)

    header_dummies = list(range(0, number_of_columns))
    rdd = sc.parallelize(data)
    values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1])))))
    unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set)
    unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func)
    matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0]))
    result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y))

    assert len(result_matrix) == number_of_columns

    output = []
    for i in range(0, number_of_columns):
        assert len(result_matrix[i]) == number_of_columns
        output.append([])

    for i in range(0, len(result_matrix)):
        for j in range(0, len(result_matrix[i])):
            if i != j and result_matrix[i][j]:
                output[j].append(file_headers[i])

    for i in range(0, len(output)):
        row = output[i]
        if len(row) != 0:
            output_string = str(row[0])
            for j in range(1, len(row)):
                output_string += (", " + str(row[j]))
            print(str(file_headers[i]) + " < " + output_string)

    sc.stop()
Exemple #7
0
def main():
    sc = SparkContext(appName='TextSimillarity')
    sqlcont = SQLContext(sc)
    rdd = sc.textFile("test.csv")
    header = rdd.first()
    newrdd = rdd.filter(lambda x: x!= header)\
   .map(lambda x: x.split(','))\
   .map(lambda x: Row(description_x = x[1], description_y = x[2]))
    new_df = sqlcont.createDataFrame(newrdd)
    calculate_simillarity(new_df)
Exemple #8
0
def initialize():
    global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    # print(columnName)
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # Getting user and their business count
    user_business = items.groupByKey().mapValues(set).collect()
    tuple_edge_list = []

    for i in range(0, len(user_business) - 1):
        for j in range(i + 1, len(user_business)):
            inter = user_business[i][1] & user_business[j][1]
            if len(inter) >= filterThreshold:
                tuple_edge_list.append(
                    (str(user_business[i][0]), str(user_business[j][0])))
                tuple_edge_list.append(
                    (str(user_business[j][0]), str(user_business[i][0])))

    totalEdges = float(len(tuple_edge_list) / 2)
    adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues(
        list).collectAsMap()
    adjacency_listMain = copy.deepcopy(adjacency_list)
    totalNodes = list(adjacency_list.keys())

    # ------------------------Newly added line------------------------
    strict_totalNodes = copy.deepcopy(totalNodes)
    # print(len(totalNodes))

    # ----------------------Part 1---------------------
    bfs(totalNodes, adjacency_list)
    print("Writing Betweenness to File....")

    # Converting into sorted List Initial Betweenness
    list_val = list(cost_dict.items())

    list_val.sort(key=lambda x: (-x[1], x[0]))
    writeToFile(list_val)
    totalNodes = copy.deepcopy(strict_totalNodes)
    # print(len(totalNodes))
    # ----------------------Part 2----------------------
    print("Creating Partitions....")
    create_components(list_val, adjacency_listMain, totalNodes, totalEdges)
    # ---------------------EoC---------------------------

    print("Duration: " + str(time.time() - t))
def main(args):

    sc = SparkContext(appName="PGM")

    graph1 = sc.textFile(args.IN[0]).map(line_to_edge)
    graph2 = sc.textFile(args.IN[1]).map(line_to_edge)

    graph_name = args.IN[0]

    seed_num = args.sn
    PARTS = args.PARTS

    G1 = deep_copy(graph1, PARTS)
    G2 = deep_copy(graph2, PARTS)

    if args.inseeds:
        seeds = sc.textFile(args.inseeds).map(line_to_edge)
        matchtype = "seeded"
        ETA = 0

    else:
        matchtype = "seedless"
        start = time()
        seeds = seed_generator(sc, G1, G2, seed_num, PARTS)
        stop = time()
        ETA = round(float(stop - start) / 60, 4)
        seeds2 = seeds.map(lambda pair: str(pair[0]) + " " + str(pair[1]))
        seeds2.coalesce(1).saveAsTextFile(args.OUT + "bucketing_segen_seeds")

    start = time()
    res = distributed_noisy_seeds(sc, G1, G2, seeds, PARTS)
    stop = time()
    ETB = round(float(stop - start) / 60, 4)
    res2 = res.map(lambda pair: str(pair[0]) + " " + str(pair[1]))
    res2.coalesce(1).saveAsTextFile(args.OUT + matchtype +
                                    "_bucketing_matching")
    print("\nSeGen   time :" + str(ETA) + " min ")
    print("DiNoiSe time :" + str(ETB) + " min\n")
    return [ETA, ETB]
    sc.stop()
Exemple #10
0
def load():
    """
    This function should prepare local configuration and initiate a SparkContext.
    Then a file must read and the String RDD must be transformed to a Article RDD
    :return: an RDD of Articles

    :see: Hint: use article.parseLine for the transformation
    """
    logger.info("Prepare spark contest and load data")
    conf = SparkConf().setMaster('local').setAppName("Ranking App")\
        .set("spark.executor.memory", "3g").set("spark.driver.memory", "3g").set("spark.python.worker.memory", "3g") \
        .set("spark.driver.maxResultsSize", 0)
    sc = SparkContext(conf=conf)
    return sc.textFile("data/wikipedia.dat").map(lambda x: parse_line(x))
Exemple #11
0
    def select(self,
               col_names=None,
               filepath=None,
               sampler=None,
               sampling_rate=1,
               task="classfication"):
        sc = SparkContext(conf=SparkConf().setAppName("select"))

        # Columns Selection
        if col_names == None:
            print("============No Column Names Specified============")
            data_rdd = sc.textFile(filepath).map(self.parse_xy)
        else:
            header_rdd = sc.textFile(filepath + "/header").map(
                self.parse_header)
            header_list = sum(header_rdd.collect(), [])
            #print("header list: ", header_list)

            columns_ind = [header_list.index(col)
                           for col in col_names]  # [Y, X1, X2, ...]
            X_col_ind = columns_ind[1:]
            Y_col_ind = columns_ind[0]
            #print("columns_index: ", columns_ind)

            # map function에 인자 전달??? 따로 빼고싶다,,
            data_rdd = sc.textFile(filepath + "/data").map(lambda vec: ([
                [float(x) for x in vec.split(',')][i] for i in X_col_ind
            ], [float(x) for x in vec.split(',')][Y_col_ind]))

        if sampler == None:
            print("============No Sampling Method Specified============")
        elif sampler == "random":
            data_rdd = data_rdd.sample(False, sampling_rate)  # 정확하지가 않네..?

        print("data: ", data_rdd.collect()[0])
        print("len(data): ", len(data_rdd.collect()))
Exemple #12
0
def get_top_3():
    """
    统计每个网址访问的uid的前三个人及他访问的次数
    :return:
    """

    spark = SparkContext("local", "get_top3")
    data = spark.textFile("data.txt")

    data.map(lambda line:(line.split(" ")[2],line.split(" ")[3])).\
        groupByKey()\
        .flatMap(lambda info:get_site_uid_cnt(info))\
        .groupByKey()\
        .map(lambda info:get_top3_uid_and_cnt(info))\
        .foreach(print)
Exemple #13
0
def run():
    sc = SparkContext("local", "WordCount")  #初始化配置
    filename_hdfs = "/sparklearn/data/小王子.txt"
    filename_linux = "file:///media/sl/D/java/idea/a2019/sparklearn/data/小王子.txt"
    filename_win = "D:\\java\\idea\\a2019\\sparklearn\\data\\小王子.txt"
    data = sc.textFile(filename_linux)  #读取是utf-8编码的文件
    stopwords_linux = "/media/sl/D/java/idea/a2019/sparklearn/data/stopwords-master/百度停用词表.txt"
    stopwords_win = r'D:\java\idea\a2019\sparklearn\data\stopwords-master\百度停用词表.txt'

    with open(stopwords_linux, 'r', encoding='utf-8') as f:
        x = f.readlines()
    stop = [i.replace('\n', '') for i in x]
    stop.extend([
        ',', '的', '我', '他', '', '。', ' ', '\n', '?', ';', ':', '-', '(', ')',
        '!', '1909', '1920', '325', 'B612', 'II', 'III', 'IV', 'V', 'VI', '—',
        '‘', '’', '“', '”', '…', '、'
    ])  #停用标点之类
    data=data.flatMap(lambda line: jieba.cut(line,cut_all=False)).filter(lambda w: w not in stop).\
        map(lambda w:(w,1)).reduceByKey(lambda w0,w1:w0+w1).sortBy(lambda x:x[1],ascending=False)
    print(data.take(100))
Exemple #14
0
class MainApp(object):
    def __init__(self):
        pass
    
    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local[10]")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)        

    def loadData(self):
        category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3])))
        category_schema = StructType([
            StructField("userid", IntegerType(), True),
            StructField("movieid", IntegerType(), True),
            StructField("rating", FloatType(), True),
            StructField("time", LongType(), True)
        ])
        category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list.registerTempTable("data")
        movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid")
        movie_list.registerTempTable("movie")
        movieid = movie_list.sort(movie_list.ct.desc()).first().movieid
        # movieid = category_list.first().movieid
        category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid))
        category_list.registerTempTable("data")
        user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50")
        print(user_list.count())
        user_list.show()
        user_list.registerTempTable("users")
        category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1)
        #category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time))
        category_list = category_list.repartition(1)
        category_list.saveAsTextFile("data.txt")
def main():
    from pyspark.context import SparkContext
    from operator import add

    # Clean up previous results that can cause failure
    if os.path.isdir(os.path.join(os.getcwd(), "wc_out")):
        shutil.rmtree(os.path.join(os.getcwd(), "wc_out"))

    #print(os.environ.get("SPARK_HOME"))
    sc = SparkContext(appName="HelloWorld")
    f = sc.textFile("test_word_count.txt")
    wc = f.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(
        add)
    wc.saveAsTextFile("wc_out")

    # Check if results files exist and raise error if they do not.
    if os.path.isdir(os.path.join(os.getcwd(), "wc_out")):
        print("Test succeeded, results files are present for word count.")
    else:
        # print("Test failed, no results files were found for word count.")
        raise ValueError("test_word_count.py has Failed.")
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark import SparkConf

# Parse the input parameters
input_file_name = sys.argv[1]
number_k_for_topK = int(sys.argv[2])
print(input_file_name)
print(str(number_k_for_topK))

# Prepare the Spark context
conf = SparkConf().setMaster("local") \
                  .setAppName("Word Count Spark") \
                  .set("spark.executor.memory", "4g") \
                  .set("spark.executor.instances", 1)
sc = SparkContext(conf=conf)
book = sc.textFile(input_file_name)

# WordCount
words_counted = book.flatMap(lambda line: line.split(" ")) \
                    .map(lambda word:(word,1)) \
                    .reduceByKey(lambda x,y:x+y )

print(words_counted)

# Output the top-K most frequent words
topk = words_counted.sortBy(lambda keyvalue: -keyvalue[1]).take(
    number_k_for_topK)
print(topk)
Exemple #17
0
                out_s += "\n"
                output_mark += 1
    out_s = out_s.strip("\n")
    return out_s


if __name__ == "__main__":

    time1 = time.time()

    sc = SparkContext('local[*]', 'inf553_hw2_2')  # initiate
    sc.setLogLevel("OFF")

    T = int(sys.argv[1])
    S = int(sys.argv[2])
    input_file = sc.textFile(sys.argv[3])  # readfile
    output_file = sys.argv[4]

    data = input_file.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id")
    # create basket
    baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).filter(
        lambda x: len(x[1]) > T).persist(StorageLevel(True, True, False,
                                                      False))

    # baskets = baskets.coalesce(4, True).persist(StorageLevel(True, True, False, False))

    N = baskets.count()

    # Pass 1
    # Pass 1 Map
Exemple #18
0
from __future__ import print_function

import sys
from operator import add

from pyspark.sql import SparkSession

from pyspark.context import SparkContext, SparkConf

if __name__ == "__main__":
    config = SparkConf().setAppName("wordCount").setMaster("local")
    sc = SparkContext()
    lines = sc.textFile("./src/main/python/wordCount/hello.txt")
    words = lines.flatMap(lambda line: line.split(" "))
    wordCountMap = words.map(lambda word: (word, 1))
    # count = wordCountMap.reduceByKey(lambda preCount, count: preCount + count)
    # output = count.collect()
    # print(output)

    # use countByKey instead
    count = wordCountMap.countByKey()
    print(count)
                        default="mnist_model")
    parser.add_argument("--export_dir",
                        help="path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    # create RDD of input data
    def parse(ln):
        vec = [int(x) for x in ln.split(',')]
        return (vec[1:], vec[0])

    images_labels = sc.textFile(args.images_labels).map(parse)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=0,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.SPARK,
                            master_node='chief')
    # Note: need to feed extra data to ensure that each worker receives sufficient data to complete epochs
    # to compensate for variability in partition sizes and spark scheduling
    cluster.train(images_labels, args.epochs)
    cluster.shutdown()
Exemple #20
0
from pyspark.context import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
#Setup
#spark = SparkSession.builder.master("spark://master:7077").appName("adult").config("spark.some.config.option", "akki").getOrCreate().enableHiveSupport()
sc = SparkContext()
sqlContext = SQLContext(sc)

# Load and parse the data file into an RDD of LabeledPoint.
data = sc.textFile('C:/Users/akshaykumar.kore/Downloads/data/adults.csv').map(
    lambda line: line.split(","))
#data=sc.read.csv("C:/Users/akshaykumar.kore/Downloads/data/adult1.csv", header=True, mode="DROPMALFORMED")

data = data.toDF()

data = data.na.fill(0)

categoricalColumns = ["_2", "_4", "_6", "_7", "_8", "_9", "_10", "_14"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCol=categoricalCol + "Index",
                            outputCol=categoricalCol + "classVec")
    with zipfile.ZipFile(small_dataset_path, "r") as z:
        z.extractall(datasets_path)

if not os.path.exists(complete_dataset_path):
    with zipfile.ZipFile(complete_dataset_path, "r") as z:
        z.extractall(datasets_path)

small_ratings_file = os.path.join(datasets_path, 'ml-latest-small',
                                  'ratings.csv')

# initiate a SparkConext
from pyspark.context import SparkContext
sc = SparkContext('local', 'movie-recommender-engine')

# give small_ratings_file in input to the sc
small_ratings_raw_data = sc.textFile(small_ratings_file)
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

# Parse the Raw data into a new RDD - Ratings.
small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

small_movies_file = os.path.join(datasets_path, 'ml-latest-small',
                                 'movies.csv')

small_movies_raw_data = sc.textFile(small_movies_file)
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

# Parse the Raw data into a new RDD - Movies.
small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
import sys
import time
import math
import utils
from pyspark.context import SparkContext

if (len(sys.argv) > 1):
	hdfs_file_path = "/user/lsde02/data/%s/*.gz" % sys.argv[1]
else:
	hdfs_file_path = "/user/lsde02/data/1901/*.gz"
hdfs_results_path = "/user/lsde02/results/"
start_time = time.strftime("%Y-%m-%d-%H-%M-%S")

sc = SparkContext()
context = sc.textFile(hdfs_file_path)
stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()])
stations = stations.filter(lambda x: 'longitude' in x[1] and 'latitude' in x[1])
stations.persist()

# Do computations on month level
month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (x[1]['temp'], x[1]['wind-speed'], x[1]['sky-condition'], x[1]['visibility'], \
				x[1]['wind-direction'])))
month_data = month_data.combineByKey(lambda value: (x['temp'], 1, x['wind-speed'], 1, x['sky-condition'], 1, x['visibility'], 1, \
				math.sin(x['wind-direction'])*math.pi/180., math.cos(x['wind-direction']*math.pi/180.)),\
				lambda x, value: (x[0] + value[0], value[1] + 1, x[2]+value[2], 1 + value[3], x[4] + value[4], 1 + value[5],\
					x[6]+value[6], 1 + value[7], x[8] + value[8], x[9] + value[9]),\
				lambda x, y: (x[0]+y[0], x[1]+y[1], x[2]+y[2], x[3]+y[3], x[4]+y[4], x[5]+y[5], x[6]+y[6], x[7]+y[7], x[8]+y[8]\
					x[9]+y[9])) 
month_data = month_data.map(lambda (label, (x1, c1, x2, c2, x3, c3, x4, c4, x5a, x5b)): (label, (x1/c1, x2/c2, x3/c3, x4/c4, math.atan2(x5a, x5b))))
month_data = month_data.coalesce(1, True)
month_avg.saveAsTextFile("%s%s-%s" % (hdfs_results_path, start_time, 'all'))
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None):
    response = tangelo.empty_response()

    # Bail with error if any of the required arguments is missing.
    missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree])))
    if len(missing) > 0:
        response["error"] = "missing required arguments: %s" % (", ".join(missing))
        return response

    # Cast the arguments to the right types.
    #
    # The degree is the degree of separation between the center element and the
    # retrieved nodes - an integer.
    try:
        degree = int(degree)
    except ValueError:
        response["error"] = "argument 'degree' must be an integer"
        return response

    # The start time is the number of milliseconds since the epoch (which is how
    # JavaScript dates are constructed, and therefore how dates are stored in
    # MongoDB) - an integer.
    try:
        start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d")
    except ValueError:
        response["error"] = "argument 'start_time' must be in YYYY-MM-DD format"
        return response

    # The end time is another date - an integer.
    try:
        end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d")
    except ValueError:
        response["error"] = "argument 'end_time' must be in YYYY-MM-DD format"
        return response

    # Get a handle to the database collection.
    if SparkContext._active_spark_context == None:
        sc = SparkContext('spark://impaladev.darpa.mil:7077', 'Enron Emailers')
    else:
        sc = SparkContext._active_spark_context

    enronData = sc.textFile('hdfs://localhost:8020/user/bigdata/pgill/enron/email_graph_fixed.txt').map(lambda line: line.split('\t')).cache()
            
    def withinTimespan(record):
        recordDate = datetime.datetime.strptime(record[2], "%Y-%m-%d")
        return recordDate >= start_time and recordDate < end_time
    
    def emptyRecords(record):
        return record[0] != "" and record[1] != ""
        
    def orderRecord(record):
        if record[1] < record[0]:
            record[0], record[1] = record[1], record[0]
        return record

    enronSpan = enronData.filter(withinTimespan).filter(emptyRecords).map(orderRecord).map(lambda rec: (rec[0], rec[1])).distinct().cache()
    
    # Start a set of all interlocutors we're interested in - that includes the
    # center emailer.
    talkers = set([center])

    # Also start a table of distances from the center.
    distance = {center: 0}

    current_talkers = list(talkers)
    all_results = []
    for i in range(degree):
        
        def emailsInvolved(record):
            return any(keyword in record for keyword in current_talkers)
        
        results = enronSpan.filter(emailsInvolved).collect()

        # Collect the names.
        current_talkers = list(itertools.chain(*map(lambda x: [x[1], x[0]], results)))
        current_talkers = list(set(current_talkers))
        talkers = talkers.union(current_talkers)

        # Compute updates to everyone's distance from center.
        for t in current_talkers:
            if t not in distance:
                distance[t] = i+1

        # save the cursor.
        all_results.append(results)

    # Construct a canonical graph structure from the set of talkers and the list
    # of emails.
    #
    # Start with an index map of the talkers.
    talkers = list(talkers)
    talker_index = {name: index for (index, name) in enumerate(talkers)}

    # Create a chained iterable from all the rewound partial results.
    all_results = itertools.chain(*all_results)

    # Create a list of graph edges suitable for use by D3 - replace each record
    # in the data with one that carries an index into the emailers list.
    edges = []
    ident = 0
    for result in all_results:
        source = result[0]
        target = result[1]
        ident += 1

        rec = { "source": talker_index[source],
                "target": talker_index[target],
                "id": str(ident) }

        edges.append(rec)

    talkers = [{"email": n, "distance": distance[n]} for n in talkers]

    # Stuff the graph data into the response object, and return it.
    response["result"] = { "nodes": talkers,
                           "edges": edges }
    return response
if args.format == "tfr":
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv":
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)
  print("zipping images and labels")
  dataRDD = images.zip(labels)

cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()
import os
import sys
from pprint import pprint
from operator import add
import pyspark
from pyspark.context import SparkContext

sc = SparkContext()
file = "SampleData3.txt"

wordcounts = sc.textFile(file) \
        .map(lambda l: ((l.split(" ")[0], len([x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)])), [x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)]))\ \
        .flatMap(lambda x: x.split()) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(lambda x,y:x+y) \
        .map(lambda x:(x[1],x[0])) \
        .sortByKey(False) 


    else:
        try:
            id1 = int(string.strip(data[0]))
            id2 = int(string.strip(data[1]))
            return (id1, id2)
        except:
            return (-1, "error")

filePath = '/home/piyush/datasets/audioscrobbler/'

conf = SparkConf().setAppName("audio_scrobbler")
sc = SparkContext(conf=conf)

# parse raw user artist data
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)

# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)

# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()

# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)

Exemple #27
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
Exemple #28
0
    # read data as CSV for Dataframe analysis
    # /Volumes/work/data/kaggle/ssi.csv

    # read data n0rmally
    """

    sqlContext = SQLContext(sc)
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv')
    # summarize(df)
    print df.show()

    #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)]))

    values using Dataframe
    Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
    Final intercept: 0.0
    """

    points = sc.textFile(BASE_DATA_PATH + "/ssi.csv").map(parsePoint)
    model = LogisticRegressionWithSGD.train(points, 10)
    print("Final weights: " + str(model.weights))
    print("Final intercept: " + str(model.intercept))

    """
    Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
    Final intercept: 0.0
    """

    sc.stop()
Exemple #29
0
import os
import sys
import platform

import py4j

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.storagelevel import StorageLevel

# init - create SparkContext on Azure HdInsight
# on Azure HdInsight master was deafult to yarn
sc = SparkContext(appName="wc")

print "sys.argv[1]: ", sys.argv[1] 

wc = sc.textFile(sys.argv[1]) \
  .map( lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower()) \
  .flatMap(lambda x: x.split(" ")) \
  .map(lambda x: (x, 1)) \
  .reduceByKey(lambda x, y: x + y)

print wc.collect()
import py4j
import pyspark
from pyspark.context import SparkContext

sc = SparkContext()
# Control our logLevel. This overrides any user-defined log settings.
# Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
sc.setLogLevel("FATAL")

text_file = sc.textFile(spark_home + "/README.md")
word_counts = text_file \
    .flatMap(lambda line: line.split()) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)
print word_counts.collect()
from pyspark.context import SparkContext
from pyspark import SparkConf
from collections import OrderedDict
import sys
import json

conf = SparkConf()
conf.setMaster("local[*]")
sc = SparkContext(conf = conf)
path = sys.argv[1]

output = []
top_20_cust = []
top_20_business = []

review_file = sc.textFile(path)
rdd_json = review_file.map(json.loads).map(lambda x : (x['user_id'],x['business_id'],x['useful'],x['text'],x['stars']))

useful = rdd_json.filter(lambda x: x[2] > 0).count()

stars = rdd_json.filter(lambda x: x[4] == 5.0).count()

longest_review = rdd_json.map(lambda x: len(x[3])).max()

rdd_cust = rdd_json.map(lambda x: (x[0],1))\
				   .reduceByKey(lambda x,y: x + y)\
				   .sortBy(lambda x: (-x[1],x[0]))

rdd_business = rdd_json.map(lambda x: (x[1],1))\
                       .reduceByKey(lambda x,y : x + y)\
                       .sortBy(lambda x: (-x[1],x[0]))
		directories += str(i)
		if i < int(sys.argv[2]):
			directories += ","
	directories += "}"
	hdfs_file_path = "/user/lsde02/data/%s/*.gz" % directories
	forced_partitions = (int(sys.argv[2])+1-int(sys.argv[1]))*12
else:
	hdfs_file_path = "/user/lsde02/data/*/*.gz"
	forced_partitions = 1500

hdfs_results_path = "/user/lsde02/results/"
start_time = time.strftime("%Y-%m-%d-%H-%M-%S")
print "Started processing: %s" % hdfs_file_path

sc = SparkContext()
context = sc.textFile(hdfs_file_path, forced_partitions)
stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()])
#stations = stations.filter(lambda x: 'fixed-weather-station' in x[1] or )

# Do computations on month level
month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (utils.get_attribute(x[1], 'temp'), utils.get_attribute(x[1], 'windspeed'), \
			utils.get_attribute(x[1], 'sky-condition'), utils.get_attribute(x[1], 'visibility'), utils.get_attribute(x[1], 'wind-direction'), \
			utils.get_attribute(x[1], 'latitude'), utils.get_attribute(x[1], 'longitude'))))
month_data = month_data.combineByKey(lambda value: (value[0] if value[0] != None else 0, 1 if value[0] != None else 0,\
					value[1] if value[1] != None else 0, 1 if value[1] != None else 0, \
					value[2] if value[2] != None else 0, 1 if value[2] != None else 0, \
					value[3] if value[3] != None else 0, 1 if value[3] != None else 0, \
					math.sin(value[4]*math.pi/180.0) if value[4] != None else 0, \
					math.cos(value[4]*math.pi/180.0) if value[4] != None else 0, \
					value[0]*value[0] if value[0] != None else 0, \
					value[1]*value[1] if value[1] != None else 0, \
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.rdd import RDD
from pyspark.files import SparkFiles
from pyspark.storagelevel import StorageLevel
from pyspark.accumulators import Accumulator, AccumulatorParam
from pyspark.broadcast import Broadcast
from pyspark.serializers import MarshalSerializer, PickleSerializer
from pyspark.status import *
from pyspark.profiler import Profiler, BasicProfiler


conf = SparkConf().setAppName("hotelhodsrrequest_parsing")
sc = SparkContext(conf = conf)
textFile = sc.textFile("/user/hive/warehouse/ehotel.db/hotelhodsrrequest/year=2016/month=01/day=*/*.gz")
#filter by PCC
PCCArray = textFile.map(lambda line: line.split("|")).filter(lambda line: line[4] == 'B7ZB')
  
  
#GroupBy Date
shopperdaycount = PCCArray.map(lambda line: (line[0].split(" ")[0], 1)).reduceByKey(lambda a, b: a+b)
shopperdaycount.saveAsTextFile('/user/sg952655/Totalshopperdaycount/')
shopperdaycounttxt = sc.textFile("/user/sg952655/Totalshopperdaycount/*")
preview0 = shopperdaycounttxt.collect()
    
  
#Duplicate record count with sessionid, transactionid, propertycode as unique key
filteredduplicaterecordcount = PCCArray.map(lambda line: (line[1]+" "+line[2]+" "+line[6], 1)).reduceByKey(lambda a, b: a+b).filter(lambda line: line[1] > 1)
Orderedfilteredduplicaterecordcount = filteredduplicaterecordcount.map(lambda line: (line[1], line[0]))
Orderedfilteredduplicaterecordcount.saveAsTextFile('/user/sg952655/TotalOrderedfilteredduplicaterecordcount/')
Orderedfilteredduplicaterecordcounttxt = sc.textFile("/user/sg952655/TotalOrderedfilteredduplicaterecordcount/*")
    print('\nRunning example of classification using GradientBoostedTrees\n')
    testClassification(trainingData, testData)

    print('\nRunning example of regression using GradientBoostedTrees\n')
    testRegression(trainingData, testData)

    sc.stop()





from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
train = sc.textFile("train.csv")
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[-1], values[:-1])

#data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = train.map(parsePoint)

# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))
Exemple #35
0
    # calculate predictions on train data & calculate Mean Squared Error & R2 score
    y_pred = lm.predict(features)
    print("Mean Squared Error = ",
          sklearn.metrics.mean_squared_error(labels, y_pred))
    print("R2 score = ", sklearn.metrics.r2_score(labels, y_pred))
    # calculate predictions on test data & calculate Mean Squared Error & R2 score
    y_pred_test = lm.predict(testfeatures)
    print("Mean Squared Error = ",
          sklearn.metrics.mean_squared_error(testlabels, y_pred_test))
    print("R2 score = ", sklearn.metrics.r2_score(testlabels, y_pred_test))
    # plot graph of prediction vs ground truth
    plt.scatter(labels, y_pred, color='black')
    plt.xlabel('Ground Truth')
    plt.ylabel('Prediction')
    plt.show()
    plt.savefig('result.png')


# load the prepared dataset
data = sc.textFile("data/testFeatLabs.csv")
printStat = [
    'Below High School Education level', 'High School Education level',
    'Some College Education level',
    'Bachelors Degree and above Education Level'
]

# run Lasso regression to predict the value for each class
for ind in range(4):
    print('\nTraining for', printStat[ind], ':')
    parsedData = data.filter(filterRows).map(lambda x: parsePoint(x, ind))
    trainAndTest(parsedData)
'''
Created on Oct 30, 2015

@author: dyerke
'''
from pyspark.context import SparkContext
from pyspark.conf import SparkConf

if __name__ == '__main__':
    m_hostname= "dyerke-Inspiron-7537"
    #
    conf= SparkConf()
    conf.setAppName("MyTestApp")
    conf.setMaster("spark://" + m_hostname + ":7077")
    conf.setSparkHome("/usr/local/spark")
    conf.set("spark.driver.host", m_hostname)
    logFile = "/usr/local/spark/README.md"  # Should be some file on your system
    #
    sc= SparkContext(conf=conf)
    logData= sc.textFile(logFile).cache()
    #
    countAs= logData.filter(lambda x: 'a' in x).count()
    countBs= logData.filter(lambda x: 'b' in x).count()
    #
    print("Lines with a: %i, lines with b: %i" % (countAs, countBs))
    sc.stop()
from pyspark.context import SparkContext
from collections import OrderedDict  #preserve the order of json file output
import sys
import time
import json
import pyspark

file_path1 = sys.argv[1]
file_path2 = sys.argv[2]
output_path1 = sys.argv[3]
output_path2 = sys.argv[4]

sc = SparkContext("local[*]")
sc.setLogLevel("ERROR")
tf1 = sc.textFile(file_path1)
tf2 = sc.textFile(file_path2)

data1 = tf1.map(lambda x:
                (json.loads(x)["business_id"], json.loads(x)["stars"]))
data2 = tf2.map(
    lambda x: (json.loads(x)["business_id"], json.loads(x)["state"])).persist(
        pyspark.StorageLevel.MEMORY_AND_DISK_2)

statecount = data2.map(lambda x: x[1]).distinct().count()

RDD = data1.join(data2).map(lambda x: (x[1][1], x[1][0])).persist(
    pyspark.StorageLevel.MEMORY_AND_DISK_2)
st = RDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y:
                                                 (x[0] + y[0], x[1] + y[1]))
task1 = st.mapValues(lambda x: x[0] / x[1]).sortByKey(False)
task1f = task1.top(statecount, key=lambda x: x[1])
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from sparkpackage.sales_dto import SalesDTO


def print_lines(line):
    print line.product_name


config = SparkConf()
config.setAppName("CSVReaderJOB")
config.setMaster("local[*]")

context = SparkContext(conf=config)

textFileRDD = context.textFile(
    '/home/dharshekthvel/ac/code/scalatrainingintellij/data/sales.csv')

# Broadcast
# amazon_product = context.broadcast(SalesDTO("AMAZON_PRODUCT"))
# mappedRDD = textFileRDD.map(lambda x : amazon_product.value)
# mappedRDD.foreach(lambda x : print_lines(x))

# Accumulator

context.accumulator()
import pyspark
from pyspark.context import SparkContext
from pyspark import SparkConf

conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

# Load the adjacency list file
AdjList1 = sc.textFile("/home/rob/Assignment4/02AdjacencyList.txt")
print AdjList1.collect()

AdjList2 = AdjList1.map(
    lambda line: line)  # 1. Replace the lambda function with yours
AdjList3 = AdjList2.map(
    lambda x: x)  # 2. Replace the lambda function with yours
AdjList3.persist()
print AdjList3.collect()

nNumOfNodes = AdjList3.count()
print "Total Number of nodes"
print nNumOfNodes

# Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links
print "Initialization"
PageRankValues = AdjList3.mapValues(
    lambda v: v)  # 3. Replace the lambda function with yours
print PageRankValues.collect()

# Run 30 iterations
print "Run 30 Iterations"
Exemple #40
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)

    def test_zip_with_different_serializers(self):
        a = self.sc.parallelize(range(5))
        b = self.sc.parallelize(range(100, 105))
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
        a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
        b = b._reserialize(MarshalSerializer())
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])

    def test_zip_with_different_number_of_items(self):
        a = self.sc.parallelize(range(5), 2)
        # different number of partitions
        b = self.sc.parallelize(range(100, 106), 3)
        self.assertRaises(ValueError, lambda: a.zip(b))
        # different number of batched items in JVM
        b = self.sc.parallelize(range(100, 104), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # different number of items in one pair
        b = self.sc.parallelize(range(100, 106), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # same total number of items, but different distributions
        a = self.sc.parallelize([2, 3], 2).flatMap(range)
        b = self.sc.parallelize([3, 2], 2).flatMap(range)
        self.assertEquals(a.count(), b.count())
        self.assertRaises(Exception, lambda: a.zip(b).count())

    def test_histogram(self):
        # empty
        rdd = self.sc.parallelize([])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])
        self.assertRaises(ValueError, lambda: rdd.histogram(1))

        # out of range
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1])

        # in range with one bucket
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals([4], rdd.histogram([0, 10])[1])
        self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1])

        # in range with one bucket exact match
        self.assertEquals([4], rdd.histogram([1, 4])[1])

        # out of range with two buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1])

        # out of range with two uneven buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])

        # in range with two buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two bucket and None
        rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two uneven buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1])

        # mixed range with two uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01])
        self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1])

        # mixed range with four uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # mixed range with uneven buckets and NaN
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0,
                                   199.0, 200.0, 200.1, None, float('nan')])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # out of range with infinite buckets
        rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")])
        self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1])

        # invalid buckets
        self.assertRaises(ValueError, lambda: rdd.histogram([]))
        self.assertRaises(ValueError, lambda: rdd.histogram([1]))
        self.assertRaises(ValueError, lambda: rdd.histogram(0))
        self.assertRaises(TypeError, lambda: rdd.histogram({}))

        # without buckets
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 4], [4]), rdd.histogram(1))

        # without buckets single element
        rdd = self.sc.parallelize([1])
        self.assertEquals(([1, 1], [1]), rdd.histogram(1))

        # without bucket no range
        rdd = self.sc.parallelize([1] * 4)
        self.assertEquals(([1, 1], [4]), rdd.histogram(1))

        # without buckets basic two
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2))

        # without buckets with more requested than elements
        rdd = self.sc.parallelize([1, 2])
        buckets = [1 + 0.2 * i for i in range(6)]
        hist = [1, 0, 0, 0, 1]
        self.assertEquals((buckets, hist), rdd.histogram(5))

        # invalid RDDs
        rdd = self.sc.parallelize([1, float('inf')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))
        rdd = self.sc.parallelize([float('nan')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))

        # string
        rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2)
        self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))

        # mixed RDD
        rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2)
        self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1])
        self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))
Exemple #41
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])
trg_node_fields = ["trg_" + field for field in node_info_fields]

# Init Spark Context as running in local mode
sc = SparkContext("local")
# Create a basic Spark Session
spark = SparkSession \
 .builder \
 .appName(app_name) \
 .getOrCreate()
# Specify properties of fields,
# including field name and related data type
log_fields = src_node_fields + transc_info_fields + trg_node_fields + item_info_fields

# ------------------------------------------
# Pipeline of the Workflow

# Load rawdata from local file system
# And split each row by specific delimiter
source = sc.textFile(input_file_name) \
 .map(lambda x: x.split(delimiter))

# DataFrame for logistics data
log_df = spark.createDataFrame(source, log_fields)

log_df.groupBy("src_area_city").count().write.csv("src_area_city")
log_df.groupBy("src_industry_lv1").count().write.csv("src_industry_lv1")
log_df.groupBy("src_industry_lv3").count().write.csv("src_industry_lv3")

log_df.groupBy("trg_area_city").count().write.csv("trg_area_city")
log_df.groupBy("trg_industry_lv1").count().write.csv("trg_industry_lv1")
log_df.groupBy("trg_industry_lv3").count().write.csv("trg_industry_lv3")
Exemple #43
0
    sc = SparkContext(conf=conf)

    # read data as CSV for Dataframe analysis
    # /Volumes/work/data/kaggle/ssi.csv

    # read data n0rmally
    '''

    sqlContext = SQLContext(sc)
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv')
    # summarize(df)
    print df.show()

    #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)]))

    values using Dataframe
    Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
    Final intercept: 0.0
    '''

    points = sc.textFile(BASE_DATA_PATH + '/ssi.csv').map(parsePoint)
    model = LogisticRegressionWithSGD.train(points, 10)
    print("Final weights: " + str(model.weights))
    print("Final intercept: " + str(model.intercept))
    '''
    Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441]
    Final intercept: 0.0
    '''

    sc.stop()
sc = SparkContext(conf=conf)

gateway         = sc._gateway
sym             = gateway.jvm.com.sml.shell

# Find the access keys for EC2.
awsAccessKeyId = os.environ['AWS_ACCESS_KEY']
awsSecretAccessKey = os.environ['AWS_SECRET_KEY']
# print("awsAccessKeyId=" + awsAccessKeyId)
# print("awsSecretAccessKey=" + awsSecretAccessKey)

sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey)

myrdd  = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv')
# Convert pyspark RDD to JavaRDD
# _to_java_object_rdd
myJavaRdd = myrdd._jrdd

# The first line of CSV file are the name of the attributes
attributeNames = myrdd.first().split(",")
# The attributeTypes has to be given
attributeTypes = ["B"]+["C"]*(len(attributeNames)-1)

# The IP address of the host if empty, project is not persisted. (Not Persisted Here)
# sym.SymShellConfig.set("RedisHost","charm")
# sym.SymShellConfig.set("RedisPort",6379)

# 1) Create the Project here
projectName     = "susyExampleInPython"
Exemple #45
0
if args.format == "tfr":  # HDFS==>numpy array
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv": # HDFS==>numpy array
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":  # HDFS==>numpy array
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)

  print("zipping images and labels")
  # print(type(labels))
  # print(labels.count())
  dataRDD = images.zip(labels) # image+label

#cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
#cluster.start(mnist_dist.map_fun, args)
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train" or args.mode == "retrain":
  cluster.train(dataRDD, args.epochs)
from pyspark.context import SparkContext

sc = SparkContext(...)
lines = sc.textFile(sys.argv[2],1)

counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x,1)) \
.reduceByKey(lambda x, y: x+y)

for (word, count) in counts.collect():
    print "%s:%i" %(word, count)