Esempio n. 1
0
import findspark
findspark.init()
import pyspark
from pyspark import SQLContext
from pyspark import SparkContext

SparkContext.setSystemProperty('spark.cleaner.periodicGC.interval', '2')
SparkContext.setSystemProperty('spark.executor.memory', '2400m')
SparkContext.setSystemProperty('spark.driver.cores', '2')
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty("spark.driver.maxResultSize", "2g")

sc = pyspark.SparkContext(master='spark://192.168.11.239:7077',
                          appName='pipeline_tests')
sqlContext = SQLContext(sc)

from pyspark.sql.types import StringType
from datetime import datetime
import pyspark.sql.functions as F  #avoid conflicts with regular python functions
from pyspark.sql.functions import udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import PCA, StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer
import numpy as np
import time
from pipeline_tester import PipelineTester

######CLEAN
df = sqlContext.read.csv("/datasets/crimes.csv", header='true')
#Define date derivatives
df = (df.withColumn(
Esempio n. 2
0
#! /usr/bin/python3

import pyspark

sc = pyspark.SparkContext("local", "storagelevel app")
rdd1 = sc.parallelize([1, 2])
rdd1.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
rdd1.getStorageLevel()
print(rdd1.getStorageLevel())
Esempio n. 3
0
import pyspark
sc = pyspark.SparkConf()\
    .setMaster("local[*]")\
    .set("spark.driver.memory","8g")\
    .set("spark.executor.memory","8g")\
    .set("spark.debug.maxToStringFields", 10000)\
    .set('spark.sql.debug.maxToStringFields', 2000)\
    .set("spark.jars","/Users/yeeun/Apache/spark-2.4.4-bin-hadoop2.7/jars/spark-redis-2.4.0-jar-with-dependencies.jar")

sparkContext = pyspark.SparkContext(conf=sc)

spark = pyspark.sql.SparkSession(sparkContext).builder\
    .appName("pysaprk_python")\
    .config("spark.redis.host", "localhost")\
    .config("spark.redis.port", "6379")\
    .getOrCreate()

from pyspark.streaming import StreamingContext
from pyspark import StorageLevel
from itertools import chain
import redis, json, time

myRedis = redis.Redis(host='127.0.0.1', port=6379, db=0)

IP = "127.0.0.1"
Port = 5559

# tweet hashtag Trend analysis

schema = ['text', 'is_quote_status', 'entities.hashtags.text as hashtag']
import pyspark
sc = pyspark.SparkContext(appName="test_pyspark")
import random
NUM_SAMPLES = 1000
def inside(p):
 x, y = random.random(), random.random()
 return x*x + y*y < 1
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
pi = 4 * count / NUM_SAMPLES
print()
print(pi)
print()
print("Pyspark is working")
print()
Esempio n. 5
0
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as fn
from pyspark.sql.functions import col
import pyspark
import os
import subprocess
import math
from pyspark.sql.types import StructType, StringType, IntegerType

private_ip = 'namenode_ip'

private_ip = private_ip.replace('.', '-')

context = pyspark.SparkContext('local[*]')
sesh = SparkSession(context)

df_mongo = sesh.read.json(
    'hdfs://ip-{}.ec2.internal:9000/user/ubuntu/metadata/metadata.json'.format(
        private_ip))

# drop these columns from metadata
df_mongo = df_mongo.drop('id')\
                   .drop('_id')\
                   .drop('brand')\
                   .drop('categories')\
                   .drop('description')\
                   .drop('related')\
                   .drop('salesRank')\
                   .drop('title')\
                   .drop('imUrl')\
                   .dropna()\
Esempio n. 6
0
## Assignment 1 - Word Count Using PySpark          ##
## Author       - Akshay Patel                      ##
## Usage        - Copy paste all code in pyspark    ##
##                until graph is rendered in window ##
######################################################


import matplotlib.pyplot as plt
from string import punctuation
from operator import add
import numpy as np
import re
import time
import pyspark

sc = pyspark.SparkContext(appName="App")

################ Word Count Application On Shakespeare's File #################
def wordCount(wordListRDD):
    """
    Creates a pair RDD with word counts from an RDD of words.
    
    Args:
        wordListRDD (RDD of str): An RDD consisting of words.
    Returns:
        RDD of (str, int): An RDD consisting of (word, count) tuples.
    """
    wordPairs = wordListRDD.map(lambda w: (w, 1))
    wordCounts = wordPairs.reduceByKey(add)
    
    return wordCounts
import findspark
findspark.init()

import pyspark
import random
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext

from hdfs import InsecureClient
from datetime import date, timedelta

yesterday = date.today() - timedelta(1)
date = yesterday.strftime('%Y%m%d')

sc = pyspark.SparkContext(appName="Citibike_aggregation_" + date)

csvDf = sqlContext.read.format("csv").option("header", "true").option(
    "inferschema", "true").option("mode", "DROPMALFORMED").load(
        'hdfs://ubuntuclient2.psudata.dev:8020/user/hdfs/citibike/raw_data/' +
        date + "*.csv")
pd_df = csvDf.toPandas()
with client_hdfs.write(
        '/user/hdfs/citibike/daily_aggregated_data/aggregated_' + date +
        '.csv',
        encoding='utf-8') as writer:
    pd_df.to_csv(writer, index=False)
mkdir_command = 'hdfs dfs -mkdir /user/hdfs/citibike/daily_backup_data/' + date
cmd_mkdir = mkdir_command.split()
subprocess.check_output(cmd_mkdir)
import sys
import csv
import math
import pyspark
sc = pyspark.SparkContext("local")

rd = sys.argv[1]
td = sys.argv[2]

ratings = sc.textFile(rd)
ratings = ratings.map(lambda x: x.split(",")).map(lambda x: (x[0],
                                                             (x[1], x[2])))

test = sc.textFile(td)
test=test.map(lambda x: x.split(",")).filter(lambda x: x[0]!='userId')\
         .map(lambda x: (x[0],x[1]))

raw=ratings.filter(lambda x: x[0]!='userId')\
        .map(lambda x: ((x[0],x[1][0]),x[1][1]))

test_label = test.map(lambda x: ((x[0], x[1]), 'test'))
train_test = raw.leftOuterJoin(test_label)

# train data
# (userid,(movieid,rating))
rating=train_test.filter(lambda x: x[1][1]!='test')\
   .map(lambda x: (x[0][0],(x[0][1],x[1][0])))

# test data
# (userid,(movieid,rating))
test_rating=train_test.filter(lambda x: x[1][1]=='test')\
Esempio n. 9
0
'''
initSession.py

    establishes a unified spark context and session
    to be used in subsequent scripts.

'''

import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext('local[*]', "temp")
spark = SparkSession.builder.master("local[*]").appName("temp").getOrCreate()
Esempio n. 10
0
def main(argv):
    sc = pyspark.SparkContext()
    # Two SDG Goals
    goal = [
        "Proportion of students at the end of lower secondary education achieving at least a minimum proficiency level in reading, both sexes (%)",
        "Proportion of students at the end of lower secondary education achieving at least a minimum proficiency level in mathematics, both sexes (%)"
    ]
    # Two different Y label files for two SDG goals
    files = ['hdfs:/data/SDG4_data.csv', 'hdfs:/data/SDG4_data_Math.csv']

    for i in range(2):

        rdd_SDG_data = sc.textFile(files[i])

        # Here filtered out data for a particular Goal
        def filter_goal(x):
            split_x = x.split(',')
            gol = goal[i]
            if (((split_x[1]+",").replace('"','')+split_x[2].replace('"',''))== gol)\
            and split_x[-4] != "" and split_x[-3] !="" :
                return True
            else:
                return False

        # Here Mapping data for a (country, (year, value))
        def map_cntry_year(x):
            split_x = x.split(',')
            return (split_x[-7].replace('"', ''),
                    (split_x[-4].replace('"', ''), split_x[-3]))

        # Filtered out country having data of both 2015 and 2018 year
        def map_year_2015_2018(x):
            yr_2015, yr_2018 = 0.0, 0.0
            if x[1][0][0] == "2015":
                yr_2015 = float(x[1][0][1])
            else:
                yr_2018 = float(x[1][0][1])
            if x[1][1][0] == "2015":
                yr_2015 = float(x[1][1][1])
            else:
                yr_2018 = float(x[1][1][1])

            return (x[0], (yr_2015, yr_2018))


        mapped_sdg_data = rdd_SDG_data.filter(lambda x:filter_goal(x)).map(lambda x:map_cntry_year(x))\
              .groupByKey().map(lambda x : (x[0], list(x[1]))).filter(lambda x:len(x[1])==2)\
             .map(lambda x: map_year_2015_2018(x))

        #list down all countries having data points, total 57 country are there
        list_cntry_code = mapped_sdg_data.map(lambda x: x[0]).collect()
        brdcast_cntry = sc.broadcast(list_cntry_code)

        # Feedback variable files for year 2018 of students
        rdd_fb_data_2018 = sc.textFile('hdfs:/data/cy07_msu_stu_qqq.csv')

        # Feedback variable files for year 2015 of students
        rdd_fb_data_2015 = sc.textFile('hdfs:/data/cy6_ms_cmb_stu_qqq.csv')

        # map (k,v) as (country,[list of Feedback variables])
        def map_cnt_STFB(x):
            l = x.split(',')
            # 298 are total features
            return (l[1], (l[14:18] + l[20:41] + l[44:52] + l[55:320]))

        # List down all the Feedback variable list, there are 298 features in total to be used for hypothesis testing
        brdcast_FB_var = sc.broadcast(
            rdd_fb_data_2018.map(lambda x: map_cnt_STFB(x)).map(
                lambda x: x[1]).take(1))

        # print(len((brdcast_FB_var.value)[0]))

        # There were multiple entries for each country for each Feedback variable
        # Filled the missing value with the mean and as output, (country,(Feedback_var name,value))
        def take_avg_per_cntry_FB_var(x):
            N = len(x[1])
            dict_count_empty = {}
            dict_count_val = {}
            for j in range(N):
                l = x[1][j]
                for ele in range(len(l)):
                    if not l[ele]:
                        if ele in dict_count_empty.keys():
                            dict_count_empty[ele] += 1.0
                        else:
                            dict_count_empty[ele] = 1.0
                    else:
                        if ele in dict_count_val.keys():
                            dict_count_val[ele] += float(l[ele])
                        else:
                            dict_count_val[ele] = float(l[ele])

            FB_var_list = (brdcast_FB_var.value)[0]
            ans = []

            for i in dict_count_val.keys():
                if i not in dict_count_empty.keys():
                    ans.append((x[0], (FB_var_list[i], dict_count_val[i] / N)))
                else:
                    ans.append((x[0], (FB_var_list[i], (dict_count_val[i] + (
                        (dict_count_val[i] /
                         (N - dict_count_empty[i])) * dict_count_empty[i])) /
                                       N)))

            return ans

        # Filter Feedback variable data for countries for which we have SDG goal value
        def filter_country(x):
            cntry_list = brdcast_cntry.value
            l = x.split(',')
            if l[1] in cntry_list:
                return True
            else:
                return False


        filter_FB_2018 = rdd_fb_data_2018.filter(lambda x: filter_country(x)).map(lambda x:map_cnt_STFB(x)).groupByKey()\
             .map(lambda x : (x[0], list(x[1])))\
             .flatMap(lambda x:take_avg_per_cntry_FB_var(x))

        filter_FB_2015 = rdd_fb_data_2015.filter(lambda x: filter_country(x)).map(lambda x:map_cnt_STFB(x)).groupByKey()\
             .map(lambda x : (x[0], list(x[1])))\
             .flatMap(lambda x:take_avg_per_cntry_FB_var(x))

        # Map (k,v) pair as (Feedback_varname, (yr, its value))
        def merge_2018(k):
            yr_2018 = k[1][0]
            FB_var = k[1][1]
            return (FB_var[0], (yr_2018[1], FB_var[1]))

        # Map (k,v) pair as (Feedback_varname, (yr, its value))
        def merge_2015(k):
            yr_2015 = k[1][0]
            FB_var = k[1][1]
            return (FB_var[0], (yr_2015[0], FB_var[1]))

        # Merge all SDG goal output with Feedback variable data
        merge_year_2018_data = mapped_sdg_data.join(filter_FB_2018).map(
            lambda k: merge_2018(k))
        merge_year_2015_data = mapped_sdg_data.join(filter_FB_2015).map(
            lambda k: merge_2015(k))

        # Merged all years data with key value as Feedback variable name and in output, (Feedback var, [(goalval, Feedback_var value) for each country])
        total_merge_data = merge_year_2015_data.join(merge_year_2018_data).groupByKey()\
             .map(lambda x : (x[0], list(x[1])))

        # Standandardize data to calculate Beta value for each Feedback variable
        def standard_data_fun(x):
            key = x[0]
            val = x[1]
            ans = []
            N = len(val)
            arrx = []
            arry = []
            for i in range(N):
                arrx.append(val[i][0][1])
                arrx.append(val[i][1][1])
                arry.append(val[i][0][0])
                arry.append(val[i][1][0])
            x_mean = np.mean(arrx)
            y_mean = np.mean(arry)
            x_std = np.std(arrx)
            y_std = np.std(arry)

            ans = []
            tot = len(arrx)
            beta_val = 0.0
            for i in range(tot):
                beta_val += ((arry[i] - y_mean) / y_std) * (
                    (arrx[i] - x_mean) / x_std)
                ans.append(
                    ((arry[i] - y_mean) / y_std, (arrx[i] - x_mean) / x_std))
            return ((key, beta_val / (tot - 1)), ans)

        def calc_beta_val(x):
            sum = 0.0
            N = len(x[1])
            for i in range(len(x[1])):
                sum += x[1][i][0] * x[1][i][1]

            return (x[0], sum / N - 1)

        standardize_data = total_merge_data.map(lambda x: standard_data_fun(x))

        # Seperated all Feedback variables having 20 most and least co-related variable
        brdcast_top_20_feature = sc.broadcast(
            standardize_data.sortBy(lambda x: -x[0][1]).map(
                lambda x: x[0][0]).take(20))
        brdcast_bottom_20_feature = sc.broadcast(
            standardize_data.sortBy(lambda x: x[0][1]).map(
                lambda x: x[0][0]).take(20))

        # print(top_20_feature)
        # print("#########################################")
        # print(bottom_20_feature)

        def calc_p_value(x):
            dof = len(x[1]) - 2
            rss = 0.0
            for i in range(len(x[1])):
                val = x[1][i][1] - (x[0][1] * x[1][i][0])
                rss += val * val
            s_square = rss / dof
            denominator = dof + 2
            plt_beta = stats.t.cdf(
                x[0][1] / math.sqrt((s_square / denominator)), dof)
            if plt_beta < 0.5:
                return ((x[0][0]), (x[0][1], 2 * plt_beta * 1000))
            else:
                return ((x[0][0]), (x[0][1], (1 - plt_beta) * 2 * 1000))

        pvalue_20_pos_cor = standardize_data.filter(lambda x: x[0][
            0] in brdcast_top_20_feature.value).map(lambda x: calc_p_value(x))
        pvalue_20_neg_cor = standardize_data.filter(
            lambda x: x[0][0] in brdcast_bottom_20_feature.value).map(
                lambda x: calc_p_value(x))

        print(" For goal ##", goal[i])
        print("#########################################")
        print()
        print("Top 20 most important feature ")
        print()
        print(pvalue_20_pos_cor.collect())
        print()
        print("#########################################")
        print()
        print("Bottom 20 most important feature ")
        print()
        print(pvalue_20_neg_cor.collect())
        print()
Esempio n. 11
0
    for i in range(number_of_batches):
        for j in range((i+1), number_of_batches):
            combinations.append((i,j))
else:
    combinations = generate_combinations(number_of_batches, int(args.combinations))
print(combinations)
combGraph = nx.Graph()
for e in combinations:
    combGraph.add_edge(e[0],e[1])
print("len(combinations)",len(combinations))
details["combinations"] = combinations
details["number_of_combinations"] = len(combinations)
partition_size = len(combinations)
# In[13]:
app_name = dataset_name + "_" + str(number_of_batches) + "_" + str(len(combinations))
sc = pyspark.SparkContext(appName=app_name)

sc.addPyFile("node2vec.py")
# In[83]:

train_edge_true = sc.textFile(input_path + "train_edges_true.txt") .map(lambda x: (int(x.split(' ')[0]), int(x.split(' ')[1]))) .map(lambda x: (min(x[0],x[1]), max(x[0],x[1]), 1))
train_edge_false= sc.textFile(input_path + "train_edges_false.txt") .map(lambda x: (int(x.split(' ')[0]), int(x.split(' ')[1]))) .map(lambda x: (min(x[0],x[1]), max(x[0],x[1]), 0))
train_edges = sc.union([train_edge_true, train_edge_false])
del train_edge_true
del train_edge_false

# In[84]:

nodes = train_edges.flatMap(lambda x: [x[0],x[1]]).distinct().persist()
nodes_count = len(nodes.collect()) #todo optimize
Esempio n. 12
0
import pyspark as ps

sc = ps.SparkContext("local")
import sys

case_num = int(sys.argv[1])
ratings_path = sys.argv[2]
users_path = sys.argv[3]
support = int(sys.argv[4])


# A-Priori Algorithm
def Get_Candidates(data, threshold):
    local_candidate = dict()
    candidate_set = set()
    data_sets = list()
    items_set = dict()
    for i in data:
        data_sets.append(set(i))

    for i in data_sets:
        for j in i:
            if items_set.has_key(j): items_set[j] += 1
            else: items_set[j] = 1

    for i in items_set.keys():
        if items_set[i] >= threshold:
            candidate_set.add(frozenset([i]))

    k = 2
    local_candidate[k - 1] = candidate_set
Esempio n. 13
0
import pyspark
import pyspark.streaming as pyspark_streaming
import pyspark.streaming.kafka as pyspark_kafka

import scapy.all as scapy

# -----------------------------------------------------------------------------
# Main program
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    #
    # Setup
    #

    #-- define spark usual and streaming contexts
    cont_0 = pyspark.SparkContext(appName="pkt_dissector")
    cont_0.setLogLevel("ERROR")
    s_cont_0 = pyspark_streaming.StreamingContext(cont_0, 5)

    #-- kafka integration (notice, that we receive packets as a bytes struct)
    brokers = "192.168.122.71:9092,192.168.122.72:9092,192.168.122.73:9092"
    kafka_dstream = pyspark_kafka.KafkaUtils.createDirectStream(
        s_cont_0, ["test1"], {"metadata.broker.list": brokers},
        valueDecoder=lambda x: bytes(x))

    #
    # Lazy evaluation rules
    #
    #-- Kafka message comes as a 2-tuple: (key, value). The code below will
    #-- select the actual message (i.e. packet) and dissects it.
    pkts = kafka_dstream.map(lambda x: scapy.Ether(x[1]))
Esempio n. 14
0
File: ETP.py Progetto: azataiot/Oraz
# Installing some required python packages and models
print(
    "\n**INFO** :Теперь установка некоторых необходимых пакетов и моделей Python\n"
)

from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.sql import SQLContext
print("Done!")
import pyspark
sqlContext = pyspark.SQLContext(pyspark.SparkContext())

print(
    "\n**INFO** :Подготовьте данные обучения из списка (метка, характеристики) кортежей.\n"
)
# print("Done!")
training = sqlContext.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                       (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                       (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                       (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                      ["label", "features"])
print("Done!")

print(
    "\n**INFO** :# Создать экземпляр LogisticRegression. Этот экземпляр является оценщиком.\n"
)
# Распечатайте параметры, документацию и любое значение по умолчанию
lr = LogisticRegression(maxIter=10, regParam=0.01)
# распечатайте параметры, документацию и любые значения по умолчанию.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
Esempio n. 15
0
#import findspark
#findspark.init()
import pyspark

sc = pyspark.SparkContext("local", "MyApp")
rdd = sc.parallelize(range(5))

letters = rdd.map(lambda x: [abet for abet in 'alpha'])

scrambl = letters.reduce(lambda x, y: x + y)

answer = scrambl.collect()

print(answer)
Esempio n. 16
0
import operator
import pyspark

# def main():
# '''Program entry point'''

#Intialize a spark context
with pyspark.SparkContext("local", "PySparkWordCounts") as sc:
    #Get a RDD containing lines from this script file
    lines = sc.textFile("C:\Govi\Python\jaffa.txt")
    #Split each line into words and assign a frequency of 1 to each word
    words = lines.flatMap(lambda line: line.split(" ")).map(lambda word:
                                                            (word, 1))
    #count the frequency for words
    counts = words.reduceByKey(operator.add)
    #Sort the counts in descending order based on the word frequency
    sorted_counts = counts.sortBy(lambda x: x[1], False)
    #Get an iterator over the counts to print a word and its frequency
    for word, count in sorted_counts.toLocalIterator():
        print(u"{} --> {}".format(word, count))

# if __name__ == "__main__":
#     main()
Esempio n. 17
0
#######################################################################
# Word count example in pyspark
#######################################################################

import pyspark
sc = pyspark.SparkContext(appName = 'wordcount')

################################################################
# Read in the data
################################################################
lines = ["Dorothy lived in the midst of the great Kansas prairies,",
         "with Uncle Henry, who was a farmer, and Aunt Em, who was", 
         "the farmer's wife. Their house was small, for the lumber", 
         "to build it had to be carried by wagon many miles.  There",
         "were four walls, a floor and a roof, which made one room;"
         "and this room contained a rusty looking cookstove, a",
         "cupboard for the dishes, a table, three or four chairs,"
         "and the beds."]

rdd = sc.parallelize(lines)

# TO DO: look at the first 3 lines


################################################################
# Pre-processing the data
################################################################



# TO DO: transform the RDD into a new one (also called 'rdd')
Esempio n. 18
0
    all_args['features'] = arguments.features
    all_args['id'] = arguments.id
    all_args['labels'] = arguments.labels
    # dtu_cluster_path = 'file:///home/micsas/workspace/distributions/dist_workflow'
    # local_path = "file:/home/svanhmic/workspace/DABAI/Workflows/dist_workflow"
    # visma_cluster_path = 'file:/home/ml/deployments/workflows'
    py_files = [
        '/shared.zip', '/examples.zip', '/cleaning.zip', '/classification.zip',
        '/semisupervised.zip'
    ]

    spark_conf = pyspark.SparkConf(loadDefaults=False)
    (spark_conf.set('spark.executor.cores',
                    4).set('spark.executor.memory',
                           '1G').set('spark.executors', 2))
    sc = pyspark.SparkContext(appName=arguments.job_name)
    job_module = importlib.import_module('{:s}'.format(arguments.job_name))
    # sc = pyspark.SparkContext(
    #     appName=arguments.job_name, pyFiles=[arguments.cluster_path+py_file for py_file in py_files], conf=spark_conf)
    # job_module = importlib.import_module('{:s}'.format(arguments.job_name))
    try:
        data_frame = job_module.run(sc, **all_args)
        # data_frame.printSchema()
        # data_frame.show()
        rdd = data_frame.toJSON(
        )  # .saveAsTextFile('hdfs:///tmp/cleaning.txt')
        js = rdd.collect()
        # print(js)
        if arguments.job_name == 'cleaning':
            print("""{"cluster":[""" + ','.join(js) + """]}""")
        elif arguments.job_name == 'classification':
Esempio n. 19
0
from pyspark.sql.types import *




import os
import numpy as np

import pyspark as ps

dlds = '/Users/travis.howe/Downloads/'

# to run this in the command line, simply python galvanize_individual.py
# in the spark shell, i don't need to initialize SparkContext (done automatically)
conf = ps.SparkConf().setMaster('local[4]').setAppName('My App')
sc = ps.SparkContext(conf=conf)


# file_rdd = sc.textFile('../sample_data_files/cookie_data.txt')
rdd1 = sc.parallelize(range(5))
rdd1.saveAsTextFile(dlds + 'thing.txt')



sys.exit()

print rdd1.reduce(lambda x, y: x + y)



# start with page 67, aggregate
"""Data pipeline for extracting basic metrics"""
import sys
import random
import ast
import re

# this code allows you to run the following code locally
import findspark
findspark.init()
import pyspark  # noqa: E402
from pyspark import SparkContext  # noqa: E402

sc = pyspark.SparkContext(appName="hw")
# the code is run on the Cavium cluster of UMICH ARC-TS
# Some libraries are pre-loaded as part of the cluster configuration
#   e.g. SparkContext
# If you are to run this code on a local machine,
# pay attention to those libraries: https://arc-ts.umich.edu/cavium/user-guide/
# or contact the authors

random.seed(42)

sys.setrecursionlimit(999999)


def inject_one_depth(node):
    res = 1
    for subnode in node['children']:
        inject_one_depth(subnode)  # inplace
        res = max(res, subnode['replies_depth'])
    node['replies_depth'] = res
    data.replace("<br />", "")
    data = re.sub("[^\w]", " ", data)
    return data.split()

def getLabelPoint(inputList):
    return LabeledPoint(inputList[1],inputList[0])

if __name__=="__main__":
    #model =
    py4j.java_gateway.launch_gateway
    finalDict={}
    details = {}
    if len(gb.glob("./diction*.pkl")) == 0:

        configuartion=py.SparkConf()                                # setting the Spark Configuration
        sContext=py.SparkContext(conf=configuartion)                # setting the Spark context
        sContext.defaultParallelism
        print ("Data preprocessing start time:", datetime.datetime.now().time())
        traindataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/pos/*.txt"))
        posData = traindataPos.flatMap(getdata)

        testdataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/pos/*.txt"))
        postestData = testdataPos.flatMap(getdata)

        newposData = traindataPos + testdataPos

        traindataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/neg/*.txt"))
        negData = traindataNeg.flatMap(getdata)

        testdataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/neg/*.txt"))
        negtestData = testdataNeg.flatMap(getdata)
Esempio n. 22
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # 예정된 도착/출발 시간 추가
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature에서 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # 파이프라인 모델 저장
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속 수치형 필드를 명목형 필드의 인덱스와 결합해서 하나의 특징 벡터로 만듦
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 수치 벡터 어셈블러를 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델 교차 검증, 훈련, 평가: 4 개의 지표에 대해 5회 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # 테스트/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시키기
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # 오래된 모델 대신 새 모델을 저장
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터로 모델 평가
        predictions = model.transform(test_data)

        # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # 특징 중요도 수집
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # 지표별 평균과 표준편차 평가 및 표로 출력
    #
    import numpy as np
    score_averages = defaultdict(float)

    # 표 데이터 계산
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # 표 출력
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # 기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    # 다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # 특징 중요도 변화를 분석하고 보고
    #

    # 각 특징에 대한 평균 계산
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # 특징 중요도를 내림차순으로 정렬하고 출력
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # 이번 실행 결과인 특징 중요도를 이전 실행 결과와 비교
    #

    # 특징 중요도 로그를 적재하거나 빈 로그를 초기화
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # 각 특징에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # 변동 값 계산
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    #  특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # 정렬된 특징 변동 값 디스플레이
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # 로그에 기존 평균 변동 값을 추가
    feature_log.append(feature_importance_entry)

    # 다음 실행을 위해 로그 유지
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
Esempio n. 23
0
import sys, json, time, collections, operator, math, pickle

# hyper-parameter
rareword_threshold = 0.000001

if __name__ == "__main__":

    start_time = time.time()

    # parse commandline argument
    train_file_path = sys.argv[1]  # input
    model_file_path = sys.argv[2]  # output
    stopwords_path = sys.argv[3]

    conf = pyspark.SparkConf().setAppName("Task2Train").setMaster("local[*]")
    sc = pyspark.SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    stopwords = set(sc.textFile(stopwords_path).collect())
    stopwords_broadcast = sc.broadcast(stopwords)

    def only_keep_letter_space(str):
        whitelist = set('abcdefghijklmnopqrstuvwxyz ')
        return "".join(filter(whitelist.__contains__, str.lower()))

    rawRDD = sc.textFile(train_file_path) \
        .map(lambda x: json.loads(x)) \
        .map(lambda x: ((x["user_id"], x["business_id"]), x["text"])) \
        .mapValues(lambda x: only_keep_letter_space(x)) \
        .mapValues(lambda x: list(filter(lambda x: x not in stopwords_broadcast.value, x.split())))
Esempio n. 24
0
# 1. an empty line at the bottom.
# 2. the labels are in the form of strings. We want an integer.

# Download dataset (if not already downloaded)
filename = "iris.data"
temp_filename = filename + '_temp'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

if not os.path.isfile(filename):
    if os.path.isfile(temp_filename):
        os.remove(temp_filename)
    download_file(url, temp_filename)
    os.rename(temp_filename, filename)

# We use pyspark to filter empty lines
sc = pyspark.SparkContext(master='local[*]', appName='iris')
data = sc.textFile('iris.data')
filtered_data = data.filter(lambda x: len(x) > 0)

# Define Input Schema
input_schema = Schema()
input_schema.add_double_column('Sepal length')
input_schema.add_double_column('Sepal width')
input_schema.add_double_column('Petal length')
input_schema.add_double_column('Petal width')
input_schema.add_categorical_column(
    "Species", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"])

# Define Transform Process
tp = TransformProcess(input_schema)
tp.categorical_to_integer("Species")
Esempio n. 25
0
#!/usr/bin/env python

import pyspark
import tarfile
from io import BytesIO
import sys


def extractFiles(incoming_bytes):
    tar = tarfile.open(fileobj=BytesIO(incoming_bytes), mode="r:gz")
    return [tar.extractfile(x).read() for x in tar if x.isfile()]


if len(sys.argv) != 3:
    raise Exception("Exactly 2 arguments are required: <outputUri>")

sc = pyspark.SparkContext(appName="ana")

gzfiles = sc.binaryFiles(sys.argv[1])
values = gzfiles.flatMap(lambda x : extractFiles(x[1])).filter(lambda x: len(x)>1).map(lambda x: x.decode("latin-1"))
words = values.flatMap(lambda line: line.split(" ")).filter(lambda x: (len(x) > 1) & x.isalpha())
anagrams = words.map(lambda word: (''.join(sorted(word.lower())), set([word.lower()]))).reduceByKey(lambda a,b: a.union(b) ).filter(lambda x: len(x[1]) > 1).map(lambda x: x[1])

anagrams.saveAsTextFile(sys.argv[2])

sc.stop()
Esempio n. 26
0
def main():

    # Note: It is important to import the libraries needed within the function
    #      so Spark does not attempt serializing the libraries to all the workers,
    #      otherwise it could fail during Serialization/Deserialization
    #      using the pickle methods.

    from mxinfer import load_images
    from mxinfer import predict

    from utils import get_args
    from utils import get_s3client
    from utils import fetch_s3_keys
    from utils import download_objects
    from utils import upload_file

    args = get_args()
    logger.info('received arguments:{}'.format(args))

    conf = SparkConf().setAppName(
        "Distributed Inference using MXNet and Spark")

    # we will set the number of cores per executor to 1 to force Spark to create
    # only one task per executor since MXNet efficiently uses all the cpus on the
    # system for inference
    conf.set('spark.executor.cores', '1')

    sc = pyspark.SparkContext(conf=conf)
    logger.info("Spark Context created")

    s3_client = get_s3client(args['access_key'], args['secret_key'])

    keys = fetch_s3_keys(args['bucket'], args['prefix'], s3_client)

    # filter out only png images.
    # you can also choose to check the content-type headers by doing
    # a head call against each S3-Key

    keys = filter(lambda x: x.endswith('.png'), keys)

    # number of keys
    n_keys = len(keys)
    if n_keys < args['batch']:
        args['batch'] = n_keys

    n_partitions = n_keys // args['batch']

    logger.info('number of keys from s3: {}'.format(n_keys))

    # if keys cannot be divided by args['batch'] .
    if (n_partitions * args['batch'] != n_keys):
        keys.extend(keys[:args['batch'] -
                         (n_keys - n_partitions * args['batch'])])

    logger.debug('Keys:{}'.format(keys))

    n_partitions = len(keys) // args['batch']
    logger.info("number of keys:{}, n_partitions:{}".format(
        len(keys), n_partitions))

    # we will create partitions of args['batch']
    rdd = sc.parallelize(keys, numSlices=n_partitions)
    logger.info('created rdd with {} partitions'.format(
        rdd.getNumPartitions()))

    sc.broadcast(args['bucket'])

    rdd = rdd.mapPartitions(lambda k: download_objects(args['bucket'], k))

    rdd = rdd.mapPartitions(load_images)

    sc.broadcast(args)
    rdd = rdd.mapPartitions(lambda imgs: predict(imgs, args))

    output = rdd.collect()

    # drop the extra keys that we added to fill the last batch

    keys = keys[:n_keys]
    output = output[:n_keys]

    logger.info("predictions: {}".format(output))

    if args['output_s3_key'] and args['output_s3_bucket']:
        with open('/tmp/' + args['output_s3_key'], 'w+') as f:
            for k, o in zip(keys, output):
                f.write("Key %s: Prediction: %s\n" % (k, o))
        upload_file(args['output_s3_bucket'], args['output_s3_key'],
                    '/tmp/' + args['output_s3_key'], s3_client)
Esempio n. 27
0
#!/usr/bin/env python

import pyspark
import sys

inputUri = 'gs://dataproc-ed3c3d29-fb10-47bb-aca7-dcc358c68973-us-central1/input/map.txt'
outputUri = 'gs://dataproc-ed3c3d29-fb10-47bb-aca7-dcc358c68973-us-central1/output'

grid = []


def parseRDD(line):
    parsedLine = list(line)
    row = []
    for i in range(len(parsedLine)):
        row.append(0)
    grid.append(row)
    return parsedLine


sc = pyspark.SparkContext()
lines = sc.textFile(inputUri)
words = lines.flatMap(parseRDD)
print(grid)
wordCounts = words.map(lambda word: (word, 1)).reduceByKey(
    lambda count1, count2: count1 + count2)
wordCounts.saveAsTextFile(outputUri)
Esempio n. 28
0
import pyspark
import sys

print("Hello, world!")

master_url = sys.argv[1] if len(sys.argv) >= 2 else "local"
sc = pyspark.SparkContext(master_url)

msg = "Hello, pyspark!"
rdd = sc.parallelize(list(msg))
print(''.join(rdd.collect()))
Esempio n. 29
0
import pyspark
sc = pyspark.SparkContext(appName="myAppName")

quijote = sc.textFile("quijote.txt")

palabrasrdd = quijote.flatMap(lambda x: x.split(" "))
# Se cuenta las lineas que llevan la cadena Quijote
filtro1 = palabrasrdd.filter(lambda l: "Quijote" in l).count()
# Se cuenta las lineas que llevan la cadena Sancho
filtro2 = palabrasrdd.filter(lambda l: "Sancho" in l).count()
# Se cuenta las lineas que llevan la cadena Rocinante
filtro3 = palabrasrdd.filter(lambda l: "Rocinante" in l).count()

print("Quijote: " + str(filtro1))
print("Sancho: " + str(filtro2))
print("Rocinante: " + str(filtro3))
Esempio n. 30
0
 def py_sc(self):
     return pyspark.SparkContext(master='local[*]', appName='test')