コード例 #1
0
def main():
    #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30")

    global sc
    #sc = SparkContext(conf=spark_conf)
    sc_conf = SparkConf()
    sc_conf.set("spark.redis.host",
                "ec2-52-73-233-196.compute-1.amazonaws.com")
    sc_conf.set("spark.redis.port", "6379")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    # sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py")
    # sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_files("dataignition-tech-xml-parq", "posts.parquet")
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))
コード例 #2
0
def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/min_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/locality_sensitive_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    run_minhash_lsh()
    end_time = time.time()
    print(
        colored(
            "Spark Custom MinHashLSH run time (seconds): {0} seconds".format(
                end_time - start_time), "magenta"))
コード例 #3
0
def main():
    #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30")

    global sc
    #sc = SparkContext(conf=spark_conf)
    sc_conf = SparkConf()
    sc_conf.set("spark.redis.host", config.REDIS_SERVER)
    sc_conf.set("spark.redis.port", "6379")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_files(config.S3_BUCKET, config.S3_FOLDER_EXTRACTED)
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))
コード例 #4
0
ファイル: preprocess.py プロジェクト: aspk/askedagain
def main():
    spark_conf = SparkConf().setAppName("Text Preprocesser").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_all()
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))
コード例 #5
0
def initialize():
    global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold
    print("Initializing...")
    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    buckets_user = items.groupByKey().mapValues(list).filter(
        lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex(
            removeDuplicateEntriesAfter)
    print("Without Duplicates DOne..")
    # withoutDuplicates = checkM.mapPartitionsWithIndex(
    #     removeDuplicateEntries).groupByKey().mapValues(list)

    if (case == 1):
        # buckets_user = withoutDuplicates.mapPartitionsWithIndex(
        #     createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold)

        callSonPhase1(buckets_user)
        print("Initializing Phase 2.....")
        finalFreq = buckets_user.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
    if (case == 2):
        buckets_business = withoutDuplicates.mapPartitionsWithIndex(
            createBuckets_case2).groupByKey().mapValues(list)
        callSonPhase1(buckets_business)
        print("Initializing Phase 2.....")
        finalFreq = buckets_business.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
コード例 #6
0
def main():
    '''
    some standard spark functions examples
    '''

    # Starting a spark session
    sc = SparkContext()
    sc.setLogLevel("OFF")
    spark = SparkSession.builder.master("local").getOrCreate()

    # Creating a dataframe from data
    l = [(1, 'a', 'b', 'c', 'd'), (1, 'a', 'b', 'c', 'd')]
    df0 = spark.createDataFrame(l, ['col1', 'col2', 'col3', 'col4', 'col5'])

    # Creating a dataframe using rdd
    l = [(2, 'f', 'g'), (2, 'f', 'g')]
    rdd = sc.parallelize(l)
    schema = StructType([
        StructField("col6", IntegerType(), True),
        StructField("col7", StringType(), True),
        StructField("col8", StringType(), True)
    ])
    df1 = spark.createDataFrame(rdd, schema)

    # Joining both df0 and df1
    indexedDf0 = add_column_index(df0)
    indexedDf1 = add_column_index(df1)
    df2 = indexedDf0.join(indexedDf1, indexedDf1.idx == indexedDf0.idx,
                          'inner').drop("idx")
    df2.write.csv("/tmp/file.csv",
                  mode='overwrite',
                  header=True,
                  nullValue='NA',
                  quoteAll=False)

    # Read a CSV file into a dataframe (multiLine=True to avoid splitting data with '\n'
    df = spark.read.csv("/tmp/file.csv",
                        header=True,
                        quote='"',
                        escape='"',
                        multiLine=True)

    # Print the Schema
    df.printSchema()

    # Count the number of rows
    print('Number of rows: {}'.format(df.count()))

    # Show columns
    print('Columns:  {}'.format(df.columns))

    # Display the data
    df.show()

    # Count Total Number of records in csv files in a given path
    path = '/tmp'
    count_total(spark, path)
コード例 #7
0
def initialize():
    global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    # print(columnName)
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # Getting user and their business count
    user_business = items.groupByKey().mapValues(set).collect()
    tuple_edge_list = []

    for i in range(0, len(user_business) - 1):
        for j in range(i + 1, len(user_business)):
            inter = user_business[i][1] & user_business[j][1]
            if len(inter) >= filterThreshold:
                tuple_edge_list.append(
                    (str(user_business[i][0]), str(user_business[j][0])))
                tuple_edge_list.append(
                    (str(user_business[j][0]), str(user_business[i][0])))

    totalEdges = float(len(tuple_edge_list) / 2)
    adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues(
        list).collectAsMap()
    adjacency_listMain = copy.deepcopy(adjacency_list)
    totalNodes = list(adjacency_list.keys())

    # ------------------------Newly added line------------------------
    strict_totalNodes = copy.deepcopy(totalNodes)
    # print(len(totalNodes))

    # ----------------------Part 1---------------------
    bfs(totalNodes, adjacency_list)
    print("Writing Betweenness to File....")

    # Converting into sorted List Initial Betweenness
    list_val = list(cost_dict.items())

    list_val.sort(key=lambda x: (-x[1], x[0]))
    writeToFile(list_val)
    totalNodes = copy.deepcopy(strict_totalNodes)
    # print(len(totalNodes))
    # ----------------------Part 2----------------------
    print("Creating Partitions....")
    create_components(list_val, adjacency_listMain, totalNodes, totalEdges)
    # ---------------------EoC---------------------------

    print("Duration: " + str(time.time() - t))
コード例 #8
0
def initialize():
    global sc, spark, items, inputfile
    print("Initializing...")
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    jsonread = sc.textFile(inputfile)
    items = jsonread.map(json.loads)
コード例 #9
0
def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set("spark.cores.max", "30")

    global sc
    global sql_context    

    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sql_context = SQLContext(sc)
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py")
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py")


    start_time = time.time()
    similarity_scores_df = compare_text()

    config = configparser.ConfigParser()
    config.read('../config/db_properties.ini')
    similarity_scores_df.write.jdbc(config['postgres']['url'], config['postgres']['table'], mode='overwrite', properties={'user': config['postgres']['user'], 'password': config['postgres']['password']})    

    end_time = time.time()
    print(colored("Spark MinHash run time (seconds): {0} seconds".format(end_time - start_time), "magenta"))
コード例 #10
0
# import pandas as pd
import numpy as np
import datetime
import pickle as pkl
# import xgboost
import os
import sys
import json
import time

kafka_topic = 'from-pubsub'
zk = '10.138.0.3:2181'
app_name = 'from-pubsub'  # Can be some other name
sc = SparkContext(appName="KafkaPubsub")
ssc = StreamingContext(sc, 30)
sc.setLogLevel("FATAL")

kafkaStream = KafkaUtils.createStream(ssc, zk, app_name, {kafka_topic: 1})


def getSparkSessionInstance(sparkConf):
    if ("sparkSessionSingletonInstance" not in globals()):
        globals()["sparkSessionSingletonInstance"] = SparkSession \
            .builder \
            .config(conf=sparkConf) \
            .getOrCreate()
    return globals()["sparkSessionSingletonInstance"]


#Dictionary of mapping between number and label, we got this from training code
A = {'label': ['NY', 'K', 'Q', 'BX', 'R']}
コード例 #11
0
ファイル: softmax.py プロジェクト: haixiaoxuan/code-python
    return ([float(i) for i in row.asDict()[feature_alias].split(",")], one_hot_label.tolist())

def extract_label_species(train_df,label_name):
    # 提取类别数
    label_type = [i.asDict()[label_name] for i in train_df.select(label_name).distinct().collect()]
    type_count = len(label_type)
    return label_type,type_count


# 解析命令行参数
parser = create_arg_parser()
args = parser.parse_args()

# 取得Spark配置中application运行配置的executors的数量
sc = SparkContext(conf=SparkConf().setAppName(args.app_name))
sc.setLogLevel("WARN")
hiveContext = HiveContext(sc)
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1

# 指定 parameter server的个数
num_ps = 1

# 如果命令行未指定tensorflow集群的大小,则采用spark配置中指定的executors数量作为集群的大小
if args.cluster_size == None:
    args.cluster_size = num_executors

print("args:", args)
print("{0} ===== Start".format(datetime.now().isoformat()))

label_name = args.label_name
コード例 #12
0
root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)
root.info("check")


PySpark => 

sc = SparkContext()
sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
logger.info('Hello Glue')

from awsglue.context import GlueContext
from pyspark.context import SparkContext



SparkScala => 

import com.amazonaws.services.glue.log.GlueLogger

object GlueApp {
  def main(sysArgs: Array[String]) {
コード例 #13
0
    files_to_load = []
    for path, subdirs, files in os.walk(data_dir):
        for f_name in files:
            if file_type == 'csv':
                if re.match(r'.*?\.csv$', f_name):
                    files_to_load.append(os.path.join(path, f_name))
            elif file_type in ('orc', 'parquet'):
                regex_string = rf'.*?\.{file_type}$'
                if re.match(regex_string, f_name):
                    files_to_load.append(os.path.join(path, f_name))
    return files_to_load


CONFIG = ConfigContext()
SPARK_CONTEXT = SparkContext('local')
SPARK_CONTEXT.setLogLevel(CONFIG.log_level.upper())
SPARK = SparkSession(SPARK_CONTEXT)  # pylint: disable=undefined-variable


def main():
    '''
    Main function
    '''
    files_to_load = read_files(CONFIG.data_dir, CONFIG.file_type)
    desc_string = 'Spark session can be accessed using "SPARK"'
    data_frame = ''
    if files_to_load:
        try:
            if CONFIG.file_type == 'csv':
                data_frame = SPARK.read.csv(files_to_load,
                                            header=True,
コード例 #14
0
def main():

    POSTGRES_URL = 'jdbc:postgresql://10.0.0.12:5432/postgres'

    # Configure spark SQL
    conf = (SparkConf()\
            .setAppName("Process")\
            .set("spark.executor.instances", "4")\
            .set("spark.driver.memory", "50g")\
            .set("spark.executor.memory", "6g"))

    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)
    spark = SparkSession.builder.appName('lead time predictor').getOrCreate()

    #create a list of cities available in S3
    city_list = get_city_list('hodabnb')

    for city in city_list:

        start = time.time()

        #create a list of calendar files available for each city
        file_list = get_object_list(city, 'calendar.csv', 'hodabnb')

        #for each city fetch all the calendar files from s3
        dfs = []
        for file_name in file_list:
            scrape_date = file_name.split('_')[1]
            path = 's3n://hodabnb/' + file_name
            dfs.append(spark.read.format('com.databricks.spark.csv')\
                                    .options(header='true', inferSchema='true')\
                                    .load(path)\
                                    .select('Listing_id', 'date' , 'available')\
                                    .withColumn("scrape_date",lit(scrape_date).cast(DateType())))

        #merge all and process as one
        df_all = reduce(DataFrame.unionAll, dfs)
        df_all = df_all.withColumn("date", df_all["date"].cast(DateType()))
        df_all = df_all.withColumn("listing_id",
                                   df_all["listing_id"].cast(IntegerType()))
        df_all = df_all.withColumn(
            'lead_time',
            when(df_all['available'] == 't',
                 datediff(df_all['date'],
                          df_all['scrape_date'])).otherwise(999))
        df_all = df_all.drop('scrape_date', 'available')
        df_all = df_all.dropDuplicates()
        df_all = df_all.groupBy('date', 'listing_id').agg({'lead_time': 'min'})
        df_city = df_all.withColumn("city", lit(city))

        #write to DB
        df_city.write.format("jdbc")\
                .option("url", POSTGRES_URL) \
                .option("dbtable", "leadtime_history") \
                .option("user", "postgres") \
                .option("password", "postgres") \
                .option("driver", "org.postgresql.Driver")\
                .mode("append")\
                .save()

        end = time.time()

        print("finished job for %s in %s sec" % (city, (end - start)))
コード例 #15
0
ファイル: HttpParser.py プロジェクト: RSkr/LogService
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import udf
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
java8_location = '/usr/lib/jvm/java-8-oracle'  # Set your own
os.environ['JAVA_HOME'] = java8_location

sc = SparkContext()
sc.setLogLevel(logLevel="ERROR")
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

class HttpParser:
    def __init__(self, filepath, range_days, test_mode=False):
        self.status_code = 404
        self.amount_to_list = 10
        self.logs_df = []
        self.logs_df_len = 0
        self.status_freq_df =[]
        self.status_freq_df_len = 0
        self.status_404 = 0

        self.testMode = test_mode
        self.filepath = filepath
コード例 #16
0
from __future__ import print_function
from pyspark.conf import SparkConf
from pyspark.context import SparkContext




config = SparkConf()
config.setAppName("SPARK_WORD_COUNT_JOB")
config.setMaster("local[*]")

sc = SparkContext(conf=config)
sc.setLogLevel("info")
text_file_rdd = sc.textFile("/home/dharshekthvel/history_1.txt")
flat_mapped_rdd=text_file_rdd.flatMap(lambda each: each.split(' '))
mapped_rdd = flat_mapped_rdd.map(lambda each: (each,1))
mapped_rdd.reduceByKey(lambda x,y: x+y)\
    .foreach(print)
コード例 #17
0
def initialize():
    global sc, spark, inputfile, t, items, validationfile, dictUid, dictBid, list_unaccounted, dict_code_uid, dict_code_bid, t, case
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task2")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # ------------Reading evaluation data-----------
    csvread2 = sc.textFile(validationfile)
    columnName2 = csvread2.first().split(',')
    validationData = csvread2.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName2)

    # calling case 3:
    if case == 3:
        implement_case3(items, validationData)
        print("Duration: " + str(time.time() - t))
        return
    # calling case 2:
    if case == 2:
        implement_case2(items, validationData)
        print("Duration: " + str(time.time() - t))
        return
    # Ending case 2

    # ------------PreProcessing data for training the mode-----------
    if case == 1:
        bid_uid = items.map(lambda u: (u[0], u[1]))

        keys = list(set(bid_uid.keys().collect()))
        values = list(set(bid_uid.values().collect()))

        dictUid = dict(zip(keys, range(0, len(keys))))
        dictBid = dict(zip(values, range(0, len(values))))
        for k, v in dictUid.items():
            dict_code_uid[v] = k

        for k, v in dictBid.items():
            dict_code_bid[v] = k

        ratings = items.map(lambda l: Rating(int(dictUid[l[0]]),
                                             int(dictBid[l[1]]), float(l[2])))

        # Training the model on train data
        rank = 2
        lambd = 0.5
        numIterations = 10
        model = ALS.train(ratings, rank, numIterations, lambd)

        print("Total entries in validation data: " +
              str(len(validationData.collect())))
        # ----------------------Creating a map with integer values for users and business on validation test set-----------------

        test_on_validation = validationData.map(lambda p: mapData(p))
        #
        validationRating = test_on_validation.filter(
            lambda p: (p[0] == 1)).map(lambda r: (r[1][0], r[1][1], r[1][2]))

        accountedPairs = test_on_validation.filter(lambda p: (p[0] == 1)).map(
            lambda r: (r[1][0], r[1][1]))

        UnaccountedPairs = test_on_validation.filter(lambda p: p[0] == 0).map(
            lambda r: ((r[1][0], r[1][1]), 2.75))

        # print("Accounted Pairs: "+str(len(accountedPairs.collect())))

        # print("Unaccounted Pairs: "+str(len(UnaccountedPairs.collect())))
        # print(test_on_validation.count())
        # print("Unaccounted Pairs: "+str(len(list_unaccounted)))

        # ----------------------Evaluate the model on training data----------------------
        # testdata = ratings.map(lambda p: (p[0], p[1]))
        # predictions = model.predictAll(testdata).map(8
        #     lambda r: ((r[0], r[1]), r[2]))
        # ratesAndPreds = ratings.map(lambda r: (
        #     (r[0], r[1]), r[2])).join(predictions)
        # MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

        # # import validation data

        # print("Mean Squared Error = " + str(MSE))

        # ----------------------Evaluate the model on testing data----------------------
        predictions = model.predictAll(accountedPairs).map(
            lambda r: ((r[0], r[1]), r[2]))
        # print(len(predictions.collect()))
        finalpred = predictions.union(UnaccountedPairs)
        # print(len(finalpred.collect()))
        # return
        # ratesAndPreds = validationRating.map(lambda r: (
        #     (r[0], r[1]), r[2])).join(predictions)
        ratesAndPreds = validationRating.map(
            lambda r: ((r[0], r[1]), r[2])).join(finalpred)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        writeToFile(finalpred)
        rmse = math.sqrt(MSE)
        print("Root Mean Squared Error = " + str(rmse))
    print("Duration: " + str(time.time() - t))
コード例 #18
0
import pyspark.sql.types as T
from pyspark.context import SparkContext
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import DenseVector
from pyspark.ml import Pipeline
from helper_1 import *
from multiprocessing import Process
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier

sc = SparkContext()
spark = SparkSession(sc)
sc.setLogLevel('FATAL')

#Creating Schema for the data to be loaded
schema = StructType([
    StructField('Age', IntegerType(), nullable=False),
    StructField('workclass', StringType(), nullable=False),
    StructField('fnlwgt', FloatType(), nullable=False),
    StructField('education', StringType(), nullable=False),
    StructField('education-num', FloatType(), nullable=False),
    StructField('marital', StringType(), nullable=False),
    StructField('occupation', StringType(), nullable=False),
    StructField('relationship', StringType(), nullable=False),
    StructField('race', StringType(), nullable=False),
    StructField('sex', StringType(), nullable=False),
    StructField('capital-gain', FloatType(), nullable=False),
    StructField('capital-loss', FloatType(), nullable=False),
コード例 #19
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job


args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()

## Set the Glue Logging level to Debug
sc.setLogLevel("DEBUG")

glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
コード例 #20
0
import pyspark
from pyspark.context import SparkContext
from pyspark import SparkConf

conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

# Load the adjacency list file
AdjList1 = sc.textFile("/home/rob/Assignment4/02AdjacencyList.txt")
print AdjList1.collect()

AdjList2 = AdjList1.map(
    lambda line: line)  # 1. Replace the lambda function with yours
AdjList3 = AdjList2.map(
    lambda x: x)  # 2. Replace the lambda function with yours
AdjList3.persist()
print AdjList3.collect()

nNumOfNodes = AdjList3.count()
print "Total Number of nodes"
print nNumOfNodes

# Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links
print "Initialization"
PageRankValues = AdjList3.mapValues(
    lambda v: v)  # 3. Replace the lambda function with yours
print PageRankValues.collect()

# Run 30 iterations
print "Run 30 Iterations"
コード例 #21
0
# from pyspark.sql.session import SparkSession
from pyspark.context import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')

# conf = (SparkConf()
#         .set("spark.debug.maxToStringFields", "140")
#         .set("spark.driver.memory", "15g")
#         .set('spark.executor.memory', '4g')
#         .set('spark.sql.codegen.fallback','true')
#         .set('spark.sql.codegen.wholeStage','false')
#         .set('spark.driver.maxResultSize', '10g'))

# sc.stop()
# sc = SparkContext(conf=conf)
sc.setLogLevel('OFF')
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
# sc = SparkContext('local')
spark = SparkSession(sc)
spark.sparkContext._conf.setAll([("spark.debug.maxToStringFields", "140"),
                                 ("spark.driver.memory", "15g"),
                                 ('spark.executor.cores', '4'),
                                 ('spark.cores.max', '4'),
                                 ('spark.executor.memory', '4g'),
                                 ('spark.sql.codegen.fallback', 'true'),
                                 ('spark.driver.maxResultSize', '10g')])
# %matplotlib inline
b64test = base64.b64decode(sys.argv[1])
print(b64test.decode())
sys.exit()
コード例 #22
0
import py4j
import pyspark
from pyspark.context import SparkContext

sc = SparkContext()
# Control our logLevel. This overrides any user-defined log settings.
# Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
sc.setLogLevel("FATAL")

text_file = sc.textFile(spark_home + "/README.md")
word_counts = text_file \
    .flatMap(lambda line: line.split()) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)
print word_counts.collect()
コード例 #23
0
def main():
    # Instantiate SparkConf and sent extraJavaOptions to both executors and drivers
    spark_conf = (SparkConf().set(
        'spark.executor.extraJavaOptions',
        '-Dcom.amazonaws.services.s3.enableV4=true').set(
            'spark.driver.extraJavaOptions',
            '-Dcom.amazonaws.services.s3.enableV4=true'))

    # Instantiate SparkContext based on SparkConf
    sc = SparkContext(conf=spark_conf)

    # Set enableV4 property to access S3 input data
    sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')

    # Create new Hadoop Configuration
    hadoopConf = sc._jsc.hadoopConfiguration()

    # Set Hadoop configuration K-V
    if is_not_blank(AWS_ACCESS_KEY_ID):
        hadoopConf.set('fs.s3a.awsAccessKeyId', AWS_ACCESS_KEY_ID)
    if is_not_blank(AWS_SECRET_ACCESS_KEY):
        hadoopConf.set('fs.s3a.awsSecretAccessKey', AWS_SECRET_ACCESS_KEY)
    hadoopConf.set('com.amazonaws.services.s3a.enableV4', 'true')
    hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')

    # Create SparkSession from SparkContext
    spark_session = (
        SparkSession(sc).builder.appName('ComplaintClassificator').config(
            conf=spark_conf).getOrCreate())

    # Timestamp of start
    start_timestamp = dt.now()

    # Instantiate SparkContext
    sc = spark_session.sparkContext

    # Instantiate SQLContext
    sql_ctx = SQLContext(sc)

    # Set log level to 'WARN'
    sc.setLogLevel('WARN')

    # Set up log4j logging
    log4j_logger = sc._jvm.org.apache.log4j
    logger = log4j_logger.LogManager.getLogger(__name__)

    # Create schema as a StructType of StructField(s)
    schema = StructType([
        StructField('ReceivedDate', StringType(), True),
        StructField('Product', StringType(), True),
        StructField('Subproduct', StringType(), True),
        StructField('Issue', StringType(), True),
        StructField('Subissue', StringType(), True),
        StructField('ConsumerComplaintNarrative', StringType(), True),
        StructField('CompanyPublicResponse', StringType(), True),
        StructField('CompanyName', StringType(), True),
        StructField('State', StringType(), True),
        StructField('ZipCode', IntegerType(), True),
        StructField('Tags', StringType(), True),
        StructField('IsConsumerConsent', StringType(), True),
        StructField('SubmittedVia', StringType(), True),
        StructField('SentDate', StringType(), True),
        StructField('CompanyResponseToConsument', StringType(), True),
        StructField('IsTimelyResponse', StringType(), True),
        StructField('IsConsumerDisputed', StringType(), True),
        StructField('ComplaintId', IntegerType(), True)
    ])

    logger.warn("Starting preprocessing and data cleansing...")

    # Read Consumer_Complaints.csv file and apply schema
    complaint_df = (spark_session.read.format('csv').option(
        'header',
        'true').option('delimiter', ',').option('mode', 'FAILFAST').option(
            'parserLib', 'univocity').option('escape', '"').option(
                'multiLine', 'true').option('inferSchema', 'false').schema(
                    schema).load(CONSUMER_COMPLAINTS).alias('complaint_df'))

    logger.warn("Explaining complaint_df...")
    complaint_df.explain()

    logger.warn("complaint_df has %d records, %d columns." %
                (complaint_df.count(), len(complaint_df.columns)))
    logger.warn("Printing schema of complaint_df: ")
    complaint_df.printSchema()

    # Register cleanse_files function as an UDF (UserDefinedFunction)
    udf_cleansed_field = udf(cleanse_field, StringType())

    # Provide a lambda function to format date-type field to 'YYYY-MM-DD' pattern
    change_data_format = udf(lambda x: dt.strptime(x, '%m/%d/%Y'), DateType())

    # Do some clean-up activities
    cleansed_df = (complaint_df.withColumn(
        'Issue', udf_cleansed_field(
            complaint_df['ConsumerComplaintNarrative'])).withColumn(
                'ReceivedDate',
                change_data_format(complaint_df['ReceivedDate'])))

    logger.warn("Explaining cleansed_df...")
    cleansed_df.explain()

    logger.warn("cleansed_init_df has %d records, %d columns." %
                (cleansed_df.count(), len(cleansed_df.columns)))
    logger.warn("Printing schema of cleansed_df: ")
    cleansed_df.printSchema()

    # Reduce a number of fields and filter non-null values out on consumer complaint narratives
    final_complaints_df = (cleansed_df.where(
        cleansed_df['ConsumerComplaintNarrative'].isNotNull()).select(
            'ComplaintId', 'ReceivedDate', 'State', 'Product',
            'ConsumerComplaintNarrative',
            'Issue').orderBy(cleansed_df['ReceivedDate']))

    final_complaints_df.registerTempTable("final_complaints_df")

    # Check random ConsumerComplaintNarrative as well as Issue content
    sql_ctx.sql(""" SELECT RowNum, ConsumerComplaintNarrative, Issue FROM
                    (SELECT ROW_NUMBER() OVER (PARTITION BY State ORDER BY ReceivedDate DESC) AS RowNum,
                        ConsumerComplaintNarrative,
                        Issue,
                        ReceivedDate,
                        State
                    FROM final_complaints_df) fc
                    WHERE RowNum = 1
                    LIMIT 10
                    """).show()

    logger.warn("Explaining final_complaints_df...")
    final_complaints_df.explain()

    logger.warn(
        "final_complaints has %d records, %d columns." %
        (final_complaints_df.count(), len(final_complaints_df.columns)))
    logger.warn("Printing schema of final_complaints_df: ")
    final_complaints_df.printSchema()

    # Read states json provider as a states_df DataFrame abstraction
    states_df = (spark_session.read.json(AMERICAN_STATES,
                                         multiLine=True).alias('states_df'))

    logger.warn("Explaining states_df...")
    states_df.explain()

    logger.warn("states_df has %d records, %d columns." %
                (states_df.count(), len(states_df.columns)))
    logger.warn("Printing schema of states_df: ")
    states_df.printSchema()

    # List of fields to drop (not needed for the further processing)
    drop_list = ['state', 'abbreviation']

    # Join complaints data with American states, apply id field and drop unnecessary fields
    joined_df = (final_complaints_df.join(
        broadcast(states_df),
        col('complaint_df.State') == col('states_df.abbreviation'),
        "left").withColumnRenamed('ConsumerComplaintNarrative',
                                  'ConsumerComplaint').withColumn(
                                      'RowNoIndex',
                                      monotonically_increasing_id()).select(
                                          'Product', 'ConsumerComplaint',
                                          'name').drop(*drop_list))

    joined_df.registerTempTable("joined_df")

    # Check random FullStateName content
    sql_ctx.sql(
        """ SELECT RowNum, Product, ConsumerComplaint, FullStateName FROM
                        (SELECT ROW_NUMBER() OVER (PARTITION BY Product ORDER BY ConsumerComplaint DESC) AS RowNum,
                            Product,
                            ConsumerComplaint,
                            name AS FullStateName
                        FROM joined_df) jd
                        WHERE RowNum = 1
                        LIMIT 10
                        """).show()

    logger.warn("Explaining joined_df...")
    joined_df.explain()

    logger.warn("joined_df has %d records, %d columns." %
                (joined_df.count(), len(joined_df.columns)))
    logger.warn("Printing schema of joined_df: ")
    joined_df.printSchema()

    # Check unique labels of Product attribute before replace
    joined_df.select('Product').distinct().show()

    # Replace redundant labels from Product field
    renamed_df = (joined_df.withColumn(
        'Product',
        regexp_replace(
            'Product',
            "Credit reporting, credit repair services, or other personal consumer reports",
            "Credit reporting, repair, or other")
    ).withColumn(
        'Product',
        regexp_replace("Product", "Virtual currency",
                       "Money transfer, virtual currency, or money service")
    ).withColumn(
        'Product',
        regexp_replace(
            "Product", "Money transfer",
            "Money transfer, virtual currency, or money service")).withColumn(
                'Product',
                regexp_replace(
                    "Product", "Payday loan",
                    "Payday loan, title loan, or personal loan")).withColumn(
                        'Product',
                        regexp_replace(
                            "Product", "Credit reporting",
                            "Credit reporting, repair, or other")).withColumn(
                                'Product',
                                regexp_replace(
                                    "Product", "Prepaid card",
                                    "Credit card or prepaid card")).withColumn(
                                        'Product',
                                        regexp_replace(
                                            "Product", "Credit card",
                                            "Credit card or prepaid card")))

    renamed_df.registerTempTable("renamed_df")

    # Check how many unique labels (classes) there are
    sql_ctx.sql(""" SELECT DISTINCT Product FROM renamed_df """).show()

    # Check how many times each class occurs in the corpus
    sql_ctx.sql(""" SELECT Product, count(*) 
    FROM renamed_df GROUP BY Product 
    ORDER BY count(*) DESC""").show(50, False)

    logger.warn("Explaining renamed_df...")
    renamed_df.explain()

    # Check unique labels of Product attribute after replace
    renamed_df.select('Product').distinct().show()

    # Check amount of unique labels of Product attribute after replace
    logger.warn(str(renamed_df.select('Product').distinct().count()))

    logger.warn("Starting feature extraction...")

    # Tokenize consumer complaints sentences
    tokenizer = Tokenizer(inputCol='ConsumerComplaint', outputCol='Words')

    # Remove stop words
    remover = StopWordsRemover(inputCol='Words', outputCol='FilteredWords')

    # num_features = 700
    hashing_tf = HashingTF(inputCol='FilteredWords', outputCol='RawFeatures')

    # minDocFreq: minimum number of documents in which a term should appear for filtering
    idf = IDF(inputCol='RawFeatures', outputCol='features')

    # Instantiate StringIndexer
    product_indexer = StringIndexer(inputCol='Product', outputCol='label')

    # Create a pipeline from previously defined feature extraction stages
    pipeline = Pipeline(
        stages=[tokenizer, remover, hashing_tf, idf, product_indexer])

    # Fit renamed_df to the pipeline
    pipeline_fit = pipeline.fit(renamed_df)

    # Transform pipeline_fit
    data_set = pipeline_fit.transform(renamed_df)

    # Randomly slice the data into training and test datasets with requested ratio
    (training_data, test_data) = data_set.randomSplit([0.7, 0.3], seed=100)

    # Cache training_data
    training_data.cache()

    logger.warn("Starting Naive-Bayes...")

    # Naive-Bayes
    nb = NaiveBayes(labelCol='label',
                    featuresCol='features',
                    modelType='multinomial')

    # Create a model without Cross Validation
    nb_model = nb.fit(training_data)

    # Make predictions on model without Cross Validation
    predictions = nb_model.transform(test_data)

    print("NaiveBayes without CV model type: ", nb.getModelType())
    print("NaiveBayes without CV smoothing factor: ", str(nb.getSmoothing()))

    # NB without CV metrics
    nb_metrics_rdd = MulticlassMetrics(predictions['label', 'prediction'].rdd)

    # NB stats by each class (label)
    labels = predictions.rdd.map(lambda cols: cols.label).distinct().collect()

    logger.warn("Printing NB stats...")

    for label in sorted(labels):
        try:
            print("Class %s precision = %s" %
                  (label, nb_metrics_rdd.precision(label)))
            print("Class %s recall = %s" %
                  (label, nb_metrics_rdd.recall(label)))
            print("Class %s F1 Measure = %s" %
                  (label, nb_metrics_rdd.fMeasure(label, beta=1.0)))
        except Py4JJavaError:
            pass

    # Weighted stats
    print("Weighted recall = %s" % nb_metrics_rdd.weightedRecall)
    print("Weighted precision = %s" % nb_metrics_rdd.weightedPrecision)
    print("Weighted F(1) Score = %s" % nb_metrics_rdd.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" %
          nb_metrics_rdd.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          nb_metrics_rdd.weightedFalsePositiveRate)

    # Show 10 results of predictions that haven't been predicted successfully
    predictions.filter(predictions['prediction'] != predictions['label']) \
        .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \
        .orderBy("probability", ascending=False) \
        .show(n=10, truncate=20)

    # Show 10 results of predictions that have been predicted successfully
    predictions.filter(predictions['prediction'] == predictions['label']) \
        .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \
        .orderBy("probability", ascending=False) \
        .show(n=10, truncate=20)

    # Instantiate an evaluation of predictions without Cross Validation
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")

    # Evaluate best model without an use of Cross Validation
    accuracy_without_cv = evaluator.evaluate(predictions)

    print("Naive-Bayes accuracy without Cross Validation = %s (metric)" %
          str(nb_metrics_rdd.accuracy))

    logger.warn("Starting Cross Validation...")

    # Instantiate ParamGridBuilder for the Cross Validation purpose
    nbp_params_grid = (ParamGridBuilder().addGrid(
        nb.smoothing,
        [0.8, 0.9, 1.0]).addGrid(hashing_tf.numFeatures,
                                 [700, 720]).addGrid(idf.minDocFreq,
                                                     [3, 4, 5]).build())

    # Instantiate the Evaluator of the model
    nb_evaluator = MulticlassClassificationEvaluator(
        labelCol='label', predictionCol='prediction')

    # Instantiate 5-fold CrossValidator
    nb_cv = CrossValidator(estimator=nb,
                           estimatorParamMaps=nbp_params_grid,
                           evaluator=nb_evaluator,
                           numFolds=5)

    # Create a model with Cross Validation
    nb_cv_model = nb_cv.fit(training_data)

    # Make predictions on model with Cross Validation
    cv_predictions = nb_cv_model.transform(training_data)

    # Evaluate best model with an use of Cross Validation
    accuracy_with_cv = nb_evaluator.evaluate(cv_predictions)

    print("Naive-Bayes accuracy with Cross Validation:", str(accuracy_with_cv))

    print(
        "Improvement for the best fitted model (NB with CV) in regard of NB: ",
        str(accuracy_with_cv - nb_metrics_rdd.accuracy))

    # NB with CV metrics
    nb_with_cv_metrics_rdd = MulticlassMetrics(
        cv_predictions['label', 'prediction'].rdd)

    # NB with CV stats by each class (label)
    labels = cv_predictions.rdd.map(lambda att: att.label).distinct().collect()

    logger.warn("Printing NB stats...")

    for label in sorted(labels):
        try:
            print("Class %s precision = %s" %
                  (label, nb_with_cv_metrics_rdd.precision(label)))
            print("Class %s recall = %s" %
                  (label, nb_with_cv_metrics_rdd.recall(label)))
            print("Class %s F1 Measure = %s" %
                  (label, nb_with_cv_metrics_rdd.fMeasure(label, beta=1.0)))
        except Py4JJavaError:
            pass

    # Print weighted stats
    print("Weighted recall = %s" % nb_with_cv_metrics_rdd.weightedRecall)
    print("Weighted precision = %s" % nb_with_cv_metrics_rdd.weightedPrecision)
    print("Weighted F(1) Score = %s" %
          nb_with_cv_metrics_rdd.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" %
          nb_with_cv_metrics_rdd.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          nb_with_cv_metrics_rdd.weightedFalsePositiveRate)

    # Show 10 results of cv_predictions that have been predicted successfully
    (cv_predictions.filter(
        cv_predictions['prediction'] == cv_predictions['label']).select(
            'Product', 'ConsumerComplaint', 'probability', 'label',
            'prediction').orderBy('probability',
                                  ascending=False).show(n=10, truncate=20))

    # Show 10 results of cv_predictions that haven't been predicted successfully
    (cv_predictions.filter(
        cv_predictions['prediction'] != cv_predictions['label']).select(
            'Product', 'ConsumerComplaint', 'probability', 'label',
            'prediction').orderBy('probability',
                                  ascending=False).show(n=10, truncate=20))

    # Timestamp of end
    end_timestamp = dt.now()

    # Print elapsed time
    print("Elapsed time: %s" % str(end_timestamp - start_timestamp))

    # Stop SparkSession
    spark_session.stop()
def initialize():
    global sc, spark, items, inputfile, t, m, gidDict, bids, hashedList, n, b, r, candidateTuple, listvala, listvalb
    print("Initializing...")

    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)
    #  column name is userid, businessid, starts

    userids = list(set(sorted(items.keys().collect())))
    k = 0
    for user in userids:
        if (user not in gidDict):
            gidDict[user] = k
            k = k + 1
    # print(k)

    bids = list(set(sorted(items.values().collect())))
    # bids = copy.copy(sorted(bids))
    # print(len(bids))
    m = len(userids)
    listvala = random.sample(range(1, m), n)
    listvalb = random.sample(range(1, m), n)
    bid_uid = items.map(lambda x: ((x[1], x[0]), 1)).reduceByKey(
        lambda x, y: x + y).map(lambda x: (x[0])).groupByKey().mapValues(list)

    bid_uid_hashed = bid_uid.map(lambda x: initialHash(x))

    dict_uniques = {}
    for each in bid_uid.collect():
        dict_uniques[each[0]] = set(each[1])

    bid_uid_hashed2 = bid_uid_hashed.map(lambda x: hashing(x))
    # print(bid_uid_hashed2.first())

    # creating signature matrix column per business IDs
    start = 0
    end = r
    tempSim = []
    finalList = []
    hashedListSet = bid_uid_hashed2.collect()
    length = len(hashedListSet)
    c = 1
    print("Finding similar pairs...")

    dictionEvery = {}

    while (end <= n):
        tempDict = []
        for each in hashedListSet:
            templist = sorted(each[1][start:end])
            tempDict.append((tuple(templist), each[0]))
            # tempDict.append((tuple(each[1][start:end]), each[0]))
        dictionEvery[c] = tempDict
        c = c + 1
        start = end
        end = end + r
    dictionaryCheck = {}

    # for i in range(1, b+1):
    #     dictionaryCheck = {}
    #     for i in range(0, )

    length = len(dictionEvery[1])
    candidateset = []
    candidateTuple = []
    print("Working on Bands 1 to 40 ")
    for i in range(1, b + 1):
        justBid = []
        dictionBand = dictionEvery[i]
        # print("Working on Band: "+str(i))
        mapper = sc.parallelize(dictionBand).groupByKey().mapValues(
            list).filter(lambda x: (len(x[1]) > 1))

        justBid = mapper.map(lambda c: c[1]).collect()
        candidateTuple.append(justBid)
        # print(justBid)

    # print(len(candidateTuple[0]))
    # print(len(candidateTuple[1]))
    # print((candidateTuple[1]))

    candidateset = (candidateTuple)  # it was list(set(candidateTuple))

    candidatepairs = []
    count = 0
    for each in candidateset:
        for e in each:
            l1 = list(combinations(sorted(e), 2))
            candidatepairs.extend(l1)

    candPairSet = []  # set()

    candPairSet = (candidatepairs)  # it was list(set(candidatepairs))
    lines = []
    print("Finding final Jaccard Simmilarity")
    finalPairs = []
    for each in candPairSet:
        set1 = dict_uniques[each[0]]
        set2 = dict_uniques[each[1]]
        inter = set1 & set2
        # print(len(inter), len(set1), len(set2))
        jaccard = (float(len(inter))) / (float(len(set1.union(set2))))
        # print(jaccard)
        if (jaccard >= 0.5):
            # print(jaccard)
            lines.append([each[0], each[1], jaccard])
            finalPairs.append(each)

    # print(len(list(set(finalPairs))))
    # print(len((finalPairs)))
    answer = writeToFile(lines)
    # calculatingPreRec(lines)
    print("Total Items Printed: " + str(answer))
    print("Duration: " + str(time.time() - t))
コード例 #25
0
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split, max
sc = SparkContext('local')
sc.setLogLevel("OFF")
spark = SparkSession(sc)
# Path to our 20 JSON files
inputPath = "hdfs://localhost:9000/stream/"
#inputPath = "./stream/"
# Explicitly set schema
schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Lang", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Len", StringType(), True),
    StructField("Likes", StringType(), True),
    StructField("RTs", StringType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", StringType(), True),
    StructField("Friends", StringType(), True)
])

inputDF = spark.readStream.schema(schema).option("delimiter", ";").option(
    "maxFilesPerTrigger", 1).csv(inputPath)
コード例 #26
0
    shifts = running_word_prices.transform(to_shifts)

    # Print the results
    shifts.foreachRDD(print_shifts)
            
if __name__ == "__main__":

    if len(sys.argv) >= 2 and sys.argv[1] == "test":
        # Run the tests
        del sys.argv[1]

        conf = SparkConf().set("spark.default.parallelism", 1)

        sc = SparkContext(appName='unit_test', conf=conf)

        sc.setLogLevel("WARN")

        sc.setCheckpointDir("/tmp")

        unittest.main()

        sc.stop()

    else:
        # Run the main()
        sc = SparkContext(appName="BoostWords")

        sc.setLogLevel("WARN")

        ssc = StreamingContext(sc, 5)