Python SparkSession Exemples, pyspark.sql.session.SparkSession Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : snappysession.py Projet : SnappyDataInc/snappydata

    def __init__(self, sparkContext, jsparkSession=None):
        """Creates a new SnappySession.
        """
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        SparkSession.__init__(self, sparkContext)
        if jsparkSession is None:
            jsparkSession = self._jvm.SnappySession(self._jsc.sc())

        self._jsparkSession = jsparkSession

Exemple #2

0

Afficher le fichier

Fichier : ms3.py Projet : alcarinn/cs449_ms3

import re
import sys
import datetime
# configure spark variables
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession

sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

# load up other dependencies

manager_log = sys.argv[1]
application_log = sys.argv[2]
start = sys.argv[3]
end = sys.argv[4]

# lines = sc.textFile(manager_log)
# app_lines = sc.textFile(application_log)
#
# app_ids_filter = app_lines.filter(lambda x: re.search(r'1580812675067_\d+', x))\
#                  .map(lambda x: re.search(r'1580812675067_\d+', x).group(0))\
#                  .filter(lambda x: (int(re.search(r'1580812675067_(\d+)', x).group(1)) <= int(end)) & (int(re.search(r'1580812675067_(\d+)', x).group(1)) >= int(start)))
#
# app_ids_filter_list = app_ids_filter.distinct().collect()
#
# regex_ids_filter = re.compile("|".join(re.escape(app_id) for app_id in app_ids_filter_list))
#
# filtered_lines = lines.filter(lambda x: regex_ids_filter.search(x)).cache()

Exemple #3

0

Afficher le fichier

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row

sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-*.csv").map( \
    lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4], forcount=1))

print train_set.take(5)
print "finish map input"

# get url show click

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

sql_query = """
SELECT urlid, label, forcount
FROM train_set
"""

Exemple #4

0

Afficher le fichier

OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

from pyspark.sql.types import IntegerType, StringType, StructType, StructField 
import pyspark.sql.functions as F

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set("spark.local.dir", SPARK_TEMP_FOLDER)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

print('Loading data...')

events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER + "events.csv") \

Exemple #5

0

Afficher le fichier

Fichier : Kmeans Clustering.py Projet : anirudh-11/code

# In[6]:


from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler


# Creating spark context and starting a session

# In[7]:


sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# Reading the data

# In[8]:


lines = sc.textFile("F:\Docs\Big data\Assignment\Assignmnet 4\Dataset\pumsb.dat")


# creating a 2d list from the data read. We are skipping the first attribute.

# In[48]:

Exemple #6

0

Afficher le fichier

from pyspark.sql.types import StringType, StructType, StructField
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split, max
sc = SparkContext('local')
sc.setLogLevel("OFF")
spark = SparkSession(sc)
# Path to our 20 JSON files
inputPath = "hdfs://localhost:9000/stream/"
#inputPath = "./stream/"
# Explicitly set schema
schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Lang", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Len", StringType(), True),
    StructField("Likes", StringType(), True),
    StructField("RTs", StringType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", StringType(), True),
    StructField("Friends", StringType(), True)
])

inputDF = spark.readStream.schema(schema).option("delimiter", ";").option(
    "delimiter", ";").option("maxFilesPerTrigger", 1).csv(inputPath)

Exemple #7

0

Afficher le fichier

Fichier : Peiying_Lyu_q4.py Projet : LyuPallu/USC-551-database

import re
import sys
import os
from pyspark import SparkContext
from collections import defaultdict
from operator import add
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.types import *
# sc.stop()

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

input_1 = sys.argv[1]
input_2 = sys.argv[2]
input_3 = sys.argv[3]

output_ = sys.argv[4]
f = open(output_,'w')

line1 = sc.textFile(input_1)
line2 = sc.textFile(input_2)
line3 = sc.textFile(input_3)

#-------------------------------------------------------------
ds1 = line1.map(lambda x: x.split(",")).collect()
ds1 = sc.parallelize(ds1)
sch1 = StructType([StructField("Drinker", StringType(), True),
                                    StructField("Beer", StringType(), True)])
df1 = spark.createDataFrame(ds1, sch1)

Exemple #8

0

Afficher le fichier

Fichier : BDM_FinalChallenge.py Projet : Mocha22ol/BDM

                
            try:
                house_number_ticket = cleanTuple(tuple((row[23].split('-'))))
                year = int(row[4][-4:])
            except:
                continue
            if house_number_ticket and borough_ticket and street_name_ticket and (year in range(2015,2020)):
                yield(year, house_number_ticket[0], house_number_ticket[1], borough_ticket, street_name_ticket, house_number_ticket[1]%2 == 0)

if __name__=='__main__':
    t = time.localtime()
    current_time = time.strftime("%H:%M:%S", t)
    print("***START***")
    print(current_time)
    sc = SparkContext()
    spark = SparkSession(sc)

    tickets = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/')
    #loading parking tickets and creating dataframe
    parking_ticket_clean = tickets.mapPartitionsWithIndex(processTickets)
    parking_tickets_df = spark.createDataFrame(parking_ticket_clean, ('year','house_number_1','house_number_2' ,'boro','street_name','even_flag'))
    
    #loading centerline segments with name and label
    centerlines = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv')
    centerline_all = centerlines.mapPartitionsWithIndex(processCenterline)

    #get full list of centerline physicalID and create dataframe 
    centerline_full_id_only = centerlines.mapPartitionsWithIndex(getPhysicalID).distinct()
    centerline_base = spark.createDataFrame(centerline_full_id_only, ('ID','dummy'))

    #stacking centerline name + label but only keep the distinct values, save into a dataframe

Exemple #9

0

Afficher le fichier

def InitSpark():
    # Creating spark context and starting a session
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)
    return spark, sc

Exemple #10

0

Afficher le fichier

Fichier : N5.py Projet : Chenyifei-FDU/spark_data-analysis


def get_month(date):
    try:
        return int(date.split('/')[-2])
    except:
        return 0


if __name__ == '__main__':
    conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \
        .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \
        .set('spark.num-executors', '3')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('WARN')
    spark = SparkSession(sc)

    data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt')
    data = data.map(lambda x: x.split('\t'))
    data = data.map(lambda line: (line[0], get_month(line[8]), line[11]))

    schema = StructType([
        StructField('uid', StringType(), False),
        StructField('month', StringType()),
        StructField('city', StringType(), True)
    ])

    table = spark.createDataFrame(data, schema)
    table.createOrReplaceTempView('Table')

    top10city = spark.sql(

Exemple #11

0

Afficher le fichier

 def __init__(self, sparkContext, jhiveContext=None):
     if jhiveContext is None:
         sparkSession = SparkSession.withHiveSupport(sparkContext)
     else:
         sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)

Exemple #12

0

Afficher le fichier

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext('local')
spark = SparkSession(sc)

lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]),
                                     movieId=int(p[1]),
                                     rating=float(p[2]),
                                     timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",

Exemple #13

0

Afficher le fichier

Fichier : test_parquet_udt.py Projet : da-tubi/rikai

def test_list_of_structs(spark: SparkSession, tmp_path: Path):
    test_dir = str(tmp_path)
    schema = StructType([
        StructField("id", IntegerType(), False),
        StructField(
            "anno",
            ArrayType(
                StructType([
                    StructField("label_id", IntegerType(), False),
                    StructField("label", StringType(), False),
                    StructField("bbox", ArrayType(IntegerType()), False),
                ])),
            False,
        ),
    ])
    df = spark.createDataFrame(
        [
            {
                "id":
                1,
                "anno": [
                    {
                        "label": "cat",
                        "label_id": 1,
                        "bbox": [1, 2, 3, 4]
                    },
                    {
                        "label": "dog",
                        "label_id": 2,
                        "bbox": [10, 23]
                    },
                ],
            },
            {
                "id":
                2,
                "anno": [
                    {
                        "label": "bug",
                        "label_id": 3,
                        "bbox": [100, 200]
                    },
                    {
                        "label": "aaa",
                        "label_id": 4,
                        "bbox": [-1, -2, -3]
                    },
                ],
            },
        ],
        schema=schema,
    )
    df.repartition(1).write.mode("overwrite").format("rikai").save(test_dir)

    records = _read_parquets(test_dir)
    for expect, actual in zip(
        [
            {
                "id":
                1,
                "anno": [
                    {
                        "label": "cat",
                        "label_id": 1,
                        "bbox": np.array([1, 2, 3, 4], dtype=np.int32),
                    },
                    {
                        "label": "dog",
                        "label_id": 2,
                        "bbox": np.array([10, 23], dtype=np.int32),
                    },
                ],
            },
            {
                "id":
                2,
                "anno": [
                    {
                        "label": "bug",
                        "label_id": 3,
                        "bbox": np.array([100, 200], dtype=np.int32),
                    },
                    {
                        "label": "aaa",
                        "label_id": 4,
                        "bbox": np.array([-1, -2, -3], dtype=np.int32),
                    },
                ],
            },
        ],
            records,
    ):
        assert expect["id"] == actual["id"]
        assert len(expect["anno"]) == len(actual["anno"])
        assert np.array_equal(expect["anno"][0]["bbox"],
                              actual["anno"][0]["bbox"])

Exemple #14

0

Afficher le fichier

Fichier : N7.py Projet : Chenyifei-FDU/spark_data-analysis

# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window, functions

if __name__ == '__main__':
    conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \
        .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \
        .set('spark.num-executors', '3')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('WARN')
    spark = SparkSession(sc)

    data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt')
    data = data.map(lambda x: x.split('\t'))
    data = data.map(lambda line: (line[0], line[2], line[11]))

    schema = StructType([
        StructField('uid', StringType(), False),
        StructField('name', StringType()),
        StructField('city', StringType(), True)
    ])

    table = spark.createDataFrame(data, schema)
    table.createOrReplaceTempView('Table')
    spark.sql('''
    select * from
            (select city,name,nb_name,rank() over(partition by city order by nb_name desc) as rk
            from

Exemple #15

0

Afficher le fichier

Fichier : als_test.py Projet : luckyapplehead/picfeed

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map( \
    lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4]))

print train_set.take(5)
print "finish map input"

# get url show click

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

sql_query = """
SELECT uid, urlid, label
FROM train_set
"""

Exemple #16

0

Afficher le fichier

Fichier : kmeans_calculator.py Projet : SmiteyJonawar/SDU-M1-DS-GRP3

from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as F
from normalize import get_min_max
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import UserDefinedFunction
import json
import os
from pyspark import SparkConf, SparkContext

conf = SparkConf().set('spark.executor.memory',
                       '4g').set('spark.driver.memory',
                                 '8g')  # needs more memory
print(conf.toDebugString())
sc = SparkContext(appName='Clustering', conf=conf).getOrCreate()
spark = SparkSession(sc)

# Loads data.
df = spark.read.option("header", "true").csv("/user/root/data/*.csv")
df_notnull = df.filter(
    F.col("lon").isNotNull() & F.col("lat").isNotNull()
    & F.col('P1').isNotNull() & F.col('timestamp').isNotNull())
df = df_notnull
df_timestamp = df.withColumn('timestamp', df['timestamp'].substr(1, 7))
df = df_timestamp
timestamp = df.collect()[0][5]

features = ['P1', 'lon', 'lat']
vector_assembler = VectorAssembler(inputCols=features, outputCol="features")

# Cast feature columns to double

Exemple #17

0

Afficher le fichier

import pandas as pd
import json
from pandas import DataFrame
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

#Create Pandas Dataframe
companies_data = pd.read_json('test.json')
dataFrame = pd.DataFrame(companies_data)
print(dataFrame.T)

#manipulate Pandas dataframe and write back to json file

change_companies_data = dataFrame.loc['exchange_code']= 123456
json_rewrite = dataFrame.to_json('test.json')
dataFrameTranspose = dataFrame.T

#Convert pandas dataframe to spark dataframe
spark_dataframe = spark.createDataFrame(dataFrameTranspose.astype(str))
spark_dataframe.show()

#Convert Spark dataframe to pandas dataframe 
spark_to_panda_dataframe = spark_dataframe.toPandas()
print(spark_to_panda_dataframe)

Exemple #18

0

Afficher le fichier

Fichier : final1.py Projet : jianangong/bdm_final

    import numpy as np
    x, y = np.array(x), np.array(y)
    xm = np.mean(x)
    ym = np.mean(y)
    numer = sum((x - xm)**2)
    denomi = sum((y - ym) * (x - xm))
    coef = denomi / numer

    return coef


if __name__ == "__main__":
    start_time = time.time()
    output = sys.argv[1]
    sc = SparkContext()
    spark = SparkSession(sc)

    street1 = sc.textFile(
        'hdfs:///tmp/bdm/nyc_cscl.csv').mapPartitionsWithIndex(processStreet)
    violation = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/'
                            ).mapPartitionsWithIndex(processViolation)

    viola = spark.createDataFrame(
        violation, ('year', 'street', 'boro', 'house_number', 'is_left'))
    stre = spark.createDataFrame(
        street1, ('physicalID', 'street', 'boro', 'low', 'high', 'is_left'))
    stre = stre.distinct()
    filtering = [
        viola.boro == stre.boro,
        viola.street == stre.street, viola.is_left == stre.is_left,
        (viola.house_number >= stre.low) & (viola.house_number <= stre.high)

Exemple #19

0

Afficher le fichier

Fichier : FinalChallenge.py Projet : lyc1005/Big-Data-Management-and-Analysis

        return "\"{}\"".format(i)
    else:
        return str(i)


def to_csv(rdd):
    li = map(process, rdd)
    return ','.join(li)


if __name__ == "__main__":
    start_time = time.time()
    output = sys.argv[1]

    sc = SparkContext()
    spark = SparkSession(sc)

    centerline = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv')

    rdd_cl = centerline.mapPartitionsWithIndex(processCenterline)

    violations = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/')

    rdd_v = violations.mapPartitionsWithIndex(processViolation)

    v = spark.createDataFrame(rdd_v,
                              ('year', 'house', 'street', 'boro', 'is_left'))

    cl = spark.createDataFrame(
        rdd_cl, ('pysicalID', 'street', 'boro', 'low', 'high', 'is_left'))

Exemple #20

0

Afficher le fichier

def main():
    sc = SparkContext("local", "dataframe app")
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)

    #load the retail dataset
    retail_data = spark.read.option("inferSchema", "true").option(
        "header", "true"
    ).option("timestampFormat", "dd/M/yyyy H:mm").csv(
        "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial02/online-retail-dataset.csv"
    )
    retail_data.show()

    #Question 1
    #How many orders did customers perform at which hour?

    # a) SQL
    retail_data.createOrReplaceTempView("retailTable")

    result = spark.sql("""
    SELECT hour(InvoiceDate) as InvoiceHour, count(distinct InvoiceNo) as NoInvoices
    FROM retailTable
    GROUP BY InvoiceHour
    ORDER BY InvoiceHour
    """)
    result.show()

    # b) Spark
    result = retail_data.selectExpr(
        "hour(InvoiceDate) as InvoiceHour",
        "InvoiceNo").distinct().groupBy("InvoiceHour").agg(
            f.expr("count(InvoiceNo) as NoInvoices")).orderBy("InvoiceHour")
    result.show()

    #Question 2
    #How frequently was each product bought in the different countries?

    # a) SQL
    df_selection = retail_data.selectExpr("Country", "StockCode", "Quantity")
    df_nonull = df_selection.na.replace(
        [""], ["UNKNOWN"], "StockCode").na.replace([""], ["UNKNOWN"],
                                                   "Country").na.drop("any")
    df_nonull.createOrReplaceTempView("retailNoNull")

    result = spark.sql("""
    SELECT Country, StockCode, sum(Quantity) as Quantity
    FROM retailNoNull
    GROUP BY Country, StockCode
    GROUPING SETS ((Country, StockCode), (Country), (StockCode), ())
    ORDER BY Country, StockCode
    """)
    result.show()

    # b) Spark
    result = df_nonull.cube("Country", "StockCode").agg(
        f.sum("Quantity").alias("Quantity")).orderBy(f.col("Country"),
                                                     f.col("StockCode"))
    result.show()

    result.coalesce(1).write.format("csv").option("header", "true").save(
        "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial03/frequencies"
    )
    sc.stop()

Exemple #21

0

Afficher le fichier

Fichier : mix_pyspark.py Projet : zhouyan8603/Linkis

        intp.saveDFToCsv(
            df._jdf, path, hasheader, isOverwrite,
            MapConverter().convert(option, gateway._gateway_client))
    else:
        print(str(df))


java_import(gateway.jvm, "scala.Tuple2")

jsc = intp.getJavaSparkContext()
jconf = intp.getSparkConf()
conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
sqlc = HiveContext(sc, intp.sqlContext())
sqlContext = sqlc
spark = SparkSession(sc, intp.getSparkSession())

##add pyfiles
try:
    pyfile = sys.argv[5]
    pyfiles = pyfile.split(',')
    for i in range(len(pyfiles)):
        if "" != pyfiles[i]:
            sc.addPyFile(pyfiles[i])
except Exception as e:
    print("add pyfile error: " + pyfile)


class UDF(object):
    def __init__(self, intp, sqlc):
        self.intp = intp

Exemple #22

0

Afficher le fichier

Fichier : Gradient_Boost.py Projet : anjalibaldawa/Outbrain-Ad-Prediction

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier


# COMMAND ----------

# MAGIC %md #####The below command will start spark session when we run our file in oracle BDE. In Databricks keep this cell as False by default. But when you run file in Oracle BDE make it True.

# COMMAND ----------

#This 
IS_SPARK_SUBMIT_CLI = False
if IS_SPARK_SUBMIT_CLI:
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)

# COMMAND ----------

# MAGIC %md ### Load Source Data
# MAGIC The data for this project is provided as a CSV file containing details of advertisement. The data includes specific characteristics (or *features*) for each ad, as well as a *label* column indicating whether the ad was clicked or not.
# MAGIC 
# MAGIC You will load this data into a DataFrame and display it.

# COMMAND ----------

# MAGIC %md #####Reading all necessary csv file

# COMMAND ----------

if IS_SPARK_SUBMIT_CLI:

Exemple #23

0

Afficher le fichier

Fichier : BDM_Final_Li_v2.py Projet : Aleexy/BDM_Final

def main(sc):
    spark = SparkSession(sc)
    sqlContext = SQLContext(sc)

    years = ['2015', '2016', '2017', '2018', '2019']
    def parseCSV(idx, part):
        if idx==0:
            next(part)
        for p in csv.reader(part):
            if p[23].isalpha() or p[24] == '' or p[21] == '' or p[23] == '' or p[4][-4:] not in years:
                continue
            if '-' in p[23]:
                yield(p[23].split('-')[0], p[23].split('-')[1], p[24].lower(), p[21], p[4][-4:])
            else:
                yield(p[23], '', p[24].lower(), p[21], p[4][-4:])

    rows = sc.textFile('/data/share/bdm/nyc_parking_violation/*.csv', use_unicode=True).mapPartitionsWithIndex(parseCSV)

    df = sqlContext.createDataFrame(rows, ('House Number', 'HN Compound', 'Street Name', 'County', 'Date'))

    map_NY = (col("County")=='NY')|\
            (col("County")=='MAN')|\
            (col("County")=='MH')|\
            (col("County")=='MN')|\
            (col("County")=='NEWY')|\
            (col("County")=='NEW Y')

    map_BX = (col("County")=='BRONX')|\
            (col("County")=='BX')

    map_BK = (col("County")=='BK')|\
            (col("County")=='K')|\
            (col("County")=='KING')|\
            (col("County")=='KINGS')

    map_QN = (col("County")=='Q')|\
            (col("County")=='QN')|\
            (col("County")=='QNS')|\
            (col("County")=='QU')|\
            (col("County")=='QUEEN')

    map_R = (col("County")=='R')|\
            (col("County")=='RICHMOND')

    df = df.withColumn("County", when(map_NY, '1')
                                .when(map_BX, '2')
                                .when(map_BK, '3')
                                .when(map_QN, '4')
                                .when(map_R, '5')
                                .otherwise('')).where(col('County')!='')
    df = df.withColumn("House Number", df["House Number"].cast('int'))
    df = df.withColumn("HN Compound", df["HN Compound"].cast('int'))

    def parseCL(idx, part):
        if idx==0:
            next(part)
        for p in csv.reader(part):
            LL_HN = p[2]
            LL_HNC = ''
            LH_HN = p[3]
            LH_HNC = ''
            if '-' in p[2] and '-' in p[3]:
                LL_HN = p[2].split('-')[0]
                LL_HNC = p[2].split('-')[1]
                LH_HN = p[3].split('-')[0]
                LH_HNC = p[3].split('-')[1]

            RL_HN = p[4]
            RL_HNC = ''
            RH_HN = p[5]
            RH_HNC = ''
            if '-' in p[4] and '-' in p[5]:
                RL_HN = p[4].split('-')[0]
                RL_HNC = p[4].split('-')[1]
                RH_HN = p[5].split('-')[0]
                RH_HNC = p[5].split('-')[1]
            yield(p[0], p[28].lower(), p[29].lower(), p[13], LL_HN, LL_HNC, LH_HN, LH_HNC, RL_HN, RL_HNC, RH_HN, RH_HNC)

    rows = sc.textFile('/data/share/bdm/nyc_cscl.csv', use_unicode=True).mapPartitionsWithIndex(parseCL)

    centerline = sqlContext.createDataFrame(rows, ('ID', 'full street', 'st label', 'borocode', 'LL_HN', 'LL_HNC', 'LH_HN', 'LH_HNC', 'RL_HN', 'RL_HNC', 'RH_HN', 'RH_HNC'))
    centerline = centerline.withColumn("LL_HN", centerline["LL_HN"].cast('int'))
    centerline = centerline.withColumn("LH_HN", centerline["LH_HN"].cast('int'))
    centerline = centerline.withColumn("RL_HN", centerline["RL_HN"].cast('int'))
    centerline = centerline.withColumn("RH_HN", centerline["RH_HN"].cast('int'))
    centerline = centerline.withColumn("LL_HNC", centerline["LL_HNC"].cast('int'))
    centerline = centerline.withColumn("LH_HNC", centerline["LH_HNC"].cast('int'))
    centerline = centerline.withColumn("RL_HNC", centerline["RL_HNC"].cast('int'))
    centerline = centerline.withColumn("RH_HNC", centerline["RH_HNC"].cast('int'))
    print('Data loaded')
    cond1 = (df['Street Name'] == centerline['full street'])
    cond2 = (df['Street Name'] == centerline['st label'])
    cond3 = (df['County'] == centerline['borocode'])
    cond4 = (df['House Number'] % 2 == 1)
    cond5 = (df['House Number'] >= centerline['LL_HN']) & (df['House Number'] <= centerline['LH_HN'])
    cond6 = (df['House Number'] % 2 == 0)
    cond7 = (df['House Number'] >= centerline['RL_HN']) & (df['House Number'] <= centerline['RH_HN'])
    cond8 = cond4 & cond5
    cond9 = cond6 & cond7

    hnc_cond1 = (df['HN Compound'].isNotNull())
    hnc_cond2 = (df['HN Compound'].isNull())
    hnc_cond3 = ((df['HN Compound'] >= centerline['LL_HNC']) & (df['HN Compound'] <= centerline['LH_HNC']))
    hnc_cond4 = ((df['HN Compound'] >= centerline['RL_HNC']) & (df['HN Compound'] <= centerline['RH_HNC']))


    cond10 = (hnc_cond2 & (cond8|cond9))
    cond11 = (hnc_cond1 & (cond8|cond9) & (hnc_cond3|hnc_cond4))

    joined = df.join(centerline, ((cond1|cond2) & cond3 & (cond10|cond11)), "inner")
    joined = joined.select(col('ID'), col('Date'))
    count_df = joined.groupBy(['ID', 'Date']).pivot('Date').count().drop('Date')
    print('Table pivoted')
    allID = centerline.select(col('ID')).dropDuplicates()
    result = allID.join(count_df, on=["ID"], how='outer').na.fill(0)

    marksColumns = [col('2015'), col('2016'), col('2017'), col('2018'), col('2019')]
    diff_x = [-2, -1, 0, 1, 2]

    average_func = sum(x for x in marksColumns)/len(marksColumns)
    result = result.withColumn("avg", average_func)
    ols_func = sum(diff*(y - col('avg')) for diff, y in zip(diff_x, marksColumns))/10
    coef = result.withColumn("OLS_COEF", ols_func).drop('avg')

    coef.rdd.map(writeToCSV).saveAsTextFile(sys.argv[1])

Exemple #24

0

Afficher le fichier

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import array

sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"

fieldSchema = StructType([
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term", StringType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("ts", LongType(), True),
    StructField("uid", LongType(), True),
    StructField("urlid", LongType(), True)
])

Exemple #25

0

Afficher le fichier

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
sc = SparkContext('local')
spark = SparkSession(sc)
fieldSchema = StructType([
    StructField("ctr", DoubleType(), True),
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term_score", DoubleType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("uid", LongType(), True)
])
print "begin to map input"
train_set = spark.read.csv(
    "gs://dataproc-1228d533-ffe2-4747-a056-8cd396c3db5f-asia-southeast1/data/picfeed/train_feature_compose_new/part-*",
    schema=fieldSchema)

Exemple #26

0

Afficher le fichier

import findspark
findspark.init()

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

df = spark.createDataFrame([
    (
        7,
        Vectors.dense([0.0, 0.0, 18.0, 1.0]),
        1.0,
    ),
    (
        8,
        Vectors.dense([0.0, 1.0, 12.0, 0.0]),
        0.0,
    ),
    (
        9,
        Vectors.dense([1.0, 0.0, 15.0, 0.1]),
        0.0,
    ),
], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1,
                         featuresCol="features",
                         outputCol="selected",

Exemple #27

0

Afficher le fichier

Fichier : construct_feature_all_tmp3_test.py Projet : luckyapplehead/picfeed

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import array
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
fieldSchema = StructType([
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term", StringType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("ts", LongType(), True),
    StructField("uid", LongType(), True),
    StructField("urlid", LongType(), True),
    StructField("user_s_term", StringType(), True)
])
train_set_join_user_model = spark.read.csv(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_join_user_model/part-00000-59d90ec7-6a27-4356-901d-ea40b3333c49-c000.csv",
    schema=fieldSchema)

Exemple #28

0

Afficher le fichier

Fichier : minimalFirstCovid19.py Projet : SamuelTribeUK/pysparkCoursework

# Samuel Tribe - 201318996 - [email protected]
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession
from pyspark.sql.types import DateType
conf = SparkConf().setAppName("covid19").setMaster("local")
spark = SparkSession(SparkContext(conf=conf))
csvPath = "C:\spark\COMP336-Coursework-1\data\covid19.csv"
covidDF = spark.read.csv(csvPath,header=True,inferSchema=True)
covidDF = covidDF.withColumn("date", F.col("date").cast(DateType()))
print("covid19.csv read as Dataframe with header=True")
covidDF.show()
print("Schema for dataframe")
covidDF.printSchema()
print("Filtering out NULL values from dataframe")
covidDF = covidDF.filter(covidDF.continent.isNotNull() & covidDF.location.isNotNull() & covidDF.date.isNotNull() & covidDF.total_cases.isNotNull() & covidDF.new_cases.isNotNull() & covidDF.total_deaths.isNotNull() & covidDF.new_deaths.isNotNull())
covidDF.show()
print("Highest deaths per country")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_deaths)).show()
print("max and min function results on total_cases")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_cases).alias('total_cases_max'), F.min(covidDF.total_cases).alias('total_cases_min')).show()

Exemple #29

0

Afficher le fichier

Fichier : mlib_1.py Projet : xiaolin17/MLlib

import findspark
findspark.init()

from pyspark.ml.linalg import  Vectors
from pyspark.ml.stat import Correlation
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)



data = [(Vectors.sparse(4, [(0,1.0), (3,-2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0) , (3, 1.0)]),)]

df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrrix:\n" + str(r2[0]))

Exemple #30

0

Afficher le fichier

Fichier : timeS.py Projet : valkyrieDataStreaming/vds

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

from pyspark.streaming import StreamingContext

sc = SparkContext('local[2]', 'timeS')
ssc = StreamingContext(sc, 1)

spark = SparkSession(sc)

data_file = ssc.socketTextStream("localhost", 9999)

from pyspark.sql.window import Window
from pyspark.sql import functions as func
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series
import pyspark.sql as sparksql
import numpy as np
import time
from pprint import pprint
days = lambda i: i * 86400

data_file = "./export.csv"
raw_data = sc.textFile(data_file)
csv_data = raw_data.map(lambda x: x.split(","))
#csv_data.toDF().show()

Exemple #31

0

Afficher le fichier

Fichier : context.py Projet : bsangee/spark

 def __init__(self, sparkContext, jhiveContext=None):
     if jhiveContext is None:
         sparkSession = SparkSession.withHiveSupport(sparkContext)
     else:
         sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)

Exemple #32

0

Afficher le fichier

#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map(lambda row: row.split(",", 4)).map(lambda p: Row(uid=p[0], urlid=p[1], ts=p[2], label=p[3], urlinfo=p[4]))
print "finish to map input"
print train_set.take(2)

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

print "start select"
sql_query = """
SELECT train_set.uid, train_set.urlinfo
FROM train_set
WHERE train_set.label > 0
"""
train_set_urlinfo = spark.sql(sql_query)
print train_set_urlinfo.take(2)

def process_uinfo(line):