def __init__(self, sparkContext, jsparkSession=None):
        """Creates a new SnappySession.
        """
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        SparkSession.__init__(self, sparkContext)
        if jsparkSession is None:
            jsparkSession = self._jvm.SnappySession(self._jsc.sc())

        self._jsparkSession = jsparkSession
Exemple #2
0
import re
import sys
import datetime
# configure spark variables
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession

sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

# load up other dependencies

manager_log = sys.argv[1]
application_log = sys.argv[2]
start = sys.argv[3]
end = sys.argv[4]

# lines = sc.textFile(manager_log)
# app_lines = sc.textFile(application_log)
#
# app_ids_filter = app_lines.filter(lambda x: re.search(r'1580812675067_\d+', x))\
#                  .map(lambda x: re.search(r'1580812675067_\d+', x).group(0))\
#                  .filter(lambda x: (int(re.search(r'1580812675067_(\d+)', x).group(1)) <= int(end)) & (int(re.search(r'1580812675067_(\d+)', x).group(1)) >= int(start)))
#
# app_ids_filter_list = app_ids_filter.distinct().collect()
#
# regex_ids_filter = re.compile("|".join(re.escape(app_id) for app_id in app_ids_filter_list))
#
# filtered_lines = lines.filter(lambda x: regex_ids_filter.search(x)).cache()
Exemple #3
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row

sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-*.csv").map( \
    lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4], forcount=1))

print train_set.take(5)
print "finish map input"

# get url show click

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

sql_query = """
SELECT urlid, label, forcount
FROM train_set
"""
Exemple #4
0
OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

from pyspark.sql.types import IntegerType, StringType, StructType, StructField 
import pyspark.sql.functions as F

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set("spark.local.dir", SPARK_TEMP_FOLDER)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

print('Loading data...')

events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER + "events.csv") \
Exemple #5
0
# In[6]:


from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler


# Creating spark context and starting a session

# In[7]:


sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# Reading the data

# In[8]:


lines = sc.textFile("F:\Docs\Big data\Assignment\Assignmnet 4\Dataset\pumsb.dat")


# creating a 2d list from the data read. We are skipping the first attribute.

# In[48]:

Exemple #6
0
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split, max
sc = SparkContext('local')
sc.setLogLevel("OFF")
spark = SparkSession(sc)
# Path to our 20 JSON files
inputPath = "hdfs://localhost:9000/stream/"
#inputPath = "./stream/"
# Explicitly set schema
schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Lang", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Len", StringType(), True),
    StructField("Likes", StringType(), True),
    StructField("RTs", StringType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", StringType(), True),
    StructField("Friends", StringType(), True)
])

inputDF = spark.readStream.schema(schema).option("delimiter", ";").option(
    "delimiter", ";").option("maxFilesPerTrigger", 1).csv(inputPath)
import re
import sys
import os
from pyspark import SparkContext
from collections import defaultdict
from operator import add
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.types import *
# sc.stop()

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

input_1 = sys.argv[1]
input_2 = sys.argv[2]
input_3 = sys.argv[3]

output_ = sys.argv[4]
f = open(output_,'w')

line1 = sc.textFile(input_1)
line2 = sc.textFile(input_2)
line3 = sc.textFile(input_3)

#-------------------------------------------------------------
ds1 = line1.map(lambda x: x.split(",")).collect()
ds1 = sc.parallelize(ds1)
sch1 = StructType([StructField("Drinker", StringType(), True),
                                    StructField("Beer", StringType(), True)])
df1 = spark.createDataFrame(ds1, sch1)
Exemple #8
0
                
            try:
                house_number_ticket = cleanTuple(tuple((row[23].split('-'))))
                year = int(row[4][-4:])
            except:
                continue
            if house_number_ticket and borough_ticket and street_name_ticket and (year in range(2015,2020)):
                yield(year, house_number_ticket[0], house_number_ticket[1], borough_ticket, street_name_ticket, house_number_ticket[1]%2 == 0)

if __name__=='__main__':
    t = time.localtime()
    current_time = time.strftime("%H:%M:%S", t)
    print("***START***")
    print(current_time)
    sc = SparkContext()
    spark = SparkSession(sc)

    tickets = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/')
    #loading parking tickets and creating dataframe
    parking_ticket_clean = tickets.mapPartitionsWithIndex(processTickets)
    parking_tickets_df = spark.createDataFrame(parking_ticket_clean, ('year','house_number_1','house_number_2' ,'boro','street_name','even_flag'))
    
    #loading centerline segments with name and label
    centerlines = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv')
    centerline_all = centerlines.mapPartitionsWithIndex(processCenterline)

    #get full list of centerline physicalID and create dataframe 
    centerline_full_id_only = centerlines.mapPartitionsWithIndex(getPhysicalID).distinct()
    centerline_base = spark.createDataFrame(centerline_full_id_only, ('ID','dummy'))

    #stacking centerline name + label but only keep the distinct values, save into a dataframe
Exemple #9
0
def InitSpark():
    # Creating spark context and starting a session
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)
    return spark, sc

def get_month(date):
    try:
        return int(date.split('/')[-2])
    except:
        return 0


if __name__ == '__main__':
    conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \
        .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \
        .set('spark.num-executors', '3')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('WARN')
    spark = SparkSession(sc)

    data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt')
    data = data.map(lambda x: x.split('\t'))
    data = data.map(lambda line: (line[0], get_month(line[8]), line[11]))

    schema = StructType([
        StructField('uid', StringType(), False),
        StructField('month', StringType()),
        StructField('city', StringType(), True)
    ])

    table = spark.createDataFrame(data, schema)
    table.createOrReplaceTempView('Table')

    top10city = spark.sql(
Exemple #11
0
 def __init__(self, sparkContext, jhiveContext=None):
     if jhiveContext is None:
         sparkSession = SparkSession.withHiveSupport(sparkContext)
     else:
         sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
Exemple #12
0
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext('local')
spark = SparkSession(sc)

lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]),
                                     movieId=int(p[1]),
                                     rating=float(p[2]),
                                     timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
Exemple #13
0
def test_list_of_structs(spark: SparkSession, tmp_path: Path):
    test_dir = str(tmp_path)
    schema = StructType([
        StructField("id", IntegerType(), False),
        StructField(
            "anno",
            ArrayType(
                StructType([
                    StructField("label_id", IntegerType(), False),
                    StructField("label", StringType(), False),
                    StructField("bbox", ArrayType(IntegerType()), False),
                ])),
            False,
        ),
    ])
    df = spark.createDataFrame(
        [
            {
                "id":
                1,
                "anno": [
                    {
                        "label": "cat",
                        "label_id": 1,
                        "bbox": [1, 2, 3, 4]
                    },
                    {
                        "label": "dog",
                        "label_id": 2,
                        "bbox": [10, 23]
                    },
                ],
            },
            {
                "id":
                2,
                "anno": [
                    {
                        "label": "bug",
                        "label_id": 3,
                        "bbox": [100, 200]
                    },
                    {
                        "label": "aaa",
                        "label_id": 4,
                        "bbox": [-1, -2, -3]
                    },
                ],
            },
        ],
        schema=schema,
    )
    df.repartition(1).write.mode("overwrite").format("rikai").save(test_dir)

    records = _read_parquets(test_dir)
    for expect, actual in zip(
        [
            {
                "id":
                1,
                "anno": [
                    {
                        "label": "cat",
                        "label_id": 1,
                        "bbox": np.array([1, 2, 3, 4], dtype=np.int32),
                    },
                    {
                        "label": "dog",
                        "label_id": 2,
                        "bbox": np.array([10, 23], dtype=np.int32),
                    },
                ],
            },
            {
                "id":
                2,
                "anno": [
                    {
                        "label": "bug",
                        "label_id": 3,
                        "bbox": np.array([100, 200], dtype=np.int32),
                    },
                    {
                        "label": "aaa",
                        "label_id": 4,
                        "bbox": np.array([-1, -2, -3], dtype=np.int32),
                    },
                ],
            },
        ],
            records,
    ):
        assert expect["id"] == actual["id"]
        assert len(expect["anno"]) == len(actual["anno"])
        assert np.array_equal(expect["anno"][0]["bbox"],
                              actual["anno"][0]["bbox"])
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Window, functions

if __name__ == '__main__':
    conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \
        .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \
        .set('spark.num-executors', '3')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('WARN')
    spark = SparkSession(sc)

    data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt')
    data = data.map(lambda x: x.split('\t'))
    data = data.map(lambda line: (line[0], line[2], line[11]))

    schema = StructType([
        StructField('uid', StringType(), False),
        StructField('name', StringType()),
        StructField('city', StringType(), True)
    ])

    table = spark.createDataFrame(data, schema)
    table.createOrReplaceTempView('Table')
    spark.sql('''
    select * from
            (select city,name,nb_name,rank() over(partition by city order by nb_name desc) as rk
            from
Exemple #15
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map( \
    lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4]))

print train_set.take(5)
print "finish map input"

# get url show click

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

sql_query = """
SELECT uid, urlid, label
FROM train_set
"""
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as F
from normalize import get_min_max
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import UserDefinedFunction
import json
import os
from pyspark import SparkConf, SparkContext

conf = SparkConf().set('spark.executor.memory',
                       '4g').set('spark.driver.memory',
                                 '8g')  # needs more memory
print(conf.toDebugString())
sc = SparkContext(appName='Clustering', conf=conf).getOrCreate()
spark = SparkSession(sc)

# Loads data.
df = spark.read.option("header", "true").csv("/user/root/data/*.csv")
df_notnull = df.filter(
    F.col("lon").isNotNull() & F.col("lat").isNotNull()
    & F.col('P1').isNotNull() & F.col('timestamp').isNotNull())
df = df_notnull
df_timestamp = df.withColumn('timestamp', df['timestamp'].substr(1, 7))
df = df_timestamp
timestamp = df.collect()[0][5]

features = ['P1', 'lon', 'lat']
vector_assembler = VectorAssembler(inputCols=features, outputCol="features")

# Cast feature columns to double
Exemple #17
0
import pandas as pd
import json
from pandas import DataFrame
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

#Create Pandas Dataframe
companies_data = pd.read_json('test.json')
dataFrame = pd.DataFrame(companies_data)
print(dataFrame.T)

#manipulate Pandas dataframe and write back to json file

change_companies_data = dataFrame.loc['exchange_code']= 123456
json_rewrite = dataFrame.to_json('test.json')
dataFrameTranspose = dataFrame.T

#Convert pandas dataframe to spark dataframe
spark_dataframe = spark.createDataFrame(dataFrameTranspose.astype(str))
spark_dataframe.show()

#Convert Spark dataframe to pandas dataframe 
spark_to_panda_dataframe = spark_dataframe.toPandas()
print(spark_to_panda_dataframe)



Exemple #18
0
    import numpy as np
    x, y = np.array(x), np.array(y)
    xm = np.mean(x)
    ym = np.mean(y)
    numer = sum((x - xm)**2)
    denomi = sum((y - ym) * (x - xm))
    coef = denomi / numer

    return coef


if __name__ == "__main__":
    start_time = time.time()
    output = sys.argv[1]
    sc = SparkContext()
    spark = SparkSession(sc)

    street1 = sc.textFile(
        'hdfs:///tmp/bdm/nyc_cscl.csv').mapPartitionsWithIndex(processStreet)
    violation = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/'
                            ).mapPartitionsWithIndex(processViolation)

    viola = spark.createDataFrame(
        violation, ('year', 'street', 'boro', 'house_number', 'is_left'))
    stre = spark.createDataFrame(
        street1, ('physicalID', 'street', 'boro', 'low', 'high', 'is_left'))
    stre = stre.distinct()
    filtering = [
        viola.boro == stre.boro,
        viola.street == stre.street, viola.is_left == stre.is_left,
        (viola.house_number >= stre.low) & (viola.house_number <= stre.high)
        return "\"{}\"".format(i)
    else:
        return str(i)


def to_csv(rdd):
    li = map(process, rdd)
    return ','.join(li)


if __name__ == "__main__":
    start_time = time.time()
    output = sys.argv[1]

    sc = SparkContext()
    spark = SparkSession(sc)

    centerline = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv')

    rdd_cl = centerline.mapPartitionsWithIndex(processCenterline)

    violations = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/')

    rdd_v = violations.mapPartitionsWithIndex(processViolation)

    v = spark.createDataFrame(rdd_v,
                              ('year', 'house', 'street', 'boro', 'is_left'))

    cl = spark.createDataFrame(
        rdd_cl, ('pysicalID', 'street', 'boro', 'low', 'high', 'is_left'))
Exemple #20
0
def main():
    sc = SparkContext("local", "dataframe app")
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)

    #load the retail dataset
    retail_data = spark.read.option("inferSchema", "true").option(
        "header", "true"
    ).option("timestampFormat", "dd/M/yyyy H:mm").csv(
        "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial02/online-retail-dataset.csv"
    )
    retail_data.show()

    #Question 1
    #How many orders did customers perform at which hour?

    # a) SQL
    retail_data.createOrReplaceTempView("retailTable")

    result = spark.sql("""
    SELECT hour(InvoiceDate) as InvoiceHour, count(distinct InvoiceNo) as NoInvoices
    FROM retailTable
    GROUP BY InvoiceHour
    ORDER BY InvoiceHour
    """)
    result.show()

    # b) Spark
    result = retail_data.selectExpr(
        "hour(InvoiceDate) as InvoiceHour",
        "InvoiceNo").distinct().groupBy("InvoiceHour").agg(
            f.expr("count(InvoiceNo) as NoInvoices")).orderBy("InvoiceHour")
    result.show()

    #Question 2
    #How frequently was each product bought in the different countries?

    # a) SQL
    df_selection = retail_data.selectExpr("Country", "StockCode", "Quantity")
    df_nonull = df_selection.na.replace(
        [""], ["UNKNOWN"], "StockCode").na.replace([""], ["UNKNOWN"],
                                                   "Country").na.drop("any")
    df_nonull.createOrReplaceTempView("retailNoNull")

    result = spark.sql("""
    SELECT Country, StockCode, sum(Quantity) as Quantity
    FROM retailNoNull
    GROUP BY Country, StockCode
    GROUPING SETS ((Country, StockCode), (Country), (StockCode), ())
    ORDER BY Country, StockCode
    """)
    result.show()

    # b) Spark
    result = df_nonull.cube("Country", "StockCode").agg(
        f.sum("Quantity").alias("Quantity")).orderBy(f.col("Country"),
                                                     f.col("StockCode"))
    result.show()

    result.coalesce(1).write.format("csv").option("header", "true").save(
        "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial03/frequencies"
    )
    sc.stop()
Exemple #21
0
        intp.saveDFToCsv(
            df._jdf, path, hasheader, isOverwrite,
            MapConverter().convert(option, gateway._gateway_client))
    else:
        print(str(df))


java_import(gateway.jvm, "scala.Tuple2")

jsc = intp.getJavaSparkContext()
jconf = intp.getSparkConf()
conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
sqlc = HiveContext(sc, intp.sqlContext())
sqlContext = sqlc
spark = SparkSession(sc, intp.getSparkSession())

##add pyfiles
try:
    pyfile = sys.argv[5]
    pyfiles = pyfile.split(',')
    for i in range(len(pyfiles)):
        if "" != pyfiles[i]:
            sc.addPyFile(pyfiles[i])
except Exception as e:
    print("add pyfile error: " + pyfile)


class UDF(object):
    def __init__(self, intp, sqlc):
        self.intp = intp
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier


# COMMAND ----------

# MAGIC %md #####The below command will start spark session when we run our file in oracle BDE. In Databricks keep this cell as False by default. But when you run file in Oracle BDE make it True.

# COMMAND ----------

#This 
IS_SPARK_SUBMIT_CLI = False
if IS_SPARK_SUBMIT_CLI:
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)

# COMMAND ----------

# MAGIC %md ### Load Source Data
# MAGIC The data for this project is provided as a CSV file containing details of advertisement. The data includes specific characteristics (or *features*) for each ad, as well as a *label* column indicating whether the ad was clicked or not.
# MAGIC 
# MAGIC You will load this data into a DataFrame and display it.

# COMMAND ----------

# MAGIC %md #####Reading all necessary csv file

# COMMAND ----------

if IS_SPARK_SUBMIT_CLI:
Exemple #23
0
def main(sc):
    spark = SparkSession(sc)
    sqlContext = SQLContext(sc)

    years = ['2015', '2016', '2017', '2018', '2019']
    def parseCSV(idx, part):
        if idx==0:
            next(part)
        for p in csv.reader(part):
            if p[23].isalpha() or p[24] == '' or p[21] == '' or p[23] == '' or p[4][-4:] not in years:
                continue
            if '-' in p[23]:
                yield(p[23].split('-')[0], p[23].split('-')[1], p[24].lower(), p[21], p[4][-4:])
            else:
                yield(p[23], '', p[24].lower(), p[21], p[4][-4:])

    rows = sc.textFile('/data/share/bdm/nyc_parking_violation/*.csv', use_unicode=True).mapPartitionsWithIndex(parseCSV)

    df = sqlContext.createDataFrame(rows, ('House Number', 'HN Compound', 'Street Name', 'County', 'Date'))

    map_NY = (col("County")=='NY')|\
            (col("County")=='MAN')|\
            (col("County")=='MH')|\
            (col("County")=='MN')|\
            (col("County")=='NEWY')|\
            (col("County")=='NEW Y')

    map_BX = (col("County")=='BRONX')|\
            (col("County")=='BX')

    map_BK = (col("County")=='BK')|\
            (col("County")=='K')|\
            (col("County")=='KING')|\
            (col("County")=='KINGS')

    map_QN = (col("County")=='Q')|\
            (col("County")=='QN')|\
            (col("County")=='QNS')|\
            (col("County")=='QU')|\
            (col("County")=='QUEEN')

    map_R = (col("County")=='R')|\
            (col("County")=='RICHMOND')

    df = df.withColumn("County", when(map_NY, '1')
                                .when(map_BX, '2')
                                .when(map_BK, '3')
                                .when(map_QN, '4')
                                .when(map_R, '5')
                                .otherwise('')).where(col('County')!='')
    df = df.withColumn("House Number", df["House Number"].cast('int'))
    df = df.withColumn("HN Compound", df["HN Compound"].cast('int'))

    def parseCL(idx, part):
        if idx==0:
            next(part)
        for p in csv.reader(part):
            LL_HN = p[2]
            LL_HNC = ''
            LH_HN = p[3]
            LH_HNC = ''
            if '-' in p[2] and '-' in p[3]:
                LL_HN = p[2].split('-')[0]
                LL_HNC = p[2].split('-')[1]
                LH_HN = p[3].split('-')[0]
                LH_HNC = p[3].split('-')[1]

            RL_HN = p[4]
            RL_HNC = ''
            RH_HN = p[5]
            RH_HNC = ''
            if '-' in p[4] and '-' in p[5]:
                RL_HN = p[4].split('-')[0]
                RL_HNC = p[4].split('-')[1]
                RH_HN = p[5].split('-')[0]
                RH_HNC = p[5].split('-')[1]
            yield(p[0], p[28].lower(), p[29].lower(), p[13], LL_HN, LL_HNC, LH_HN, LH_HNC, RL_HN, RL_HNC, RH_HN, RH_HNC)

    rows = sc.textFile('/data/share/bdm/nyc_cscl.csv', use_unicode=True).mapPartitionsWithIndex(parseCL)

    centerline = sqlContext.createDataFrame(rows, ('ID', 'full street', 'st label', 'borocode', 'LL_HN', 'LL_HNC', 'LH_HN', 'LH_HNC', 'RL_HN', 'RL_HNC', 'RH_HN', 'RH_HNC'))
    centerline = centerline.withColumn("LL_HN", centerline["LL_HN"].cast('int'))
    centerline = centerline.withColumn("LH_HN", centerline["LH_HN"].cast('int'))
    centerline = centerline.withColumn("RL_HN", centerline["RL_HN"].cast('int'))
    centerline = centerline.withColumn("RH_HN", centerline["RH_HN"].cast('int'))
    centerline = centerline.withColumn("LL_HNC", centerline["LL_HNC"].cast('int'))
    centerline = centerline.withColumn("LH_HNC", centerline["LH_HNC"].cast('int'))
    centerline = centerline.withColumn("RL_HNC", centerline["RL_HNC"].cast('int'))
    centerline = centerline.withColumn("RH_HNC", centerline["RH_HNC"].cast('int'))
    print('Data loaded')
    cond1 = (df['Street Name'] == centerline['full street'])
    cond2 = (df['Street Name'] == centerline['st label'])
    cond3 = (df['County'] == centerline['borocode'])
    cond4 = (df['House Number'] % 2 == 1)
    cond5 = (df['House Number'] >= centerline['LL_HN']) & (df['House Number'] <= centerline['LH_HN'])
    cond6 = (df['House Number'] % 2 == 0)
    cond7 = (df['House Number'] >= centerline['RL_HN']) & (df['House Number'] <= centerline['RH_HN'])
    cond8 = cond4 & cond5
    cond9 = cond6 & cond7

    hnc_cond1 = (df['HN Compound'].isNotNull())
    hnc_cond2 = (df['HN Compound'].isNull())
    hnc_cond3 = ((df['HN Compound'] >= centerline['LL_HNC']) & (df['HN Compound'] <= centerline['LH_HNC']))
    hnc_cond4 = ((df['HN Compound'] >= centerline['RL_HNC']) & (df['HN Compound'] <= centerline['RH_HNC']))


    cond10 = (hnc_cond2 & (cond8|cond9))
    cond11 = (hnc_cond1 & (cond8|cond9) & (hnc_cond3|hnc_cond4))

    joined = df.join(centerline, ((cond1|cond2) & cond3 & (cond10|cond11)), "inner")
    joined = joined.select(col('ID'), col('Date'))
    count_df = joined.groupBy(['ID', 'Date']).pivot('Date').count().drop('Date')
    print('Table pivoted')
    allID = centerline.select(col('ID')).dropDuplicates()
    result = allID.join(count_df, on=["ID"], how='outer').na.fill(0)

    marksColumns = [col('2015'), col('2016'), col('2017'), col('2018'), col('2019')]
    diff_x = [-2, -1, 0, 1, 2]

    average_func = sum(x for x in marksColumns)/len(marksColumns)
    result = result.withColumn("avg", average_func)
    ols_func = sum(diff*(y - col('avg')) for diff, y in zip(diff_x, marksColumns))/10
    coef = result.withColumn("OLS_COEF", ols_func).drop('avg')

    coef.rdd.map(writeToCSV).saveAsTextFile(sys.argv[1])
Exemple #24
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import array

sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"

fieldSchema = StructType([
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term", StringType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("ts", LongType(), True),
    StructField("uid", LongType(), True),
    StructField("urlid", LongType(), True)
])
Exemple #25
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
sc = SparkContext('local')
spark = SparkSession(sc)
fieldSchema = StructType([
    StructField("ctr", DoubleType(), True),
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term_score", DoubleType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("uid", LongType(), True)
])
print "begin to map input"
train_set = spark.read.csv(
    "gs://dataproc-1228d533-ffe2-4747-a056-8cd396c3db5f-asia-southeast1/data/picfeed/train_feature_compose_new/part-*",
    schema=fieldSchema)
Exemple #26
0
import findspark
findspark.init()

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

df = spark.createDataFrame([
    (
        7,
        Vectors.dense([0.0, 0.0, 18.0, 1.0]),
        1.0,
    ),
    (
        8,
        Vectors.dense([0.0, 1.0, 12.0, 0.0]),
        0.0,
    ),
    (
        9,
        Vectors.dense([1.0, 0.0, 15.0, 0.1]),
        0.0,
    ),
], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1,
                         featuresCol="features",
                         outputCol="selected",
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import array
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
fieldSchema = StructType([
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term", StringType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("ts", LongType(), True),
    StructField("uid", LongType(), True),
    StructField("urlid", LongType(), True),
    StructField("user_s_term", StringType(), True)
])
train_set_join_user_model = spark.read.csv(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_join_user_model/part-00000-59d90ec7-6a27-4356-901d-ea40b3333c49-c000.csv",
    schema=fieldSchema)
# Samuel Tribe - 201318996 - [email protected]
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession
from pyspark.sql.types import DateType
conf = SparkConf().setAppName("covid19").setMaster("local")
spark = SparkSession(SparkContext(conf=conf))
csvPath = "C:\spark\COMP336-Coursework-1\data\covid19.csv"
covidDF = spark.read.csv(csvPath,header=True,inferSchema=True)
covidDF = covidDF.withColumn("date", F.col("date").cast(DateType()))
print("covid19.csv read as Dataframe with header=True")
covidDF.show()
print("Schema for dataframe")
covidDF.printSchema()
print("Filtering out NULL values from dataframe")
covidDF = covidDF.filter(covidDF.continent.isNotNull() & covidDF.location.isNotNull() & covidDF.date.isNotNull() & covidDF.total_cases.isNotNull() & covidDF.new_cases.isNotNull() & covidDF.total_deaths.isNotNull() & covidDF.new_deaths.isNotNull())
covidDF.show()
print("Highest deaths per country")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_deaths)).show()
print("max and min function results on total_cases")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_cases).alias('total_cases_max'), F.min(covidDF.total_cases).alias('total_cases_min')).show()
Exemple #29
0
import findspark
findspark.init()

from pyspark.ml.linalg import  Vectors
from pyspark.ml.stat import Correlation
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)



data = [(Vectors.sparse(4, [(0,1.0), (3,-2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0) , (3, 1.0)]),)]

df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrrix:\n" + str(r2[0]))
Exemple #30
0
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

from pyspark.streaming import StreamingContext

sc = SparkContext('local[2]', 'timeS')
ssc = StreamingContext(sc, 1)

spark = SparkSession(sc)

data_file = ssc.socketTextStream("localhost", 9999)

from pyspark.sql.window import Window
from pyspark.sql import functions as func
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series
import pyspark.sql as sparksql
import numpy as np
import time
from pprint import pprint
days = lambda i: i * 86400

data_file = "./export.csv"
raw_data = sc.textFile(data_file)
csv_data = raw_data.map(lambda x: x.split(","))
#csv_data.toDF().show()
Exemple #31
0
 def __init__(self, sparkContext, jhiveContext=None):
     if jhiveContext is None:
         sparkSession = SparkSession.withHiveSupport(sparkContext)
     else:
         sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
Exemple #32
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map(lambda row: row.split(",", 4)).map(lambda p: Row(uid=p[0], urlid=p[1], ts=p[2], label=p[3], urlinfo=p[4]))
print "finish to map input"
print train_set.take(2)

train_set_d = spark.createDataFrame(train_set)

train_set_d.createOrReplaceTempView("train_set")

print "start select"
sql_query = """
SELECT train_set.uid, train_set.urlinfo
FROM train_set
WHERE train_set.label > 0
"""
train_set_urlinfo = spark.sql(sql_query)
print train_set_urlinfo.take(2)

def process_uinfo(line):