def __init__(self, sparkContext, jsparkSession=None): """Creates a new SnappySession. """ self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm SparkSession.__init__(self, sparkContext) if jsparkSession is None: jsparkSession = self._jvm.SnappySession(self._jsc.sc()) self._jsparkSession = jsparkSession
import re import sys import datetime # configure spark variables from pyspark.context import SparkContext from pyspark.sql.context import SQLContext from pyspark.sql.session import SparkSession sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession(sc) # load up other dependencies manager_log = sys.argv[1] application_log = sys.argv[2] start = sys.argv[3] end = sys.argv[4] # lines = sc.textFile(manager_log) # app_lines = sc.textFile(application_log) # # app_ids_filter = app_lines.filter(lambda x: re.search(r'1580812675067_\d+', x))\ # .map(lambda x: re.search(r'1580812675067_\d+', x).group(0))\ # .filter(lambda x: (int(re.search(r'1580812675067_(\d+)', x).group(1)) <= int(end)) & (int(re.search(r'1580812675067_(\d+)', x).group(1)) >= int(start))) # # app_ids_filter_list = app_ids_filter.distinct().collect() # # regex_ids_filter = re.compile("|".join(re.escape(app_id) for app_id in app_ids_filter_list)) # # filtered_lines = lines.filter(lambda x: regex_ids_filter.search(x)).cache()
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-*.csv").map( \ lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4], forcount=1)) print train_set.take(5) print "finish map input" # get url show click train_set_d = spark.createDataFrame(train_set) train_set_d.createOrReplaceTempView("train_set") sql_query = """ SELECT urlid, label, forcount FROM train_set """
OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/" DATA_BUCKET_FOLDER = "/outbrain/orig/" SPARK_TEMP_FOLDER = "/outbrain/spark-temp/" from pyspark.sql.types import IntegerType, StringType, StructType, StructField import pyspark.sql.functions as F from pyspark.context import SparkContext, SparkConf from pyspark.sql.session import SparkSession from pyspark.sql.functions import col conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set("spark.local.dir", SPARK_TEMP_FOLDER) sc = SparkContext(conf=conf) spark = SparkSession(sc) print('Loading data...') events_schema = StructType( [StructField("display_id", IntegerType(), True), StructField("uuid_event", StringType(), True), StructField("document_id_event", IntegerType(), True), StructField("timestamp_event", IntegerType(), True), StructField("platform_event", IntegerType(), True), StructField("geo_location_event", StringType(), True)] ) events_df = spark.read.schema(events_schema) \ .options(header='true', inferschema='false', nullValue='\\N') \ .csv(DATA_BUCKET_FOLDER + "events.csv") \
# In[6]: from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.ml.feature import VectorAssembler # Creating spark context and starting a session # In[7]: sc = SparkContext.getOrCreate() spark = SparkSession(sc) # Reading the data # In[8]: lines = sc.textFile("F:\Docs\Big data\Assignment\Assignmnet 4\Dataset\pumsb.dat") # creating a 2d list from the data read. We are skipping the first attribute. # In[48]:
from pyspark.sql.types import StringType, StructType, StructField from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql.functions import explode from pyspark.sql.functions import split, max sc = SparkContext('local') sc.setLogLevel("OFF") spark = SparkSession(sc) # Path to our 20 JSON files inputPath = "hdfs://localhost:9000/stream/" #inputPath = "./stream/" # Explicitly set schema schema = StructType([ StructField("ID", StringType(), True), StructField("Lang", StringType(), True), StructField("Date", StringType(), True), StructField("Source", StringType(), True), StructField("Len", StringType(), True), StructField("Likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", StringType(), True), StructField("Friends", StringType(), True) ]) inputDF = spark.readStream.schema(schema).option("delimiter", ";").option( "delimiter", ";").option("maxFilesPerTrigger", 1).csv(inputPath)
import re import sys import os from pyspark import SparkContext from collections import defaultdict from operator import add from pyspark.sql.session import SparkSession from pyspark.sql.functions import mean from pyspark.sql.types import * # sc.stop() sc = SparkContext.getOrCreate() spark = SparkSession(sc) input_1 = sys.argv[1] input_2 = sys.argv[2] input_3 = sys.argv[3] output_ = sys.argv[4] f = open(output_,'w') line1 = sc.textFile(input_1) line2 = sc.textFile(input_2) line3 = sc.textFile(input_3) #------------------------------------------------------------- ds1 = line1.map(lambda x: x.split(",")).collect() ds1 = sc.parallelize(ds1) sch1 = StructType([StructField("Drinker", StringType(), True), StructField("Beer", StringType(), True)]) df1 = spark.createDataFrame(ds1, sch1)
try: house_number_ticket = cleanTuple(tuple((row[23].split('-')))) year = int(row[4][-4:]) except: continue if house_number_ticket and borough_ticket and street_name_ticket and (year in range(2015,2020)): yield(year, house_number_ticket[0], house_number_ticket[1], borough_ticket, street_name_ticket, house_number_ticket[1]%2 == 0) if __name__=='__main__': t = time.localtime() current_time = time.strftime("%H:%M:%S", t) print("***START***") print(current_time) sc = SparkContext() spark = SparkSession(sc) tickets = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/') #loading parking tickets and creating dataframe parking_ticket_clean = tickets.mapPartitionsWithIndex(processTickets) parking_tickets_df = spark.createDataFrame(parking_ticket_clean, ('year','house_number_1','house_number_2' ,'boro','street_name','even_flag')) #loading centerline segments with name and label centerlines = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv') centerline_all = centerlines.mapPartitionsWithIndex(processCenterline) #get full list of centerline physicalID and create dataframe centerline_full_id_only = centerlines.mapPartitionsWithIndex(getPhysicalID).distinct() centerline_base = spark.createDataFrame(centerline_full_id_only, ('ID','dummy')) #stacking centerline name + label but only keep the distinct values, save into a dataframe
def InitSpark(): # Creating spark context and starting a session sc = SparkContext.getOrCreate() spark = SparkSession(sc) return spark, sc
def get_month(date): try: return int(date.split('/')[-2]) except: return 0 if __name__ == '__main__': conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \ .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \ .set('spark.num-executors', '3') sc = SparkContext(conf=conf) sc.setLogLevel('WARN') spark = SparkSession(sc) data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt') data = data.map(lambda x: x.split('\t')) data = data.map(lambda line: (line[0], get_month(line[8]), line[11])) schema = StructType([ StructField('uid', StringType(), False), StructField('month', StringType()), StructField('city', StringType(), True) ]) table = spark.createDataFrame(data, schema) table.createOrReplaceTempView('Table') top10city = spark.sql(
def __init__(self, sparkContext, jhiveContext=None): if jhiveContext is None: sparkSession = SparkSession.withHiveSupport(sparkContext) else: sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row from pyspark.context import SparkContext from pyspark.sql.session import SparkSession sc = SparkContext('local') spark = SparkSession(sc) lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=int(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, userCol="userId", itemCol="movieId", ratingCol="rating") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
def test_list_of_structs(spark: SparkSession, tmp_path: Path): test_dir = str(tmp_path) schema = StructType([ StructField("id", IntegerType(), False), StructField( "anno", ArrayType( StructType([ StructField("label_id", IntegerType(), False), StructField("label", StringType(), False), StructField("bbox", ArrayType(IntegerType()), False), ])), False, ), ]) df = spark.createDataFrame( [ { "id": 1, "anno": [ { "label": "cat", "label_id": 1, "bbox": [1, 2, 3, 4] }, { "label": "dog", "label_id": 2, "bbox": [10, 23] }, ], }, { "id": 2, "anno": [ { "label": "bug", "label_id": 3, "bbox": [100, 200] }, { "label": "aaa", "label_id": 4, "bbox": [-1, -2, -3] }, ], }, ], schema=schema, ) df.repartition(1).write.mode("overwrite").format("rikai").save(test_dir) records = _read_parquets(test_dir) for expect, actual in zip( [ { "id": 1, "anno": [ { "label": "cat", "label_id": 1, "bbox": np.array([1, 2, 3, 4], dtype=np.int32), }, { "label": "dog", "label_id": 2, "bbox": np.array([10, 23], dtype=np.int32), }, ], }, { "id": 2, "anno": [ { "label": "bug", "label_id": 3, "bbox": np.array([100, 200], dtype=np.int32), }, { "label": "aaa", "label_id": 4, "bbox": np.array([-1, -2, -3], dtype=np.int32), }, ], }, ], records, ): assert expect["id"] == actual["id"] assert len(expect["anno"]) == len(actual["anno"]) assert np.array_equal(expect["anno"][0]["bbox"], actual["anno"][0]["bbox"])
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.sql.session import SparkSession from pyspark.sql.types import * from pyspark.sql import Window, functions if __name__ == '__main__': conf = SparkConf().set("spark.master", 'spark://10.190.2.112:7077').set('spark.app.name', 'task_14307110005') \ .set('spark.default.parallelism', '15').set('spark.executor-cores', '2').set('spark.executor-memory', '8g') \ .set('spark.num-executors', '3') sc = SparkContext(conf=conf) sc.setLogLevel('WARN') spark = SparkSession(sc) data = sc.textFile('hdfs://10.190.2.112/data/data_dump.txt') data = data.map(lambda x: x.split('\t')) data = data.map(lambda line: (line[0], line[2], line[11])) schema = StructType([ StructField('uid', StringType(), False), StructField('name', StringType()), StructField('city', StringType(), True) ]) table = spark.createDataFrame(data, schema) table.createOrReplaceTempView('Table') spark.sql(''' select * from (select city,name,nb_name,rank() over(partition by city order by nb_name desc) as rk from
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map( \ lambda row: row.split(",", 4)).map(lambda p: Row(label=int(p[0]), ts=p[1], uid=int(p[2]), urlid=int(p[3]), urlinfo=p[4])) print train_set.take(5) print "finish map input" # get url show click train_set_d = spark.createDataFrame(train_set) train_set_d.createOrReplaceTempView("train_set") sql_query = """ SELECT uid, urlid, label FROM train_set """
from pyspark.sql.session import SparkSession from pyspark.ml.feature import VectorAssembler from pyspark.sql import functions as F from normalize import get_min_max from pyspark.sql.types import DoubleType from pyspark.sql.functions import UserDefinedFunction import json import os from pyspark import SparkConf, SparkContext conf = SparkConf().set('spark.executor.memory', '4g').set('spark.driver.memory', '8g') # needs more memory print(conf.toDebugString()) sc = SparkContext(appName='Clustering', conf=conf).getOrCreate() spark = SparkSession(sc) # Loads data. df = spark.read.option("header", "true").csv("/user/root/data/*.csv") df_notnull = df.filter( F.col("lon").isNotNull() & F.col("lat").isNotNull() & F.col('P1').isNotNull() & F.col('timestamp').isNotNull()) df = df_notnull df_timestamp = df.withColumn('timestamp', df['timestamp'].substr(1, 7)) df = df_timestamp timestamp = df.collect()[0][5] features = ['P1', 'lon', 'lat'] vector_assembler = VectorAssembler(inputCols=features, outputCol="features") # Cast feature columns to double
import pandas as pd import json from pandas import DataFrame from pyspark.context import SparkContext from pyspark.sql.session import SparkSession sc = SparkContext.getOrCreate() spark = SparkSession(sc) #Create Pandas Dataframe companies_data = pd.read_json('test.json') dataFrame = pd.DataFrame(companies_data) print(dataFrame.T) #manipulate Pandas dataframe and write back to json file change_companies_data = dataFrame.loc['exchange_code']= 123456 json_rewrite = dataFrame.to_json('test.json') dataFrameTranspose = dataFrame.T #Convert pandas dataframe to spark dataframe spark_dataframe = spark.createDataFrame(dataFrameTranspose.astype(str)) spark_dataframe.show() #Convert Spark dataframe to pandas dataframe spark_to_panda_dataframe = spark_dataframe.toPandas() print(spark_to_panda_dataframe)
import numpy as np x, y = np.array(x), np.array(y) xm = np.mean(x) ym = np.mean(y) numer = sum((x - xm)**2) denomi = sum((y - ym) * (x - xm)) coef = denomi / numer return coef if __name__ == "__main__": start_time = time.time() output = sys.argv[1] sc = SparkContext() spark = SparkSession(sc) street1 = sc.textFile( 'hdfs:///tmp/bdm/nyc_cscl.csv').mapPartitionsWithIndex(processStreet) violation = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/' ).mapPartitionsWithIndex(processViolation) viola = spark.createDataFrame( violation, ('year', 'street', 'boro', 'house_number', 'is_left')) stre = spark.createDataFrame( street1, ('physicalID', 'street', 'boro', 'low', 'high', 'is_left')) stre = stre.distinct() filtering = [ viola.boro == stre.boro, viola.street == stre.street, viola.is_left == stre.is_left, (viola.house_number >= stre.low) & (viola.house_number <= stre.high)
return "\"{}\"".format(i) else: return str(i) def to_csv(rdd): li = map(process, rdd) return ','.join(li) if __name__ == "__main__": start_time = time.time() output = sys.argv[1] sc = SparkContext() spark = SparkSession(sc) centerline = sc.textFile('hdfs:///tmp/bdm/nyc_cscl.csv') rdd_cl = centerline.mapPartitionsWithIndex(processCenterline) violations = sc.textFile('hdfs:///tmp/bdm/nyc_parking_violation/') rdd_v = violations.mapPartitionsWithIndex(processViolation) v = spark.createDataFrame(rdd_v, ('year', 'house', 'street', 'boro', 'is_left')) cl = spark.createDataFrame( rdd_cl, ('pysicalID', 'street', 'boro', 'low', 'high', 'is_left'))
def main(): sc = SparkContext("local", "dataframe app") sc.setLogLevel("ERROR") spark = SparkSession(sc) #load the retail dataset retail_data = spark.read.option("inferSchema", "true").option( "header", "true" ).option("timestampFormat", "dd/M/yyyy H:mm").csv( "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial02/online-retail-dataset.csv" ) retail_data.show() #Question 1 #How many orders did customers perform at which hour? # a) SQL retail_data.createOrReplaceTempView("retailTable") result = spark.sql(""" SELECT hour(InvoiceDate) as InvoiceHour, count(distinct InvoiceNo) as NoInvoices FROM retailTable GROUP BY InvoiceHour ORDER BY InvoiceHour """) result.show() # b) Spark result = retail_data.selectExpr( "hour(InvoiceDate) as InvoiceHour", "InvoiceNo").distinct().groupBy("InvoiceHour").agg( f.expr("count(InvoiceNo) as NoInvoices")).orderBy("InvoiceHour") result.show() #Question 2 #How frequently was each product bought in the different countries? # a) SQL df_selection = retail_data.selectExpr("Country", "StockCode", "Quantity") df_nonull = df_selection.na.replace( [""], ["UNKNOWN"], "StockCode").na.replace([""], ["UNKNOWN"], "Country").na.drop("any") df_nonull.createOrReplaceTempView("retailNoNull") result = spark.sql(""" SELECT Country, StockCode, sum(Quantity) as Quantity FROM retailNoNull GROUP BY Country, StockCode GROUPING SETS ((Country, StockCode), (Country), (StockCode), ()) ORDER BY Country, StockCode """) result.show() # b) Spark result = df_nonull.cube("Country", "StockCode").agg( f.sum("Quantity").alias("Quantity")).orderBy(f.col("Country"), f.col("StockCode")) result.show() result.coalesce(1).write.format("csv").option("header", "true").save( "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial03/frequencies" ) sc.stop()
intp.saveDFToCsv( df._jdf, path, hasheader, isOverwrite, MapConverter().convert(option, gateway._gateway_client)) else: print(str(df)) java_import(gateway.jvm, "scala.Tuple2") jsc = intp.getJavaSparkContext() jconf = intp.getSparkConf() conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf) sqlc = HiveContext(sc, intp.sqlContext()) sqlContext = sqlc spark = SparkSession(sc, intp.getSparkSession()) ##add pyfiles try: pyfile = sys.argv[5] pyfiles = pyfile.split(',') for i in range(len(pyfiles)): if "" != pyfiles[i]: sc.addPyFile(pyfiles[i]) except Exception as e: print("add pyfile error: " + pyfile) class UDF(object): def __init__(self, intp, sqlc): self.intp = intp
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.classification import GBTClassifier from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier # COMMAND ---------- # MAGIC %md #####The below command will start spark session when we run our file in oracle BDE. In Databricks keep this cell as False by default. But when you run file in Oracle BDE make it True. # COMMAND ---------- #This IS_SPARK_SUBMIT_CLI = False if IS_SPARK_SUBMIT_CLI: sc = SparkContext.getOrCreate() spark = SparkSession(sc) # COMMAND ---------- # MAGIC %md ### Load Source Data # MAGIC The data for this project is provided as a CSV file containing details of advertisement. The data includes specific characteristics (or *features*) for each ad, as well as a *label* column indicating whether the ad was clicked or not. # MAGIC # MAGIC You will load this data into a DataFrame and display it. # COMMAND ---------- # MAGIC %md #####Reading all necessary csv file # COMMAND ---------- if IS_SPARK_SUBMIT_CLI:
def main(sc): spark = SparkSession(sc) sqlContext = SQLContext(sc) years = ['2015', '2016', '2017', '2018', '2019'] def parseCSV(idx, part): if idx==0: next(part) for p in csv.reader(part): if p[23].isalpha() or p[24] == '' or p[21] == '' or p[23] == '' or p[4][-4:] not in years: continue if '-' in p[23]: yield(p[23].split('-')[0], p[23].split('-')[1], p[24].lower(), p[21], p[4][-4:]) else: yield(p[23], '', p[24].lower(), p[21], p[4][-4:]) rows = sc.textFile('/data/share/bdm/nyc_parking_violation/*.csv', use_unicode=True).mapPartitionsWithIndex(parseCSV) df = sqlContext.createDataFrame(rows, ('House Number', 'HN Compound', 'Street Name', 'County', 'Date')) map_NY = (col("County")=='NY')|\ (col("County")=='MAN')|\ (col("County")=='MH')|\ (col("County")=='MN')|\ (col("County")=='NEWY')|\ (col("County")=='NEW Y') map_BX = (col("County")=='BRONX')|\ (col("County")=='BX') map_BK = (col("County")=='BK')|\ (col("County")=='K')|\ (col("County")=='KING')|\ (col("County")=='KINGS') map_QN = (col("County")=='Q')|\ (col("County")=='QN')|\ (col("County")=='QNS')|\ (col("County")=='QU')|\ (col("County")=='QUEEN') map_R = (col("County")=='R')|\ (col("County")=='RICHMOND') df = df.withColumn("County", when(map_NY, '1') .when(map_BX, '2') .when(map_BK, '3') .when(map_QN, '4') .when(map_R, '5') .otherwise('')).where(col('County')!='') df = df.withColumn("House Number", df["House Number"].cast('int')) df = df.withColumn("HN Compound", df["HN Compound"].cast('int')) def parseCL(idx, part): if idx==0: next(part) for p in csv.reader(part): LL_HN = p[2] LL_HNC = '' LH_HN = p[3] LH_HNC = '' if '-' in p[2] and '-' in p[3]: LL_HN = p[2].split('-')[0] LL_HNC = p[2].split('-')[1] LH_HN = p[3].split('-')[0] LH_HNC = p[3].split('-')[1] RL_HN = p[4] RL_HNC = '' RH_HN = p[5] RH_HNC = '' if '-' in p[4] and '-' in p[5]: RL_HN = p[4].split('-')[0] RL_HNC = p[4].split('-')[1] RH_HN = p[5].split('-')[0] RH_HNC = p[5].split('-')[1] yield(p[0], p[28].lower(), p[29].lower(), p[13], LL_HN, LL_HNC, LH_HN, LH_HNC, RL_HN, RL_HNC, RH_HN, RH_HNC) rows = sc.textFile('/data/share/bdm/nyc_cscl.csv', use_unicode=True).mapPartitionsWithIndex(parseCL) centerline = sqlContext.createDataFrame(rows, ('ID', 'full street', 'st label', 'borocode', 'LL_HN', 'LL_HNC', 'LH_HN', 'LH_HNC', 'RL_HN', 'RL_HNC', 'RH_HN', 'RH_HNC')) centerline = centerline.withColumn("LL_HN", centerline["LL_HN"].cast('int')) centerline = centerline.withColumn("LH_HN", centerline["LH_HN"].cast('int')) centerline = centerline.withColumn("RL_HN", centerline["RL_HN"].cast('int')) centerline = centerline.withColumn("RH_HN", centerline["RH_HN"].cast('int')) centerline = centerline.withColumn("LL_HNC", centerline["LL_HNC"].cast('int')) centerline = centerline.withColumn("LH_HNC", centerline["LH_HNC"].cast('int')) centerline = centerline.withColumn("RL_HNC", centerline["RL_HNC"].cast('int')) centerline = centerline.withColumn("RH_HNC", centerline["RH_HNC"].cast('int')) print('Data loaded') cond1 = (df['Street Name'] == centerline['full street']) cond2 = (df['Street Name'] == centerline['st label']) cond3 = (df['County'] == centerline['borocode']) cond4 = (df['House Number'] % 2 == 1) cond5 = (df['House Number'] >= centerline['LL_HN']) & (df['House Number'] <= centerline['LH_HN']) cond6 = (df['House Number'] % 2 == 0) cond7 = (df['House Number'] >= centerline['RL_HN']) & (df['House Number'] <= centerline['RH_HN']) cond8 = cond4 & cond5 cond9 = cond6 & cond7 hnc_cond1 = (df['HN Compound'].isNotNull()) hnc_cond2 = (df['HN Compound'].isNull()) hnc_cond3 = ((df['HN Compound'] >= centerline['LL_HNC']) & (df['HN Compound'] <= centerline['LH_HNC'])) hnc_cond4 = ((df['HN Compound'] >= centerline['RL_HNC']) & (df['HN Compound'] <= centerline['RH_HNC'])) cond10 = (hnc_cond2 & (cond8|cond9)) cond11 = (hnc_cond1 & (cond8|cond9) & (hnc_cond3|hnc_cond4)) joined = df.join(centerline, ((cond1|cond2) & cond3 & (cond10|cond11)), "inner") joined = joined.select(col('ID'), col('Date')) count_df = joined.groupBy(['ID', 'Date']).pivot('Date').count().drop('Date') print('Table pivoted') allID = centerline.select(col('ID')).dropDuplicates() result = allID.join(count_df, on=["ID"], how='outer').na.fill(0) marksColumns = [col('2015'), col('2016'), col('2017'), col('2018'), col('2019')] diff_x = [-2, -1, 0, 1, 2] average_func = sum(x for x in marksColumns)/len(marksColumns) result = result.withColumn("avg", average_func) ols_func = sum(diff*(y - col('avg')) for diff, y in zip(diff_x, marksColumns))/10 coef = result.withColumn("OLS_COEF", ols_func).drop('avg') coef.rdd.map(writeToCSV).saveAsTextFile(sys.argv[1])
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql.functions import array sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" fieldSchema = StructType([ StructField("label", IntegerType(), True), StructField("pdef", DoubleType(), True), StructField("pbeau", DoubleType(), True), StructField("pnum", IntegerType(), True), StructField("s_term", StringType(), True), StructField("sumclick", LongType(), True), StructField("sumshow", LongType(), True), StructField("ts", LongType(), True), StructField("uid", LongType(), True), StructField("urlid", LongType(), True) ])
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row from pyspark.sql.types import * from pyspark.ml.linalg import Vectors from pyspark.ml.classification import GBTClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator sc = SparkContext('local') spark = SparkSession(sc) fieldSchema = StructType([ StructField("ctr", DoubleType(), True), StructField("label", IntegerType(), True), StructField("pdef", DoubleType(), True), StructField("pbeau", DoubleType(), True), StructField("pnum", IntegerType(), True), StructField("s_term_score", DoubleType(), True), StructField("sumclick", LongType(), True), StructField("sumshow", LongType(), True), StructField("uid", LongType(), True) ]) print "begin to map input" train_set = spark.read.csv( "gs://dataproc-1228d533-ffe2-4747-a056-8cd396c3db5f-asia-southeast1/data/picfeed/train_feature_compose_new/part-*", schema=fieldSchema)
import findspark findspark.init() from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors from pyspark.context import SparkContext from pyspark.sql.session import SparkSession sc = SparkContext('local') spark = SparkSession(sc) df = spark.createDataFrame([ ( 7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0, ), ( 8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0, ), ( 9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0, ), ], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selected",
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql.functions import array sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" fieldSchema = StructType([ StructField("label", IntegerType(), True), StructField("pdef", DoubleType(), True), StructField("pbeau", DoubleType(), True), StructField("pnum", IntegerType(), True), StructField("s_term", StringType(), True), StructField("sumclick", LongType(), True), StructField("sumshow", LongType(), True), StructField("ts", LongType(), True), StructField("uid", LongType(), True), StructField("urlid", LongType(), True), StructField("user_s_term", StringType(), True) ]) train_set_join_user_model = spark.read.csv( "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_join_user_model/part-00000-59d90ec7-6a27-4356-901d-ea40b3333c49-c000.csv", schema=fieldSchema)
# Samuel Tribe - 201318996 - [email protected] from pyspark import SparkContext, SparkConf from pyspark.sql import functions as F from pyspark.sql.session import SparkSession from pyspark.sql.types import DateType conf = SparkConf().setAppName("covid19").setMaster("local") spark = SparkSession(SparkContext(conf=conf)) csvPath = "C:\spark\COMP336-Coursework-1\data\covid19.csv" covidDF = spark.read.csv(csvPath,header=True,inferSchema=True) covidDF = covidDF.withColumn("date", F.col("date").cast(DateType())) print("covid19.csv read as Dataframe with header=True") covidDF.show() print("Schema for dataframe") covidDF.printSchema() print("Filtering out NULL values from dataframe") covidDF = covidDF.filter(covidDF.continent.isNotNull() & covidDF.location.isNotNull() & covidDF.date.isNotNull() & covidDF.total_cases.isNotNull() & covidDF.new_cases.isNotNull() & covidDF.total_deaths.isNotNull() & covidDF.new_deaths.isNotNull()) covidDF.show() print("Highest deaths per country") covidDF.groupBy(['location']).agg(F.max(covidDF.total_deaths)).show() print("max and min function results on total_cases") covidDF.groupBy(['location']).agg(F.max(covidDF.total_cases).alias('total_cases_max'), F.min(covidDF.total_cases).alias('total_cases_min')).show()
import findspark findspark.init() from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation from pyspark.context import SparkContext from pyspark.sql.session import SparkSession sc = SparkContext('local') spark = SparkSession(sc) data = [(Vectors.sparse(4, [(0,1.0), (3,-2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0) , (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrrix:\n" + str(r2[0]))
from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType from pyspark.streaming import StreamingContext sc = SparkContext('local[2]', 'timeS') ssc = StreamingContext(sc, 1) spark = SparkSession(sc) data_file = ssc.socketTextStream("localhost", 9999) from pyspark.sql.window import Window from pyspark.sql import functions as func import matplotlib.pyplot as plt import pandas as pd from pandas import Series import pyspark.sql as sparksql import numpy as np import time from pprint import pprint days = lambda i: i * 86400 data_file = "./export.csv" raw_data = sc.textFile(data_file) csv_data = raw_data.map(lambda x: x.split(",")) #csv_data.toDF().show()
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_url_combine_data_v2/part-00000-eab41fe7-4a1c-46e5-b995-6beba43de164-c000.csv").map(lambda row: row.split(",", 4)).map(lambda p: Row(uid=p[0], urlid=p[1], ts=p[2], label=p[3], urlinfo=p[4])) print "finish to map input" print train_set.take(2) train_set_d = spark.createDataFrame(train_set) train_set_d.createOrReplaceTempView("train_set") print "start select" sql_query = """ SELECT train_set.uid, train_set.urlinfo FROM train_set WHERE train_set.label > 0 """ train_set_urlinfo = spark.sql(sql_query) print train_set_urlinfo.take(2) def process_uinfo(line):