a. Tokenizer b. StopWordsRemover c. n-gram d. Binarizer e. PCA f. PolynomialExpansion g. StringIndexer @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType conf = BaseConfUtils() spark = conf.createSparkSession("Examples of All Feature Transformers") ############################################################################################################### # Tokenizer : # Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms # (usually words). A simple Tokenizer class provides this functionality. # RegexTokenizer allows more advanced tokenization based on regular expression (regex) matching. By default, # the parameter “pattern” (regex, default: "\\s+") is used as delimiters to split the input text. # Alternatively, users can set parameter “gaps” to false indicating the regex “pattern” denotes “tokens” # rather than splitting gaps, and find all matching occurrences as the tokenization result. # ############################################################################################################### sentenceDataFrame = spark.createDataFrame([\ (0, "Hi I heard about Spark"),\
''' Created on 16-Jun-2020 This file will stream folder and will read the newly generated files using PySpark Streaming. @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() sparkContxt = conf.createSparkContext("Streaming Local File System") scc = conf.createStreamingContext(sparkContxt) if __name__ == "__main__": lines = scc.textFileStream("D:/Study_Document/GIT/OneStopPySpark/temp/") # 'log/ mean directory name counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda x: (x, 1)) \ .reduceByKey(lambda a, b: a + b) counts.pprint() scc.start() scc.awaitTermination()
''' Created on 09-Jun-2020 Basic operation on DataFrames @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() sparkSess = conf.createSparkSession("DataFrameBasics") df = sparkSess.read.json( "D:/Study_Document/GIT/OneStopPySpark/resources/people.json") df.show() ''' +----+-------+ | age| name| +----+-------+ |null|Michael| | 30| Andy| | 19| Justin| +----+-------+ ''' df.printSchema() ''' root |-- age: long (nullable = true)
Create a Spark program to read the airport data from in/airports.text, find all the airports whose latitude are bigger than 40. Then output the airport's name and the airport's latitude to out/airports_by_latitude.text. Each row of the input file contains the following columns: Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format Sample output: "St Anthony", 51.391944 "Tofino", 49.082222 ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from com.pyspark.poc.utils.CommanUtils import CommanUtils conf = BaseConfUtils() sc = conf.createSparkContext("AirportsLatitude") def splitByComma(line: str): splits = CommanUtils.COMMA_DELIMITER.split(line) return "{}, {}".format(splits[1], splits[6]) if __name__ == "__main__": airportsLatitude = sc.textFile( "D:/Study_Document/GIT/OneStopPySpark/resources/airports.txt") filteredAirports = airportsLatitude.filter( lambda lines: float(CommanUtils.COMMA_DELIMITER.split(lines)[6]) > 40) airpotsNameAndLatitude = filteredAirports.map(splitByComma) airpotsNameAndLatitude.saveAsTextFile(
9) Outcome: Class variable (0 or 1) @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils import numpy as np from pyspark.sql.functions import when from pyspark.ml.feature import Imputer from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler from pyspark.ml.feature import ChiSqSelector from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator conf = BaseConfUtils() sparkSess = conf.createSparkSession("Diabates data Classification") raw_data = sparkSess.read.csv( 'D:/Study_Document/GIT/OneStopPySpark/resources/diabetes.csv', inferSchema=True, header=True) print(raw_data.columns) raw_data.show(5, truncate=False) ''' +-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+ |Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome| +-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+ |6 |148 |72 |35 |0 |33.6|0.627 |50 |1 |
It is an important tool to do statistics. Most Databases support Window functions. Spark from version 1.4 start supporting Window functions. Spark Window Functions have the following traits: a. perform a calculation over a group of rows, called the Frame. b. a frame corresponding to the current row c. return a new value to for each row by an aggregate/window function d. Can use SQL grammar or DataFrame API. @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql import Window from pyspark.sql.types import * from pyspark.sql.functions import * conf = BaseConfUtils() sparkContxt = conf.createSparkContext("Windows Fun") sqlContxt = conf.createSQLContext(sparkContxt) @udf("long") def median_udf(s): index = int(len(s) / 2) return s[index] if __name__ == "__main__": empsalary_data = [("sales", 1, "Alice", 5000, ["game", "ski"]), ("personnel", 2, "Olivia", 3900, ["game", "ski"]), ("sales", 3, "Ella", 4800, ["skate", "ski"]), ("sales", 4, "Ebba", 4800, ["game", "ski"]),
''' Created on 23-Apr-2020 @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() ''' Fetch records using SQLContext. ''' sparkSess = conf.createSparkSession("Testing") if __name__ == '__main__': # Fetching records using SQLContext # source_df = sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/sakila").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "actor").option("user", "root").option("password", "ayyappasai").load() # source_df.show() sourceDF = sparkSess.read \ .format("jdbc").option("url", "jdbc:mysql://localhost:3306/ONESTOP_SPARK_DB") \ .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "Student_Marks") \ .option("user", "root").option("password", "ayyappasai").load() sourceDF.show()
''' Created on 18-July-2020 Requirement: Read Json and List out the count of #TAG word with user name of it. @author: Ashok Kumar ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql.functions import explode conf = BaseConfUtils() spark = conf.createSparkSession("HashTagCount") def iteration(user): print(user) def fetchRowData(row): user_name=row["USER_NAME"] text=row["TEXT"] words=text.split(" ") return (user_name,words) def filterMyHashTag(row): l = list() key=row[0] for word in row[1]: if "#" in word: l.append(word) rowval=(key,l)
''' Created on 25-Apr-2020 Write data to MySQL DB @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql.types import Row, StructType, StructField, IntegerType, \ DoubleType, StringType, FloatType from pyspark.sql.session import SparkSession conf = BaseConfUtils() spark = conf.createSparkSession("Write to MySQL") def createRow(row): eachRow = row.split(",") row =Row(eachRow[0], eachRow[1], eachRow[2], \ eachRow[3], eachRow[6], eachRow[7], \ eachRow[8], eachRow[9], eachRow[10], \ eachRow[12], eachRow[13], eachRow[14]) # row = Row(id=eachRow[0], name=eachRow[1], nationality=eachRow[2], \ # city=eachRow[3], gender=eachRow[6], age=eachRow[7], \ # english_grade=eachRow[8], math_grade=eachRow[9], sciences_grade=eachRow[10], \ # portfolio_rating=eachRow[12], coverletter_rating=eachRow[13], refletter_rating=eachRow[14]) print(row) return row
''' Create a Spark program to read the first 100 prime numbers from in/prime_nums.text, print the sum of those numbers to console. Each row of the input file contains 10 prime numbers separated by spaces. @author: Ashok Kumar ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() sc = conf.createSparkContext("SumOfNumbersSolution") if __name__ == "__main__": primeNumbersFile = sc.textFile( "D:/Study_Document/GIT/OneStopPySpark/resources/prime_nums.text") onlyNumber = primeNumbersFile.flatMap(lambda line: line.split("\t")) validNumbers = onlyNumber.filter(lambda number: number) intNumbers = validNumbers.map(lambda number: int(number)) print("Sum is: {}".format(intNumbers.reduce(lambda x, y: x + y))) print("=================== Execution Completed ===================")
''' Created on 12-Jun-2020 This file will connect with Twitter to read tweets @author: kasho ''' import time from pyspark.sql import Row from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.conf import SparkConf conf = BaseConfUtils() sc = conf.createSparkContext("PySpark Word Count Exmaple") sqlContxt = conf.createSQLContext(sc) def filterEmptyLines(line): if len(line) > 0: return line if __name__ == "__main__": words = sc.textFile("D:/Study_Document/pycharm-workspace/PySparkPOC/resources/wordCount.txt").flatMap( lambda line: line.split("\n")) words = words.filter(filterEmptyLines) line = words.map(lambda p: Row(name=p)) df = sqlContxt.createDataFrame(line)
Below are feature extraction supported in PySpark: a. TF-IDF b. Word2Vec c. CountVectorizer d. FeatureHasher @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.ml.feature import Word2Vec from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import CountVectorizer from pyspark.ml.feature import FeatureHasher conf = BaseConfUtils() sparkSess = conf.createSparkSession("Examples of All Feature Extractions") ############################################################################################################### # TF-IDF : # Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in # text mining to reflect the importance of a term to a document in the corpus. Denote a term by t, # a document by d, and the corpus by D. # # In MLlib, we separate TF and IDF to make them flexible: # a. TF: Both HashingTF and CountVectorizer can be used to generate the term frequency vectors. # HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length # eature vectors. In text processing, a “set of terms” might be a bag of words. HashingTF utilizes # the hashing trick. # # CountVectorizer converts text documents to vectors of term counts. Refer to CountVectorizer
Account_Manager: Binary 0=No manager, 1= Account manager assigned Years: Totaly Years as a customer Num_sites: Number of websites that use the service. Onboard_date: Date that the name of the latest contact was onboarded Location: Client HQ Address Company: Name of Client Company @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator conf = BaseConfUtils() sparkSess = conf.createSparkSession("customer churn Model") if __name__ == "__main__": data = sparkSess.read.csv( 'D:/Study_Document/GIT/OneStopPySpark/resources/customer_churn.csv', inferSchema=True, header=True) data.printSchema() data.show(5) ''' +----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+ | Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites| Onboard_date| Location| Company|Churn| +----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+ |Cameron Williams|42.0| 11066.8| 0| 7.22| 8.0|2013-08-30 07:00:40|10265 Elizabeth M...| Harvey LLC| 1| | Kevin Mueller|41.0| 11916.22| 0| 6.5| 11.0|2013-08-13 00:38:46|6157 Frank Garden...| Wilson PLC| 1|
Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States and output the airport's name and the city's name to out/airports_in_usa.text. Each row of the input file contains the following columns: Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format Sample output: "Putnam County Airport", "Greencastle" "Dowagiac Municipal Airport", "Dowagiac" @author: Ashok Kumar ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from com.pyspark.poc.utils.CommanUtils import CommanUtils conf = BaseConfUtils() sc = conf.createSparkContext("AirportList") def splitByComma(line: str): splits = CommanUtils.COMMA_DELIMITER.split(line) return "{}, {}".format(splits[1], splits[2]) if __name__ == "__main__": airports = sc.textFile( "D:/Study_Document/GIT/OneStopPySpark/resources/airports.txt") airportsInUSA = airports.filter(lambda lines: CommanUtils.COMMA_DELIMITER. split(lines)[3] == "\"United States\"") airportsNameAndCityNames = airportsInUSA.map(splitByComma) airportsNameAndCityNames.saveAsTextFile(
Created on 05-Sept-2020 "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995. "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995 Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st, take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file. Keep in mind, that the original log files contains the following header lines. host logname time method url response bytes Make sure the head lines are removed in the resulting RDD. ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() sc = conf.createSparkContext("UnionSampleLog") def isNotHeader(line: str): return not (line.startswith("host") and "bytes" in line) if __name__ == "__main__": julyFirstLogs = sc.textFile( "D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950701.tsv") augustFirstLogs = sc.textFile( "D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950801.tsv") aggregatedLogs = julyFirstLogs.union(augustFirstLogs) cleanLogHeader = aggregatedLogs.filter(isNotHeader)
Pivot Spark DataFrame: Spark SQL provides pivot function to rotate the data from one column into multiple columns. It is an aggregation where one of the grouping columns values transposed into individual columns with distinct data. To get the total amount exported to each country of each product, will do group by Product, pivot by Country, and the sum of Amount. @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql import Row conf = BaseConfUtils() sparkContxt = conf.createSparkContext("Pivoting Fun") sqlContxt = conf.createSQLContext(sparkContxt) rdd = sparkContxt.parallelize([("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]) df_data = sqlContxt.createDataFrame(rdd, ["Product","Amount","Country"]) df_data.show() ''' +-------+------+-------+ |Product|Amount|Country| +-------+------+-------+ | Banana| 1000| USA| |Carrots| 1500| USA|
Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days. Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file. Example output: vagrant.vf.mmc.com www-a1.proxy.aol.com ..... Keep in mind, that the original log files contains the following header lines. host logname time method url response bytes Make sure the head lines are removed in the resulting RDD. ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils conf = BaseConfUtils() sc = conf.createSparkContext("SameHostsIntersection") if __name__ == "__main__": julyFirstLogs = sc.textFile("D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950701.tsv") augustFirstLogs = sc.textFile("D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950801.tsv") julyFirstHost = julyFirstLogs.map(lambda line: line.split("\t")[0]) augustFirstHost = augustFirstLogs.map(lambda line: line.split("\t")[0]) intersectionData = julyFirstHost.intersection(augustFirstHost) cleanHostInter = intersectionData.filter(lambda host: host != "host") cleanHostInter.saveAsTextFile("D:/Study_Document/GIT/OneStopPySpark/out/nasa_logs_same_hosts.csv") print("Execution Completed")
''' Created on 12-Jun-2020 This file will read the twitter line from port which we created in TweetRead.py after reading each line we are storing it into local file system i.e. window machine @author: kasho ''' import time from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql import Row conf = BaseConfUtils() sparkContxt = conf.createSparkContext("Twitter Streaming") ssc = conf.createStreamingContext(sparkContxt) sqlContxt = conf.createSQLContext(sparkContxt) def processData(lines): words = lines.flatMap(lambda line: line.split("\n")) words = words.filter(filterEmptyLines) line = words.map(lambda p: Row(name=p)) df = sqlContxt.createDataFrame(line) # output = line.collect() df.coalesce(1).write.format("text").mode("append").save( "D:/Study_Document/pycharm-workspace/PySparkPOC/resources/TwitterRead") def filterEmptyLines(line): if len(line) > 0: