コード例 #1
0
    a. Tokenizer
    b. StopWordsRemover
    c. n-gram
    d. Binarizer
    e. PCA
    f. PolynomialExpansion
    g. StringIndexer
@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

conf = BaseConfUtils()
spark = conf.createSparkSession("Examples of All Feature Transformers")

###############################################################################################################
# Tokenizer :
#   Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms
#   (usually words). A simple Tokenizer class provides this functionality.
#   RegexTokenizer allows more advanced tokenization based on regular expression (regex) matching. By default,
#   the parameter “pattern” (regex, default: "\\s+") is used as delimiters to split the input text.
#   Alternatively, users can set parameter “gaps” to false indicating the regex “pattern” denotes “tokens”
#   rather than splitting gaps, and find all matching occurrences as the tokenization result.
#
###############################################################################################################

sentenceDataFrame = spark.createDataFrame([\
                    (0, "Hi I heard about Spark"),\
コード例 #2
0
'''
Created on 16-Jun-2020

This file will stream folder and will read the newly generated files using PySpark Streaming.

@author: kasho
'''


from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils

conf = BaseConfUtils()
sparkContxt = conf.createSparkContext("Streaming Local File System")
scc = conf.createStreamingContext(sparkContxt)


if __name__ == "__main__":
    lines = scc.textFileStream("D:/Study_Document/GIT/OneStopPySpark/temp/")  # 'log/ mean directory name
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(lambda a, b: a + b)
    counts.pprint()
    scc.start()
    scc.awaitTermination()
コード例 #3
0
'''
Created on 09-Jun-2020

Basic operation on DataFrames

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils

conf = BaseConfUtils()
sparkSess = conf.createSparkSession("DataFrameBasics")

df = sparkSess.read.json(
    "D:/Study_Document/GIT/OneStopPySpark/resources/people.json")
df.show()
'''
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

'''

df.printSchema()
'''
root
 |-- age: long (nullable = true)
コード例 #4
0
    Create a Spark program to read the airport data from in/airports.text,  find all the airports whose latitude are bigger than 40.
    Then output the airport's name and the airport's latitude to out/airports_by_latitude.text.

    Each row of the input file contains the following columns:
        Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, ICAO Code,
        Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format

    Sample output:
    "St Anthony", 51.391944
    "Tofino", 49.082222

'''
from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from com.pyspark.poc.utils.CommanUtils import CommanUtils

conf = BaseConfUtils()
sc = conf.createSparkContext("AirportsLatitude")


def splitByComma(line: str):
    splits = CommanUtils.COMMA_DELIMITER.split(line)
    return "{}, {}".format(splits[1], splits[6])


if __name__ == "__main__":
    airportsLatitude = sc.textFile(
        "D:/Study_Document/GIT/OneStopPySpark/resources/airports.txt")
    filteredAirports = airportsLatitude.filter(
        lambda lines: float(CommanUtils.COMMA_DELIMITER.split(lines)[6]) > 40)
    airpotsNameAndLatitude = filteredAirports.map(splitByComma)
    airpotsNameAndLatitude.saveAsTextFile(
コード例 #5
0
        9) Outcome: Class variable (0 or 1)

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
import numpy as np
from pyspark.sql.functions import when
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

conf = BaseConfUtils()
sparkSess = conf.createSparkSession("Diabates data Classification")

raw_data = sparkSess.read.csv(
    'D:/Study_Document/GIT/OneStopPySpark/resources/diabetes.csv',
    inferSchema=True,
    header=True)

print(raw_data.columns)

raw_data.show(5, truncate=False)
'''
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|6          |148    |72           |35           |0      |33.6|0.627                   |50 |1      |
コード例 #6
0
    It is an important tool to do statistics. Most Databases support Window functions.
    Spark from version 1.4 start supporting Window functions. Spark Window Functions have the following traits:
        a. perform a calculation over a group of rows, called the Frame.
        b. a frame corresponding to the current row
        c. return a new value to for each row by an aggregate/window function
        d. Can use SQL grammar or DataFrame API.

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = BaseConfUtils()
sparkContxt = conf.createSparkContext("Windows Fun")
sqlContxt = conf.createSQLContext(sparkContxt)


@udf("long")
def median_udf(s):
    index = int(len(s) / 2)
    return s[index]


if __name__ == "__main__":
    empsalary_data = [("sales", 1, "Alice", 5000, ["game", "ski"]),
                      ("personnel", 2, "Olivia", 3900, ["game", "ski"]),
                      ("sales", 3, "Ella", 4800, ["skate", "ski"]),
                      ("sales", 4, "Ebba", 4800, ["game", "ski"]),
コード例 #7
0
'''
Created on 23-Apr-2020

@author: kasho
'''
from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils

conf = BaseConfUtils()
'''
Fetch records using SQLContext.
'''
sparkSess = conf.createSparkSession("Testing")

if __name__ == '__main__':
    # Fetching records using SQLContext
    # source_df = sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/sakila").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "actor").option("user", "root").option("password", "ayyappasai").load()
    # source_df.show()

    sourceDF = sparkSess.read \
        .format("jdbc").option("url", "jdbc:mysql://localhost:3306/ONESTOP_SPARK_DB") \
        .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "Student_Marks") \
        .option("user", "root").option("password", "ayyappasai").load()
    sourceDF.show()

コード例 #8
0
'''
Created on 18-July-2020
Requirement:
    Read Json and List out the count of #TAG word with user name of it.

@author: Ashok Kumar
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql.functions import explode

conf = BaseConfUtils()
spark = conf.createSparkSession("HashTagCount")

def iteration(user):
    print(user)

def fetchRowData(row):
    user_name=row["USER_NAME"]
    text=row["TEXT"]
    words=text.split(" ")
    return (user_name,words)

def filterMyHashTag(row):
    l = list()
    key=row[0]
    for word in row[1]:
        if "#" in word:
            l.append(word)

    rowval=(key,l)
コード例 #9
0
'''
Created on 25-Apr-2020

Write data to MySQL DB

@author: kasho
'''
from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql.types import Row, StructType, StructField, IntegerType, \
    DoubleType, StringType, FloatType
from pyspark.sql.session import SparkSession

conf = BaseConfUtils()
spark = conf.createSparkSession("Write to MySQL")


def createRow(row):
    eachRow = row.split(",")
    row =Row(eachRow[0], eachRow[1], eachRow[2], \
              eachRow[3], eachRow[6], eachRow[7], \
              eachRow[8], eachRow[9], eachRow[10], \
              eachRow[12], eachRow[13], eachRow[14])

    # row = Row(id=eachRow[0], name=eachRow[1], nationality=eachRow[2], \
    #           city=eachRow[3], gender=eachRow[6], age=eachRow[7], \
    #           english_grade=eachRow[8], math_grade=eachRow[9], sciences_grade=eachRow[10], \
    #           portfolio_rating=eachRow[12], coverletter_rating=eachRow[13], refletter_rating=eachRow[14])
    print(row)
    return row

コード例 #10
0
'''
    Create a Spark program to read the first 100 prime numbers from in/prime_nums.text,
    print the sum of those numbers to console.
    Each row of the input file contains 10 prime numbers separated by spaces.

@author: Ashok Kumar
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils

conf = BaseConfUtils()
sc = conf.createSparkContext("SumOfNumbersSolution")

if __name__ == "__main__":
    primeNumbersFile = sc.textFile(
        "D:/Study_Document/GIT/OneStopPySpark/resources/prime_nums.text")
    onlyNumber = primeNumbersFile.flatMap(lambda line: line.split("\t"))

    validNumbers = onlyNumber.filter(lambda number: number)

    intNumbers = validNumbers.map(lambda number: int(number))

    print("Sum is: {}".format(intNumbers.reduce(lambda x, y: x + y)))

    print("=================== Execution Completed ===================")
コード例 #11
0
'''
Created on 12-Jun-2020

This file will connect with Twitter to read tweets

@author: kasho
'''
import time

from pyspark.sql import Row

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.conf import SparkConf

conf = BaseConfUtils()
sc = conf.createSparkContext("PySpark Word Count Exmaple")
sqlContxt = conf.createSQLContext(sc)

def filterEmptyLines(line):
    if len(line) > 0:
        return  line


if __name__ == "__main__":


    words = sc.textFile("D:/Study_Document/pycharm-workspace/PySparkPOC/resources/wordCount.txt").flatMap(
        lambda line: line.split("\n"))
    words = words.filter(filterEmptyLines)
    line = words.map(lambda p: Row(name=p))
    df = sqlContxt.createDataFrame(line)
コード例 #12
0
    Below are feature extraction supported in PySpark:
    a. TF-IDF
    b. Word2Vec
    c. CountVectorizer
    d. FeatureHasher
@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import FeatureHasher

conf = BaseConfUtils()
sparkSess = conf.createSparkSession("Examples of All Feature Extractions")

###############################################################################################################
# TF-IDF :
#   Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in
#   text mining to reflect the importance of a term to a document in the corpus. Denote a term by t,
#   a document by d, and the corpus by D.
#
#   In MLlib, we separate TF and IDF to make them flexible:
#   a.  TF: Both HashingTF and CountVectorizer can be used to generate the term frequency vectors.
#           HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length
#           eature vectors. In text processing, a “set of terms” might be a bag of words. HashingTF utilizes
#           the hashing trick.
#
#           CountVectorizer converts text documents to vectors of term counts. Refer to CountVectorizer
コード例 #13
0
        Account_Manager: Binary 0=No manager, 1= Account manager assigned
        Years: Totaly Years as a customer
        Num_sites: Number of websites that use the service.
        Onboard_date: Date that the name of the latest contact was onboarded
        Location: Client HQ Address
        Company: Name of Client Company

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

conf = BaseConfUtils()
sparkSess = conf.createSparkSession("customer churn Model")

if __name__ == "__main__":
    data = sparkSess.read.csv(
        'D:/Study_Document/GIT/OneStopPySpark/resources/customer_churn.csv',
        inferSchema=True,
        header=True)
    data.printSchema()
    data.show(5)
    '''
    +----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
    |           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
    +----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
    |Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
    |   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
コード例 #14
0
    Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States
    and output the airport's name and the city's name to out/airports_in_usa.text. Each row of the input file contains the following columns:
    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, ICAO Code, Latitude,
    Longitude, Altitude, Timezone, DST, Timezone in Olson format

    Sample output:
    "Putnam County Airport", "Greencastle"
    "Dowagiac Municipal Airport", "Dowagiac"

@author: Ashok Kumar
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from com.pyspark.poc.utils.CommanUtils import CommanUtils

conf = BaseConfUtils()
sc = conf.createSparkContext("AirportList")


def splitByComma(line: str):
    splits = CommanUtils.COMMA_DELIMITER.split(line)
    return "{}, {}".format(splits[1], splits[2])


if __name__ == "__main__":
    airports = sc.textFile(
        "D:/Study_Document/GIT/OneStopPySpark/resources/airports.txt")
    airportsInUSA = airports.filter(lambda lines: CommanUtils.COMMA_DELIMITER.
                                    split(lines)[3] == "\"United States\"")
    airportsNameAndCityNames = airportsInUSA.map(splitByComma)
    airportsNameAndCityNames.saveAsTextFile(
コード例 #15
0
Created on 05-Sept-2020

    "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
    "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995

    Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
    take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file. Keep in mind, that the original
    log files contains the following header lines.
    host    logname    time    method    url    response    bytes
    Make sure the head lines are removed in the resulting RDD.

'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils

conf = BaseConfUtils()
sc = conf.createSparkContext("UnionSampleLog")


def isNotHeader(line: str):
    return not (line.startswith("host") and "bytes" in line)


if __name__ == "__main__":
    julyFirstLogs = sc.textFile(
        "D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950701.tsv")
    augustFirstLogs = sc.textFile(
        "D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950801.tsv")

    aggregatedLogs = julyFirstLogs.union(augustFirstLogs)
    cleanLogHeader = aggregatedLogs.filter(isNotHeader)
コード例 #16
0
Pivot Spark DataFrame:
    Spark SQL provides pivot function to rotate the data from
        one column into multiple columns. It is an aggregation where
        one of the grouping columns values transposed into individual
        columns with distinct data. To get the total amount exported to each
        country of each product, will do group by Product, pivot by Country,
        and the sum of Amount.

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql import Row

conf = BaseConfUtils()
sparkContxt = conf.createSparkContext("Pivoting Fun")
sqlContxt = conf.createSQLContext(sparkContxt)
rdd = sparkContxt.parallelize([("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"),
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")])
df_data = sqlContxt.createDataFrame(rdd, ["Product","Amount","Country"])
df_data.show()

'''
+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
| Banana|  1000|    USA|
|Carrots|  1500|    USA|
コード例 #17
0
    Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
    Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.

    Example output:
    vagrant.vf.mmc.com
    www-a1.proxy.aol.com
    .....
    Keep in mind, that the original log files contains the following header lines.
    host    logname    time    method    url    response    bytes
    Make sure the head lines are removed in the resulting RDD.

'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils


conf = BaseConfUtils()
sc = conf.createSparkContext("SameHostsIntersection")

if __name__ == "__main__":
    julyFirstLogs = sc.textFile("D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950701.tsv")
    augustFirstLogs = sc.textFile("D:/Study_Document/GIT/OneStopPySpark/resources/nasa_19950801.tsv")

    julyFirstHost = julyFirstLogs.map(lambda line: line.split("\t")[0])
    augustFirstHost = augustFirstLogs.map(lambda line: line.split("\t")[0])

    intersectionData = julyFirstHost.intersection(augustFirstHost)
    cleanHostInter = intersectionData.filter(lambda host: host != "host")
    cleanHostInter.saveAsTextFile("D:/Study_Document/GIT/OneStopPySpark/out/nasa_logs_same_hosts.csv")

    print("Execution Completed")
コード例 #18
0
'''
Created on 12-Jun-2020

This file will read the twitter line from port which we created in TweetRead.py
    after reading each line we are storing it into local file system i.e. window machine

@author: kasho
'''
import time
from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql import Row

conf = BaseConfUtils()
sparkContxt = conf.createSparkContext("Twitter Streaming")
ssc = conf.createStreamingContext(sparkContxt)
sqlContxt = conf.createSQLContext(sparkContxt)


def processData(lines):
    words = lines.flatMap(lambda line: line.split("\n"))

    words = words.filter(filterEmptyLines)
    line = words.map(lambda p: Row(name=p))
    df = sqlContxt.createDataFrame(line)
    # output = line.collect()
    df.coalesce(1).write.format("text").mode("append").save(
        "D:/Study_Document/pycharm-workspace/PySparkPOC/resources/TwitterRead")


def filterEmptyLines(line):
    if len(line) > 0: