Beispiel #1
0
def read_csv(spark: SparkContext, path: str, schema=None):
    """Load csv files from the source directory into a dataframe"""
    df = spark.read.csv(path, header=True, schema=schema)

    if not schema:
        return df

    # recreate the dataframe with the correct schema
    # this will ensure non-nullable columns do not contain null values
    return spark.createDataFrame(df.rdd, schema, verifySchema=True)
Beispiel #2
0
def read_parquet(spark: SparkContext,
                 path: str,
                 schema: DataType = None,
                 merge_schema=True):
    """Read a directory of parquet files into a dataframe"""

    # initially all the columns will be nullable
    df = spark.read.option("mergeSchema",
                           str(merge_schema).lower()).parquet(path)

    if not schema:
        # return the dataframe without validating the schema
        return df

    # recreate the dataframe with the correct schema
    # this will ensure non-nullable columns do not contain null values
    return spark.createDataFrame(df.rdd, schema, verifySchema=True)
Beispiel #3
0
def get_channel_mapping(spark: pyspark.SparkContext) -> pyspark.sql.DataFrame:
    """
    Creates the channel mapping dataframe from the hard-coded values

    Parameters
    ----------
    spark : pyspark.SparkContext
        Spark context to initialize variables and get data from hive

    Returns
    -------
    pyspark.sql.DataFrame
        PySpark dataframe with channel mapping data
    """

    channel_mapping = spark.createDataFrame(
        [
            ("01", "Distribution Channel 01"),
            ("10", "Other"),
            ("11", "DSD Bis Intercompany"),
            ("12", "DSD Pizza Intercomp"),
            ("20", """Warehouse/Exports"""),
            ("30", "Foodservice"),
            ("40", "DSD Pizza"),
            ("45", "DSD"),
            ("50", "KFI"),
            ("55", "Plant Ingredient"),
            ("60", "Imports"),
            ("65", "Bulk FS - Specialty"),
        ],
        StructType([
            StructField("bic_zdistr_ch", StringType(), True),
            StructField("channel_desc", StringType(), True),
        ]),  # add your columns label here
    )

    return channel_mapping
Beispiel #4
0
# заготовка для API

from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import PipelineModel
from pyspark import SparkConf, SparkContext

spark = SparkContext()

df1 = spark.createDataFrame([
    (1353, 1347),
], ['user', 'summa'])

va = VectorAssembler(inputCols=['user', 'summa'], outputCol="features")

modelka = KMeansModel.load('./models/clusters.model')

result = modelka.transform(
    va.transform(df1)).select('prediction').take(1)[0][0]

# Проверка на новенького.
# Если страый юзер - тянем монгу
# Иначе поднимаем мини-инстанс.
Beispiel #5
0
"GDELT dataset found here: https://aws.amazon.com/public-datasets/gdelt/
# Column headers found here: http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt

# Load RDD
from pyspark.sql import SparkSession
from pyspark import SparkContext ,SparkConf
#spark = SparkSession.builder.appName("gdelt").getOrCreate()
conf = SparkConf().setAppName("gdelt")#.setMaster(master)
sc = SparkContext(conf=conf)
lines = sc.textFile("s3a://gdelt-open-data/events/2018*") # Loads 73,385,698 records from 2016
# Split lines into columns; change split() argument depending on deliminiter e.g. '\t'
parts = lines.map(lambda l: l.split('\t'))
# Convert RDD into DataFrame
from urllib import urlopen
html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip()
columns = html.split('\t')
df = sc.createDataFrame(parts, columns)
df.printSchema
sc.stop()
from datetime import datetime as dt

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext(conf=conf)

df = sc.createDataFrame([{
    'a': 10,
    'date': dt(2017, 1, 1)
}, {
    'a': 23,
    'date': dt(2018, 1, 1, 1)
}])

print(df.show())
Beispiel #7
0
from pyspark.mllib.feature import Word2Vec
from pyspark import SparkContext, SparkConf
import nltk
import os
stop_words = nltk.corpus.stopwords.words('english')
stop_words += ['?', '.', '!', ',']

sparkConf = SparkConf().setMaster("local").setAppName("Word2Vec").set(
    "spark.app.id", "Word2Vec")
sc = SparkContext(conf=sparkConf)
#inp = sc.textFile("Trump.txt").map(lambda row: row.split(" "))
tweets = open(os.getcwd() + "/Trump.txt").read.splitlines()
documentDF = sc.createDataFrame(tweets)
word2vec = Word2Vec()
model = word2vec.fit(inp)
result = model.transform(inp)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
Beispiel #8
0
agents
agents.count()

# Filter french agents
fr_agents = agents.filter(agents.country_name == "France")
fr_agents
fr_agents.count()
agent = fr_agents.first()
agent
print(agent.country_name, agent.id)

# Several operations
agents.filter(agents.country_name == "France").filter(
    agents.latitude < 0).count()
agents.filter((agents.country_name == "France")
              & (agents.latitude < 0)).count()
agents.limit(5).show()

# Create a view (Spark-SQL)
agents.createTempView("agents_table")  #createOrReplaceTempView()
spark.sql("SELECT * FROM agents_table ORDER BY id DESC LIMIT 10").show()

# Load a DF in memory and convert it in RDD
agents.persist()
agents.rdd.filter(lambda row: row.country_name == "France").count()

# Convert RDD (containing Row) in DF
from pyspark.sql import Row
rdd = sc.parallelize([Row(name="Alice"), Row(name="Bob")])
spark.createDataFrame(rdd)
Beispiel #9
0
departmentWithEmployees1 = Row(department=department1,
                               employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2,
                               employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3,
                               employees=[employee1, employee4])
departmentWithEmployees4 = Row(department=department4,
                               employees=[employee2, employee3])

print(department1)
print(departmentWithEmployees1.employees[0].email)

departmentsWithEmployeesSeq1 = [
    departmentWithEmployees1, departmentWithEmployees2
]
df1 = sc.createDataFrame(departmentsWithEmployeesSeq1)

departmentsWithEmployeesSeq2 = [
    departmentWithEmployees3, departmentWithEmployees4
]
df2 = sc.createDataFrame(departmentsWithEmployeesSeq2)

unionDF = df1.unionAll(df2)

#dbutils.fs.rm("/tmp/databricks-df-example.parquet", True)
unionDF.write.parquet("databricks-df-example.parquet")

explodeDF = unionDF.selectExpr("e.firstName", "e.lastName", "e.email",
                               "e.salary")

explodeDF.show()
Beispiel #10
0
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json
from pyspark.sql.types import StringType

sc = SparkContext(appName="spark1")
sc.setLogLevel("WARN")

df = sc.createDataFrame(["10", "11", "13"], "string").toDF("age")

df2 = sc.createDataFrame(["10", "11", "13"], StringType()).toDF("age")
rolledUpDF.where("Country IS NULL").show()
rolledUpDF.where("Date IS NULL").show()

#cubes
dfNoNull.cube("Date", "Country").agg(_sum(col("Quantity")))\
  .select("Date", "Country", "sum(Quantity)").orderBy("Date").show()
###################



## rdd and dataframes
a = spark.range(10).rdd
b = spark.range(10).toDF("id").rdd.map(lambda row: row[0])
a.take(4)

sqldf = SparkContext.createDataFrame(ratings)

## using spark dataframes
df = spark.read.csv('KCLT.csv',header=True)
df1 = spark.read.load('KCLT.csv',format='com.databricks.spark.csv', header='true',inferSchema='true')
type(df)
df.describe().show()
df.dtypes
df.count()
#change data type and column name
df = df.withColumn('date',df.date.cast('timestamp'))

## using spark rdd
sc = SparkContext('local','example')
rdd = sc.textFile('KCLT.csv').map(lambda line: line.split(",")[1])
ratings = sc.textFile('../Python_Projects/ml-100k/u.data').map(lambda line: line.split()[2])