Beispiel #1
0
def _get_schema(header, schema):
    if schema is None or len(schema) == 0:
        # Use header to generate schema
        if header is None or len(header) == 0:
            return None
        elif len(header) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            header = header[:4]

        schema = StructType()
        try:
            schema.add(StructField(header[0], IntegerType())).add(
                StructField(header[1], IntegerType())
            ).add(StructField(header[2], FloatType())).add(
                StructField(header[3], LongType())
            )
        except IndexError:
            pass
    else:
        if header is not None:
            warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)

        if len(schema) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            schema = schema[:4]

    return schema
Beispiel #2
0
def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema
Beispiel #3
0
def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return
Beispiel #4
0
 def __init__(self, predictionAndObservations):
     sc = predictionAndObservations.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(predictionAndObservations.first())
     schema = StructType([
         StructField("prediction", DoubleType(), nullable=False),
         StructField("observation", DoubleType(), nullable=False)])
     if numCol == 3:
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(predictionAndObservations, schema=schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics
     java_model = java_class(df._jdf)
     super(RegressionMetrics, self).__init__(java_model)
Beispiel #5
0
 def __init__(self, predAndLabelsWithOptWeight):
     sc = predAndLabelsWithOptWeight.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(predAndLabelsWithOptWeight.first())
     schema = StructType([
         StructField("prediction", DoubleType(), nullable=False),
         StructField("label", DoubleType(), nullable=False)])
     if (numCol == 3):
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
     java_model = java_class(df._jdf)
     super(MulticlassMetrics, self).__init__(java_model)
Beispiel #6
0
 def __init__(self, scoreAndLabels):
     sc = scoreAndLabels.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(scoreAndLabels.first())
     schema = StructType([
         StructField("score", DoubleType(), nullable=False),
         StructField("label", DoubleType(), nullable=False)])
     if numCol == 3:
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(scoreAndLabels, schema=schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
     java_model = java_class(df._jdf)
     super(BinaryClassificationMetrics, self).__init__(java_model)
Beispiel #7
0
def get_twitter_schema(json_file_name):
    schema_dict = json.load(open(json_file_name))
    schema_struct = StructType.fromJson(schema_dict)
    return schema_struct
Beispiel #8
0
            "<slide duration> must be less than or equal to <window duration>",
            file=sys.stderr)
    windowDuration = '{} seconds'.format(windowSize)
    slideDuration = '{} seconds'.format(slideSize)
    monitoring_dir = sys.argv[1]

    spark = SparkSession\
        .builder\
        .appName("InteractionCount")\
        .config("spark.eventLog.enabled","true")\
        .config("spark.eventLog.dir","hdfs://10.254.0.33:8020/user/ubuntu/applicationHistory")\
        .master("spark://10.254.0.33:7077")\
        .getOrCreate()

    userSchema = StructType().add("userA",
                                  "string").add("userB", "string").add(
                                      "timestamp",
                                      "timestamp").add("interaction", "string")
    csvDF = spark\
        .readStream \
 .schema(userSchema) \
 .csv(monitoring_dir)

    interactions = csvDF.select(csvDF['interaction'], csvDF['timestamp'])

    windowedCounts = interactions.groupBy(
        window(interactions.timestamp, windowDuration, slideDuration),
        interactions.interaction).count()

    query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
Beispiel #9
0
def remove_duplicate_encounters(ds,
                                owner_name='user',
                                transmitter_name='participant_identifier',
                                start_time_name='start_time',
                                end_time_name='end_time',
                                centroid_id_name='centroid_id',
                                distance_threshold=12):

    schema = StructType([
        StructField('timestamp', TimestampType()),
        StructField('localtime', TimestampType()),
        StructField('start_time', DoubleType()),
        StructField('end_time', DoubleType()),
        StructField('user', StringType()),
        StructField('participant_identifier', StringType()),
        StructField('version', IntegerType()),
        StructField('os', StringType()),
        StructField('latitude', DoubleType()),
        StructField('distances', ArrayType(DoubleType())),
        StructField('longitude', DoubleType()),
        StructField('average_count', DoubleType()),
        StructField('centroid_longitude', DoubleType()),
        StructField('centroid_latitude', DoubleType()),
        StructField('centroid_id', IntegerType()),
        StructField('centroid_area', DoubleType()),
        StructField('distance_mean', DoubleType()),
        StructField('distance_std', DoubleType()),
        StructField('distance_count', DoubleType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def remove_duplicates(data):
        data['distance_mean'] = data['distances'].apply(lambda a: np.mean(a))
        data['distance_std'] = data['distances'].apply(lambda a: np.std(a))
        data['distance_count'] = data['distances'].apply(
            lambda a: len(np.array(a)[np.array(a) < distance_threshold]))
        if data.shape[0] < 2:
            return data
        if len(
                np.intersect1d(data[owner_name].values,
                               data[transmitter_name].values)) == 0:
            return data
        data = data.sort_values('distance_mean').reset_index(drop=True)
        not_visited = []
        for i, row in data.iterrows():
            if i in not_visited:
                continue
            temp_df = data[
                data.participant_identifier.isin(
                    [row[owner_name], row[transmitter_name]])
                & data.user.isin([row[owner_name], row[transmitter_name]])]
            if temp_df.shape[0] == 0:
                continue
            else:
                indexes = [u for u in list(temp_df.index.values) if u != i]
                not_visited += indexes
        data = data[~data.index.isin(not_visited)]
        return data

    ds = ds.withColumn(start_time_name,
                       F.col(start_time_name).cast('double')).withColumn(
                           end_time_name,
                           F.col(end_time_name).cast('double'))
    data = ds.groupBy([centroid_id_name, 'version']).apply(remove_duplicates)
    data = data.withColumn(
        start_time_name,
        F.col(start_time_name).cast('timestamp')).withColumn(
            end_time_name,
            F.col(end_time_name).cast('timestamp'))
    data = data.withColumn('covid', F.lit(0))
    return DataStream(data=data, metadata=Metadata())
Beispiel #10
0
def test_load_spark_df(size, num_samples, num_movies, title_example,
                       genres_example):
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Check if the function load correct dataset
    df = movielens.load_spark_df(spark, size=size)
    assert df.count() == num_samples
    assert len(df.columns) == 4

    # Test if can handle different size of header columns
    header = ["a"]
    df = movielens.load_spark_df(spark, header=header)
    assert len(df.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header)
        assert len(df.columns) == 4

    # Test title load
    df = movielens.load_spark_df(spark, size=size, title_col="Title")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    title = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Title").limit(2).collect()
    assert title[0][0] == title[1][0]
    assert title[0][0] == title_example

    # Test genres load
    df = movielens.load_spark_df(spark, size=size, genres_col="Genres")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    genres = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Genres").limit(2).collect()
    assert genres[0][0] == genres[1][0]
    assert genres[0][0] == genres_example

    # Test movie data load (not rating data)
    df = movielens.load_spark_df(spark,
                                 size=size,
                                 header=None,
                                 title_col="Title",
                                 genres_col="Genres")
    assert df.count() == num_movies
    assert len(df.columns) == 3

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, size='10k')
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, local_cache_path='.')

    # Test if use schema when both schema and header are provided
    header = ["1", "2"]
    schema = StructType([StructField("u", IntegerType())])
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header, schema=schema)
        assert len(df.columns) == len(schema)
Beispiel #11
0
mama mia mama mia
first thing we need to do is create the training data
-check if we already get trained data
-clean the data
then we need to pull it throught the algorthim
-gather the training data
-gather the input sentences
-run the analysis
'''

the_trained_file_exist = path.isfile(trained_data)
if the_trained_file_exist:
    trained_data = spark.read.csv(trained_data, header=False)
else:
    the_schema = StructType([
        StructField("classified", StringType()),
        StructField("content", StringType())
    ])
    raw_data = spark.read.csv(training_data, header=False, schema=the_schema)
    raw_data.printSchema()
    raw_data.show(5)
    #clean up data...
    raw_data = raw_data.withColumn('content', lower(col('content')))
    raw_data = raw_data.withColumn('content', split(col('content'), ' '))
    '''
    from there, we just need to get rid of all the special chars and
    stuff so we can just have words.
    '''
    raw_data.show(5)
    spam_clean_data = raw_data.filter(raw_data.classified == 'spam')
    ham_clean_data = raw_data.filter(raw_data.classified == 'ham')
    ham_clean_data.show(5)
Beispiel #12
0
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
spark = SparkSession.builder.appName("tweets").getOrCreate()
spark.sparkContext.setLogLevel('WARN')
userSchema = StructType()\
.add("id", "integer")\
.add("lang", "string")\
.add("date", "string")\
.add("source", "string")\
.add("len", "integer")\
.add("likes", "integer")\
.add("RT", "integer")\
.add("hashtags", "string")\
.add("usernames", "string")\
.add("userid", "string")\
.add("name", "string")\
.add("place", "string")\
.add("followers", "integer")\
.add("friends", "integer")

dfCSV = spark.readStream.option("sep", ";").option(
    "header", "false").schema(userSchema).csv("/stream")
dfCSV.createOrReplaceTempView("tweets")

q2 = spark.sql(
    "select name,max(followers/friends) as FRRatio from tweets group by name order by FRRatio desc limit 5"
)
query = q2.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()
  ('spark.app.name', 'nyctaxi'), \
  ('spark.cores.max', '16'), \
  ('spark.driver.memory','124g')])
sc = SparkSession.builder.config(conf=conf).getOrCreate()

#Need to define schema for NYC taxi data.
schema = StructType([ \
    StructField('VendorID', IntegerType(), True), \
    StructField("tpep_pickup_datetime",StringType(),True), \
    StructField("tpep_dropoff_datetime",StringType(),True), \
    StructField("passenger_count", IntegerType(), True), \
    StructField("trip_distance", DoubleType(), True), \
    StructField('RateCodeID', IntegerType(), True), \
    StructField('store_and_fwd_flag', StringType(), True), \
    StructField("PULocationID",IntegerType(),True), \
    StructField("DOLocationID", IntegerType(), True), \
    StructField("payment_type", IntegerType(), True), \
    StructField("fare_amount", DoubleType(), True), \
    StructField("extra", DoubleType(), True), \
    StructField("mta_tax", DoubleType(), True), \
    StructField("tip_amount", DoubleType(), True), \
    StructField("tolls_amount", DoubleType(), True), \
    StructField("improvement_surcharge", DoubleType(), True), \
    StructField("total_amount", DoubleType(), True), \
    StructField("congestion_surcharge", DoubleType(), True)
                     ])
df = sc.read.format("csv").options(header='True').schema(schema).load(
    "../../dan606/nyctaxi/trip data/yellow_tripdata_2019-11.csv")
df.printSchema()
#df = sqlContext.createDataFrame(df.head(500000), df.schema) #not a random sample

df.count()
sqlContext.cacheTable('Ratings')
tt = time() - t0
print "Data is loaded in %s seconds" % round(tt,3)

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

t0 = time()
print "Training the ALS model..."
model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed,
                  iterations=iterations, lambda_=regularization_parameter)
tt = time() - t0
print "ALS model built!"
print "New model trained in %s seconds" % round(tt,3)

predictions = model.recommendProductsForUsers(10) \
    .flatMap(lambda pair: pair[1]) \
    .map(lambda rating: (rating.user, rating.product, rating.rating))

schema = StructType([StructField("userId", StringType(), True), StructField("movieId", StringType(), True),
                     StructField("prediction", FloatType(), True)])

dfToSave = sqlContext.createDataFrame(predictions, schema)
#dfToSave.write.jdbc(url=jdbcUrl, table=TABLE_RECOMMENDATIONS)
t0 = time()
dfToSave.write.option('driver', 'org.postgresql.Driver').jdbc(jdbcUrl, TABLE_RECOMMENDATIONS, mode='overwrite')
tt = time() - t0

print "Recommendation saved to DB in %s seconds" % round(tt,3)
Beispiel #15
0
    def _transform(self, df):
        import copy
        from pyspark.sql.types import StructField, StructType
        from pyspark.ml.linalg import VectorUDT

        model_pre_predict = self.getModel()
        deserialize = deserialize_fn()
        serialize = serialize_fn()
        serialized_model = serialize(model_pre_predict)

        input_shapes = self.getInputShapes()
        label_cols = self.getLabelColumns()
        output_cols = self.getOutputCols()
        feature_cols = self.getFeatureColumns()
        metadata = self._get_metadata()

        final_output_cols = util.get_output_cols(df.schema, output_cols)

        def predict(rows):
            from pyspark import Row
            from pyspark.ml.linalg import DenseVector, SparseVector

            model = deserialize(serialized_model)
            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()

                # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a
                # dense torch tensor.
                data = [
                    torch.tensor([row[col]]).reshape(shape)
                    for col, shape in zip(feature_cols, input_shapes)
                ]

                with torch.no_grad():
                    preds = model(*data)

                if not isinstance(preds, list) and not isinstance(
                        preds, tuple):
                    preds = [preds]

                for label_col, output_col, pred in zip(label_cols, output_cols,
                                                       preds):
                    meta = metadata[label_col]
                    col_type = meta['spark_data_type']
                    # dtype for dense and spark tensor is always np.float64
                    if col_type == DenseVector:
                        shape = np.prod(pred.shape)
                        flattened_pred = pred.reshape(shape, )
                        field = DenseVector(flattened_pred)
                    elif col_type == SparseVector:
                        shape = meta['shape']
                        flattened_pred = pred.reshape(shape, )
                        nonzero_indices = flattened_pred.nonzero()[0]
                        field = SparseVector(shape, nonzero_indices,
                                             flattened_pred[nonzero_indices])
                    elif pred.shape.numel() == 1:
                        # If the column is scalar type, int, float, etc.
                        value = pred.item()
                        python_type = util.spark_scalar_to_python_type(
                            col_type)
                        if issubclass(python_type, numbers.Integral):
                            value = round(value)
                        field = python_type(value)
                    else:
                        field = DenseVector(pred.reshape(-1))

                    fields[output_col] = field

                values = [fields[col] for col in final_output_cols]

                yield Row(*values)

        spark0 = SparkSession._instantiatedSession

        final_output_fields = []

        # copy input schema
        for field in df.schema.fields:
            final_output_fields.append(copy.deepcopy(field))

        # append output schema
        override_fields = df.limit(1).rdd.mapPartitions(
            predict).toDF().schema.fields[-len(output_cols):]
        for name, override, label in zip(output_cols, override_fields,
                                         label_cols):
            # default data type as label type
            data_type = metadata[label]['spark_data_type']()

            if type(override.dataType) == VectorUDT:
                # Override output to vector. This is mainly for torch's classification loss
                # where label is a scalar but model output is a vector.
                data_type = VectorUDT()
            final_output_fields.append(
                StructField(name=name, dataType=data_type, nullable=True))

        final_output_schema = StructType(final_output_fields)

        pred_rdd = df.rdd.mapPartitions(predict)

        # Use the schema from previous section to construct the final DF with prediction
        return spark0.createDataFrame(pred_rdd, schema=final_output_schema)
    def run(
        self,
        predicted_data_file_path: str,
        passed_paper_exam_data_file_path: str,
        send_scout_mail_data_file_path: str,
        aodrno_to_wjoid_data_file_path: str,
        applied_data_file_path: str,
        item_data_file_path: str
    ) -> bool:
        """execute."""
        # check parameter
        self.__param_check(
            predicted_data_file_path,
            passed_paper_exam_data_file_path,
            send_scout_mail_data_file_path,
            aodrno_to_wjoid_data_file_path,
            applied_data_file_path,
            item_data_file_path
        )

        # make spark context
        spark = SparkSession\
            .builder\
            .appName('aggregate_data')\
            .config('spark.sql.crossJoin.enabled', 'true')\
            .config('spark.debug.maxToStringFields', 500)\
            .getOrCreate()
        sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)

        # load predicted data
        custom_schema = StructType([
            StructField('predicted_wjoid', StringType(), True),
            StructField('predicted_wsfid', StringType(), True),
            StructField('predicted_score', FloatType(), True),
            StructField('predicted_aodrno', StringType(), True),
            StructField('predicted_wsfno', StringType(), True)
        ])
        predicted_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(predicted_data_file_path, schema=custom_schema)\
            .na\
            .fill(0.0)

        # load aodrno to wjoid mapping data
        custom_schema = StructType([
            StructField('mapping_aodrno', StringType(), True),
            StructField('mapping_aodrid', StringType(), True),
            StructField('mapping_wjoid', StringType(), True)
        ])
        mapping_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(aodrno_to_wjoid_data_file_path, schema=custom_schema)\
            .na\
            .fill(0.0)
        # join
        df = predicted_df.join(
            mapping_df,
            predicted_df['predicted_aodrno'] == mapping_df['mapping_aodrno'],
            'left_outer'
        )
        # delete unnecessary variables
        del(predicted_df)
        del(mapping_df)

        # load passed data
        custom_schema = StructType([
            StructField('passed_astfid', StringType(), True),
            StructField('passed_aodrid', StringType(), True)
        ])
        passed_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(passed_paper_exam_data_file_path, schema=custom_schema)\
            .na\
            .fill(0.0)

        # join
        df = df.join(
            passed_df,
            (
                (df['mapping_aodrid'] == passed_df['passed_aodrid'])
                & (df['predicted_wsfid'] == passed_df['passed_astfid'])
            ),
            'left_outer'
        )
        # delete unnecessary variables
        del(passed_df)

        # load send scout mail data
        custom_schema = StructType([
            StructField('scout_mail_wsfid', StringType(), True),
            StructField('scout_mail_wjoid', StringType(), True)
        ])
        scout_mail_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(send_scout_mail_data_file_path, schema=custom_schema)\
            .na\
            .fill(0.0)
        # join
        df = df.join(
            scout_mail_df,
            (
                (df['mapping_wjoid'] == scout_mail_df['scout_mail_wjoid'])
                & (df['predicted_wsfid'] == scout_mail_df['scout_mail_wsfid'])
            ),
            'left_outer'
        )
        # delete unnecessary variables
        del(scout_mail_df)

        # load applied data
        custom_schema = StructType([
            StructField('applied_astfid', StringType(), True),
            StructField('applied_aodrid', StringType(), True)
        ])
        applied_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(applied_data_file_path, schema=custom_schema)\
            .na\
            .fill(0.0)
        # join
        df = df.join(
            applied_df,
            (
                (df['mapping_aodrid'] == applied_df['applied_aodrid'])
                & (df['predicted_wsfid'] == applied_df['applied_astfid'])
            ),
            'left_outer'
        )
        # delete unnecessary variables
        del(applied_df)

        # load item data
        items_df = sqlContext\
            .read\
            .format('csv')\
            .options(header='true')\
            .load(item_data_file_path)\
            .na\
            .fill(0.0)
        # join
        df = df.join(
            items_df,
            df['predicted_aodrno'] == items_df['i_aodrno'],
            'left_outer'
        )
        # delete unnecessary variables
        del(items_df)

        df.cache()

        # make udf
        f_column_check = udf(lambda l_1, l_2: self.__column_check(l_1, l_2), StringType())
        f_make_area_cd = udf(lambda l: self.__make_area_cd(l), StringType())

        # passed check
        df = df.withColumn(
            'passed_paper_exam',
            f_column_check(df['passed_astfid'], df['passed_aodrid'])
        )
        df = df.withColumn(
            'send_scout_mail',
            f_column_check(df['scout_mail_wsfid'], df['scout_mail_wjoid'])
        )
        df = df.withColumn(
            'applied',
            f_column_check(df['applied_astfid'], df['applied_aodrid'])
        )

        # area code
        df = df.withColumn('area_cd', f_make_area_cd(df['i_atdfkid']))

        df.cache()

        # aggregate

        # all data
        data = {}
        d = self.__calc_distribution(df)
        for k in d.keys():
            data[k] = [d[k]]

        # send scout mail
        tmp_df = df.where(df['send_scout_mail'] == '1')
        d = self.__calc_distribution(tmp_df)
        for k in d.keys():
            data[k].append(d[k])

        # send scout mail -> applied
        tmp_df = df.where(
            (df['send_scout_mail'] == '1') & (df['applied'] == '1')
        )
        d = self.__calc_distribution(tmp_df)
        for k in d.keys():
            data[k].append(d[k])

        # send scout mail -> passed paper exam
        tmp_df = df.where(
            (df['send_scout_mail'] == '1') & (df['passed_paper_exam'] == '1')
        )
        d = self.__calc_distribution(tmp_df)
        for k in d.keys():
            data[k].append(d[k])

        # not send scout mail -> applied
        tmp_df = df.where(
            (df['send_scout_mail'] != '1') & (df['applied'] == '1')
        )
        d = self.__calc_distribution(tmp_df)
        for k in d.keys():
            data[k].append(d[k])

        # not send scout mail -> passed paper exam
        tmp_df = df.where(
            (df['send_scout_mail'] != '1') & (df['passed_paper_exam'] == '1')
        )
        d = self.__calc_distribution(tmp_df)
        for k in d.keys():
            data[k].append(d[k])

        print('all')
        for k, v in sorted(data.items()):
            if int(v[2]) == 0:
                div_1 = 0
            else:
                div_1 = int(v[3]) / int(v[2])
            if int(v[4]) == 0:
                div_2 = 0
            else:
                div_2 = int(v[5]) / int(v[4])
            t = [v[0], v[1], v[2], v[3], div_1, v[4], v[5], div_2]

            print('%s\t%s' % (k, '\t'.join(map(str, t))))

        # count by each item
        for column in ['i_assyulid', 'area_cd']:
            print('')
            print(column)

            if column == 'i_assyulid':
                master = ['%02d' % (n) for n in range(1, 16)]
            elif column == 'area_cd':
                master = ['2', '3', '4', '5', '6', '7', '8', '0']
            else:
                master = []

            data = defaultdict(list)
            for m in master:
                tmp_data = {}
                column_df = df.where(df[column] == m)

                # send scout mail -> applied
                tmp_df = column_df.where(
                    (column_df['send_scout_mail'] == '1') & (column_df['applied'] == '1')
                )
                d = self.__calc_distribution(tmp_df)
                for k in d.keys():
                    tmp_data[k] = [d[k]]

                # send scout mail -> passed paper exam
                tmp_df = column_df.where(
                    (column_df['send_scout_mail'] == '1') & (column_df['passed_paper_exam'] == '1')
                )
                d = self.__calc_distribution(tmp_df)
                for k in d.keys():
                    tmp_data[k].append(d[k])

                # not send scout mail -> applied
                tmp_df = column_df.where(
                    (column_df['send_scout_mail'] != '1') & (column_df['applied'] == '1')
                )
                d = self.__calc_distribution(tmp_df)
                for k in d.keys():
                    tmp_data[k].append(d[k])

                # not send scout mail -> passed paper exam
                tmp_df = column_df.where(
                    (column_df['send_scout_mail'] != '1') & (column_df['passed_paper_exam'] == '1')
                )
                d = self.__calc_distribution(tmp_df)
                for k in d.keys():
                    tmp_data[k].append(d[k])

                for k, v in tmp_data.items():
                    if int(v[0]) == 0:
                        div_1 = 0
                    else:
                        div_1 = int(v[1]) / int(v[0])
                    if int(v[2]) == 0:
                        div_2 = 0
                    else:
                        div_2 = int(v[3]) / int(v[2])

                    data[k].append(v[0])
                    data[k].append(v[1])
                    data[k].append(div_1)
                    data[k].append(v[2])
                    data[k].append(v[3])
                    data[k].append(div_2)

            for k, v in sorted(data.items()):
                print('%s\t%s' % (k, '\t'.join(map(str, v))))

        # delete unnecessary variables
        del(column_df)

        # job with experience
        data = defaultdict(list)
        i = 1
        while i <= 2:
            if i == 1:
                must_assyulid_df = df.where(df['i_must_assyulid'].isNotNull())
            elif i == 2:
                must_assyulid_df = df.where(df['i_must_assyulid'].isNull())
            else:
                break

            # send scout mail -> applied
            tmp_df = must_assyulid_df.where(
                (must_assyulid_df['send_scout_mail'] == '1')
                & (must_assyulid_df['applied'] == '1')
            )
            d = self.__calc_distribution(tmp_df)
            for k in d.keys():
                data[k].append(d[k])

            # send scout mail -> passed paper exam
            tmp_df = must_assyulid_df.where(
                (must_assyulid_df['send_scout_mail'] == '1')
                & (must_assyulid_df['passed_paper_exam'] == '1')
            )
            d = self.__calc_distribution(tmp_df)
            for k in d.keys():
                data[k].append(d[k])

            # not send scout mail -> applied
            tmp_df = must_assyulid_df.where(
                (must_assyulid_df['send_scout_mail'] != '1')
                & (must_assyulid_df['applied'] == '1')
            )
            d = self.__calc_distribution(tmp_df)
            for k in d.keys():
                data[k].append(d[k])

            # not send scout mail -> passed paper exam
            tmp_df = must_assyulid_df.where(
                (must_assyulid_df['send_scout_mail'] != '1')
                & (must_assyulid_df['passed_paper_exam'] == '1')
            )
            d = self.__calc_distribution(tmp_df)
            for k in d.keys():
                data[k].append(d[k])

            i += 1

        print('')
        print('job with experience')
        for k, v in sorted(data.items()):
            if int(v[0]) == 0:
                div_1 = 0
            else:
                div_1 = int(v[1]) / int(v[0])
            if int(v[2]) == 0:
                div_2 = 0
            else:
                div_2 = int(v[3]) / int(v[2])

            if int(v[4]) == 0:
                div_3 = 0
            else:
                div_3 = int(v[5]) / int(v[4])
            if int(v[6]) == 0:
                div_4 = 0
            else:
                div_4 = int(v[7]) / int(v[6])

            t = [v[0], v[1], div_1, v[2], v[3], div_2, v[4], v[5], div_3, v[6], v[7], div_4]

            print('%s\t%s' % (k, '\t'.join(map(str, t))))

        # delete unnecessary variables
        del(must_assyulid_df)

        return True
Beispiel #17
0
    ArrayType,
    BooleanType,
    DoubleType,
    LongType,
    StringType,
    StructField,
    StructType,
)

HN_ITEMS_SCHEMA = StructType([
    StructField("id", LongType()),
    StructField("parent", DoubleType()),
    StructField("time", LongType()),
    StructField("type", StringType()),
    StructField("by", StringType()),
    StructField("text", StringType()),
    StructField("kids", ArrayType(LongType())),
    StructField("dead", BooleanType()),
    StructField("score", DoubleType()),
    StructField("title", StringType()),
    StructField("descendants", DoubleType()),
    StructField("url", StringType()),
])

ITEM_FIELD_NAMES = [field.name for field in HN_ITEMS_SCHEMA.fields]


@asset(
    io_manager_key="parquet_io_manager",
    required_resource_keys={"hn_client"},
    description=
    "Items from the Hacker News API: each is a story or a comment on a story.",
Beispiel #18
0
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load the departure delay bucketizer
    departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(
        base_path)
    departure_bucketizer = Bucketizer.load(departure_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml import PipelineModel

    string_vectorizer_pipeline_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "FlightNum", "DepDelayBucket"
    ]:
        string_pipeline_model_path = "{}/models/string_indexer_pipeline_model_{}.bin".format(
            base_path, column)
        string_pipeline_model = PipelineModel.load(string_pipeline_model_path)
        string_vectorizer_pipeline_models[column] = string_pipeline_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the final assembler
    final_assembler_path = "{}/models/final_vector_assembler.bin".format(
        base_path)
    final_assembler = VectorAssembler.load(final_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    # Bucketize the departure and arrival delays for classification
    ml_bucketized_features = departure_bucketizer.transform(
        prediction_requests)

    # Check the buckets
    ml_bucketized_features.select("DepDelay", "DepDelayBucket").show()

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "FlightNum", "DepDelayBucket"
    ]:
        string_pipeline_path = "{}/models/string_indexer_pipeline_{}.bin".format(
            base_path, column)
        string_pipeline_model = string_vectorizer_pipeline_models[column]
        ml_bucketized_features = string_pipeline_model.transform(
            ml_bucketized_features)
        ml_bucketized_features = ml_bucketized_features.drop(column + "_index")

    # Vectorize numeric columns
    ml_bucketized_features = vector_assembler.transform(ml_bucketized_features)

    # Drop the original numeric columns
    numeric_columns = ["DepDelay", "Distance"]

    # Combine various features into one feature vector, 'features'
    final_vectorized_features = final_assembler.transform(
        ml_bucketized_features)
    final_vectorized_features.show()

    # Drop the individual vector columns
    feature_columns = [
        "Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec",
        "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec",
        "NumericFeatures_vec"
    ]
    for column in feature_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)
schema = StructType() \
    .add("conversationTitle", "string") \
    .add("tapeNumber", "string") \
    .add("conversationNumber", "string") \
    .add("identifier", "string") \
    .add("startDateTime", "string") \
    .add("endDateTime", "string") \
    .add("startDate", "string") \
    .add("startTime", "string") \
    .add("endDate", "string") \
    .add("endTime", "string") \
    .add("dateCertainty", "string") \
    .add("timeCertainty", "string") \
    .add("participants", "string") \
    .add("description", "string") \
    .add("locationCode", "string") \
    .add("recordingDevice", "string") \
    .add("latitudeEstimated", "string") \
    .add("longitudeEstimated", "string") \
    .add("collection", "string") \
    .add("collectionURL", "string") \
    .add("chronCode", "string") \
    .add("chronRelease", "string") \
    .add("chronReleaseDate", "string") \
    .add("aogpRelease", "string") \
    .add("aogpSegment", "string") \
    .add("digitalAccess", "string") \
    .add("physicalAccess", "string") \
    .add("contactEmail", "string") \
    .add("lastModified", "string") \
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType

# TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic
redisMessageSchema = StructType([
    StructField('key', StringType()),
    StructField('value', StringType()),
    StructField('expiredType', StringType()),
    StructField('expiredValue', StringType()),
    StructField('existType', StringType()),
    StructField('ch', StringType()),
    StructField('incr', BooleanType()),
    StructField(
        'zSetEntries',
        ArrayType(
            StructType([
                StructField('element', StringType()),
                StructField('score', StringType())
            ])))
])

# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
customerJSONSchema = StructType([
    StructField('customerName', StringType()),
    StructField('email', StringType()),
    StructField('phone', StringType()),
    StructField('birthDay', StringType())
])

# TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
Beispiel #21
0
def write_schema(src_name):
    """
    define outgoing file's schema
    """
    return {
        'yellow':
            StructType([
                StructField('VendorID', StringType(), True),
                StructField('tpep_pickup_datetime', TimestampType(), True),
                StructField('tpep_dropoff_datetime', TimestampType(), True),
                StructField('passenger_count', IntegerType(), True),
                StructField('trip_distance', DecimalType(10, 4), True),
                StructField('RatecodeID', StringType(), True),
                StructField('store_and_fwd_flag', StringType(), True),
                StructField('PULocationID', StringType(), True),
                StructField('DOLocationID', StringType(), True),
                StructField('payment_type', StringType(), True),
                StructField('fare_amount', DecimalType(10, 2), True),
                StructField('extra', DecimalType(10, 2), True),
                StructField('mta_tax', DecimalType(10, 2), True),
                StructField('tip_amount', DecimalType(10, 2), True),
                StructField('tolls_amount', DecimalType(10, 2), True),
                StructField('improvement_surcharge', DecimalType(10, 2), True),
                StructField('total_amount', DecimalType(10, 2), True),
                StructField('congestion_surcharge', DecimalType(10, 2), True),
                StructField('file_name', StringType(), True),
                StructField('update_date', TimestampType(), True),
                StructField('dt', StringType(), True)
            ]),
        'green':
            StructType([
                StructField('VendorID', StringType(), True),
                StructField('lpep_pickup_datetime', TimestampType(), True),
                StructField('lpep_dropoff_datetime', TimestampType(), True),
                StructField('store_and_fwd_flag', StringType(), True),
                StructField('RatecodeID', StringType(), True),
                StructField('PULocationID', StringType(), True),
                StructField('DOLocationID', StringType(), True),
                StructField('passenger_count', IntegerType(), True),
                StructField('trip_distance', DecimalType(10, 4), True),
                StructField('fare_amount', DecimalType(10, 2), True),
                StructField('extra', DecimalType(10, 2), True),
                StructField('mta_tax', DecimalType(10, 2), True),
                StructField('tip_amount', DecimalType(10, 2), True),
                StructField('tolls_amount', DecimalType(10, 2), True),
                StructField('ehail_fee', DecimalType(10, 2), True),
                StructField('improvement_surcharge', DecimalType(10, 2), True),
                StructField('total_amount', DecimalType(10, 2), True),
                StructField('payment_type', StringType(), True),
                StructField('trip_type', StringType(), True),
                StructField('congestion_surcharge', DecimalType(10, 2), True),
                StructField('file_name', StringType(), True),
                StructField('update_date', TimestampType(), True),
                StructField('dt', StringType(), True)
            ]),
        'corrupted':
            StructType([
                StructField('corrupted', StringType(), True),
                StructField('file_name', StringType(), True),
                StructField('update_date', TimestampType(), True)
            ])
    }[src_name]
Beispiel #22
0
sc = SparkContext('local',"task2")

#rdd1 = sc.textFile("shape_stat.csv")
#rdd2 = sc.textFile("shape.csv")
rdd1 = sc.textFile(data2)
rdd2 = sc.textFile(data1)

mappedRdd1 = rdd1.map(lambda x: x.split(','))
mappedRdd2 = rdd2.map(lambda x: x.split(','))


spark = SparkSession(sc)

header=mappedRdd1.first()
fields = [StructField(field_name, StringType(), True) for field_name in header]
schema = StructType(fields)
frdd1=mappedRdd1.filter(lambda row:row != header)
df1=spark.createDataFrame(frdd1,schema=schema)


header=mappedRdd2.first()
fields = [StructField(field_name, StringType(), True) for field_name in header]
schema = StructType(fields)
frdd2=mappedRdd2.filter(lambda row:row != header)
df2=spark.createDataFrame(frdd2,schema=schema)

df1=df1[df1.word==word]
df2=df2[df2.word==word]

df1=df1.repartitionByRange(100,'key_id')
df2=df2.repartitionByRange(100,'key_id')
Beispiel #23
0
#https://medium.com/@nitingupta.bciit/apache-spark-streaming-from-text-file-fa40155b7a21
#https://hackersandslackers.com/structured-streaming-in-pyspark/

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType
spark = SparkSession.builder.appName("Streaming").config('').getOrCreate()

schema_1 = StructType([
    StructField('Athlete', StringType(), True),
    StructField('Age', IntegerType(), True),
    StructField('Country', StringType(), True),
    StructField('Year', IntegerType(), True),
    StructField('Closing Ceremony Date', TimestampType(), True),
    StructField('Sport', StringType(), True),
    StructField('Medal and Number', StringType(), True),
])

activityDataSample = 'folder_path'

#dont use max file = 1 in production
streamingDF = (spark.readStream.schema(schema_1).option(
    "maxFilesPerTrigger", 1).csv(activityDataSample))


streamingDF = streamingDF.filter((streamingDF.Country != 'null') | (streamingDF.Country != 'Null') \
                                 | (streamingDF.Country != ''))
streamingActionCountsDF = streamingDF.groupBy('Country').count()

print(streamingActionCountsDF.isStreaming)

#As we are testing it in standalone
schema = StructType([
    StructField(
        "results",
        ArrayType(
            StructType([
                StructField(
                    "user",
                    StructType([
                        StructField("gender", StringType()),
                        StructField(
                            "name",
                            StructType([
                                StructField("title", StringType()),
                                StructField("first", StringType()),
                                StructField("last", StringType())
                            ])),
                        StructField(
                            "location",
                            StructType([
                                StructField("street", StringType()),
                                StructField("city", StringType()),
                                StructField("state", StringType()),
                                StructField("zip", StringType())
                            ])),
                        StructField("email", StringType()),
                        StructField("username", StringType()),
                        StructField("password", StringType()),
                        StructField("salt", StringType()),
                        StructField("md5", StringType()),
                        StructField("sha1", StringType()),
                        StructField("sha256", StringType()),
                        StructField("registered", StringType()),
                        StructField("dob", StringType()),
                        StructField("phone", StringType()),
                        StructField("cell", StringType()),
                        StructField(
                            "picture",
                            StructType([
                                StructField("large", StringType()),
                                StructField("medium", StringType()),
                                StructField("thumbnail", StringType())
                            ]))
                    ]))
            ]))),
    StructField("nationality", StringType()),
    StructField("seed", StringType()),
    StructField("version", StringType())
])
Beispiel #25
0
    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
               == "true":
                timezone = self.conf.get("spark.sql.session.timeZone")
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [str(x) if not isinstance(x, basestring) else
                          (x.encode('utf-8') if not isinstance(x, str) else x)
                          for x in data.columns]

            if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                    and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(data, schema, timezone)
                except Exception as e:
                    warnings.warn("Arrow will not be used in createDataFrame: %s" % str(e))
                    # Fallback to create DataFrame without arrow if raise some exception
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType, name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("total-amount-spent").getOrCreate()

schema = StructType([
    StructField("customerID", IntegerType(), True),
    StructField("itemID", IntegerType(), True),
    StructField("amountSpent", FloatType(), True)
])

df = spark.read.schema(schema).csv("file:///SparkCourse/customer-orders.csv")
df.printSchema()

customers = df.select("customerID", "amountSpent")
amountSpent = customers.groupBy("customerID").sum("amountSpent")
amountSpentRounded = amountSpent.withColumn(
    "amountSpent", func.round(func.col("sum(amountSpent)"),
                              2)).select("customerID",
                                         "amountSpent").sort("amountSpent")

results = amountSpentRounded.collect()

for result in results:
    print(result[0], result[1])

spark.stop()
    @F.udf(returnType=IntegerType())
    def boro_to_borocode(x):
        '''Convert boro string to borocode'''
        return {"NY": 1, "MN": 1, "BX": 2, "BRONX": 2, \
            "BK": 3, "K": 33, "KINGS": 3, "KING": 3, "BKLYN": 4, \
            "Q": 4, "QUEEN": 4, "QN": 4, "QNS": 4, "QU": 4, \
            "ST": 5, "SI": 5, "R": 5, "RICHMOND": 5}.get(x, None)

    @F.udf(returnType=StringType())
    def trim_street(x):
        '''Remove repeating whitespaces'''
        return ' '.join(x.upper().split())

    count_schema = StructType([StructField('PHYSICALID', IntegerType(), True),\
        StructField('year', IntegerType(), True),\
        StructField('count', IntegerType(), True)])

    # get the violation data into the dataframe
    # filter for null values  and make sure home address has the right format
    # extract year from issued date and make sure the year is in the 2015 to 2019 range
    # convert boro to borocode and only make sure nyc boroughs go through by using the null filter
    # trim streetname of any extra white spaces and collect violations based on a common streetname and borough.
    # The violations collection row is called `violations`
    df_violations = csv_df(sqlContext, os.path.join(sys.argv[1] if len(sys.argv) > 1 else "nyc_parking_violation", "*.csv"))\
        .select("Issue Date", "Street Name", "House Number", "Violation County").dropna()\
        .filter(F.col("House Number").rlike("^(\d+)((-(\d+))*)$"))\
        .withColumn("year", date_to_year(F.col("Issue Date")))\
        .filter("2015 <= year and year <= 2019")\
        .withColumn("BOROCODE", boro_to_borocode(F.col("Violation County")))\
        .dropna(subset=["BOROCODE"])\
import unittest
from pyspark.sql import SparkSession
from pipeline import pipeline
from pyspark.sql.types import StructType, StructField, StringType

message_schema = StructType([StructField("name", StringType())])


class Test(unittest.TestCase):
    def test_pipeline(self) -> None:
        spark = _spark()
        data = spark.readStream.json(path="test_data/", schema=message_schema)

        stream = pipeline.process(data)

        stream.processAllAvailable()
        stream.stop()

        self.assertEqual(
            2,
            spark.sql("SELECT COUNT(1) AS count FROM example").first()
            ["count"])
        self.assertEqual(
            1,
            spark.sql("SELECT COUNT(1) AS count FROM example WHERE name = 'a'"
                      ).first()["count"])
        self.assertEqual(
            1,
            spark.sql("SELECT COUNT(1) AS count FROM example WHERE name = 'b'"
                      ).first()["count"])
    secret = open(app_secrets_path)
    app_secret = yaml.load(secret, Loader=yaml.FullLoader)

    # Setup spark to use s3
    hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"])
    hadoop_conf.set("fs.s3a.secret.key",
                    app_secret["s3_conf"]["secret_access_key"])

    print(
        "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'"
    )

    fin_schema = StructType() \
        .add("id", IntegerType(), True) \
        .add("has_debt", BooleanType(), True) \
        .add("has_financial_dependents", BooleanType(), True) \
        .add("has_student_loans", BooleanType(), True) \
        .add("income", DoubleType(), True)

    fin_df = spark.read \
        .option("header", "false") \
        .option("delimiter", ",") \
        .format("csv") \
        .schema(fin_schema) \
        .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/finances.csv")

    fin_df.printSchema()
    fin_df.show()

    print(
        "Creating dataframe ingestion CSV file using 'SparkSession.read.csv()',"
Beispiel #30
0
from pyspark.sql.functions import split, max, struct
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

spark = SparkSession\
 .builder\
 .appName("BigData")\
 .getOrCreate()

schema = StructType([
    StructField("id", StringType(), True),
    StructField("language", StringType(), True),
    StructField("date", StringType(), True),
    StructField("source", StringType(), True),
    StructField("len", StringType(), True),
    StructField("likes", StringType(), True),
    StructField("RTs", StringType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("usermentionnames", StringType(), True),
    StructField("usermentionid", StringType(), True),
    StructField("name", StringType(), True),
    StructField("place", StringType(), True),
    StructField("followers", DoubleType(), True),
    StructField("friends", DoubleType(), True),
])
lines=spark \
 .readStream \
 .option("sep",";") \
 .schema(schema) \
 .csv("hdfs://localhost:8080/stream")

tags = lines.select("Hashtags")
h = tags.select(explode(split("Hashtags", ",")).alias("Hashtags"))
Beispiel #31
0
    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self._wrapped._conf.pandasRespectSessionTimeZone():
                timezone = self._wrapped._conf.sessionLocalTimeZone()
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [str(x) if not isinstance(x, basestring) else
                          (x.encode('utf-8') if not isinstance(x, str) else x)
                          for x in data.columns]

            if self._wrapped._conf.arrowEnabled() and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(data, schema, timezone)
                except Exception as e:
                    from pyspark.util import _exception_message

                    if self._wrapped._conf.arrowFallbackEnabled():
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true; however, "
                            "failed by the reason below:\n  %s\n"
                            "Attempting non-optimization as "
                            "'spark.sql.execution.arrow.fallback.enabled' is set to "
                            "true." % _exception_message(e))
                        warnings.warn(msg)
                    else:
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
                            "the error below and will not continue because automatic fallback "
                            "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
                            "false.\n  %s" % _exception_message(e))
                        warnings.warn(msg)
                        raise
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType, name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Beispiel #32
0
def bluetooth_encounter(data,
                        st: datetime,
                        et: datetime,
                        distance_threshold=12,
                        n_rows_threshold=8,
                        time_threshold=10 * 60,
                        ltime=True):
    """

    :param ds: Input Datastream
    :param st: Start Time the time window in UTC
    :param et: End Time of time window in UTC
    :param distance_threshold: Threshold on mean distance per encounter
    :param n_rows_threshold: No of rows per group/encounter
    :param time_threshold: Minimum Duration of time per encounter
    :param epsilon: A simple threshold
    :param count_threshold: Threshold on count
    :return: A Sparse representation of the Bluetooth Encounter
    """
    schema = StructType([
        StructField('timestamp', TimestampType()),
        StructField('localtime', TimestampType()),
        StructField('start_time', TimestampType()),
        StructField('end_time', TimestampType()),
        StructField('user', StringType()),
        StructField('version', IntegerType()),
        StructField('latitude', DoubleType()),
        StructField('distances', ArrayType(DoubleType())),
        StructField('longitude', DoubleType()),
        StructField('average_count', DoubleType()),
        StructField('major', LongType()),
        StructField('minor', LongType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_enconters(data):
        if data.shape[0] < n_rows_threshold:
            return pd.DataFrame([],
                                columns=[
                                    'user', 'major', 'minor', 'start_time',
                                    'end_time', 'version', 'distances',
                                    'timestamp', 'localtime', 'latitude',
                                    'longitude', 'average_count'
                                ])
        data = data.sort_values('time').reset_index(drop=True)
        data_filtered = data[data.distance_estimate < distance_threshold]
        #         data_filtered = data
        if data_filtered.shape[0] < n_rows_threshold or data_filtered[
                'time'].max() - data_filtered['time'].min() < time_threshold:
            return pd.DataFrame([],
                                columns=[
                                    'user', 'major', 'minor', 'start_time',
                                    'end_time', 'version', 'distances',
                                    'timestamp', 'localtime', 'latitude',
                                    'longitude', 'average_count'
                                ])
        else:
            data_all = []
            #             data = data_filtered
            k = 0
            i = data.shape[0]
            c = 'localtime'
            data_all.append([
                data_filtered['user'].iloc[k], data['major'].iloc[k],
                data['minor'].iloc[k], data[c].iloc[k], data[c].iloc[i - 1],
                data['version'].iloc[k],
                data['distance_estimate'].iloc[k:i].values,
                data['timestamp'].iloc[int(
                    (i + k) / 2)], data['localtime'].iloc[int((i + k) / 2)],
                np.mean(data['latitude'].values[k:i]),
                np.mean(data['longitude'].values[k:i]),
                np.mean(data['count'].values[k:i])
            ])
            return pd.DataFrame(data_all,
                                columns=[
                                    'user', 'major', 'minor', 'start_time',
                                    'end_time', 'version', 'distances',
                                    'timestamp', 'localtime', 'latitude',
                                    'longitude', 'average_count'
                                ])

    #     print(st,et)
    data = data.withColumn('time', F.col('timestamp').cast('double'))
    #     data.show(100,False)
    #     print('--'*40)
    if ltime:
        data_filtered = data.filter((data.localtime >= F.lit(st))
                                    & (data.localtime < F.lit(et)))
    else:
        data_filtered = data.filter((data.timestamp >= F.lit(st))
                                    & (data.timestamp < F.lit(et)))
    #     data.show(100,False)
    #     print(data_filtered.count(),'filtered data count')
    data_filtered = data_filtered.filter(data_filtered.longitude != 200)
    data_result = data_filtered.groupBy(['user', 'major', 'minor',
                                         'version']).apply(get_enconters)
    #     data_filtered.sort('timestamp').show(1000,False)
    print(data_result.count(), 'encounter count')
    #     data_result.show(5,False)
    return DataStream(data=data_result, metadata=Metadata())
Beispiel #33
0
    json_obj = json.loads(array_str)
    for item in json_obj:
        yield item["a"], item["b"]


if __name__ == "__main__":

    spark = SparkSession \
        .builder \
        .appName("compute_stats") \
        .getOrCreate()

    csv_schema = StructType([
        StructField('InvoiceNo', StringType(), nullable=False),
        StructField('Customer_ID', StringType(), nullable=False),
        StructField('Date', StringType(), nullable=False),
        StructField('Planet', StringType(), nullable=False),
        StructField('Purchase', StringType(), nullable=False)
    ])

    # json_schema = ArrayType(StructType( [ StructField('ItemNo', StringType(), nullable=False),
    #                                      StructField('UnitPrice', StringType(), nullable=False),
    #                                      StructField('Description', StringType(), nullable=False),
    #                                      StructField('Quantity', StringType(), nullable=False) ] ) )

    json_schema = StructType([
        StructField(
            "My_array",
            ArrayType(
                StructType([
                    StructField('ItemNo', StringType(), nullable=True),
Beispiel #34
0
COUNTRIES_FILE_PATH  = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH     = '/opt/SparkDatasets/geography/cities.csv'

CONTINENT_STRUCTURE = \
    [ ( 'continent_id'  , 'integer' )
    , ( 'continent_name', 'string'  ) ]
COUNTRY_STRUCTURE = \
    [ ( 'country_id'  , 'integer' )
    , ( 'continent_id', 'integer' )
    , ( 'country_name', 'string'  ) ]
CITY_STRUCTURE = \
    [ ( 'city_id'   , 'integer' )
    , ( 'country_id', 'integer' )
    , ( 'city_name' , 'string'  ) ]

CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA   = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE)   )
CITY_SCHEMA      = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE)      )

spark = SparkSession.builder.getOrCreate()

continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df  = generate_dataframe( spark, COUNTRY_SCHEMA  , COUNTRIES_FILE_PATH  )
cities_df     = generate_dataframe( spark, CITY_SCHEMA     , CITIES_FILE_PATH     )

continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')

print continents_df.count()
print countries_df.count()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]

# Using StructType schema
from pyspark.sql.types import StructField, StructType, StringType, MapType
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.printSchema()
df.show(truncate=False)

df3=df.rdd.map(lambda x: \
    (x.name,x.properties["hair"],x.properties["eye"])) \
    .toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

df.withColumn("hair",df.properties.getItem("hair")) \
  .withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties") \
  .show()
# MAGIC Complex outputs are helpful when you need to return multiple values from your UDF. The UDF design pattern involves returning a single column to drill down into, to pull out the desired data.

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC Start by determining the desired output.  This will look like a schema with a high level `StructType` with numerous `StructFields`.
# MAGIC 
# MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> For a refresher on this, see the lesson **Applying Schemas to JSON Data** in <a href="https://academy.databricks.com/collections/frontpage/products/etl-part-1-data-extraction/" target="_blank">ETL Part 1 course from Databricks Academy</a>.

# COMMAND ----------

from pyspark.sql.types import DoubleType, StructType, StructField

mathOperationsSchema = StructType([
  StructField("sum", DoubleType(), True), 
  StructField("multiplication", DoubleType(), True), 
  StructField("division", DoubleType(), True) 
])

# COMMAND ----------

# MAGIC %md
# MAGIC Create a function that returns a tuple of your desired output.

# COMMAND ----------

def manual_math(x, y):
  return (float(x + y), float(x * y), x / float(y))

manual_math(1, 2)
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType

# this is a manually created schema - before Spark 3.0.0, schema inference is not automatic

redisMessageSchema = StructType(
    [
        StructField("key", StringType()),
        StructField("value", StringType()),
        StructField("expiredType", StringType()),
        StructField("expiredValue",StringType()),
        StructField("existType", StringType()),
        StructField("ch", StringType()),
        StructField("incr",BooleanType()),
        StructField("zSetEntries", ArrayType( \
            StructType([
                StructField("element", StringType()),\
                StructField("score", StringType())   \
            ]))                                      \
        )

    ]
)

# this is a manually created schema - before Spark 3.0.0, schema inference is not automatic
# since we are not using the date or the amount in sql calculations, we are going
# to cast them as strings
# {"accountNumber":"703934969","location":"spain"}
customerLocationSchema = StructType([
    StructField("accountNumber", StringType()),
Beispiel #38
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from distutils.version import LooseVersion
        from pyspark.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            if LooseVersion(pa.__version__) < LooseVersion("0.12.0"):
                temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False)
                arrow_schema = temp_batch.schema
            else:
                arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            for name, field in zip(schema, arrow_schema):
                struct.add(name, from_arrow_type(field.type), nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
                      for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Beispiel #39
0
def load_data():
    """Load the data from the different files and join the togheter
    """

    users_schema = StructType([
        StructField('timestamp', TimestampType(), nullable=False),
        StructField('userId', IntegerType(), False),
        StructField('nick', StringType(), False),
        StructField('twitter', StringType(), False),
        StructField('dob', DateType(), False),
        StructField('country', StringType(), False)
    ])

    users = spark.read.load('data/users.csv',
                            format='csv',
                            schema=users_schema,
                            header=True)

    users = users.withColumn(
        'age',
        F.datediff(F.to_date(F.lit('2016-06-16'), 'yyyy-mm-dd'), users.dob) /
        365)

    ages = users.select(['userId', 'age'])

    buy_click = spark.read.load('data/buy-clicks.csv',
                                format='csv',
                                inferSchema=True,
                                header=True,
                                timestampFormat="yyyy-MM-dd HH:mm:ss")

    revenue_by_session_by_user = buy_click.groupBy([
        'userId', 'userSessionId'
    ]).agg(F.sum('price').alias('revenue')).select(['userId', 'revenue'])
    revenues=revenue_by_session_by_user.groupBy('userId').agg(F.mean('revenue').alias('avg_buy'), F.min('revenue')\
            .alias('min_buy'), F.max('revenue').alias('max_buy'))

    game_click = spark.read.load('data/game-clicks.csv',
                                 format='csv',
                                 inferSchema=True,
                                 header=True,
                                 timestampFormat="yyyy-MM-dd HH:mm:ss")

    avg_ishit = game_click.groupBy(['userId'
                                    ]).agg(F.mean('isHit').alias('avg_isHit'))

    user_session = spark.read.load('data/user-session.csv',
                                   format='csv',
                                   inferSchema=True,
                                   header=True,
                                   timestampFormat="yyyy-MM-dd HH:mm:ss")

    team = spark.read.load('data/team.csv',
                           format='csv',
                           inferSchema=True,
                           header=True,
                           timestampFormat="yyyy-MM-dd HH:mm:ss")

    strengths = team.join(user_session, on='teamId',
                          how='inner').select(['userId',
                                               'strength']).dropDuplicates()

    data = ages.join(revenues, on='userId',
                     how='inner').join(avg_ishit, on='userId',
                                       how='inner').join(strengths,
                                                         on='userId',
                                                         how='left').na.fill(0)

    return data
Beispiel #40
0
    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
        exception will be thrown at runtime. If the given schema is not StructType, it will be
        wrapped into a StructType as its only field, and the field name will be "value", each record
        will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`DataType` or a datatype string or a list of column names, default
            is None.  The data type string format equals to `DataType.simpleString`, except that
            top level struct type can omit the `struct<>` and atomic types use `typeName()` as
            their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
            as a short name for IntegerType.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        .. versionchanged:: 2.0
           The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
           StructType, it will be wrapped into a StructType and each record will also be wrapped
           into a tuple.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = [str(x) for x in data.columns]
            data = [r.tolist() for r in data.to_records(index=False)]

        if isinstance(schema, StructType):
            def prepare(obj):
                _verify_type(obj, schema)
                return obj
        elif isinstance(schema, DataType):
            datatype = schema

            def prepare(obj):
                _verify_type(obj, datatype)
                return (obj, )
            schema = StructType().add("value", datatype)
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Beispiel #41
0
                      "bool": "boolean",
                      "boolean": "boolean",
                      "struct": "struct",
                      "array": "array",
                      "date": "date",
                      "datetime": "datetime",
                      "byte": "byte",
                      "short": "short",
                      "binary": "binary",
                      "null": "null",
                      "vector": "vector",
                      "timestamp": "datetime"
                      }
SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType,
                     "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType,
                     "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType,
                     "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT
                     }
SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }
PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"}
PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric",
                      "date": "date", "array": "array", "binaty": "binary", "null": "null"}
SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"],
                            "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary",
                            "array": "array", "object": "object", "null": "null", "missing": "missing"}