コード例 #1
0
# MAGIC 
# MAGIC Complex outputs are helpful when you need to return multiple values from your UDF. The UDF design pattern involves returning a single column to drill down into, to pull out the desired data.

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC Start by determining the desired output.  This will look like a schema with a high level `StructType` with numerous `StructFields`.
# MAGIC 
# MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> For a refresher on this, see the lesson **Applying Schemas to JSON Data** in ETL Part 1 module

# COMMAND ----------

from pyspark.sql.types import FloatType, StructType, StructField

mathOperationsSchema = StructType([
  StructField("sum", FloatType(), True), 
  StructField("multiplication", FloatType(), True), 
  StructField("division", FloatType(), True) 
])

# COMMAND ----------

# MAGIC %md
# MAGIC Create a function that returns a tuple of your desired output.

# COMMAND ----------

def manual_math(x, y):
  return (float(x + y), float(x * y), x / float(y))

manual_math(1, 2)
コード例 #2
0
def test_select_subset_of_columns_as_entity_primary_keys(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        customer_feature_schema)
    feature_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )
    feature_table_df = filter_feature_table_by_time_range(
        feature_table_df,
        feature_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=2),
            400.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
コード例 #3
0
def test_implicit_type_conversion(spark: SparkSession, ):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'single_customer.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_table = {
        "name": "transactions",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "daily_transactions",
            "type": "float"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source],
        [transaction_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])

    expected_joined_data = [
        (
            1001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
コード例 #4
0
for col in gdelt.columns[1:]:
    gdelt = gdelt.withColumn(col, F.lower(F.col(col)))

print('Getting only GDELT FRA Event')
gdelt = gdelt.where('Actor1Code RLIKE "^fra.*"')
gdelt = gdelt.where('Actor2Code RLIKE "^fra.*"')

# Taking top 10 actors here

print('Normalizing Date')
date_normalizer = partial(utils.normalize_date,
                          starting_date='20170101',
                          ending_date='20190101',
                          date_format='%Y%m%d')
pudf_normalizer = F.pandas_udf(lambda ds: ds.apply(date_normalizer),
                               FloatType(), F.PandasUDFType.SCALAR)
# date_normalizer = F.udf(lambda d: normalize_date(d, '20170101', '20190101', '%Y%m%d'), FloatType())
gdelt = gdelt.withColumn('date',
                         pudf_normalizer(gdelt.SQLDATE)).drop('SQLDATE')

print('GDELT to dummies')
for col in cols_to_take[1:]:
    categories = gdelt.select(col).distinct().rdd.flatMap(
        lambda x: x).collect()
    dummies_var.append([
        F.when(F.col(col) == category,
               1).otherwise(0).alias(col.lower() + '_' + category)
        for category in categories
    ])
gdelt = gdelt.select('date', *list(chain.from_iterable(dummies_var)))
コード例 #5
0
import pyspark
from pyspark import SparkContext
from pyspark.sql.types import FloatType


spark = pyspark.sql.SparkSession.builder \
    .master("local") \
    .appName("movies") \
    .getOrCreate()

df = spark.read.csv(path="./comparison.csv", header=True)

df = df.withColumn("similarity", df["similarity"].cast(FloatType()))\
    .orderBy("similarity", ascending=[0]) \
    .collect()

sc = SparkContext.getOrCreate()
rdd = sc.parallelize(df)


def seq_op(acc, row):
    similarity = row["similarity"]

    if row["first"] in acc:
        acc[row["first"]].append((row["second"], similarity))
    else:
        acc[row["first"]] = [(row["second"], similarity)]

    if row["second"] in acc:
        acc[row["second"]].append((row["first"], similarity))
    else:
コード例 #6
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

spark = (
    SparkSession.builder.appName("TotalSpentByCustomer")
    .master("local[*]")
    .getOrCreate()
)

# Create schema when reading customer-orders
customerOrderSchema = StructType(
    [
        StructField("cust_id", IntegerType(), True),
        StructField("item_id", IntegerType(), True),
        StructField("amount_spent", FloatType(), True),
    ]
)

# Load up the data into spark dataset
customersDF = spark.read.schema(customerOrderSchema).csv("customer-orders.csv")

totalByCustomer = customersDF.groupBy("cust_id").agg(
    func.round(func.sum("amount_spent"), 2).alias("total_spent")
)

totalByCustomerSorted = totalByCustomer.sort("total_spent")

totalByCustomerSorted.show(totalByCustomerSorted.count())

spark.stop()
コード例 #7
0
    #puts together whole
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=3)

    # Split the data into train and test
    splits = joinedDF.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]

    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)

    prediction = cvModel.transform(test)

    evaluator.evaluate(prediction)

    #need funciton to extract first element of vector of probability
    secondelement = udf(lambda v: float(v[1]), FloatType())

    selected = prediction.withColumn(
        "prob_of_cancel", secondelement(prediction["probability"])).select(
            "ID", "prob_of_cancel", "prediction", 'label')

    selected.coalesce(1).write.csv(
        path=
        "s3n://leesa.east2.training/predictions/flight_cancellation_predictions.csv",
        mode='overwrite',
        header=True)
コード例 #8
0
    "float": FloatType,
    "double": DoubleType,
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_TYPES = {
    "int", "float", "string", "bool", "date", "null", "array", "double"
}
PROFILER_LEGEND_TYPES = {
    "string": "ABC",
    "int": "#",
    "integer": "#",
    "float": "##.#",
    "double": "##.#",
コード例 #9
0
spark = SparkSession.builder.master("local").appName("Return").getOrCreate()
df = spark.read.csv(
    r'D:\LiBao\data_20200904\NSM_GlobalSelect_Nasdaq\NSM-2016-01-05-TAS-Data-1-of-1-a1.csv',
    header=True)
df_global_market = spark.read.csv(
    r'D:\LiBao\data_20200904\NMS-2\NMS-2016-01-05-TAS-Data-1-of-1-a1.csv',
    header=True)
df_capital_market = spark.read.csv(
    r'D:\LiBao\data_20200904\NAQ\NAQ-2016-01-05-TAS-Data-1-of-1-a1.csv',
    header=True)
df = df.union(df_global_market).union(df_capital_market)
# rename the first column and sequence number
df = df.withColumnRenamed('#RIC', 'Ticker')
df = df.withColumnRenamed('Seq. No.', 'SeqNo')
# change the type of the data
df = df.withColumn('Price', df['Price'].cast(FloatType()))
df = df.withColumn('Volume', df['Volume'].cast(FloatType()))
df = df.withColumn('SeqNo', df['SeqNo'].cast(IntegerType()))
# select the trade entry
trade_df = df.where("Type=='Trade'")
# convert trading time into hours and minutes
trade_df = trade_df.withColumn('Hour', hour(trade_df['Exch Time']))
trade_df = trade_df.withColumn('Minute', minute(trade_df['Exch Time']))
# substract the time with 870  to get the minute indicator
trade_df = trade_df.withColumn(
    'MinuteIndicator', trade_df['Hour'] * 60 + trade_df['Minute'] - 870)
# select the data during trading hours, trading volume larger than 0 and drop any missing values on trade price
trade_df = trade_df.\
    filter((trade_df['MinuteIndicator'] >= 0) & (trade_df['MinuteIndicator'] <= 390) & (trade_df['Volume']>0)).\
    dropna(subset=('Price'))
# delete rows with ticker contains '![/', all these rows are minute-by-minute summary rather than real trade
コード例 #10
0
                # dropoff lat
                float(chunk_row[4]),
                # dropoff long
                float(chunk_row[5]))
        except ValueError:
            pass


# parse the datasets into row tuples
yellow_rows = yellow.mapPartitions(parse_yellow)
citi_rows = citi.mapPartitions(parse_citi)

# define dataframe schemas
yellow_schema = StructType([
    StructField('dropoff_time', TimestampType(), True),
    StructField('dropoff_lat', FloatType(), True),
    StructField('dropoff_lng', FloatType(), True)
])

citi_schema = StructType([
    StructField('station_id', IntegerType(), True),
    StructField('ride_id', StringType(), True),
    StructField('start_time', TimestampType(), True)
])

# instantiate the dataframes
yellow_df = sqlContext.createDataFrame(yellow_rows, yellow_schema)
citi_df = sqlContext.createDataFrame(citi_rows, citi_schema)


# filtering function to check if the taxi dropoff location is within 0.25 miles of citibike station
コード例 #11
0
# train the model
model = ALS.train(
    dfRates.rdd, 20,
    20)  # you could tune these numbers, but these are reasonable choices
print("trained ...")

# use this model to predict what the user would rate accommodations that she has not rated
allPredictions = None
for USER_ID in range(0, 100):
    dfUserRatings = dfRates.filter(
        dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect()
    rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
    pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
    predictions = model.predictAll(pairsPotential).map(
        lambda p: (str(p[0]), str(p[1]), float(p[2])))
    predictions = predictions.takeOrdered(5, key=lambda x: -x[2])  # top 5
    print("predicted for user={0}".format(USER_ID))
    if (allPredictions == None):
        allPredictions = predictions
    else:
        allPredictions.extend(predictions)

# write them
schema = StructType([
    StructField("userId", StringType(), True),
    StructField("accoId", StringType(), True),
    StructField("prediction", FloatType(), True)
])
dfToSave = sqlContext.createDataFrame(allPredictions, schema)
dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
コード例 #12
0
    def test_as_spark_type(self):
        type_mapper = {
            # binary
            np.character: BinaryType(),
            np.bytes_: BinaryType(),
            np.string_: BinaryType(),
            bytes: BinaryType(),
            # integer
            np.int8: ByteType(),
            np.byte: ByteType(),
            np.int16: ShortType(),
            np.int32: IntegerType(),
            np.int64: LongType(),
            np.int: LongType(),
            int: LongType(),
            # floating
            np.float32: FloatType(),
            np.float: DoubleType(),
            np.float64: DoubleType(),
            float: DoubleType(),
            # string
            np.str: StringType(),
            np.unicode_: StringType(),
            str: StringType(),
            # bool
            np.bool: BooleanType(),
            bool: BooleanType(),
            # datetime
            np.datetime64: TimestampType(),
            datetime.datetime: TimestampType(),
            # DateType
            datetime.date: DateType(),
            # DecimalType
            decimal.Decimal: DecimalType(38, 18),
            # ArrayType
            np.ndarray: ArrayType(StringType()),
            List[bytes]: ArrayType(BinaryType()),
            List[np.character]: ArrayType(BinaryType()),
            List[np.bytes_]: ArrayType(BinaryType()),
            List[np.string_]: ArrayType(BinaryType()),
            List[bool]: ArrayType(BooleanType()),
            List[np.bool]: ArrayType(BooleanType()),
            List[datetime.date]: ArrayType(DateType()),
            List[np.int8]: ArrayType(ByteType()),
            List[np.byte]: ArrayType(ByteType()),
            List[decimal.Decimal]: ArrayType(DecimalType(38, 18)),
            List[float]: ArrayType(DoubleType()),
            List[np.float]: ArrayType(DoubleType()),
            List[np.float64]: ArrayType(DoubleType()),
            List[np.float32]: ArrayType(FloatType()),
            List[np.int32]: ArrayType(IntegerType()),
            List[int]: ArrayType(LongType()),
            List[np.int]: ArrayType(LongType()),
            List[np.int64]: ArrayType(LongType()),
            List[np.int16]: ArrayType(ShortType()),
            List[str]: ArrayType(StringType()),
            List[np.unicode_]: ArrayType(StringType()),
            List[datetime.datetime]: ArrayType(TimestampType()),
            List[np.datetime64]: ArrayType(TimestampType()),
        }

        for numpy_or_python_type, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)

        with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))
コード例 #13
0
def classify_spark(training, testing, target_domains, target_domains_dict):

    # Adjust
    target_domains_dict["_other"] = len(target_domains)
    target_domains.append(["_other"])

    feature_list = [c for c in training.columns if c.startswith("_")]

    assembler = VectorAssembler(inputCols=feature_list,
                                outputCol="features",
                                handleInvalid="skip")

    str2idx = udf(lambda s: float(target_domains_dict[s]), FloatType())
    idx2str = udf(lambda f: target_domains[int(f)], StringType())

    training = assembler.transform(training)
    testing = assembler.transform(testing)
    training = training.withColumn("label_idx", str2idx("label"))
    testing = testing.withColumn("label_idx", str2idx("label"))

    bins = np.zeros(len(target_domains))
    freqs = { row["label_idx"]: row["count"] for row in training.select("label_idx")\
                                                           .groupBy("label_idx").count().collect() }
    for i in freqs:
        bins[int(i)] = freqs[i]
    class_weights = np.sum(bins) / (len(bins) * bins)
    idx2cw = udf(lambda f: float(class_weights[int(f)]), FloatType())
    training = training.withColumn("weigth", idx2cw("label_idx"))

    #model = pyspark.ml.classification.DecisionTreeClassifier(labelCol="label_idx",
    #                                    featuresCol="features", predictionCol="prediction_idx")
    model = pyspark.ml.classification.LogisticRegression(
        labelCol="label_idx",
        weightCol="weigth",
        featuresCol="features",
        predictionCol="prediction_idx")

    model_fit = model.fit(training)

    training_predictions = model_fit.transform(training)
    testing_predictions = model_fit.transform(testing)

    training_predictions = training_predictions.withColumn(
        "prediction", idx2str("prediction_idx"))
    testing_predictions = testing_predictions.withColumn(
        "prediction", idx2str("prediction_idx"))

    labels_training = training_predictions.select("label").toPandas().values
    labels_test = testing_predictions.select("label").toPandas().values

    pred_training = training_predictions.select("prediction").toPandas().values
    pred_test = testing_predictions.select("prediction").toPandas().values

    training_report = classification_report(labels_training,
                                            pred_training,
                                            output_dict=True)
    testing_report = classification_report(labels_test,
                                           pred_test,
                                           output_dict=True)

    return model_fit, training_report, testing_report
コード例 #14
0
df = sc.read.parquet('../data/userdata1.parquet')
print(df)

# Handle duplicate values
print(df.drop_duplicates().count())

# Handling missing data
print(df.fillna(0).show())
print(df.dropna().show())

# fill missing values in specefic columns
print(df.fillna({'cc':'6767119071901597' }).show())

# Changing data type in the DF
df1 = df.withColumn("salary",  df["salary"].cast(FloatType()))
print(df1.show())
print(df1.printSchema())

# replace null values with mean salary
print(F.avg(df1.salary))
# df1 = df1.fillna()

# drop String literal and cast to integer
df = df.withColumn("cc", F.when(df.cc != '', df.cc).otherwise('0'))
df = df.withColumn("cc", df.cc.cast(IntegerType()))
print(df.printSchema())

# replace empty String literal with something
df = df.withColumn("birthdate",F.when(df.birthdate != '',df.birthdate).otherwise("05/05/2020"))
df = df.withColumn("birthdate", F.to_date(df.birthdate,'mm/dd/yyyy'))
コード例 #15
0
ファイル: test_spark.py プロジェクト: mymistakes/horovod
    def test_prepare_data_compress_sparse(self):
        util.clear_training_cache()

        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with mock.patch('horovod.spark.common.util._get_metadata',
                        side_effect=util._get_metadata) as mock_get_metadata:
            with spark_session('test_prepare_data') as spark:
                data = [[
                    0.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    DenseVector([1.0, 1.0])
                ], [
                    1.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    SparseVector(2, {1: 1.0})
                ]]

                schema = StructType([
                    StructField('float', FloatType()),
                    StructField('dense', VectorUDT()),
                    StructField('sparse', VectorUDT()),
                    StructField('mixed', VectorUDT())
                ])

                df = create_test_data_from_schema(spark, data, schema)

                with local_store() as store:
                    with util.prepare_data(num_processes=2,
                                           store=store,
                                           df=df,
                                           feature_columns=['dense', 'sparse', 'mixed'],
                                           label_columns=['float'],
                                           compress_sparse=True) as dataset_idx:
                        mock_get_metadata.assert_called()
                        assert dataset_idx == 0

                        train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(dataset_idx)
                        self.assertDictEqual(metadata, expected_metadata)
コード例 #16
0
"""

"""

from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType

spark = SparkSession.builder.appName('CustomerAmount').getOrCreate()

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('order_id', IntegerType(), True),
    StructField('value', FloatType(), True)
])

df = spark.read.csv('datasets/customer-orders.csv', schema=schema)
df.describe().show()
df.show(5)
# +-------+--------+-----+
# |user_id|order_id|value|
# +-------+--------+-----+
# |     44|    8602|37.19|
# |     35|    5368|65.89|
# |      2|    3391|40.64|
# |     47|    6694|14.98|
# |     29|     680|13.08|
# +-------+--------+-----+

df_users = df.groupby('user_id')\
    .agg(F.sum('value').alias('total'))\
コード例 #17
0
 
X21_test = scalerX21.transform(X21)
pred21 = model_d21.predict(X21_test)
prediction21 = scalery21.inverse_transform(pred21)

df21['prediction'] = prediction21


df_new = pd.concat([df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,df21])

df_new['ml_score'] = (df_new['prediction']/30)*(31 - df_new['day_num']) + df_new['usage_till_date']

df_new['ma_score'] = df_new['first7day_avg']*(31 - df_new['day_num']) + df_new['usage_till_date']


p_schema = StructType([StructField('concat_agmnt_no',StringType(),True),StructField('billing_start_date',StringType(),True),StructField('billing_end_date',StringType(),True),StructField('day_num',IntegerType(),True),StructField('usage_till_date',FloatType(),True),StructField('first7day_avg',FloatType(),True),StructField('last_1bc_usg',FloatType(),True),StructField('ml_score',FloatType(),True),StructField('ma_score',FloatType(),True)])
 

df1 = df_new[['concat_agmnt_no',
'billing_start_date',
'billing_end_date',
'day_num',
'usage_till_date',
'first7day_avg',
'last_1bc_usg',
'ml_score',
'ma_score']]
 
df2 = sqlContext.createDataFrame(df1, p_schema)
 
df2.registerTempTable("test_temp_df") 
コード例 #18
0
ファイル: oss_data_fetcher.py プロジェクト: t-triobox/ReAgent
def select_relevant_columns(df,
                            discrete_action: bool = True,
                            include_possible_actions: bool = True):
    """Select all the relevant columns and perform type conversions."""
    if not discrete_action and include_possible_actions:
        raise NotImplementedError(
            "currently we don't support include_possible_actions")

    select_col_list = [
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("reward").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("not_terminal").cast(BooleanType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("action_probability").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("mdp_id").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("sequence_number").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("step").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("time_diff").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics_presence").cast(ArrayType(BooleanType())),
    ]

    if discrete_action:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(LongType()),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(LongType()),
        ]
    else:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action_presence").cast(ArrayType(BooleanType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action_presence").cast(ArrayType(BooleanType())),
        ]

    if include_possible_actions:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_actions_mask").cast(ArrayType(LongType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_next_actions_mask").cast(ArrayType(LongType())),
        ]

    return df.select(*select_col_list)
コード例 #19
0
    "int": "int",
    "float": "float",
    "double": "double",
    "bool": "boolean",
    "boolean": "boolean",
    "struct": "struct",
    "array": "array",
    "date": "date",
    "long": "long"
    # "vector": "vector"
}

SPARK_DTYPES_DICT_OBJECTS = {
    "string": StringType(),
    "int": IntegerType(),
    "float": FloatType(),
    "double": DoubleType(),
    "boolean": BooleanType(),
    "struct": StructType(),
    "array": ArrayType,
    "date": DateType()
}

SPARK_DTYPES_DICT = {
    "string": StringType,
    "int": IntegerType,
    "float": FloatType,
    "double": DoubleType,
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
コード例 #20
0
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, Row, StructField


def prediction(height, width, nChannels, data):
    array = np.ndarray(shape=(height, width, nChannels),
                       dtype=np.uint8,
                       buffer=data,
                       strides=(width * nChannels, nChannels, 1))
    out_scores, out_boxes, out_classes = predict_util(array)
    return Row('classes', 'scores')(out_classes.tolist(), out_scores.tolist())


schema = StructType([
    StructField("classes", ArrayType(IntegerType()), False),
    StructField("scores", ArrayType(FloatType()), False)
])

prediction_udf = udf(prediction, schema)

# COMMAND ----------

# MAGIC %md
# MAGIC # Assert prediction `classes` on `test.jpg`

# COMMAND ----------

display(
    images_df.where(
        "image.origin='dbfs:/mnt/roy/object-detection/images/test.jpg'").
    withColumn(
コード例 #21
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

schema = StructType([ \
                     StructField("stationID", StringType(), True), \
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True) \
                    ])

# // Read the file as dataframe
df = spark.read.schema(schema).csv("data/1800.csv")
df.printSchema()

# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

# Select only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

# Aggregate to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

# Convert temperature to fahrenheit and sort the dataset
minTempsByStationF = minTempsByStation.withColumn("temperature",
                                                  func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\
                                                  .select("stationID", "temperature").sort("temperature")
コード例 #22
0
    new_test_data = dict()
    for company in test_data:
        company_data = test_data[company]
        test_df = vector_assembler.transform(company_data)
        test_df = scalar.transform(test_df)

        test_df = test_df.select(['scaledFeatures', TARGET])
        new_test_data[company] = test_df

    return train_df, new_test_data


error_pct_udf = udf(
    lambda arr: (float(abs(arr[0] - arr[1])) * float(100)) / float(arr[0]),
    FloatType())


def train_and_pred(train, test_data, tech_only=False):
    # train the linear regression model
    lr_model = LinearRegression(featuresCol='scaledFeatures',
                                labelCol=TARGET,
                                maxIter=300,
                                regParam=1,
                                elasticNetParam=1).fit(train)
    print('Coefficients: {}'.format(str(lr_model.coefficients)))
    print('Intercept: {}'.format(str(lr_model.intercept)))

    # summarize the training
    trainingSummary = lr_model.summary
    print('Training r2 = {}'.format(float(trainingSummary.r2)))
コード例 #23
0
_SMALLINT_TYPE = __short_type.simpleString()

__int_type = IntegerType()
_INT_TYPE = __int_type.simpleString()
assert _INT_TYPE == int.__name__
assert __int_type.typeName().startswith(_INT_TYPE)

__long_type = LongType()
_BIGINT_TYPE = __long_type.simpleString()
assert __long_type.typeName() == 'long'

_INT_TYPES = \
    [_TINYINT_TYPE, _SMALLINT_TYPE,
     _INT_TYPE, _BIGINT_TYPE]

__float_type = FloatType()
_FLOAT_TYPE = __float_type.simpleString()
assert _FLOAT_TYPE == __float_type.typeName()

__double_type = DoubleType()
_DOUBLE_TYPE = __double_type.simpleString()
assert _DOUBLE_TYPE == __double_type.typeName()

_FLOAT_TYPES = [_FLOAT_TYPE, _DOUBLE_TYPE]

_NUM_TYPES = _INT_TYPES + _FLOAT_TYPES

_POSSIBLE_CAT_TYPES = [_BOOL_TYPE, _STR_TYPE] + _NUM_TYPES
_POSSIBLE_FEATURE_TYPES = _POSSIBLE_CAT_TYPES + _NUM_TYPES

__date_type = DateType()
コード例 #24
0
def align_diff_frames(resolve_func,
                      this,
                      that,
                      fillna=True,
                      how="full",
                      preserve_order_column=False):
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from databricks.koalas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(kdf, this_column_labels, that_column_labels):
        ...    kdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `kdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from kdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from kdf2.
        ...    new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, kdf1, kdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(
        that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this,
                              that,
                              how=how,
                              preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []
    this_columns_to_apply = []
    additional_that_columns = []
    columns_to_keep = []
    column_labels_to_keep = []

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                    tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(
                    F.lit(None).cast(FloatType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(
                    combined._internal.spark_column_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        kser_set, column_labels_applied = zip(*resolve_func(
            combined, this_columns_to_apply, that_columns_to_apply))
        columns_applied = [c.spark.column for c in kser_set]
        column_labels_applied = list(column_labels_applied)
    else:
        columns_applied = []
        column_labels_applied = []

    applied = combined[columns_applied + columns_to_keep]
    applied.columns = pd.MultiIndex.from_tuples(column_labels_applied +
                                                column_labels_to_keep,
                                                names=combined.columns.names)

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    kdf = applied[list(this_labels.values()) + list(other_labels.values())]
    kdf.columns = kdf.columns.droplevel()
    return kdf
コード例 #25
0
def test_join_with_composite_entity(
    spark: SparkSession,
    composite_entity_schema: StructType,
    rating_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=1)),
        (1001, 8002, datetime(year=2020, month=9, day=3)),
        (1001, 8003, datetime(year=2020, month=9, day=1)),
        (2001, 8001, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            3.0,
        ),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            4.5,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        rating_feature_schema,
    )
    feature_table = FeatureTable(
        name="ratings",
        features=[
            Field("customer_rating", "double"),
            Field("driver_rating", "double")
        ],
        entities=[Field("customer_id", "int32"),
                  Field("driver_id", "int32")],
        max_age=86400,
    )
    feature_table_df = filter_feature_table_by_time_range(
        feature_table_df,
        feature_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("ratings__customer_rating", FloatType()),
        StructField("ratings__driver_rating", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (1001, 8002, datetime(year=2020, month=9, day=3), None, None),
        (1001, 8003, datetime(year=2020, month=9, day=1), None, None),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=2),
            4.0,
            4.5,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
コード例 #26
0
# Write a config file to create the Spark Job

"""

from pyspark.sql import SparkSession, SQLContext, HiveContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import *

#When Loading id as IntegerType, the schema returns null
#Load id as LongType and the schema load works

data_schema_acquisition = [
    StructField("id", LongType(), False),
    StructField("channel", StringType(), False),
    StructField("seller", StringType(), False),
    StructField("interest_rate", FloatType(), False),
    StructField("balance", LongType(), False),
    StructField("loan_term", IntegerType(), False),
    StructField("origination_date", StringType(), False),
    StructField("first_payment_date", StringType(), False),
    StructField("ltv", IntegerType(), False),
    StructField("cltv", IntegerType(), False),
    StructField("borrower_count", IntegerType(), False),
    StructField("dti", IntegerType(), False),
    StructField("borrower_credit_score", IntegerType(), False),
    StructField("first_time_homebuyer", StringType(), False),
    StructField("loan_purpose", StringType(), False),
    StructField("property_type", StringType(), False),
    StructField("unit_count", IntegerType(), False),
    StructField("occupancy_status", StringType(), False),
    StructField("property_state", StringType(), False),
コード例 #27
0
def test_multiple_join(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
    driver_feature_schema: StructType,
):

    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (1001, 8002, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=3)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    customer_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            200.0,
        ),
    ]
    customer_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(customer_table_data),
        customer_feature_schema)
    customer_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )
    customer_table_df = filter_feature_table_by_time_range(
        customer_table_df,
        customer_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )

    driver_table_data = [
        (
            8001,
            datetime(year=2020, month=8, day=31),
            datetime(year=2020, month=8, day=31),
            200,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            300,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            600,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            500,
        ),
    ]
    driver_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(driver_table_data),
        driver_feature_schema)

    driver_table = FeatureTable(
        name="bookings",
        features=[Field("completed_bookings", "int32")],
        entities=[Field("driver_id", "int32")],
        max_age=7 * 86400,
    )
    driver_table_df = filter_feature_table_by_time_range(
        driver_table_df,
        driver_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = join_entity_to_feature_tables(
        entity_df,
        "event_timestamp",
        [customer_table_df, driver_table_df],
        [customer_table, driver_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
コード例 #28
0
#+-------+---------+-----+----+


li=[23,34,56] #list of elements
df.filter(df['column_name'].isin(li)) #Checking if column matches any element of a list

df.filter(df['column_name'].isin(li)==False) #Selecting if column does not match any element of a list


import numpy as np
import pyspark.sql.functions as func

def median(values_list):
    med = np.median(values_list)
    return float(med)
udf_median = func.udf(median, FloatType())

df_grouped = df.groupby(['a', 'd']).agg(udf_median(func.collect_list(col('c'))).alias('median'))
df_grouped.show()

#Extract a column from a dataframe to a list
sea_lists=[row[0] for row in dataframe_with_sea.collect()]


#Round off a column
from pyspark.sql.functions import pow, lit
from pyspark.sql.types import LongType
num_places = 3
m = pow(lit(10), num_places).cast(LongType())
df = sc.parallelize([(0.6643, ), (0.6446, )]).toDF(["x"])
df.withColumn("trunc", (col("x") * m).cast(LongType()) / m)
コード例 #29
0
# COMMAND ----------

ratings_df.show()

# COMMAND ----------

movie_ratings=ratings_df.drop('timestamp')

# COMMAND ----------

# Data type convert
from pyspark.sql.types import IntegerType, FloatType
movie_ratings = movie_ratings.withColumn("userId", movie_ratings["userId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("movieId", movie_ratings["movieId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("rating", movie_ratings["rating"].cast(FloatType()))

# COMMAND ----------

movie_ratings.show()

# COMMAND ----------

# MAGIC %md 
# MAGIC ### ALS Model Selection and Evaluation
# MAGIC 
# MAGIC With the ALS model, we can use a grid search to find the optimal hyperparameters.

# COMMAND ----------

# import package
コード例 #30
0
 def fe_DaysSinceRegistration(df):
     days_active = udf(lambda x, y: float((x - y) / (86400000.0)),
                       FloatType())
     df = df.withColumn("DaysSinceRegistration",
                        days_active("ts", "registration"))
     return df