def recordstream(df, epoch_id):
    # First splitting the value from Spark DF to get the timestamp from data and later applying window on the datetime
    split_col = F.split(df.value, ',')
    df = df.withColumn(
        'TimeStamp',
        F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''),
                       'yyyy-mm-dd HH:mm:ssss'))
    df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType()))
    df = df.withColumn(
        'Nu_Temp',
        F.regexp_replace(split_col.getItem(2), '"', '').cast(tp.DoubleType()))
    df = df.drop('value')
    # Saving input stream to master data set
    dfw = df.selectExpr('TimeStamp as ts', 'RT_Temp', 'Nu_Temp')
    dfw.write.saveAsTable(name='tsa.turbine_master',
                          format='hive',
                          mode='append')
    dfp = df.select('TimeStamp', 'RT_Temp', 'Nu_Temp')

    if len(dfp.take(1)) != 0:
        # print('Calling Predictions & Model path is',g_model)
        df_final = fitVar(2, dfp, g_model)
        df_final.show(5)
        df_final.write.saveAsTable(name='tsa.batch_predictions',
                                   format='hive',
                                   mode='append')
Example #2
0
def preprocessing(spark: SparkSession, pppath: Path, datadir: str):
    print("--- preprocessing -----------------------")

    schema = T.StructType([
        T.StructField('year', T.IntegerType(), True),
        T.StructField('month', T.IntegerType(), True),
        T.StructField('dn', T.IntegerType(), True),
        T.StructField('wday', T.IntegerType(), True),
        T.StructField('snap', T.IntegerType(), True),
        T.StructField('dept_id', T.StringType(), True),
        T.StructField('item_id', T.StringType(), True),
        T.StructField('store_id', T.StringType(), True),
        T.StructField('sales', T.DoubleType(), True),
        T.StructField('flag_ram', T.IntegerType(), True),
        T.StructField('Sales_Pred', T.DoubleType(), True)
    ])

    csv_path = str(Path(datadir, "Sales5_Ab2011_InklPred.csv"))
    print(f"--- Reading: '{csv_path}'")

    sales5: DataFrame = spark.read.csv(csv_path, header='true', schema=schema) \
        .withColumn("label", F.col('sales'))

    ppdf = prepro(sales5)
    print(f"--- Writing: '{pppath}'")
    ppdf.write \
        .format("parquet") \
        .mode("overwrite") \
        .save(str(pppath))
Example #3
0
def process_song_data(spark, input_bucket, output_data):
    """ Reads the songs dataset, transforms it, creating artists and
    songs table in parquet files"""

    song_data = get_files_paths_s3(input_bucket, "song_data")
    # song_data = get_local_song_data()

    # specify schema to improve read speed
    song_schema = T.StructType()\
        .add("num_songs", T.IntegerType())\
        .add("artist_id", T.StringType())\
        .add("artist_latitude", T.DoubleType())\
        .add("artist_longitude", T.DoubleType())\
        .add("artist_location", T.StringType())\
        .add("artist_name", T.StringType())\
        .add("song_id", T.StringType())\
        .add("title", T.StringType())\
        .add("duration", T.DoubleType())\
        .add("year", T.IntegerType())

    df = spark.read.schema(song_schema).json(song_data)
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])
    songs_table.write.partitionBy("year", "artist_id")\
        .parquet("{}/songs_table.parquet".format(output_data),
                mode="overwrite")
    # Since the data is song based there can be duplicated artists
    artists_table = df.selectExpr(["artist_id", "artist_name as name",
                                "artist_location as location",
                                "artist_latitude as latitude",
                                "artist_longitude as longitude"])\
                        .dropDuplicates(["artist_id"])

    artists_table.write.parquet("{}artists_table.parquet".format(output_data),
                                mode="overwrite")
def import_twitter_data(spark_session, tweets_file_path):
    """Imports the twitter data and returns resulting DataFrame.
    
    Args:
        spark_session    --    An active SparkSession.
        tweets_file_path    --    A file path.
    """

    tweets_schema = types.StructType([
        types.StructField('id', types.LongType()),
        types.StructField('timestamp', types.LongType(), nullable=False),
        types.StructField('postalCode', types.StringType()),
        types.StructField('lon', types.DoubleType(), nullable=False),
        types.StructField('lat', types.DoubleType(), nullable=False),
        types.StructField('tweet', types.StringType(), nullable=False),
        types.StructField('user_id', types.LongType()),
        types.StructField('application', types.StringType()),
        types.StructField('source', types.StringType())
    ])

    tweets_df = spark_session.read.csv(tweets_file_path,
                                       escape='"',
                                       header='true',
                                       schema=tweets_schema,
                                       mode='DROPMALFORMED')

    tweets_df = tweets_df.select(['timestamp', 'lon', 'lat', 'tweet'])
    return tweets_df
Example #5
0
def predict(df, epoch_id):
    split_col = F.split(df.value, ',')
    # df = df.withColumn('TimeStamp', F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''),
    #                                                'yyyy-mm-dd HH:mm:ss.SSS'))
    df = df.withColumn(
        'TimeStamp',
        F.regexp_replace(split_col.getItem(0), '"',
                         '').cast(tp.TimestampType()))
    df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType()))
    df = df.withColumn(
        'Nu_Temp',
        F.regexp_replace(split_col.getItem(2), '"', '').cast(tp.DoubleType()))
    df = df.drop('value')
    dfw = df.select('TimeStamp', 'RT_Temp', 'Nu_Temp')

    if len(dfw.take(1)) != 0:
        df_final = fitVar(2, dfw)
        # Converting selective columns from prediction dataframe to single column dataframe value
        df_final = df_final.withColumn('value', (F.concat(
            col("TS"), lit(","), col("RT_Temp"), lit(","),
            col("RT_Temp_Predict"), lit(","), col("Nu_Temp"), lit(","),
            col("Nu_Temp_Predict"), lit(","),
            col("RMSE_Score"))).cast(tp.StringType()))
        ds = df_final.select('value')
        # ds.show(5)
        # Sending each row of dataframe on Kafka message
        print('Now sending Message on Kafka topic', sink_topic)
        ds.selectExpr("CAST(value AS STRING)")\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker)\
            .option("topic", sink_topic)\
            .save()
def spark():

    try:
        from pyspark import SparkContext, SparkConf
        from pyspark.sql import SparkSession
        from pyspark.sql import types

        conf = SparkConf()

        conf.set("spark.jars.ivy", "/home/jovyan/.ivy2/")
        conf.set("spark.driver.extraClassPath",
                 "jars/scala-udf-similarity-0.0.7.jar")
        conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")
        conf.set("spark.driver.memory", "4g")
        conf.set("spark.sql.shuffle.partitions", "12")

        sc = SparkContext.getOrCreate(conf=conf)

        spark = SparkSession(sc)

        udfs = [
            ("jaro_winkler_sim", "JaroWinklerSimilarity", types.DoubleType()),
            ("jaccard_sim", "JaccardSimilarity", types.DoubleType()),
            ("cosine_distance", "CosineDistance", types.DoubleType()),
            ("Dmetaphone", "DoubleMetaphone", types.StringType()),
            ("QgramTokeniser", "QgramTokeniser", types.StringType()),
            ("Q3gramTokeniser", "Q3gramTokeniser", types.StringType()),
            ("Q4gramTokeniser", "Q4gramTokeniser", types.StringType()),
            ("Q5gramTokeniser", "Q5gramTokeniser", types.StringType()),
            ("DmetaphoneAlt", "DoubleMetaphoneAlt", types.StringType()),
        ]

        for a, b, c in udfs:
            spark.udf.registerJavaFunction(a, "uk.gov.moj.dash.linkage." + b,
                                           c)

        rt = types.ArrayType(
            types.StructType([
                types.StructField("_1", types.StringType()),
                types.StructField("_2", types.StringType()),
            ]))

        spark.udf.registerJavaFunction(
            name="DualArrayExplode",
            javaClassName="uk.gov.moj.dash.linkage.DualArrayExplode",
            returnType=rt,
        )
        SPARK_EXISTS = True

    except:
        SPARK_EXISTS = False

    if SPARK_EXISTS:
        print("Spark exists, running spark tests")
        yield spark
    else:
        spark = None
        logger.error("Spark not available")
        print("Spark not available")
        yield spark
Example #7
0
def main(link, key):
    crime_schema = schema_def()
    # API request is made to retrieve data from Vancouver Open Data
    urllib.request.urlretrieve(link, "Vancouver.zip")
    compressed_file = zipfile.ZipFile('Vancouver.zip')
    csv_file = compressed_file.open('crimedata_csv_all_years.csv')

    pd_crimes = pd.read_csv(csv_file)

    # Creation of Spark DataFrame
    df_crime_init = spark.createDataFrame(pd_crimes,
                                          schema=crime_schema).cache()
    # Tagging City to Vancouver
    df_crime_init = df_crime_init.withColumn("city",
                                             functions.lit("Vancouver"))
    # UDF to apply UUID for entire dataframe
    genuuid = functions.udf(lambda: str(uuid.uuid4()))
    df_crime_init = df_crime_init.withColumn("uuid", genuuid()).cache()

    # Changing NaN values to 0 for numeric  columns & Filtering those rows that has not latitude and longitude
    df_crime_init = df_crime_init.na.fill(0)
    df_crime = df_crime_init.where((df_crime_init["X"] > 0)
                                   | (df_crime_init["Y"] > 0))

    # Conversion of UTM co-orindates to Latitude and Longitude
    utm_udf_x = functions.udf(
        lambda x, y: utm.to_latlon(x, y, 10, 'U')[0].item(),
        types.DoubleType())
    utm_udf_y = functions.udf(
        lambda x, y: utm.to_latlon(x, y, 10, 'U')[1].item(),
        types.DoubleType())
    df_crime = df_crime.withColumn(
        'lat', utm_udf_x(functions.col('X'), functions.col('Y')))
    df_crime = df_crime.withColumn(
        'long', utm_udf_y(functions.col('X'), functions.col('Y')))

    # Creating a new dataframe to store those records that does not have co-ordinates. We would need this for further study
    df_crime_nan = df_crime_init.where(df_crime_init["X"] == 0.0)
    df_crime_nan = df_crime_nan.withColumn("lat", functions.lit(0.0))
    df_crime_nan = df_crime_nan.withColumn("long", functions.lit(0.0))
    # Union of both dataframes
    df_crime_full = df_crime.union(df_crime_nan)

    # Calling Cassandra DB crime_type to tag subtypes to one common type
    crimepred = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='crime_type', keyspace=key).load()
    crimepred.registerTempTable("crimetype")
    df_crime_full.registerTempTable("crime")
    df_table = spark.sql(
        "select c.city,t.type as crime_type,c.uuid,c.subtype as crimesub_type,c.hour,c.lat,c.long,c.month,c.neighbourhood,c.year from crime c left join crimetype t on c.subtype = t.sub_type"
    )
    df_table = df_table.withColumn('count', functions.lit(1))

    df_na = df_table.where(df_table["crime_type"].isNull()).show()
    #df_crime_nan.show()
    print(df_table.count())

    # Loading of data into base_table
    df_table.write.format("org.apache.spark.sql.cassandra").options(
        table='base_table', keyspace=key).mode('overwrite').save()
Example #8
0
    def setUp(self):

        sc = SparkContext.getOrCreate()
        sql_context = SQLContext(sc)
        struct_feat = [T.StructField('f1', T.FloatType())]
        struct_lab = [T.StructField('l1', T.StringType())]

        self.default_param_dict = {
            'algorithm': 'LogisticRegression',
            'elasticNetParam': (0.0, 0.5),
            'fitIntercept': True,
            'labelCol': 'label',
            'maxIter': (100, 150),
            'predictionCol': 'prediction',
            'probabilityCol': 'probability',
            'rawPredictionCol': 'rawPrediction',
            'regParam': (0.0, 0.5),
            'threshold': (0.0, 0.5),
            'tol': (1e-06, 0.01)
        }

        self.default_features = [
            T.StructField('AarsVaerk_1', T.DoubleType(), True),
            T.StructField('AarsVaerk_2', T.DoubleType(), True),
            T.StructField('AarsVaerk_3', T.DoubleType(), True)
        ]
        self.default_standard = True

        self.workflow = ExecuteWorkflowClassification(self.default_param_dict,
                                                      self.default_standard,
                                                      self.default_features)
Example #9
0
def main():
    schema = types.StructType([
        types.StructField('station', types.StringType(), True),
        types.StructField('date', types.StringType(), True),
        types.StructField('element', types.StringType(), True),
        types.StructField('value1', types.DoubleType(), True),
        types.StructField('mflag', types.StringType(), True),
        types.StructField('qflag', types.StringType(), True),
    ])
    df = spark.read.csv(inputs, schema)

    getRange = functions.udf(get_dif, types.DoubleType())

    df_by_date = df.select('station', 'date', 'value1')\
        .where((df.element == 'TMAX') | (df.element == 'TMIN'))\
        .groupBy('station', 'date') \
        .agg(functions.collect_list('value1').alias('range'))\
        .withColumn('range', getRange('range'))
    df_by_date = df_by_date.where(df_by_date.range > 1).sort('date',
                                                             ascending=True)
    df_max = df_by_date.groupBy('date').max('range').select(
        'date',
        functions.col('max(range)').alias('range'))
    joined_df = df_max.join(df_by_date, ["date", "range"], 'inner')
    joined_df = joined_df.select('date', 'station', 'range')
    joined_df.show()
    if not os.path.exists(output):
        os.makedirs(output)
    joined_df.write.csv(output, sep=' ', mode='overwrite')
Example #10
0
 def sensor_schema():
     sen_schema = types.StructType([
         types.StructField('timestamp', types.StringType()),
         types.StructField('X', types.DoubleType()),
         types.StructField('Y', types.DoubleType()),
         types.StructField('Z', types.DoubleType()),
     ])
     return sen_schema
def clean_input(dataframe, start, end):
    input_columns = [
        "client_id",
        "timestamp",
        "is_default_browser",
        "search_counts",
        "country",
        "profile_creation_date",
        "channel",
        "os",
        "hours",
    ]
    columns = {col: F.col(col) for col in input_columns}

    # normalize countries against a whitelist
    columns["country"] = (
        F.when(F.col("country").isin(countries), F.col("country"))
        .otherwise("Other")
        .alias("country")
    )

    # clean operating system based on CEP naming scheme
    pattern = {
        "Windows": ["Windows%", "WINNT%"],
        "Mac": ["Darwin%"],
        "Linux": ["%Linux%", "%BSD%", "%SunOS%"],
    }
    columns["os"] = column_like("os", pattern, "Other")

    # rename normalized channel to channel
    columns["channel"] = F.col("normalized_channel")

    # convert profile creation date into seconds (day -> seconds)
    columns["profile_creation_date"] = (
        F.when(F.col("profile_creation_date") >= 0,
               F.col("profile_creation_date") * seconds_per_day)
        .otherwise(0.0)
        .cast(types.DoubleType())
    )

    # generate hours of usage from subsession length (seconds -> hours)
    columns["hours"] = (
        F.when((F.col("subsession_length") >= 0) &
               (F.col("subsession_length") < 180 * seconds_per_day),
               F.col("subsession_length") / seconds_per_hour)
        .otherwise(0.0)
        .cast(types.DoubleType())
    )

    # clean the dataset
    clean = (
        dataframe
        .where(F.col("submission_date_s3") >= start)
        .where(F.col("submission_date_s3") < end)
        .select([expr.alias(name) for name, expr in columns.iteritems()])
    )

    return clean
def get_schema():
    return t.StructType([
        t.StructField("ReportAsOfEOD", t.StringType(), True),
        t.StructField("LoanID", t.StringType(), True),
        t.StructField("Date", t.StringType(), True),
        t.StructField("PrincipalRepayment", t.DoubleType(), True),
        t.StructField("InterestRepayment", t.DoubleType(), True),
        t.StructField("LateFeesRepayment", t.DoubleType(), True),
    ])
def get_schema():
    return TableSchema(
        [
            t.StructField("UserName", t.StringType(), True),
            t.StructField("Loans", t.LongType(), False),
            t.StructField("TotalInterestRepayment", t.DoubleType(), True),
            t.StructField("TotalLateFeesRepayment", t.DoubleType(), True),
        ],
        primary_key="UserName",
    )
 def process_columns(self, data_frame: DataFrame) -> DataFrame:
     return (data_frame.withColumn(
         "conversion_rate_multiplier",
         F.col("conversion_rate_multiplier").substr(1, 8).cast(
             T.IntegerType()) + F.col("conversion_rate_multiplier").substr(
                 9, 7).cast(T.DoubleType()) * .0000001).withColumn(
                     "conversion_rate_divisor",
                     F.col("conversion_rate_divisor").substr(1, 8).cast(
                         T.IntegerType()) +
                     F.col("conversion_rate_divisor").substr(9, 7).cast(
                         T.DoubleType()) * .0000001).drop("value"))
Example #15
0
def getSchema():  # noqa: N802
    return t.StructType(
        [
            t.StructField("COUNTYFP", t.IntegerType(), True),
            t.StructField("NEVER", t.DoubleType(), True),
            t.StructField("RARELY", t.DoubleType(), True),
            t.StructField("SOMETIMES", t.DoubleType(), True),
            t.StructField("FREQUENTLY", t.DoubleType(), True),
            t.StructField("ALWAYS", t.DoubleType(), True),
            t.StructField("INSERT_TS", t.TimestampType(), True),
        ]
    )
Example #16
0
def pivot(trades, prices):
    """
    Pivot and fill the columns on the event id so that each row contains a
    column for each id + column combination where the value is the most recent
    non-null value for that id. For example, given the above input tables the
    expected output is:

    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | id|    timestamp|  bid|  ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+
    | 10|1546300799000| 37.5|37.51| null|    null|  37.5| 37.51|    null|       null|  null|  null|    null|       null|
    | 10|1546300800000| null| null| 37.5|   100.0|  37.5| 37.51|    37.5|      100.0|  null|  null|    null|       null|
    | 10|1546300801000| null| null|37.51|   100.0|  37.5| 37.51|   37.51|      100.0|  null|  null|    null|       null|
    | 10|1546300802000|37.51|37.52| null|    null| 37.51| 37.52|   37.51|      100.0|  null|  null|    null|       null|
    | 20|1546300804000| null| null|12.67|   300.0| 37.51| 37.52|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300806000| 37.5|37.51| null|    null|  37.5| 37.51|   37.51|      100.0|  null|  null|   12.67|      300.0|
    | 10|1546300807000| null| null| 37.5|   200.0|  37.5| 37.51|    37.5|      200.0|  null|  null|   12.67|      300.0|
    +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+

    :param trades: DataFrame of trade events
    :param prices: DataFrame of price events
    :return: A DataFrame of the combined events and pivoted columns.
    """
    trades_prices = trades. \
        join(prices, ['id', 'timestamp'], 'outer'). \
        select('id', 'timestamp', 'bid', 'ask', 'price', 'quantity'). \
        orderBy(asc("timestamp"))
    unique_ids = trades_prices.select('id').distinct().collect()
    result = None
    for row in unique_ids:
        id = str(row.id)
        dyn_columns = trades_prices. \
            withColumn("bid", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('bid')).cast(T.DoubleType()))).\
            withColumn("ask", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('ask')).cast(T.DoubleType()))).\
            withColumn("price", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('price')).cast(T.DoubleType()))).\
            withColumn("quantity", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('quantity')).cast(T.DoubleType()))).\
            withColumn(id+"_id", when(col("id") == row.id, lit(id).cast(T.IntegerType())).otherwise(lit(id).cast(T.IntegerType()))).\
            withColumn(id + "_bid", func.last('bid', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_ask", func.last('ask', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_price", func.last('price', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \
            withColumn(id + "_quantity", func.last('quantity', True).over(
            Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))).\
            drop('bid', 'ask', 'price', 'quantity', id + "_id")
        if result is None:
            result = trades_prices.join(dyn_columns, ['id', 'timestamp'], how='outer')
        else:
            result = result.join(dyn_columns, ['id', 'timestamp'], how='outer')

    return result.orderBy('timestamp')
Example #17
0
def get_schema():
    return TableSchema(
        [
            t.StructField("ReportAsOfEOD", t.DateType(), True),
            t.StructField("LoanID", t.StringType(), True),
            t.StructField("Date", t.DateType(), True),
            t.StructField("PrincipalRepayment", t.DoubleType(), True),
            t.StructField("InterestRepayment", t.DoubleType(), True),
            t.StructField("LateFeesRepayment", t.DoubleType(), True),
        ],
        primary_key=["LoanID", "Date"],
        # partition_by = "Date" #---takes a very long time
    )
Example #18
0
def prepro(spark: SparkSession, datadir: Path, nam: str):
    def pp(s5: DataFrame) -> DataFrame:
        stages = []
        catvars = ['dept_id', 'item_id', 'store_id', 'wday']
        for v in catvars:
            stages += [StringIndexer(inputCol=v, outputCol=f"i{v}")]
        stages += [
            OneHotEncoderEstimator(inputCols=[f"i{v}" for v in catvars],
                                   outputCols=[f"v{v}" for v in catvars])
        ]
        stages += [
            VectorAssembler(inputCols=[
                'vwday', 'vitem_id', 'vdept_id', 'vstore_id', 'flag_ram',
                'snap', 'dn', 'month', 'year'
            ],
                            outputCol='features')
        ]

        pip: Pipeline = Pipeline(stages=stages)
        pipm = pip.fit(s5)
        df: DataFrame = pipm.transform(s5)
        return df.drop('idept_id', 'iitem_id', 'istore_id', 'iwday',
                       'vdept_id', 'vtem_id', 'vstore_id', 'vwday')

    print("--- preprocessing -----------------------")

    schema = stype.StructType([
        stype.StructField('year', stype.IntegerType(), True),
        stype.StructField('month', stype.IntegerType(), True),
        stype.StructField('dn', stype.IntegerType(), True),
        stype.StructField('wday', stype.IntegerType(), True),
        stype.StructField('snap', stype.IntegerType(), True),
        stype.StructField('dept_id', stype.StringType(), True),
        stype.StructField('item_id', stype.StringType(), True),
        stype.StructField('store_id', stype.StringType(), True),
        stype.StructField('sales', stype.DoubleType(), True),
        stype.StructField('flag_ram', stype.IntegerType(), True),
        stype.StructField('Sales_Pred', stype.DoubleType(), True),
    ])

    csv_path = datadir / "Sales5_Ab2011_InklPred.csv"
    print(f"--- Reading: '{csv_path}'")

    sales5: DataFrame = spark.read.csv(str(csv_path), header='true', schema=schema) \
        .withColumn("label", sfunc.col('sales'))

    ppdf = pp(sales5)
    print(f"--- Writing: '{nam}'")

    hlp.writeToDatadirParquet(ppdf, nam)
Example #19
0
def load_prices(spark):
    data = [
        (10, 1546300799000, 37.50, 37.51),
        (10, 1546300802000, 37.51, 37.52),
        (10, 1546300806000, 37.50, 37.51),
    ]
    schema = T.StructType([
        T.StructField("id", T.LongType()),
        T.StructField("timestamp", T.LongType()),
        T.StructField("bid", T.DoubleType()),
        T.StructField("ask", T.DoubleType()),
    ])

    return spark.createDataFrame(data, schema)
Example #20
0
def load_trades(spark):
    data = [
        (10, 1546300800000, 37.50, 100.000),
        (10, 1546300801000, 37.51, 100.000),
        (20, 1546300804000, 12.67, 300.000),
        (10, 1546300807000, 37.50, 200.000),
    ]
    schema = T.StructType([
        T.StructField("id", T.LongType()),
        T.StructField("timestamp", T.LongType()),
        T.StructField("price", T.DoubleType()),
        T.StructField("quantity", T.DoubleType()),
    ])

    return spark.createDataFrame(data, schema)
Example #21
0
def weighted_predict(df, epoch_id):
    split_col = F.split(df.value, ',')
    # df = df.withColumn('TimeStamp', F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''),
    #                                                'yyyy-mm-dd HH:mm:ss.SSS'))
    df = df.withColumn(
        'TimeStamp',
        F.regexp_replace(split_col.getItem(0), '"',
                         '').cast(tp.TimestampType()))
    df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType()))
    df = df.withColumn('RT_Temp_Predict',
                       split_col.getItem(2).cast(tp.DoubleType()))
    df = df.withColumn('Nu_Temp', split_col.getItem(3).cast(tp.DoubleType()))
    df = df.withColumn('Nu_Temp_Predict',
                       split_col.getItem(4).cast(tp.DoubleType()))
    df = df.withColumn(
        'RMSE_Score',
        F.regexp_replace(split_col.getItem(4), '"', '').cast(tp.DoubleType()))
    df = df.drop('value')
    # df.show()
    sp_df = df.select('TimeStamp','RT_Temp','RT_Temp_Predict','Nu_Temp','Nu_Temp_Predict','RMSE_Score')\
        .where("topic='{}'".format(str(sp_topic)))
    bt_df = df.select('TimeStamp','RT_Temp','RT_Temp_Predict','Nu_Temp','Nu_Temp_Predict','RMSE_Score')\
        .where("topic='{}'".format(str(bl_topic)))
    # print("Speed Layer Predictions....")
    # sp_df.show(5)
    # print("Batch Layer Predictions....")
    # bt_df.show(5)

    df_final = (sp_df.alias('sp').join(
        bt_df.alias('bt'),
        on=sp_df['TimeStamp'] == bt_df['TimeStamp'],
        how='inner'
    ).selectExpr(
        'sp.TimeStamp as TS', 'round(sp.RT_Temp,3) as RT_Temp',
        'round(sp.RT_Temp_Predict,3) as Speed_RT_Temp',
        'round(bt.RT_Temp_Predict,3) as Batch_RT_Temp',
        'round(({}*sp.RT_Temp_Predict + {}*bt.RT_Temp_Predict),3) as Wt_RT_Temp'
        .format(str(s_wt), str(b_wt)), 'round(sp.Nu_Temp,3) as Nu_Temp',
        'round(sp.Nu_Temp_Predict,3) as Speed_Nu_Temp',
        'round(bt.Nu_Temp_Predict,3) as Batch_Nu_Temp',
        'round(({}*sp.Nu_Temp_Predict + {}*bt.Nu_Temp_Predict),3) as Wt_Nu_Temp'
        .format(str(s_wt), str(b_wt)), 'round(sp.RMSE_Score,3) as Speed_RMSE',
        'round(bt.RMSE_Score,3) as Batch_RMSE'))
    df_final.show(5)
    # df = spark.sql("select * FROM default.turbine")
    df_final.write.saveAsTable(name='tsa.serving_predictions',
                               format='hive',
                               mode='append')
Example #22
0
    def verification(self, candDF, threshold):
        """
            Input: $candDF is the output DataFrame from the 'filtering' function.
                   $threshold is a float value between (0, 1]

            Output: Return a new DataFrame $resultDF that represents the ER result.
                    It has five columns: id1, joinKey1, id2, joinKey2, jaccard

            Comments: There are two differences between $candDF and $resultDF
                      (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity
                          between $joinKey1 and $joinKey2
                      (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold
        """
        def get_jaccard_similarity(set_1, set_2):
            set_1 = set(set_1)
            set_2 = set(set_2)
            return len(set_1 & set_2) * 1.00 / len(set_1 | set_2) * 1.00

        calculate_jaccard = functions.udf(get_jaccard_similarity,
                                          types.DoubleType())
        candDF = candDF.withColumn(
            'jaccard', calculate_jaccard(candDF['joinKey1'],
                                         candDF['joinKey2']))
        candDF = candDF.filter(candDF.jaccard >= threshold)
        return candDF
def _generate_select_expression_for_extended_string_to_double(
        source_column, name):
    """
    More robust conversion from StringType to DoubleType.
    Is able to additionally handle (compared to implicit Spark conversion):

    * Preceding whitespace
    * Trailing whitespace
    * Preceeding and trailing whitespace
    * underscores as thousand separators

    Hint
    ----
    Please have a look at the tests to get a better feeling how it behaves under
    tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and
    tests/data/test_fixtures/mapper_custom_data_types_fixtures.py

    Example
    -------
    >>> from spooq.transformer import Mapper
    >>>
    >>> input_df.head(3)
    [Row(input_string="  21474838464.70 "),
     Row(input_string="Hello"),
     Row(input_string="21_474_838_464.70")]
    >>> mapping = [("output_value", "input_string", "extended_string_to_double")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(input_string=21474838464.7),
     Row(input_string=None),
     Row(input_string=21474838464.70)]
    """
    return F.regexp_replace(F.trim(source_column), "_",
                            "").cast(T.DoubleType()).alias(name)
Example #24
0
    def test_fit_model_multiclass(self):
        model = create_mnist_model()
        optimizer = tf.keras.optimizers.Adadelta(1.0)
        loss = tf.keras.losses.categorical_crossentropy

        for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]:
            with spark_session('test_fit_model_multiclass', cores=num_cores) as spark:
                df = create_mnist_data(spark)

                with local_store() as store:
                    keras_estimator = hvd.KerasEstimator(
                        num_proc=num_cores,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        metrics=['accuracy'],
                        feature_cols=['features'],
                        label_cols=['label_vec'],
                        batch_size=2,
                        epochs=2,
                        verbose=2)

                    keras_model = keras_estimator.fit(df).setOutputCols(['label_prob'])
                    pred_df = keras_model.transform(df)

                    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
                    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))

                    preds = pred_df.collect()
                    assert len(preds) == df.count()

                    row = preds[0]
                    label_prob = row.label_prob.toArray().tolist()
                    assert label_prob[int(row.label_pred)] == max(label_prob)
Example #25
0
def extract_embedding(spark, glove_model_path, output_folder):

    glove = Glove.load(glove_model_path)

    dictionary_schema = T.StructType([
        T.StructField('index', T.IntegerType(), True),
        T.StructField('standard_concept_id', T.IntegerType(), True)
    ])

    dictionary_df = spark.createDataFrame([
        Row(index=k, standard_concept_id=int(v))
        for k, v in glove.inverse_dictionary.items()
    ], dictionary_schema)

    vector_schema = T.StructType([
        T.StructField('index', T.IntegerType(), True),
        T.StructField('vector', T.ArrayType(T.DoubleType()), True)
    ])

    vector_df = spark.createDataFrame([
        Row(index=idx, vector=vector.tolist())
        for idx, vector in enumerate(glove.word_vectors)
    ], vector_schema)

    dictionary_df.join(vector_df, 'index').select(
        'standard_concept_id',
        'vector').write.mode('overwrite').parquet(output_folder)
    def get_coefficients(
        split_urls_and_word_frequency_orders: DataFrame,
        s: float,
        additional_weight_function: Callable[[int], float] = lambda e: 1
    ) -> DataFrame:
        """

        :param split_urls_and_word_frequency_orders: A DataFrame of split URLs and word frequency orders with columns:
                                                     id, url, split_url, word_frequency_orders.
        :param s: s parameter of Zipf distribution.
        :param additional_weight_function: additional weight function to be applied additional weight beside Zipf to
                                           word vector.
        :return: A DataFrame of split URLs and coefficient of each term with columns: id, url, split_url, coefficients
        """
        def calculate_coefficients(word_frequency_orders):
            coefficients = []
            for i in range(len(word_frequency_orders)):
                coefficients.append(
                    additional_weight_function(i) *
                    URLVectorCalculator.get_zipf_coefficient(
                        word_frequency_orders[i], s))
            return coefficients

        get_coefficients_udf = F.udf(calculate_coefficients,
                                     T.ArrayType(T.DoubleType()))
        split_urls_and_coefficients = split_urls_and_word_frequency_orders \
            .select("id",
                    "url",
                    "split_url",
                    get_coefficients_udf("word_frequency_orders").alias("coefficients"))
        return split_urls_and_coefficients
    def sum_word_vectors(
            urls_and_weighted_word_vectors: DataFrame) -> DataFrame:
        """
        Sums weighted word vectors and their corresponding coefficients for each URL.

        :param urls_and_weighted_word_vectors: A DataFrame of URLs and weighted word vectors with columns: id, url, pos,
                                               word, weighted_word_vector, coefficient.
        :return: A DataFrame of URLs and their corresponding sum of word vectors and sum of coefficients with columns:
                 id, url, split_url, coefficients, summed_vectors, summed_coefficients.
        """

        word_array_sorter_udf = F.udf(
            URLVectorCalculator.sort_list_of_2_tuples_by_0th_item,
            T.ArrayType(T.StringType()))
        coefficient_array_sorter_udf = F.udf(
            URLVectorCalculator.sort_list_of_2_tuples_by_0th_item,
            T.ArrayType(T.DoubleType()))

        vector_size = len(
            urls_and_weighted_word_vectors.select(
                'weighted_word_vector').first()[0])
        return urls_and_weighted_word_vectors \
            .groupBy("id", "url") \
            .agg(F.collect_list(F.struct("pos", "word")).alias("positions_and_words"),
                 F.collect_list(F.struct("pos", "coefficient")).alias("positions_and_coefficients"),
                 F.sum("coefficient").alias("summed_coefficients"),
                 F.array(*[F.sum(F.col("weighted_word_vector")[i])
                           for i in range(vector_size)]).alias("summed_vectors")) \
            .select("id", "url", "summed_coefficients", "summed_vectors",
                    word_array_sorter_udf("positions_and_words").alias("split_url"),
                    coefficient_array_sorter_udf("positions_and_coefficients").alias("coefficients"))
def cond_pandas(pyData):
    groupby_columns = ['grp', 'subgrp']
    agg_columns = ['mean_of_C', 'max_of_D', 'cond_var_of_E', 'cond_var_of_E2']
    df = spark.createDataFrame(pyData)
    postAggSchema = DataTypes.StructType(
        [x for x in DataPointSchema.fields if x.name in groupby_columns] + [
            DataTypes.StructField(name, DataTypes.DoubleType(), False)
            for name in agg_columns
        ])
    #
    @pandas_udf(postAggSchema, PandasUDFType.GROUPED_MAP)
    def inner_agg_method(dfPartition):
        group_key = dfPartition['grp'].iloc[0]
        subgroup_key = dfPartition['subgrp'].iloc[0]
        C = dfPartition['C']
        D = dfPartition['D']
        posE = dfPartition[dfPartition.E < 0]['E']
        return pd.DataFrame([[
            group_key,
            subgroup_key,
            C.mean(),
            D.max(),
            posE.var(),
            posE \
                .agg(lambda E: \
                    ((E * E).sum() -
                    E.sum()**2/E.count())/(E.count()-1)) \
                .mean(),
            ]], columns=groupby_columns + agg_columns)

    #
    aggregates = df \
        .groupby(df.grp, df.subgrp).apply(inner_agg_method) \
        .orderBy('grp', 'subgrp')
    return aggregates, None
Example #29
0
def calc(df):
    
    ## function to calculate the appoximating function and its derivative
    def foo(x,y):

        y_arr = np.array(y)
        gy = g(y_arr)
        gp = gprime(y_arr)
        x_arr = np.array(x)
        res = np.outer(gy,x_arr)
        return([res.flatten().tolist(), gp.tolist()])

    udf_foo = f.udf(foo, t.ArrayType(t.ArrayType(t.DoubleType())))



    df2 = df.withColumn("vals", udf_foo("features","Y"))

    df2 = df2.select("id", f.col("vals").getItem(0).alias("gy"), f.col("vals").getItem(1).alias("gy_"))
    GY_ = np.array(df2.agg(f.array([f.sum(f.col("gy")[i]) 
                                for i in range(n_comp**2)])).collect()[0][0]).reshape(n_comp,n_comp)/num_rows

    GY_AVG_V  = np.array(df2.agg(f.array([f.avg(f.col("gy_")[i]) 
                                  for i in range(n_comp)])).collect()[0][0]).reshape(n_comp,1)*V

    return(GY_, GY_AVG_V)
Example #30
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars.packages",
             "graphframes:graphframes:0.8.0-spark3.0-s_2.12")

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.sql.shuffle.partitions", "8")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types

    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction("Dmetaphone",
                                   "uk.gov.moj.dash.linkage.DoubleMetaphone",
                                   types.StringType())
    return spark