コード例 #1
0
Created on Thu Mar 15 19:07:02 2018

@author: zhcao
"""

from pyspark.sql import SparkSession
import pyspark.sql.types as typ
import pyspark.ml.feature as ft
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline

if __name__ == "__main__":

    spark = SparkSession.builder.appName("XiGuaKMeans").getOrCreate()

    labels = [('Num', typ.IntegerType()), ('VIB1', typ.FloatType()),
              ('VIB2', typ.FloatType())]

    schema = typ.StructType(
        [typ.StructField(e[0], e[1], False) for e in labels])

    data = spark.read.csv(
        "file:///home/hadoop/zhaco/workspace/spark_test/watermelon.csv",
        header=True,
        schema=schema)

    data.createOrReplaceTempView("data_clu")
    data.printSchema()
    data.cache()
    data.show()
コード例 #2
0
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext, HiveContext
import pyspark.sql.types as typ

if __name__ == "__main__":

    conf = SparkConf().setMaster("local[*]").setAppName("Test7_1")

    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    hiveContext = HiveContext(sc)

    labels = [('INFANT_ALIVE_AT_REPORT', typ.StringType()),
              ('BIRTH_YEAR', typ.IntegerType()),
              ('BIRTH_MONTH', typ.IntegerType()),
              ('BIRTH_PLACE', typ.StringType()),
              ('MOTHER_AGE_YEARS', typ.IntegerType()),
              ('MOTHER_RACE_6CODE', typ.StringType()),
              ('MOTHER_EDUCATION', typ.StringType()),
              ('FATHER_COMBINED_AGE', typ.IntegerType()),
              ('FATHER_EDUCATION', typ.StringType()),
              ('MONTH_PRECARE_RECODE', typ.StringType()),
              ('CIG_BEFORE', typ.IntegerType()),
              ('CIG_1_TRI', typ.IntegerType()), ('CIG_2_TRI',
                                                 typ.IntegerType()),
              ('CIG_3_TRI', typ.IntegerType()),
              ('MOTHER_HEIGHT_IN', typ.IntegerType()),
              ('MOTHER_BMI_RECODE', typ.IntegerType()),
              ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
コード例 #3
0
    types.StructField('2008',types.DoubleType()),
    types.StructField('2009',types.DoubleType()),
    types.StructField('2010',types.DoubleType()),
    types.StructField('2011',types.DoubleType()),
    types.StructField('2012',types.DoubleType()),
    types.StructField('2013',types.DoubleType()),
    types.StructField('2014',types.DoubleType()),
    types.StructField('2015',types.DoubleType()),
    types.StructField('2016',types.DoubleType()),
    types.StructField('2017',types.DoubleType()),
    types.StructField('2018',types.DoubleType()),
    types.StructField('2019',types.DoubleType()),
    types.StructField('2020',types.DoubleType()),
    types.StructField('2021',types.DoubleType()),
    types.StructField('2022',types.DoubleType()),
    types.StructField('Estimates Start After',types.IntegerType())    
])

FDindex_schema = types.StructType([
    types.StructField('ifs', types.IntegerType()),
    types.StructField('code', types.StringType()),
    types.StructField('country', types.StringType()),
    types.StructField('imf_region', types.StringType()),
    types.StructField('imf_income', types.StringType()),
    types.StructField('year', types.IntegerType()),
    types.StructField('FD', types.DoubleType()),
    types.StructField('FI', types.DoubleType()),
    types.StructField('FM', types.DoubleType()),
    types.StructField('FID',types.DoubleType()),
    types.StructField('FIA',types.DoubleType()),
    types.StructField('FIE', types.DoubleType()),
コード例 #4
0
import sys
from pyspark.sql import SparkSession, functions, types
import pandas as pd

spark = SparkSession.builder.appName('wikipedia popular').getOrCreate()

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

schema = types.StructType([
    types.StructField('language', types.StringType(), False),
    types.StructField('page', types.StringType(), False),
    types.StructField('views', types.IntegerType(), False),
    types.StructField('bytes', types.IntegerType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    def spilt(filenmae):
        file = filenmae.rsplit('/', 1)[1]
        date = file.split("-", 1)[1]
        name = date[:-7]
        return name

    path_to_hour = functions.udf(spilt, returnType=types.StringType())
    data = spark.read.csv(in_directory, sep=" ", schema=schema).withColumn(
        'filename', path_to_hour(functions.input_file_name()))
コード例 #5
0
ファイル: conftest.py プロジェクト: vishalbelsare/ibis
def get_common_spark_testing_client(data_directory, connect):
    spark = (SparkSession.builder.config('spark.default.parallelism',
                                         4).config('spark.driver.bindAddress',
                                                   '127.0.0.1').getOrCreate())
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session
    num_partitions = 4

    df_functional_alltypes = (
        s.read.csv(
            path=str(data_directory / 'functional_alltypes.csv'),
            schema=pt.StructType([
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]),
            mode='FAILFAST',
            header=True,
        ).repartition(num_partitions).sort('index'))

    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = (s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = (s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
コード例 #6
0
def main():
    get_pred_error = functions.udf(get_sq_error, types.FloatType())
    convert_to_int = functions.udf(lambda value: int(value), types.IntegerType())
    get_day_of_week = functions.udf(lambda timestamp: datetime.strptime(timestamp, '%Y-%m-%d').weekday(), types.StringType())
    get_month = functions.udf(lambda timestamp: datetime.strptime(timestamp, '%Y-%m-%d').month,
                                    types.StringType())
    training_data = spark.read.csv('train_rating.txt', header=True)
    test_data = spark.read.csv('test_rating.txt', header=True)

    for column in training_data.columns:
        if(column != 'date'):
            training_data = training_data.withColumn(column, convert_to_int(training_data[column]))
        else:
            training_data = training_data.withColumn('dow', get_day_of_week(training_data[column]))
            training_data = training_data.withColumn('month', get_month(training_data[column]))

    for column in test_data.columns:
        if (column != 'date'):
            test_data = test_data.withColumn(column, convert_to_int(test_data[column]))
        else:
            test_data = test_data.withColumn('dow', get_day_of_week(test_data[column]))
            test_data = test_data.withColumn('month', get_month(test_data[column]))

    training_data = training_data.drop('date')
    test_data = test_data.drop('date')


    discreete_columns = ['dow', 'month']
    string_indexer = [StringIndexer(inputCol='{}'.format(column), outputCol='{}_ind'.format(column)) for column in discreete_columns]
    hot_encoders = [OneHotEncoder(inputCol='{}_ind'.format(column), outputCol='{}_he'.format(column)) for column in discreete_columns]
    vector_assembler = VectorAssembler(inputCols=["user_id", "business_id", "dow_he", "month_he"], outputCol="features")

    rf = RandomForestClassifier(numTrees=25, maxDepth=10, labelCol="rating", seed=42)


    models = [
        ('Rand-forest', Pipeline(stages=string_indexer + hot_encoders + [vector_assembler, rf]))
    ]

    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='rating')

    # split data into training and testing
    train, test = training_data.randomSplit([0.8, 0.2])
    train = train.cache()
    test = test.cache()

    for label, pipeline in models:

        model = pipeline.fit(train)
        predictions = model.transform(test)
        predictions = predictions.withColumn('sq_error', get_pred_error(predictions['rating'], predictions['prediction']))
        rmse_score = predictions.groupBy().avg('sq_error').head()[0]
        # calculate a score
        score = evaluator.evaluate(predictions)
        print(label, rmse_score ** 0.5)

    #Uncomment when you are satisfied with training
    # final_pred = model.transform(test_data)
    # final_pred = final_pred.withColumnRenamed('prediction', 'rating')
    # final_pred.select("test_id","rating").toPandas().to_csv('submission.csv', sep=',', encoding='utf-8',index=False)
    return
コード例 #7
0
Zhukun Luo
Jiangxi University of Finance and Economics
'''
#导入sparksql包
from pyspark.sql import SparkSession
from pyspark.sql import types as T
#创建一个spark环境
spark = (
    SparkSession.builder.master("local[4]")  #表示创建一个4个线程
    .appName("Exploring Joins").config("spark.some.config.option",
                                       "some-value").getOrCreate())
sc = spark.sparkContext

#创建一个数据模板
schema = T.StructType([  #数据格式
    T.StructField("user_id", T.IntegerType(), False),
    T.StructField("name", T.StringType(), True),
    T.StructField("sex", T.StringType(), True),
    T.StructField("age", T.IntegerType(), True),
])

data = [  #数据
    (1, "ming Li", "male", 13),
    (2, "fang Zhang", "female", 12),
    (2, "hong Wang", "female", 1),
]

user_df = spark.createDataFrame(  #创建一个dataframe
    data=data, schema=schema)

user_df.toPandas()
コード例 #8
0
        select('id', 'connections_count')

    # Find the most popular superhero with the most connections.
    # Result is a Row object
    superhero_connections_by_id_df = superhero_connections_df.groupBy('id'). \
        agg(func.sum('connections_count').alias('connections_freq'))

    superhero_connections_by_id_df.show()

    # Find min frequency in superhero connections.
    # first: to convert it to Row object
    min_frequency = superhero_connections_by_id_df.agg(func.min(func.col('connections_freq')).alias('min_freq')). \
        first()['min_freq']

    # Get the dataset of superheros' name.
    superhero_names_schema = types.StructType([
        types.StructField("id", types.IntegerType(), False),
        types.StructField("name", types.StringType(), False)
    ])
    superhero_names_df = spark.read.schema(superhero_names_schema).csv(
        'dataset/Marvel+Names', sep=' ').cache()

    # Get superheros with their name having minimum connection frequency.
    # First filter and then join two dataframes.
    most_obscure_superheros = superhero_connections_by_id_df.filter(func.col('connections_freq') == min_frequency).\
        join(superhero_names_df, on='id')

    most_obscure_superheros.show()

    spark.stop()
コード例 #9
0
def process_immigration_data(spark, input_data, output_data, date_string):
    # Get execution_date
    execution_date = stringToDatetimeYYYYMMDD(date_string)
    # Extract the last two digits from the year
    year = datetimeToYearShort(execution_date)
    # Extract month short version apr
    month = datetimeToMonthShort(execution_date)
    # Extract the day of month
    day = datetimeToDayClasic(execution_date)
    path = input_data + "i94_{0}{1}_sub.sas7bdat".format(month.lower(), year)
    df_immigration_data = spark.read.format(
        'com.github.saurfang.sas.spark').load(path)
    # df_immigration_data = df_immigration_data.filter('date="2016-04-06"')
    # df_immigration_data = spark.read.options(delimiter=",", header=True, encoding="UTF-8").csv(
    #     input_data)
    # trim spaces for arrdate
    df_immigration_data = trimStrings(df_immigration_data, ['arrdate'])
    # Set arrdate as integer because it is a timestamp
    df_immigration_data = df_immigration_data.withColumn(
        "arrdate", df_immigration_data.arrdate.cast("integer"))
    # transform the timestamp in a date
    df_immigration_data = df_immigration_data.withColumn(
        "date",
        udf_date_timedelta(df_immigration_data.arrdate).cast("date"))
    # create the day column
    df_immigration_data = df_immigration_data.withColumn(
        'day', datetimeToDay(df_immigration_data.date))
    #filter by day column
    df_immigration_data.createOrReplaceTempView("immigration_data")
    df_immigration_data = spark.sql(
        """SELECT arrdate,day,date,admnum,i94cit,i94res,
    i94bir,i94port,i94mode,
    i94addr,i94visa,gender,i94yr,i94mon 
    FROM immigration_data 
    WHERE day={0}""".format(day))

    # trim spaces for the rest of the columns
    df_immigration_data = trimStrings(df_immigration_data, [
        'admnum', 'i94cit', 'i94res', 'i94bir', 'i94port', 'i94mode',
        'i94addr', 'i94visa', 'gender'
    ])
    ### Set the rest of columns for time table
    #transform timestamp in a datetime object to be able to get the hour
    df_immigration_data = df_immigration_data.withColumn(
        "datetime", get_datetime(df_immigration_data.arrdate))
    df_immigration_data = df_immigration_data.withColumnRenamed(
        'i94yr', 'year')
    df_immigration_data = df_immigration_data.withColumn(
        'hour', datetimeToHour(df_immigration_data.datetime))
    df_immigration_data = df_immigration_data.withColumn(
        'week', datetimeToWeek(df_immigration_data.date))
    df_immigration_data = df_immigration_data.withColumn(
        'day', datetimeToDay(df_immigration_data.date))
    df_immigration_data = df_immigration_data.withColumnRenamed(
        'i94mon', 'month')
    df_immigration_data = df_immigration_data.withColumn(
        'weekday', datetimeToWeekDay(df_immigration_data.date))

    df_immigration_data = df_immigration_data['admnum', 'arrdate', 'year',
                                              'month', 'day', 'weekday',
                                              'hour', 'week', 'i94cit',
                                              'i94res', 'i94bir', 'i94port',
                                              'i94mode', 'i94addr', 'i94visa',
                                              'gender']
    # assure the columns are delivered in the right data type
    df_immigration_data = df_immigration_data \
        .withColumn("admnum", castAbsInt_udf(F.col("admnum"))) \
        .withColumn("arrdate", castInt_udf(F.col("arrdate"))) \
        .withColumn("year", F.col("year").cast(T.IntegerType())) \
        .withColumn("month", F.col("month").cast(T.IntegerType())) \
        .withColumn("day", F.col("day").cast(T.IntegerType())) \
        .withColumn("weekday", F.col("weekday").cast(T.StringType())) \
        .withColumn("hour", F.col("hour").cast(T.IntegerType())) \
        .withColumn("week", F.col("week").cast(T.IntegerType())) \
        .withColumn("i94cit", castInt_udf(F.col("i94cit"))) \
        .withColumn("i94res", castInt_udf(F.col("i94res"))) \
        .withColumn("i94bir", castInt_udf(F.col("i94bir"))) \
        .withColumn("i94port", F.col("i94port").cast(T.StringType())) \
        .withColumn("i94mode", castInt_udf(F.col("i94mode"))) \
        .withColumn("i94addr", F.col("i94addr").cast(T.StringType())) \
        .withColumn("i94visa", castInt_udf(F.col("i94visa"))) \
        .withColumn("gender", F.col("gender").cast(T.StringType()))
    # filter immigrants without admission number
    df_immigration_data = df_immigration_data.na.drop(subset=["admnum"])
    df_immigration_data.write.mode("overwrite")\
        .parquet(
        output_data + "immigration")
コード例 #10
0
    # is the input JSON?
    "es.input.json": "yes"
}

## is there a field in the mapping that should be used to specify the ES document ID
# "es.mapping.id": "id"
# Define Training Set Structure
tweetSchema = tp.StructType([
    # Todo use proper timestamp
    tp.StructField(name='timestamp', dataType=tp.LongType(), nullable=True),
    tp.StructField(name='tweet', dataType=tp.StringType(), nullable=True)
])

schema = tp.StructType([
    tp.StructField(name='id', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='subjective', dataType=tp.IntegerType(),
                   nullable=True),
    tp.StructField(name='positive', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='negative', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='ironic', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='lpositive', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='lnegative', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='top', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='tweet', dataType=tp.StringType(), nullable=True)
])

# Create Spark Context
sc = SparkContext(appName="Tweet")
spark = SparkSession(sc)

sc.setLogLevel("WARN")
コード例 #11
0
import datetime as dt

spark = get_spark_context(__name__)

df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "prices")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr(
        "CAST(key AS STRING)", 
        "CAST(value AS STRING)",
        "timestamp",
    )
    .select(
        psf.col("key").cast(pst.IntegerType()).alias("match_id"),
        psf.col("value").alias("body"),
        "timestamp",
        psf.to_date(psf.col("timestamp")).alias("arrival_date")
    )
    .coalesce(1)
    .writeStream.format("json")
    .outputMode("append")
    .option("checkpointLocation", f"./_checkpoints/raw")
    .partitionBy("arrival_date")
    .trigger(processingTime='1 minute')
    .start("./raw/prices")
)

spark.streams.awaitAnyTermination()
コード例 #12
0
    def data_describe(self):
        sqlContext = SQLContext(self.sc)
        print('starto read data after explore_saprk_step1_cross:')
        rootPath=self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_train_single_cross')
        test_file_path = rootPath + 'actLog_test_single_cross'
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #比对label,看labels是否合适
        labels=[  ('duration_time',typ.IntegerType()),
                ('device',typ.IntegerType()),
                ('music_id',typ.IntegerType()),
                ('item_city',typ.IntegerType()),
                ('author_id',typ.IntegerType()),
                ('item_id',typ.IntegerType()),
                ('user_city',typ.IntegerType()),
                ('uid',typ.IntegerType()),
                ('channel',typ.IntegerType()),
                ('finish',typ.IntegerType()),
                ('like',typ.IntegerType()),
                ('time_day',typ.IntegerType()),
                ('item_pub_month',typ.IntegerType()),
                ('item_pub_day',typ.LongType()),
                ('item_pub_hour',typ.IntegerType()),
                ('item_pub_minute',typ.IntegerType()),
                ('uid_count_bin',typ.IntegerType()),
                ('user_city_count_bin',typ.IntegerType()),
                ('user_city_count_ratio',typ.DoubleType()),
                ('item_id_count_bin',typ.IntegerType()),
                ('item_id_count_ratio',typ.DoubleType()),
                ('author_id_count_bin',typ.IntegerType()),
                ('author_id_count_ratio',typ.DoubleType()),
                ('item_city_count_bin',typ.IntegerType()),
                ('item_city_count_ratio',typ.DoubleType()),
                ('music_id_count_bin',typ.IntegerType()),
                ('music_id_count_ratio',typ.DoubleType()),
                ('device_count_bin',typ.IntegerType()),
                ('device_count_ratio',typ.DoubleType()),
                ('uid_author_id_count_bin',typ.IntegerType()),
                ('uid_author_id_count_ratio',typ.DoubleType()),
                 ('uid_item_city_count_bin',typ.IntegerType()),
                ('uid_item_city_count_ratio',typ.DoubleType()),
                ('uid_channel_count_bin',typ.IntegerType()),
                ('uid_channel_count_ratio',typ.DoubleType()),
                ('uid_music_id_count_bin',typ.IntegerType()),
                ('uid_music_id_count_ratio',typ.DoubleType()),
                ('uid_device_count_bin',typ.IntegerType()),
                ('uid_device_count_ratio',typ.DoubleType()),
                ('author_id_channel_count_bin',typ.IntegerType()),
                ('author_id_channel_count_ratio',typ.DoubleType()),
                ('author_id_user_city_count_bin',typ.IntegerType()),
                ('author_id_user_city_count_ratio',typ.DoubleType()),
                ('author_id_item_city_count_bin',typ.IntegerType()),
                ('author_id_item_city_count_ratio',typ.DoubleType()),
                ('author_id_music_id_count_bin',typ.IntegerType()),
                ('author_id_music_id_count_ratio',typ.DoubleType()),
                ('uid_channel_device_count_bin',typ.IntegerType()),  #改成uid_channel_device
                ('uid_channel_device_count_ratio',typ.DoubleType()),  #改成uid_channel_device
                ('author_id_item_city_music_id_count_bin',typ.IntegerType()),
                ('author_id_item_city_music_id_count_ratio',typ.DoubleType()),
            ]
        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,actionLogSchema)
        # df_actLog_test.show(1,truncate=False)

        print('start to read actLog_train_single_cross')
        train_file_path = rootPath + 'actLog_train_single_cross'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,actionLogSchema)
        # df_actLog_train.show(1,truncate=False)


        return df_actLog_train, df_actLog_test
コード例 #13
0
    def data_explore(self,df_train,df_test):

        sqlContext = SQLContext(self.sc)

        print("对item_pub_hour进行离散化")
        def hourBin(x):
            if x>=23 or x <=2:
                return 1
            elif 3<=x<8:
                return 2
            elif 8<=x<12:
                return 3
            else:
                return 4

        converHourBin=udf(lambda x :hourBin(x), typ.IntegerType())
        df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour))
        df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour))

        print("----1、计算统计特征:用户特征和item特征之间的条件概率---------")
        feats_list = []

        condition = ['uid']
        authors = ['music_id','item_pub_hour']  #'author_id', 'item_city', 'channel',
        feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors])
        df_tmp=df_train.select(condition)
        df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count')
        # df2.show(1,truncate=False) # ['uid','uid_count']
        df2.cache()
        # df_train=df_train.join(df2,condition,'left')
        # df_train.show(1,truncate=False)
        # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id]
        # del df2
        # gc.collect()
        for feature_group in feats_list:
            print(feature_group+[feature_group[0]+'_count'])   #+[feature_group[0]+'_count']
            df1=df_train.select(feature_group).groupby(feature_group).count()
            # df1.show(1,truncate=False)   #理论上还是只有3个字段,不包含uid_count
            df1=df1.join(df2,condition,'left')
            df1.show(1,truncate=False)   #|uid|item_pub_hour|count|uid_count
            df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count'))
            df1=df1.drop('count').drop(feature_group[0]+'_count')
            df1.show(1,truncate=False)
            print(df_train.columns)
            print(df1.columns)
            df_train=df_train.join(df1,feature_group,"left")   #|uid|item_pub_hour|item_pub_hour_uid_condition_ratio
            df_train.show(1,truncate=False)
            df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0})  #对某一列填充缺失值
            df_test.show(1,truncate=False)



        feats_list = []
        condition = ['item_id']
        authors = ['uid_city', 'channel']
        feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors])

        df_tmp=df_train.select(condition)
        df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count')
        # df2.show(1,truncate=False) # ['uid','uid_count']
        df2.cache()
        # df_train=df_train.join(df2,condition,'left')
        # df_train.show(1,truncate=False)
        # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id]
        # del df2
        # gc.collect()
        for feature_group in feats_list:
            print(feature_group+[feature_group[0]+'_count'])   #+[feature_group[0]+'_count']
            df1=df_train.select(feature_group).groupby(feature_group).count()
            # df1.show(1,truncate=False)   #理论上还是只有3个字段,不包含uid_count
            df1=df1.join(df2,condition,'left')
            df1.show(1,truncate=False)
            df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count'))
            df1=df1.drop('count').drop(feature_group[0]+'_count')
            # df1.show(5)
            df_train=df_train.join(df1,feature_group,"left")
            df_train.show(1,truncate=False)
            df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0})  #对某一列填充缺失值
            df_test.show(1,truncate=False)


        df_train=df_train.drop('uid_count').drop('item_id_count')
        df_train.printSchema()
        df_test.printSchema()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_step3_try'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_step3_try'
        os.system("hadoop fs -rm -r {}".format(train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
コード例 #14
0
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark import StorageLevel as S

from datetime import datetime as dt
import os

# ## Connect to the Spark cluster
spark = SparkSession\
  .builder\
  .appName("ProcessDoubleClick")\
  .getOrCreate()

# # Data Import
# Read in raw data
impressionFields = [
    T.StructField('advertiserID', T.IntegerType(), False),
    T.StructField('domain', T.StringType(), False),
    T.StructField('viewable', T.BooleanType(), False),
    T.StructField('city', T.StringType(), False),
    T.StructField('mobileDevice', T.StringType(), False),
    T.StructField('country', T.StringType(), False),
    T.StructField('sellerPrice', T.IntegerType(), False),
    T.StructField('userID', T.IntegerType(), False),
    T.StructField('impressionID', T.IntegerType(), False),
    T.StructField('postalCode', T.StringType(), False),
    T.StructField('carrier', T.StringType(), False),
    T.StructField('eventType', T.StringType(), False),
    T.StructField('lineItemID', T.IntegerType(), False),
    T.StructField('time', T.TimestampType(), False),
    T.StructField('duration', T.IntegerType(), False),
    T.StructField('browser', T.StringType(), False),
コード例 #15
0
 def lookup(mapping):
     def fn(v):
         return mapping.index(v)
     return F.udf(fn, returnType=T.IntegerType())
コード例 #16
0
ファイル: ingest.py プロジェクト: dengxixi/pipline
from pyspark.sql import *
from pyspark.sql import types as T

from pyspark import SparkContext


def parse(line):
    items = line.split(",")
    return (int(items[0]), items[1], int(items[2]))


sc = SparkContext()

spark = SparkSession.builder.master("local").config(
    "spark.some.config.option", "some-value").getOrCreate()

schema = T.StructType([
    T.StructField("class", T.IntegerType(), True),
    T.StructField("name", T.StringType(), True),
    T.StructField("score", T.IntegerType(), True),
])

rdd = sc.textFile('/Users/zdeng-ext/school/test.csv').map(parse)
df = spark.createDataFrame(rdd, schema)

df.write.format('com.databricks.spark.avro').save("people_avro")
コード例 #17
0
ファイル: temp_range.py プロジェクト: kacy12/bigdata
import sys
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('temp_range_dataframe').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+

observation_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.StringType()),
    types.StructField('observation', types.StringType()),
    types.StructField('value', types.IntegerType()),
    types.StructField('mflag', types.StringType()),
    types.StructField('qflag', types.StringType()),
    types.StructField('sflag', types.StringType()),
    types.StructField('obstime', types.StringType()),
])


def main(inputs, output):

    weather = spark.read.csv(inputs, schema=observation_schema)
    filter_weather = weather.filter(weather.qflag.isNull()).cache()

    Max_weather = filter_weather.filter(
        weather.observation == ('TMAX')).withColumn('tmax', weather.value)
    Max_temperature = Max_weather.select('station', 'date', 'tmax')
    Max_temperature.show(10)

    Min_weather = filter_weather.filter(
        weather.observation == ('TMIN')).withColumn('tmin', weather.value)
    Min_temperature = Min_weather.select('station', 'date', 'tmin')
コード例 #18
0
                config = tf.ConfigProto(device_count={'GPU': 0})
                config.inter_op_parallelism_threads = 1
                config.intra_op_parallelism_threads = 1
                K.set_session(tf.Session(config=config))

            # Restore from checkpoint.
            model = deserialize_model(model_bytes, tf.keras.models.load_model)

            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()
                # Convert from log domain to real Sales numbers.
                log_sales = model.predict_on_batch([[row[col]]
                                                    for col in all_cols])[0]
                # Add 'Sales' column with prediction results.
                fields['Sales'] = math.exp(log_sales)
                yield Row(**fields)

        return fn

    # Submit a Spark job to do inference. Horovod framework is not involved here.
    pred_df = spark.read.parquet('%s/test_df.parquet' % args.data_dir) \
        .rdd.mapPartitions(predict_fn(best_model_bytes)).toDF()
    submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()),
                                   pred_df.Sales).toPandas()
    submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv,
                                                index=False)
    print('Saved predictions to %s' % args.local_submission_csv)

    spark.stop()
コード例 #19
0
def _generate_select_expression_for_extended_string_to_int(source_column, name):
    """
    More robust conversion from StringType to IntegerType.
    Is able to additionally handle (compared to implicit Spark conversion):

        * Preceding whitespace
        * Trailing whitespace
        * Preceeding and trailing whitespace
        * underscores as thousand separators

    Hint
    ----
    Please have a look at the tests to get a better feeling how it behaves under
    tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and
    tests/data/test_fixtures/mapper_custom_data_types_fixtures.py

    Example
    -------
    >>> from spooq2.transformer import Mapper
    >>>
    >>> input_df.head(3)
    [Row(input_string="  123456 "),
     Row(input_string="Hello"),
     Row(input_string="123_456")]
    >>> mapping = [("output_value", "input_string", "extended_string_to_int")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(input_string=123456),
     Row(input_string=None),
     Row(input_string=123456)]
    """
    return _generate_select_expression_for_extended_string_to_long(source_column, name).cast(T.IntegerType())
コード例 #20
0
    def prepare_df(df):
        num_rows = df.count()

        # Expand dates.
        df = expand_date(df)

        df = df \
            .withColumn('Open', df.Open != '0') \
            .withColumn('Promo', df.Promo != '0') \
            .withColumn('StateHoliday', df.StateHoliday != '0') \
            .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

        # Merge in store information.
        store = store_csv.join(store_states_csv, 'Store')
        df = df.join(store, 'Store')

        # Merge in Google Trend information.
        google_trend_all = prepare_google_trend()
        df = df.join(google_trend_all,
                     ['State', 'Year', 'Week']).select(df['*'],
                                                       google_trend_all.trend)

        # Merge in Google Trend for whole Germany.
        google_trend_de = google_trend_all[google_trend_all.file ==
                                           'Rossmann_DE']
        google_trend_de = google_trend_de.withColumnRenamed(
            'trend', 'trend_de')
        df = df.join(google_trend_de,
                     ['Year', 'Week']).select(df['*'],
                                              google_trend_de.trend_de)

        # Merge in weather.
        weather = weather_csv.join(
            state_names_csv, weather_csv.file == state_names_csv.StateName)
        df = df.join(weather, ['State', 'Date'])

        # Fix null values.
        df = df \
            .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
            .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
            .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
            .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

        # Days & months competition was open, cap to 2 years.
        df = df.withColumn(
            'CompetitionOpenSince',
            F.to_date(
                F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                                df.CompetitionOpenSinceMonth)))
        df = df.withColumn(
            'CompetitionDaysOpen',
            F.when(
                df.CompetitionOpenSinceYear > 1900,
                F.greatest(
                    F.lit(0),
                    F.least(
                        F.lit(360 * 2),
                        F.datediff(df.Date,
                                   df.CompetitionOpenSince)))).otherwise(0))
        df = df.withColumn('CompetitionMonthsOpen',
                           (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

        # Days & weeks of promotion, cap to 25 weeks.
        df = df.withColumn(
            'Promo2Since',
            F.expr(
                'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
            ))
        df = df.withColumn(
            'Promo2Days',
            F.when(
                df.Promo2SinceYear > 1900,
                F.greatest(
                    F.lit(0),
                    F.least(F.lit(25 * 7),
                            F.datediff(df.Date,
                                       df.Promo2Since)))).otherwise(0))
        df = df.withColumn('Promo2Weeks',
                           (df.Promo2Days / 7).cast(T.IntegerType()))

        # Check that we did not lose any rows through inner joins.
        assert num_rows == df.count(), 'lost rows in joins'
        return df
コード例 #21
0
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # don't fail when on headless server
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession, types

colour_schema = types.StructType([
    types.StructField('R', types.IntegerType(), False),
    types.StructField('G', types.IntegerType(), False),
    types.StructField('B', types.IntegerType(), False),
    types.StructField('word', types.StringType(), False),
    types.StructField('confidence', types.StringType(), False),
])


def rgb2lab_query(table_name='__THIS__',
                  passthrough_columns=None,
                  input_bytes=True,
                  r='R',
                  g='G',
                  b='B',
                  out_l='labL',
                  out_a='labA',
                  out_b='labB'):
    """
    Build SQL query to convert RGB colours to LAB colours.

    table_name: name of the input table to query from. Must be '__THIS__' if being used in a SQLTransformer.
    passthrough_columns: list of column names that should be preserved and selected into the resulting table.
コード例 #22
0
    arr=line.split('|')
    #print("size: ",len(arr))
    return [int(arr[0]) if arr[0] else None,int(arr[1]) if arr[1] else None,arr[2],int(arr[3]) if arr[3] else None,int(arr[4]) if arr[4] else None,int(arr[5]) if arr[5] else None]

def savetheresult( rdd ):
    if not rdd.isEmpty():
        nrdd = rdd.map(call_split)
        df = sparkSess.createDataFrame(nrdd,schema)
        #df.show()
        #df.coalesce(2).write.mode("append").parquet("s3://bigdata-4/parquet/")
        df.coalesce(2).write.mode("append").parquet("s3://bigdata-4/post_history/")
        
sparkSess = SparkSession.builder.appName('post_history_table').getOrCreate()
sc = sparkSess.sparkContext
schema = types.StructType([
        types.StructField('unnamed', types.IntegerType()),
        types.StructField('id', types.IntegerType()),
        types.StructField('creation_date', types.StringType()),
        types.StructField('post_id', types.IntegerType()),
        types.StructField('post_history_type_id', types.IntegerType()),
        types.StructField('user_id', types.IntegerType())])
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createDirectStream(ssc, ["post_history"], {"bootstrap.servers": "127.0.0.1:9092"})
lines = kvs.map(lambda x: x[1])
#lines.pprint()
lines2=lines.foreachRDD(savetheresult)
ssc.start()
ssc.awaitTermination()


コード例 #23
0
#get data
data = sqlContext.read.parquet(
    "CTU-Flows_main/Flows.parquet/_yyyymd={}".format(sys.argv[1]))
#data = sqlContext.read.parquet(sys.argv[1])

df = data.dropDuplicates()

#filter flows with dstIP outside of the university and srcIP inside range 80-83 mask 22
df = data.filter(data.Proto == "tcp").filter(
    data.DstAddr.startswith("147.32.8")).filter(
        ~data.SrcAddr.startswith("147.32.8")).select("DstAddr", "Dport",
                                                     "State", "StartTime",
                                                     "SrcAddr")

#select day from timestamp and convert cast numbers to int
df = df.withColumn('Dport', df["Dport"].cast(T.IntegerType()))
df = df.withColumn(
    'timestamp',
    unix_timestamp('StartTime',
                   'yyyy/MM/dd hh:mm:ss.SSSSSS').cast(T.TimestampType()))
#add column for day
df = df.withColumn(
    'day',
    unix_timestamp('StartTime', 'yyyy/MM/dd').cast(T.TimestampType()))
df = df.filter(col('Dport').isNotNull())

#get flows from attackers
attackers = df.select('SrcAddr', 'DstAddr', 'Dport', "day").distinct().groupBy(
    'SrcAddr', "day").agg(
        F.countDistinct('DstAddr').alias('addrCount'),
        F.countDistinct('Dport').alias('portCount')).filter(
コード例 #24
0
import pyspark.sql.types as typ
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('spark0').getOrCreate()
import pyspark.ml.feature as ft
import pyspark.ml.classification as cl
import pyspark.ml.regression as reg
import pyspark.ml.clustering as clu
from  pyspark.ml import Pipeline,PipelineModel
import pyspark.ml.evaluation as ev
import pyspark.ml.tuning as tune
#定义数据结构
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.IntegerType()),
    ('DIABETES_GEST', typ.IntegerType()),
    ('HYP_TENS_PRE', typ.IntegerType()),
    ('HYP_TENS_GEST', typ.IntegerType()),
    ('PREV_BIRTH_PRETERM', typ.IntegerType())]

schema=typ.StructType([typ.StructField(e[0],e[1],False) for e in labels])
コード例 #25
0
])

simple_df = spark.createDataFrame(simple_rdd)                       # optionally can give colum names
#simple_df.printSchema()                                             # will print inferred schema. date is wrongly shown as string






#### Specify schema : faster

schema = [
      ('Date', types.DateType())
    , ('Name', types.StringType())
    , ('Age',  types.IntegerType())
    , ('Weight', types.IntegerType())
    , ('Location', types.StringType())
]

schema = types.StructType([types.StructField(e[0],e[1], False) for e in schema])

simple_df_schema = spark.createDataFrame(
      simple_rdd
        .map(lambda row:
             [dt.datetime.strptime(row[0], '%Y-%m-%d')] + row[1:]
            )
    , schema=schema
)

#simple_df_schema = spark.createDataFrame(simple_rdd,schema=schema)
コード例 #26
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('first Spark app').getOrCreate()

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

schema = types.StructType([
    types.StructField('id', types.IntegerType(), False),
    types.StructField('x', types.FloatType(), False),
    types.StructField('y', types.FloatType(), False),
    types.StructField('z', types.FloatType(), False),
])


def main(in_directory, out_directory):
    # Read the data from the JSON files
    xyz = spark.read.json(in_directory, schema=schema)
    #xyz.show(); return

    # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by.
    with_bins = xyz.select(
        xyz['x'],
        xyz['y'],
        # TODO: also the y values
        (xyz['id'] % 10).alias('bin'),
    )
    #with_bins.show(); #return

    # Aggregate by the bin number.
コード例 #27
0
ファイル: od-builder.py プロジェクト: eubr-bigsea/btr-spark
def buildODMatrix(buste_data, datapath, filepath):

    clean_buste_data = buste_data.na.drop(subset=[
        "date", "route", "busCode", "tripNum", "stopPointId", "timestamp",
        "shapeLon", "shapeLat"
    ])

    filtered_boardings = clean_buste_data.na.drop(
        subset=['cardNum', 'cardTimestamp']).dropDuplicates(
            ['cardNum', 'date', 'cardTimestamp'])

    multiple_boardings = filtered_boardings.groupby('cardNum').count().filter(F.col('count') > 1) \
    .select(F.col("cardNum").alias("cardNum1"), F.col("count").alias("count1"))

    clean_boardings = filtered_boardings.join(
        multiple_boardings,
        filtered_boardings.cardNum == multiple_boardings.cardNum1, 'leftsemi')

    boarding_data = clean_boardings.withColumn('boarding_id',
                                               F.monotonically_increasing_id())

    user_boarding_w = Window.partitionBy(boarding_data.cardNum,
                                         boarding_data.date).orderBy(
                                             boarding_data.cardTimestamp)

    od_matrix_ids = boarding_data.select(
        F.col('cardNum'), F.col('boarding_id'),
        F.lead('boarding_id',
               default=-1).over(user_boarding_w).alias('next_boarding_id'),
        F.first(
            'boarding_id',
            True).over(user_boarding_w).alias('first_boarding')).withColumn(
                'next_boarding_id',
                F.when(
                    F.col('next_boarding_id') == -1,
                    F.col('first_boarding')).otherwise(
                        F.col('next_boarding_id'))).drop('first_boarding')

    origin_matrix = boarding_data.select(
        F.col("route").alias("o_route"),
        F.col("busCode").alias("o_bus_code"),
        F.col("date").alias("o_date"),
        F.col("tripNum").alias("o_tripNum"),
        F.col("cardTimestamp").alias("o_timestamp"),
        F.col("shapeId").alias("o_shape_id"),
        F.col("shapeSequence").alias("o_shape_seq"),
        F.col("shapeLat").alias("o_shape_lat"),
        F.col("shapeLon").alias("o_shape_lon"),
        F.col("stopPointId").alias("o_stop_id"),
        F.col("boarding_id").alias("o_boarding_id"))

    next_origin_matrix = boarding_data.select(
        F.col("route").alias("next_o_route"),
        F.col("busCode").alias("next_o_bus_code"),
        F.col("date").alias("next_o_date"),
        F.col("tripNum").alias("next_o_tripNum"),
        F.col("cardTimestamp").alias("next_o_timestamp"),
        F.col("shapeId").alias("next_o_shape_id"),
        F.col("shapeSequence").alias("next_o_shape_seq"),
        F.col("shapeLat").alias("next_o_shape_lat"),
        F.col("shapeLon").alias("next_o_shape_lon"),
        F.col("stopPointId").alias("next_o_stop_id"),
        F.col("boarding_id").alias("next_o_boarding_id"))



    user_trips_data = origin_matrix.join(od_matrix_ids, origin_matrix.o_boarding_id == od_matrix_ids.boarding_id, 'inner') \
    .join(next_origin_matrix, od_matrix_ids.next_boarding_id == next_origin_matrix.next_o_boarding_id, 'inner') \
    .drop('boarding_id').drop('next_boarding_id') \
    .withColumn('o_unixtimestamp',F.unix_timestamp(F.col('o_timestamp'), 'HH:mm:ss')) \
    .withColumn('next_o_unixtimestamp',F.unix_timestamp(F.col('next_o_timestamp'), 'HH:mm:ss')) \
    .withColumn('leg_duration',F.when(F.col('next_o_unixtimestamp') > F.col('o_unixtimestamp'), \
    ((F.col('next_o_unixtimestamp') - F.col('o_unixtimestamp'))/60.0)).otherwise(-1)) \
    .orderBy(['cardNum','o_date','o_timestamp'])
    # .withColumn('o_date',F.from_unixtime(F.unix_timestamp(F.col('o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd'))\
    # .withColumn('next_o_date',F.from_unixtime(F.unix_timestamp(F.col('next_o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd')) \

    bus_trip_data = clean_buste_data.orderBy(['route','busCode','tripNum','timestamp']) \
    .dropDuplicates(['route','busCode','tripNum','stopPointId']) \
    .drop('cardNum') \
    .withColumn('id',F.monotonically_increasing_id()) \
    .withColumn('route', F.col('route').cast(T.IntegerType())) \
    .withColumnRenamed('','cardNum')

    cond = [
        bus_trip_data.route == user_trips_data.o_route,
        bus_trip_data.busCode == user_trips_data.o_bus_code,
        bus_trip_data.date == user_trips_data.o_date,
        bus_trip_data.tripNum == user_trips_data.o_tripNum
    ]

    w = Window().partitionBy(
        ['cardNum', 'date', 'route', 'busCode', 'tripNum']).orderBy('dist')

    filtered_od_matrix = bus_trip_data.join(user_trips_data, cond, 'left_outer') \
    .withColumn('dist',dist(F.col('shapeLat'),F.col('shapeLon'),F.col('next_o_shape_lat'),F.col('next_o_shape_lon'))) \
    .filter('timestamp > o_timestamp') \
    .withColumn('rn', F.row_number().over(w)) \
    .where(F.col('rn') == 1) \
    .filter('dist <= 1.0') \
    .filter(user_trips_data.cardNum.isNotNull())

    trips_origins = filtered_od_matrix \
    .select(['o_date','o_route','o_bus_code','o_tripNum','o_stop_id','o_timestamp']) \
    .groupBy(['o_date','o_route','o_bus_code','o_tripNum','o_stop_id']) \
    .count() \
    .withColumnRenamed('count','boarding_cnt') \
    .withColumnRenamed('o_date','date') \
    .withColumnRenamed('o_route','route') \
    .withColumnRenamed('o_bus_code','busCode') \
    .withColumnRenamed('o_tripNum','tripNum') \
    .withColumnRenamed('o_stop_id','stopPointId')

    trips_destinations = filtered_od_matrix \
    .select(['date','route','busCode','tripNum','stopPointId','timestamp']) \
    .groupBy(['date','route','busCode','tripNum','stopPointId']) \
    .count() \
    .withColumnRenamed('count','alighting_cnt')

    trips_origins.write.csv(path=datapath + 'od/trips_origins/' + filepath,
                            header=True,
                            mode='overwrite')
    trips_destinations.write.csv(path=datapath + 'od/trips_destinations/' +
                                 filepath,
                                 header=True,
                                 mode='overwrite')

    trips_o = sqlContext.read.csv(datapath + 'od/trips_origins/' + filepath,
                                  header=True,
                                  inferSchema=True,
                                  nullValue="-")
    trips_d = sqlContext.read.csv(datapath + 'od/trips_destinations/' +
                                  filepath,
                                  header=True,
                                  inferSchema=True,
                                  nullValue="-")

    trips_passengers = trips_o.join(
        trips_d,
        on=['date', 'route', 'busCode', 'tripNum', 'stopPointId'],
        how='outer')

    trips_window = Window.partitionBy(['date', 'route', 'busCode',
                                       'tripNum']).orderBy('timestamp')

    od_matrix_route_boarding = filtered_od_matrix.groupby(['route']).count() \
    .withColumnRenamed('count','odmatrix_boarding')

    od_matrix_route_prop = bus_trip_data.groupby(['route']).count() \
    .withColumnRenamed('count','overall_boarding') \
    .join(od_matrix_route_boarding, 'route','left_outer') \
    .withColumn('extrap_factor',F.when(((F.col('odmatrix_boarding') == 0) | (F.col('odmatrix_boarding').isNull())), 0.0) \
    .otherwise(F.col('overall_boarding').cast('float')/F.col('odmatrix_boarding')))

    buste_crowdedness_extrapolated = bus_trip_data.join(trips_passengers, on=['date','route','busCode','tripNum','stopPointId'], how='left_outer') \
    .withColumn('crowd_bal', F.col('boarding_cnt') - F.col('alighting_cnt')) \
    .withColumn('num_pass',F.sum('crowd_bal').over(trips_window)) \
    .drop('numPassengers','gps_timestamp','gps_timestamp_in_secs') \
    .orderBy(['date','route','busCode','tripNum','timestamp']) \
    .join(od_matrix_route_prop, 'route', 'left') \
    .drop('overall_boarding','odmatrix_boarding') \
    .withColumn('ext_num_pass', F.col('num_pass')*F.col('extrap_factor'))

    return buste_crowdedness_extrapolated
コード例 #28
0
                                     batch_size=args.batch_size,
                                     epochs=args.epochs,
                                     verbose=2)

keras_model = keras_estimator.fit(train_df).setOutputCols(['Sales'])

history = keras_model.getHistory()
best_val_rmspe = min(history['val_exp_rmspe'])
print('Best RMSPE: %f' % best_val_rmspe)

# Save the trained model.
keras_model.save(args.local_checkpoint_file)
print('Written checkpoint to %s' % args.local_checkpoint_file)

# ================ #
# FINAL PREDICTION #
# ================ #

print('================')
print('Final prediction')
print('================')

pred_df = keras_model.transform(test_df)
# Convert from log domain to real Sales numbers
pred_df = pred_df.withColumn('Sales', F.exp(pred_df.Sales))
submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales).toPandas()
submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv, index=False)
print('Saved predictions to %s' % args.local_submission_csv)

spark.stop()
コード例 #29
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('weather ETL').getOrCreate()

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

observation_schema = types.StructType([
    types.StructField('station', types.StringType(), False),
    types.StructField('date', types.StringType(), False),
    types.StructField('observation', types.StringType(), False),
    types.StructField('value', types.IntegerType(), False),
    types.StructField('mflag', types.StringType(), False),
    types.StructField('qflag', types.StringType(), False),
    types.StructField('sflag', types.StringType(), False),
    types.StructField('obstime', types.StringType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    weather = spark.read.csv(in_directory, schema=observation_schema)
    # weather.show(); return
    qflagNull = weather.filter(weather['qflag'].isNull())
    # qflagNull.show(); return
    caStation = qflagNull.filter(
        functions.substring(qflagNull.station, 1, 2) == 'CA')
    # caStation.show(); return
コード例 #30
0
import sys, os
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types, Row
from pyspark import SparkConf, SparkContext
app_name = "NCAA Basketball"
spark = SparkSession.builder.appName(app_name).getOrCreate()
assert spark.version >= '2.3'  # make sure we have Spark 2.3+
spark.sparkContext.setLogLevel('WARN')


# Function that maps the period to minutes remaining
@functions.udf(returnType=types.IntegerType())
def period_mins_left(period):
    if period == '1st Period':
        return 30
    elif period == '2nd Period':
        return 20
    elif period == '3rd Period':
        return 10
    elif period == '4th Period':
        return 0
    elif period == '1st Half':
        return 20
    elif period == '2nd Half':
        return 0
    else:
        return 0


from resources import play_by_play_schema