コード例 #1
0
def tfunc(t,rdd,rddb):
  # texts
  try:
    #----- texts
    if topic=="TT_raw":
      rowRdd = rdd.map(lambda w: Row(id=w['id'],author=w['user_screen_name'],\
      body=w['body'], created_utc=str(int(w['timestamp_ms'])/1000), \
      pharmatags=w['pharmatags'],conditiontags=w['conditiontags'], symptomtags=w['symptomtags']))
    else:  # this is reddit
      rowRdd = rdd.map(lambda w: Row(id=w['id'],author=w['author'],\
      body=w['body'], created_utc=w['created_utc'], \
      pharmatags=w['pharmatags'],conditiontags=w['conditiontags'], symptomtags=w['symptomtags']))

    texts = getSqlContextInstance(rdd.context).createDataFrame(rowRdd) 
    texts.registerTempTable("texts")
    texts = texts.select(texts.id,from_unixtime(texts.created_utc).alias('created_utc'),texts.author,texts.body, explode(texts.pharmatags).alias('pharmatag'), texts.conditiontags, texts.symptomtags)
    # return texts.rdd

    #----- bids
    rowRdd2= rddb.map(lambda w: Row(price=w['price'], pharmatag=w['pharmatags']))
    bids = getSqlContextInstance(rddb.context).createDataFrame(rowRdd2) 
    bids.registerTempTable("bids")
    getSqlContextInstance(rdd.context).cacheTable('bids')
    bids = bids.select(bids.price,bids.pharmatag)
    
    # #---- texts ids joined with pharma bids, java webservice already sorted by price
    idbids = bids.join(texts,texts.pharmatag==bids.pharmatag,'inner').select(texts.id,texts.author, texts.created_utc, texts.body, texts.conditiontags, texts.symptomtags, bids.pharmatag,bids.price).limit(1)
    idbids.registerTempTable("idbids")
    idbids.show()
    return idbids.rdd


    # #-----texts id & bids, find min
    # DEPRECATED, will just return the top matched, since it's already sorted by price  by java service
    # idsbidsmin = getSqlContextInstance(rddb.context).sql("SELECT id, author, created_utc, body, pharmatag, conditiontags, symptomtags, max(price) as price FROM idbids GROUP BY id,author, created_utc, body, conditiontags, symptomtags, pharmatag ")
    # idsbidsmin.registerTempTable("idsbidsmin") # dataframe
    # idsbidsmin.show()
    # return idsbidsmin.rdd

  except 'Exception':
    pass
コード例 #2
0
config = ConfigParser.ConfigParser()
config.read('config.ini')
kuduMaster = config.get('hadoop','kudu_masters')
kuduPort = config.get('hadoop','kudu_port')

# ### Create a Spark Session
spark = SparkSession.builder.appName("Sensor Analytics").getOrCreate()
sc = spark.sparkContext
sqc = SQLContext(sc)

# ## Analyze Maintenance Costs
# We start our analysis with visualizing the distribution of maintenance costs
rawMaintCosts = sqc.read.format('org.apache.kudu.spark.kudu')\
    .option('kudu.master',kuduMaster)\
    .option('kudu.table','impala::sensors.maintenance').load()\
  .withColumn('day', F.to_date(F.from_unixtime('maint_date')))\
  .withColumn('month', F.date_format(F.from_unixtime('maint_date'),'yyyy-MMM'))\
  .orderBy('maint_date')
maintCosts = rawMaintCosts.toPandas()

# ### Summary Statistics on Maintenance Costs
maintCosts.describe()

# ### Boxplot of Monthly Maintenance Costs
sb.set(style="ticks", palette="muted", color_codes=True)
sb.boxplot(x="cost", y="month", data=maintCosts, whis=np.inf, color='r')
sb.despine(trim=True)

# ### Pairplot Comparing Maintenance Cost and Duration
sb.pairplot(maintCosts, hue="type", vars=['cost','duration'])
コード例 #3
0
    def process_data(self):
        ##############################################################################
        # DECLARE VARIABLES
        ##############################################################################

        dt_range = self.study_dates("2020-07-30")
        dt = dt_range
        s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s1_initial_bucket_depth = 'cuebiq/daily-feed/US/'
        s1_bucket_output = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/'
        anchor_dist = 430
        time_thresh = 28800
        part_num = 9

        gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True)
        ])

        s2_gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True),
            StructField("row_number", IntegerType(), True)
        ])

        ##############################################################################
        # WINDOWS
        ##############################################################################
        w = Window().partitionBy('device_id').orderBy('utc_timestamp')
        l = Window().partitionBy('device_id',
                                 'lin_grp').orderBy('utc_timestamp')
        w2 = Window().partitionBy('device_id').orderBy('row_number')

        ##############################################################################
        # BEGIN DAILY ITERATION
        ##############################################################################

        print("Reading in files for {}".format(str(dt['study_dt'])[:10]))
        print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name,
                                                 s1_initial_bucket_depth,
                                                 dt['s3_before'],
                                                 dt['s3_study_dt'],
                                                 dt['s3_after']))
        print("")

        #################################################################################################
        # START STEP 1
        #################################################################################################
        df1 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_before'] +"/*.gz") # the day before

        df2 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_study_dt'] +"/*.gz") # actual study date

        df3 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_after'] +"/*.gz") # the day after

        # Union data from three inputs into 1 dataframe
        df = df1.union(df2).union(df3) \
            .repartition(part_num, 'device_id')

        del df1
        del df2
        del df3

        ##############################################################################
        # FILTER INITIAL JUNK RECORDS
        # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range
        ##############################################################################
        df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \
                    .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \
                            & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \
                            & (df['utc_timestamp'] + df['tz_offset']) \
                                    .between(dt['utc_study_dt'], dt['utc_after'])) \
                    .dropDuplicates(['utc_timestamp','device_id'])

        ##############################################################################
        # EXCESSIVE SPEED REMOVAL
        ##############################################################################
        df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \
                            lead(df['longitude'],1).over(w))) \
            .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \
            .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \
            .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \
            .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \
            .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \
            .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \
                        | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \
                        & ((col('speed_from') < 600) & (col('speed_to') < 600)) \
                        & ((col('speed_from') < 20) | (col('speed_to') < 20))) \
            .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        ##############################################################################
        # LINEAR TRAVEL PING REMOVAL
        # Break pings out into groups of 4 to measure the linear distance
        ##############################################################################
        #Assign a record number and linear grouping and lead distance
        df = df.withColumn('RecordNum',row_number().over(w)) \
            .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \
            .withColumn('dist_to', distance(df['latitude'], df['longitude'], \
                lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters'))

        # Create aggregated table for linear groupings
        expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \
            py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')]

        dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr)

        dfl_grp.createOrReplaceTempView('dfl_grp')
        df.createOrReplaceTempView('dfl')

        # Grab just the first and last records in each linear grouping and append aggregated info
        dfls = spark.sql(
            "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.min_utc_timestamp \
                    UNION ALL \
                    SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.max_utc_timestamp")

        # Measure the distance between first and last in each linear grouping and compare to sum distance of all points
        # Only keep groups that meet criteria for being straight-line
        df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \
                    lead(dfls['latitude'],1).over(l), \
                    lead(dfls['longitude'],1).over(l), 'meters')) \
                .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \
                .na.drop(subset=['strt_dist']) \
                .filter((dfls['min_dist'] > 0)  \
                    & (col('strt_dist').between(150, 2000)) \
                    & (dfls['cnt'] == 4) \
                    & (col('lin') >= .99825)) \
                .select('device_id','lin_grp', 'lin')

        # Outer join main dataframe to linears groups to filter non-linear pings
        df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \
            .filter(col('lin').isNull()) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        del dfl_grp
        del dfls
        del df_j

        #######################################
        # CHAIN
        # Calculating the dynamic chain threshold to find proximate ping relationships
        #######################################
        df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
            .withColumn('chain', when((distance(df['latitude'], df['longitude'], \
                            lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1)
                            .when((distance(df['latitude'], df['longitude'], \
                            lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \
            .filter(col('chain') == 1) \
            .withColumn('row_number', row_number().over(w)) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \
            .persist()

        df \
            .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \
            .write \
            .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        ##############################################################################################
        # START STEP 2
        ##############################################################################################

        print('Begin micro-clustering')

        # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold
        self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \
                        .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \
                                .when(distance(df['latitude'], df['longitude'], \
                                                lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \
                                            >= anchor_dist, col('tz_timestamp')) \
                                .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \
                        .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \
                        .repartition(part_num, 'device_id') \
                        .persist()

        print('df_dist starting count = {}'.format(
            self.df_dist.count()))  # Materialize table for caching

        df.unpersist()
        del df

        #####################################################################################################
        # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing
        # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration
        ######################################################################################################
        df_anchor1 = self.anchor_func(3, 3)
        df_anchor2 = self.anchor_func(5, 5)
        df_anchor3 = self.anchor_func(12, 6)
        df_anchor4 = self.anchor_func(20, 5)
        df_anchor5 = self.anchor_func(30, 5)
        df_anchor6 = self.anchor_func(50, 5)
        df_anchor7 = self.anchor_func(80, 5, 1000000)
        df_anchor8 = self.anchor_func(1000, 5, 1000000)

        ##################################################################################################
        # Collect remaining pings to driver for Python analysis
        print('collect remaining pings')
        anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \
                                                'longitude':row[4], 'anchor':row[7]}).collect()

        # Sort elements in list by device_id and timestamp
        anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp'))

        # Python analysis on driver of final remaining pings
        print('iterate through remaining pings on driver')
        anchor_dr = []

        for r in anchor_list:
            if r['anchor'] is not None:
                anchor_dr.append(r)

            else:
                if anchor_dr[-1]['device_id'] == r['device_id']:
                    if distance_dr(r['latitude'],r['longitude'], \
                                anchor_dr[-1]['latitude'], \
                                anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \
                                & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh:
                        anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \
                                        'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \
                                        'anchor':anchor_dr[-1]['anchor']})

                    else:
                        r['anchor'] = r['timestamp']
                        anchor_dr.append(r)

        # Condense result table for dataframe distribution
        print('generate driver anchor table')
        new_anchor = []
        for r in anchor_dr:
            new_anchor.append([r['timestamp'], r['device_id'], r['anchor']])

        # Bring driver results back into a distributed dataframe and join results
        print('disperse driver anchor table back to cluster')
        new_anchor_schema = StructType([
            StructField('tz_timestamp', IntegerType(), True),
            StructField('device_id', StringType(), True),
            StructField('anchor', IntegerType(), True)
        ])

        df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \
                        .repartition(part_num, 'device_id')

        # Join remaining anchors to main analysis table
        self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \
                                'accuracy','row_number') \
                            .join(df_anchor_dr,['tz_timestamp','device_id']) \

        # Union all anchor tables together and sort
        print('finalizing anchor results into central table')
        df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \
                            .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \
                            .repartition(part_num,'device_id') \
                            .persist()

        self.df_dist.unpersist()

        #######################################################################################
        # Calculate centroids
        #######################################################################################
        print('start calculating centroids')
        # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings
        df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \
                                                                        py.count(col('tz_timestamp')).alias('cnt')]) \
                                    .withColumn('max_acc_1', col('max_accuracy') + 1) \
                                    .filter(col('cnt') > 1) \
                                    .select('device_id','anchor','max_acc_1','cnt')

        # Calculate the nominator for each micro-cluster
        df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \
                                        .withColumn('nom',col('max_acc_1') - col('accuracy'))

        df_denom = df_anchors_fnl.groupBy(
            'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')])


        df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \
                            .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \
                            .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \
                            .withColumn('lon', df_anchors_fnl['longitude'] * col('weight'))


        expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \
                    py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \
                    py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \
                    py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \
                    py.avg(col('accuracy')).alias('avg_accuracy')]

        df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \
                                .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \
                                .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \
                                .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \
                                .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \
                                .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \
                                .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \
                                .filter(col('dwell_seconds') > 1) \
                                .select('device_id','os','start_tm','end_tm', \
                                        'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \
                                        col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy')


        df_micro \
                .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \
                .write \
                .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        df_anchors_fnl.unpersist()

        return
コード例 #4
0
def process_log_data(spark, input_data, output_data):
    '''
    Get the files from log folders and compose a DataFrame.
    Create the users, time and songplays tables with
    the desired columns and format.
    
    Parameters:
        spark (object): Previous created spark object.
        input_data(string): Key for AWS S3 objects to read.
        output_data(string): Key for AWS S3 objects to save.
        
    Returns:
        None
    '''
    # get filepath to log data file
    log_data = input_data + 'log_data'

    # read log data file
    # smaller data to test: s3a://{}:{}@udacity-dend/log_data/2018/11/2018-11-12*.json
    df = spark.read.json("s3a://{}:{}@udacity-dend/log_data/*/*/*.json"\
                      .format(os.environ['AWS_ACCESS_KEY_ID'],os.environ['AWS_SECRET_ACCESS_KEY']))

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong')

    # extract columns for users table
    users_columns = ['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = df.select(*users_columns).dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + '/users', mode='overwrite')

    # create datetime column from original timestamp column
    df = df.withColumn('datetime', from_unixtime(col('ts') / 1000))

    # extract columns to create time table
    df_time = df.select('datetime').dropDuplicates()
    time_table = df_time.withColumnRenamed('datetime', 'start_time')\
                         .orderBy('start_time', ascending=True)\
                         .withColumn('hour', hour(col('start_time')))\
                         .withColumn('day', dayofmonth(col('start_time')))\
                         .withColumn('week', weekofyear(col('start_time')))\
                         .withColumn('month', month(col('start_time')))\
                         .withColumn('year', year(col('start_time')))\
                         .withColumn('weekday', dayofweek(col('start_time')))

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + '/time',
                             mode='overwrite',
                             partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    basePath = output_data + '/songs/'
    song_df = spark.read.option("basePath",
                                basePath).parquet(output_data + '/songs/*')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='left')
    songplays_table = songplays_table.drop('song', 'artist', 'title', 'year',
                                           'duration')

    columns_name = [
        'start_time', 'user_id', 'level', 'session_id', 'location',
        'user_agent', 'song_id', 'artist_id'
    ]
    songplays_table = songplays_table.toDF(*columns_name)
    songplays_table = songplays_table.withColumn('month', month(col('start_time')))\
                                     .withColumn('year', year(col('start_time')))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + '/songplays',
                                  mode='overwrite',
                                  partitionBy=['year', 'month'])
コード例 #5
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    dyf_ds_results = glueContext.create_dynamic_frame.from_catalog(database='dts-odin_ncsbasic', table_name='results')

    dyf_ds_results = dyf_ds_results.resolveChoice(specs=[('_key', 'cast:long')])
    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_ncsb.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_ds_results = Filter.apply(frame=dyf_ds_results,
    #                                    f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')
    #
    # dyf_ds_results = dyf_ds_results.select_fields(
    #     ['_key', '_id', 'userid', 'time_begin', 'time_end', 'timecreated']).rename_field(
    #     '_id', 'id')

    dy_cache = dyf_ds_results.toDF()
    dy_cache = dy_cache.cache()
    dyf_ds_results = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_ds_results')

    #doc moc flag tu s3
    print('dyf_ds_results::schema')
    dyf_ds_results.printSchema()
    dyf_ds_results.show(5)

    if (dyf_ds_results.count() > 0):
        #--------------------------------------------------------------------------------------------------------------#
        dyf_student_contact_email = glueContext.create_dynamic_frame.from_catalog(database='tig_advisor',
                                                                                 table_name='student_contact_email')

        dyf_student_contact_email = dyf_student_contact_email.select_fields(['email', 'contact_id'])
        dyf_student_contact_email = Filter.apply(frame=dyf_student_contact_email,
                                                 f=lambda x: x["email"] is not None
                                                             and x["email"] != '')
        df_student_contact_email = dyf_student_contact_email.toDF()
        df_student_contact_email = df_student_contact_email.dropDuplicates(['contact_id', 'email'])
        dyf_student_contact_email = DynamicFrame.fromDF(df_student_contact_email, glueContext,
                                                        "dyf_student_contact_email")
        # -------------------------------------------------------------------------------------------------------------#

        # -------------------------------------------------------------------------------------------------------------#
        dyf_users = glueContext.create_dynamic_frame.from_catalog(database='dts-odin_ncsbasic', table_name='users')
        dyf_users = dyf_users.select_fields(['_id', 'email'])
        # -------------------------------------------------------------------------------------------------------------#

        # -------------------------------------------------------------------------------------------------------------#
        dyf_ds_results_nscb = Filter.apply(frame=dyf_ds_results, f=lambda x: x["time_begin"] is not None and x["time_begin"] != ''
                                                                and x["time_end"] is not None and x["time_end"] != ''
                                                                and x["time_begin"] < x["time_end"]
                                                                and x["timecreated"] is not None and x["timecreated"] != '')
        # -------------------------------------------------------------------------------------------------------------#

        # -------------------------------------------------------------------------------------------------------------#
        # ds_df_results = ds_results.toDF()
        # ds_df_results = ds_df_results.where('time_begin IS NOT NULL AND time_end IS NOT NULL')
        # ds_results_nscb = DynamicFrame.fromDF(ds_df_results, glueContext, 'ds_results_nscb')

        # map ls ncsb vs contact_id
        join_ncsb1 = Join.apply(dyf_ds_results_nscb, dyf_users, 'userid', '_id')
        join_ncsb2 = Join.apply(join_ncsb1, dyf_student_contact_email, 'email', 'email')

        print('join_ncsb2::schema')
        join_ncsb2.printSchema()
        join_ncsb2.show(5)

        # convert data
        join_ncsb2 = Filter.apply(frame=join_ncsb2, f=lambda x: x["contact_id"] is not None)

        data_df_ncsb = join_ncsb2.toDF()
        data_df_ncsb = data_df_ncsb.withColumn('sogio', (data_df_ncsb.time_end - data_df_ncsb.time_begin) / 3600)
        data_df_ncsb = data_df_ncsb.withColumn("giovao", from_unixtime(data_df_ncsb.time_begin))
        data_df_ncsb = data_df_ncsb.withColumn("ngay_tao", from_unixtime(data_df_ncsb.timecreated))
        data_df_ncsb = data_df_ncsb.withColumn('id_time', from_unixtime(data_df_ncsb.time_begin, "yyyyMMdd"))
        # data_df_ncsb = data_df_ncsb.where("contact_id IS NOT NULL")
        data_df_ncsb = data_df_ncsb.where("sogio > 0.0")

        data_df_ncsb = data_df_ncsb.groupby('contact_id', 'id_time').agg(f.sum('sogio').alias("tong_so_gio"),
                                                                           f.count('contact_id'))
        data_df_ncsb = data_df_ncsb.dropDuplicates(['contact_id', 'id_time'])
        data_ncsb = DynamicFrame.fromDF(data_df_ncsb, glueContext, 'data_ncsb')

        data_ncsb = data_ncsb.resolveChoice(specs=[('tong_so_gio', 'cast:float')])
        # -------------------------------------------------------------------------------------------------------------#

        # -------------------------------------------------------------------------------------------------------------#
        # tinh bang "fact_hieusuathoctap"
        # df_hieusuathoctap = dropnullfields1.toDF()

        print ('data_ncsb::data_ncsb::data_ncsb::printSchema------------------')
        data_ncsb.printSchema()

        # print ('data_ncsb::data_ncsb::data_ncsb::show------------------')
        data_ncsb.show(10)

        print('data_ncsb::number: ', data_ncsb.count())

        # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time

        applymapping2 = ApplyMapping.apply(frame=data_ncsb,
                                           mappings=[("contact_id", "string", "contact_id", "string"),
                                                     ("id_time", 'string', 'id_time', 'bigint'),
                                                     ("count(contact_id)", 'long', 'soca', 'int'),
                                                     ("tong_so_gio", 'float', 'sogio', 'float')])

        resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                             transformation_ctx="resolvechoice2")
        dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                                                                   catalog_connection="glue_redshift",
                                                                   connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_ncsb_v2",
                                                                                       "database": "dts_odin",
                                                                                       "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                            SELECT um.user_id, hwb.id_time, 52, hwb.soca, hwb.sogio
                                                                                                            FROM temp_staging_lich_su_tu_hoc_ncsb_v2 hwb
                                                                                                            LEFT JOIN user_map um
                                                                                                                 ON um.source_type = 1
                                                                                                                 AND um.source_id = hwb.contact_id
                                                                                                            WHERE um.user_id is not null;
                                                                                                             DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_ncsb_v2"""
                                                                                       },
                                                                   redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/ncsb_2",
                                                                   transformation_ctx="datasink4")


        df_datasource = dyf_ds_results.toDF()
        flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_ncsb.parquet", mode="overwrite")

        dy_cache.unpersist()
コード例 #6
0
    StructField('work_or_load', IntegerType(), True), \
    StructField('plug_id', IntegerType(), True), \
    StructField('household_id', IntegerType(), True), \
    StructField('house_id', IntegerType(), True)]
schema = StructType(fields)

# carrega o arquivo csv para o dataframe 'df'
df = spark.read.load("sample-00.csv",
                     format="csv",
                     sep=",",
                     schema=schema,
                     header="false")

# converte a coluna 'ts' do time stamp para o formato yyyy-MM-dd HH:mm
# que sera utilizado no agrupamento dos dados
df = df.withColumn('ts', from_unixtime('ts', "yyyy-MM-dd HH:mm")) \

# filtra a coluna 'work_or_load' para pegar apenas as linhas que sao load
df = df.filter(df.work_or_load == 1)

# realiza o agrupamento por casa ('house_id'), comodo ('household_id'), tomada
# ('plug_id') e janela de uma hora ('window('ts', "1 hour")'). Em seguida, realiza
# a agregacao calculando a media da coluna value.
# Dado que o trabalho (W) = potencia (P) * intervalo de tempo (delta_t), temos que
# ao fazer esta operacao estamos calculando o W em Wh multiplicando a potencia
# media em uma hora pelo intervalo de 1 hora. Por fim, ordenamos os dados.
#
# OBS: como temos um intervalo de tempo de uma hora nao e necessario realizar a
# multiplicacao efetivamente.
df = df.groupBy('house_id', 'household_id', 'plug_id', window('ts', "1 hour")) \
    .agg({"value":"avg"}) \
コード例 #7
0
    })

#if you have multiple saves below this prevents reloading the data every time
pw_df.cache()

#join on the grid to get the feeder and tx for each outage
pw_df = pw_df.join(grid, on='site_id', how='inner')

#We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI
pw_distinct_core_id = pw_df.select("time", "core_id", "feeder_id", "tx")
pw_distinct_core_id_by_feeder = pw_distinct_core_id.groupBy(
    F.window("time", '10 days', '1 day'),
    "feeder_id").agg(F.countDistinct("core_id"))
pw_distinct_core_id_by_feeder = pw_distinct_core_id_by_feeder.withColumn(
    "window_mid_point",
    F.from_unixtime((F.unix_timestamp(col("window.start")) +
                     F.unix_timestamp(col("window.end"))) / 2))
pw_distinct_core_id_by_feeder = pw_distinct_core_id_by_feeder.select(
    "feeder_id",
    col("count(DISTINCT core_id)").alias("sensors_reporting"),
    "window_mid_point")

pw_distinct_core_id_by_tx = pw_distinct_core_id.groupBy(
    F.window("time", '10 days', '1 day'), "tx").agg(F.countDistinct("core_id"))
pw_distinct_core_id_by_tx = pw_distinct_core_id_by_tx.withColumn(
    "window_mid_point",
    F.from_unixtime((F.unix_timestamp(col("window.start")) +
                     F.unix_timestamp(col("window.end"))) / 2))
pw_distinct_core_id_by_tx = pw_distinct_core_id_by_tx.select(
    "tx",
    col("count(DISTINCT core_id)").alias("sensors_reporting"),
    "window_mid_point")
コード例 #8
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)
コード例 #9
0
def preprocess_data(input, output):
    """Based on preprocess_data.ipynb."""
    print('input=%s, output=%s' % (input, output))

    sc = SparkContext.getOrCreate()
    sql_sc = SQLContext(sc)

    schema = StructType([
        StructField('VendorID', IntegerType(), True),
        StructField('tpep_pickup_datetime', TimestampType(), True),
        StructField('tpep_dropoff_datetime', TimestampType(), True),
        StructField('passenger_count', IntegerType(), True),
        StructField('trip_distance', DoubleType(), True),
        StructField('pickup_longitude', DoubleType(), True),
        StructField('pickup_latitude', DoubleType(), True),
        StructField('RateCodeID', IntegerType(), True),
        StructField('store_and_fwd_flag', StringType(), True),
        StructField('dropoff_longitude', DoubleType(), True),
        StructField('dropoff_latitude', DoubleType(), True),
        StructField('payment_type', IntegerType(), True),
        StructField('fare_amount', DoubleType(), True),
        StructField('extra', DoubleType(), True),
        StructField('mta_tax', DoubleType(), True),
        StructField('tip_amount', DoubleType(), True),
        StructField('tolls_amount', DoubleType(), True),
        StructField('improvement_surcharge', DoubleType(), True),
        StructField('total_amount', DoubleType(), True),
    ])

    raw_sdf = sql_sc.read.csv(input,
                              header=True,
                              schema=schema,
                              timestampFormat='yyyy-MM-dd HH:mm:ss')

    # Convert timestamp from EST to UTC.
    clean_sdf = raw_sdf.withColumn(
        'tpep_pickup_timestamp_ms',
        unix_timestamp(raw_sdf['tpep_pickup_datetime']) * 1000 +
        5 * 60 * 60 * 1000)
    clean_sdf = clean_sdf.withColumn(
        'tpep_dropoff_timestamp_ms',
        unix_timestamp(raw_sdf['tpep_dropoff_datetime']) * 1000 +
        5 * 60 * 60 * 1000)

    # Only consider the first 2 days of data.
    end_timestamp = pd.Timestamp('2015-03-03 00:00:00').tz_localize(
        'Etc/GMT+5')
    filtered_sdf = clean_sdf.filter('tpep_dropoff_timestamp_ms <= %d' %
                                    int(end_timestamp.value / 1e6))

    all_events_rdd = filtered_sdf.rdd.flatMap(create_events)

    all_events_sdf = sql_sc.createDataFrame(all_events_rdd)

    all_events2_sdf = all_events_sdf.withColumn(
        'timestamp_str', from_unixtime(all_events_sdf['timestamp'] / 1000))

    # Sort all events so streaming_data_generator.py can read events in time order.
    sorted_sdf = all_events2_sdf.orderBy('timestamp')

    sorted_sdf.write.mode('overwrite').format('json').save(output)
コード例 #10
0
import shutil
#init_notebook_mode(connected=True)
spark = SparkSession.builder.appName(
    "Python Spark SQL basic example").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.read.load('/test_dev/mba-code/dataset.csv',
                          format='csv',
                          header='true',
                          inferSchema='true',
                          encoding='UTF-8')
df.select(
    "InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate",
    "InvoiceDateWS",
    date_format(from_unixtime(unix_timestamp('InvoiceDateWS', 'mm/dd/yyy')),
                'EEEE').alias('weekday'), "CustomerID",
    "Country").write.save("Invoices.parquet", format="parquet")
parquetFile = spark.read.parquet("Invoices.parquet")
# Parquet files can also be used to create a temporary view and then used in SQL statements.
parquetFile.createOrReplaceTempView("parquetFile")
DescriptionGrp = spark.sql(
    "SELECT distinct InvoiceNo,StockCode FROM parquetFile group by InvoiceNo,StockCode"
)
#print(DescriptionGrp.rdd.take(2))
minSupport = 0.05 * DescriptionGrp.rdd.count()
apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey(
    lambda x, y: x + y)
schema = StructType([
    StructField("id", StringType(), True),
    StructField("items", ArrayType(StringType()), True)
コード例 #11
0
ファイル: FA6.py プロジェクト: Ruddy-Harnur/Final-Challenge
from pyspark.sql.types import FloatType
#import statsmodels.formula.api as smf

# from pyspark.sql.functions import regexp_replace, col
# from pyspark.ml.regression import LinearRegression
# from sklearn.linear_model import LinearRegression
from pyspark.sql.functions import broadcast

from pyspark.sql.functions import *

if __name__=='__main__':
    sc = SparkContext()
    spark = SparkSession(sc)
    pv = spark.read.csv('hdfs:///tmp/bdm/nyc_parking_violation/', header = True,inferSchema = True)
    pv = pv.select('Issue Date', 'Violation County', 'Street Name', 'House Number')
    pv = pv.withColumn('Date', from_unixtime(unix_timestamp('Issue Date', 'MM/dd/yyyy')))
    pv = pv.withColumn('Year',f.year(pv['Date']))
    pv = pv.filter(pv["Year"] >= (2015)) \
       .filter(pv["Year"] <= (2019))
    pv = pv.na.drop()
    pv = pv.withColumn('street name',f.lower(pv['Street Name']))
    
    
    borough_dict = {'NY':1, 'MAN':1, 'MH':1, 'NEWY':1, 'NEW':1, 'Y':1, "NY":1,
                'BX':2, 'BRONX':2,
                'K':3, 'BK':3, 'KING':3, 'KINGS':3,
                'Q':4, 'QN':4, 'QNS':4, 'QU':4, 'QUEEN':4,
                'R':5, 'RICHMOND':5}
    mapping_expr = create_map([lit(x) for x in chain(*borough_dict.items())])
    pv = pv.withColumn("BOROCODE", mapping_expr.getItem(col("Violation County")))
    pv = pv.withColumn("HN_int",(f.regexp_replace("House Number", "-", "")))
コード例 #12
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    #------------------------------------------------------------------------------------------------------------------#
    dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(
        database='native_talk', table_name='native_talk_history_log_api')

    dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('_key',
                                                            'cast:long')])

    try:
        df_flag = spark.read.parquet(
            "s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk_thanh_cong.parquet"
        )
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                       f=lambda x: x["_key"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_native_talk = dyf_native_talk.select_fields([
        '_key', 'learning_date', 'speaking_dialog_score', 'username',
        'updated_time'
    ])

    dy_cache = dyf_native_talk.toDF()
    dy_cache = dy_cache.cache()
    dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext,
                                          'dyf_native_talk')

    print('dy_cache------------')
    dy_cache.printSchema()
    print('dy_cache: ', dy_cache.count())
    dy_cache.show(2)

    #------------------------------------------------------------------------------------------------------------------#

    if (dyf_native_talk.count() > 0):

        #---------------------------------------------------------datasource0-----------------------------------------------------#
        dyf_native_talk = Filter.apply(
            frame=dyf_native_talk,
            f=lambda x: x["username"] is not None and x["username"] != '' and
            x["speaking_dialog_score"] is not None and x[
                "speaking_dialog_score"] > 0 and x[
                    "learning_date"] is not None and x["learning_date"] != '')
        # ----------------------------------datasource1---------------------------------------------------------------------------#
        if (dyf_native_talk.count() > 0):
            dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(
                database='native_talk',
                table_name='native_talk_account_mapping')

            dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(
                ['contact_id',
                 'username']).rename_field('username', 'nativetalk_user')
            dy_cache_2 = dyf_nt_account_mapping.toDF()
            dy_cache_2 = dy_cache_2.cache()
            dyf_nt_account_mapping = DynamicFrame.fromDF(
                dy_cache_2, glueContext, 'dyf_nt_account_mapping')

            dyf_nt_account_mapping = Filter.apply(
                frame=dyf_nt_account_mapping,
                f=lambda x: x["nativetalk_user"] is not None and x[
                    "nativetalk_user"] != '')
            # ----------------------------------datasource1---------------------------------------------------------------------------#

            # -------------------------------------------------------------------------------------------------------------#
            join = Join.apply(dyf_native_talk, dyf_nt_account_mapping,
                              'username', 'nativetalk_user')
            if (join.count() > 0):
                df_nativetalk = join.toDF()
                df_nativetalk = df_nativetalk.withColumn(
                    'sogio', f.lit(0.083333))  #5 phut
                df_nativetalk = df_nativetalk.withColumn(
                    'id_time',
                    from_unixtime(
                        unix_timestamp(df_nativetalk.learning_date,
                                       "yyyy-MM-dd"), "yyyyMMdd"))
                df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL")

                data_nativetalk = DynamicFrame.fromDF(df_nativetalk,
                                                      glueContext,
                                                      'data_nativetalk')
                data_nativetalk = data_nativetalk.resolveChoice(
                    specs=[('sogio', 'cast:float')])
                # -------------------------------------------------------------------------------------------------------------#
                print('data_nativetalk----------')
                data_nativetalk.printSchema()

                # tinh bang "fact_hieusuathoctap"
                df_hieusuathoctap = data_nativetalk.toDF()
                # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time
                df_hieusuathoctap = df_hieusuathoctap.groupby(
                    'contact_id', 'id_time').agg(f.sum('sogio'),
                                                 f.count('contact_id'))

                df_hieusuathoctap = df_hieusuathoctap.withColumn(
                    'tu_hoc_type_id', f.lit(400))
                data_hieusuathoctap = DynamicFrame.fromDF(
                    df_hieusuathoctap, glueContext, 'data_hieusuathoctap')
                data_hieusuathoctap = data_hieusuathoctap.resolveChoice(
                    specs=[('sum(sogio)', 'cast:double')])

                print(
                    'data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------'
                )
                data_hieusuathoctap.printSchema()

                applymapping2 = ApplyMapping.apply(
                    frame=data_hieusuathoctap,
                    mappings=[("contact_id", "string", "contact_id", "string"),
                              ("id_time", 'string', 'id_time', 'bigint'),
                              ("count(contact_id)", 'long', 'soca', 'int'),
                              ("sum(sogio)", 'double', 'sogio', 'double'),
                              ("tu_hoc_type_id", 'int', "tu_hoc_type_id",
                               "int")])

                resolvechoice2 = ResolveChoice.apply(
                    frame=applymapping2,
                    choice="make_cols",
                    transformation_ctx="resolvechoice2")
                dropnullfields2 = DropNullFields.apply(
                    frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('dropnullfields2 number: ', dropnullfields2.count())

                datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(
                    frame=dropnullfields2,
                    catalog_connection="glue_redshift",
                    connection_options={
                        "dbtable":
                        "temp_staging_lich_su_tu_hoc_native_talk___",
                        "database":
                        "dts_odin",
                        "postactions":
                        """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                                            SELECT um.user_id, hwb.id_time, 56, hwb.soca, round(hwb.sogio, 4)
                                                                                                                            FROM temp_staging_lich_su_tu_hoc_native_talk___ hwb
                                                                                                                            LEFT JOIN user_map um
                                                                                                                                ON um.source_type = 1
                                                                                                                                AND um.source_id = hwb.contact_id;
                                                                                                                            DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk___
                                                                                                                            """
                    },
                    redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/",
                    transformation_ctx="datasink2")

                df_datasource = dyf_native_talk.toDF()
                flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')
                df.write.parquet(
                    "s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk_thanh_cong.parquet",
                    mode="overwrite")
                dy_cache.unpersist()
                dy_cache_2.unpersist()
コード例 #13
0
ファイル: dataframe_1.py プロジェクト: msukmanowsky/drpyspark
from __future__ import print_function
import pyspark
from pyspark.sql import functions as F
import drpyspark


drpyspark.enable_debug_output()
with pyspark.SparkContext() as sc:
    sqlContext = pyspark.sql.SQLContext(sc)
    logs = sc.parallelize([
        {'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2'},
    ])
    logs = logs.map(lambda l: pyspark.sql.Row(**l))
    logs = (sqlContext.createDataFrame(logs)
            .withColumn('timestamp', F.to_date(F.from_unixtime('timestamp')))
            .withColumn('minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH")))
    (logs
     .groupBy(['minute', 'url'])
     .count()
     .show())
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    EXPIRED = 'EXPIRED'

    dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")
    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields(
        [
            '_key', 'contact_id', 'used_product_id', 'status_old',
            'status_new', 'status_description', 'timecreated'
        ])
    # .rename_field('contact_id', 'contactid')

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])
    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_expired_lan_n.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')
    print dyf_tpe_enduser_used_product_history.count()
    if dyf_tpe_enduser_used_product_history.count() > 0:
        try:
            dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
                database="tig_market",
                table_name="tpe_invoice_product_details")
            dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
                ['id', 'cat_code'])

            dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
                database="tig_advisor", table_name="student_contact")
            dyf_student_contact = dyf_student_contact.select_fields(
                ['contact_id',
                 'student_id']).rename_field('contact_id', 'contactid')

            ##################### Join and Filter data
            df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF(
            )
            df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id',
                                                                                            'used_product_id').agg(
                f.max("timecreated").alias("max_timecreated")) \
                .withColumnRenamed("contact_id", "contact_id_temp")
            print df_tpe_used_product_history_step1.count()
            df_tpe_used_product_history_step1.show(20)

            df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby(
                'contact_id_temp').agg(
                    f.max("max_timecreated").alias("max_timecreated"),
                    f.count("used_product_id").alias("count_used_product_id"))
            print df_tpe_used_product_history_step2.count()
            df_tpe_used_product_history_step2.show(20)
            print "EEEEEEEEEEEEEEEEEEEEEEEEE"

            dyf_tpe_used_product_history = DynamicFrame.fromDF(
                df_tpe_used_product_history_step2, glueContext,
                "dyf_tpe_used_product_history")
            dyf_part_one = Filter.apply(
                frame=dyf_tpe_used_product_history,
                f=lambda x: x["count_used_product_id"] > 1)

            # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history,
            #                             f=lambda x: x["used_product_id"] > 1)
            df_part_one = dyf_part_one.toDF()
            df_part_one = df_part_one.join(
                df_tpe_enduser_used_product_history,
                (df_part_one.contact_id_temp
                 == df_tpe_enduser_used_product_history.contact_id)
                & (df_part_one.max_timecreated
                   == df_tpe_enduser_used_product_history.timecreated)
                & (df_part_one.used_product_id_temp
                   == df_tpe_enduser_used_product_history.used_product_id))

            dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext,
                                               "dyf_part_one")
            dyf_part_one = dyf_part_one.select_fields([
                'contact_id', 'used_product_id', 'status_old', 'status_new',
                'status_description', 'timecreated'
            ])

            dyf_join_part_one_product_details = Join.apply(
                dyf_part_one, dyf_tpe_invoice_product_details,
                'used_product_id', 'id')

            dyf_join_part_one_product_details.printSchema()
            print "total 01: ", dyf_join_part_one_product_details.count()
            dyf_join_part_one_product_details.toDF().show(2)

            dyf_join_part_one_contact = Join.apply(
                dyf_join_part_one_product_details, dyf_student_contact,
                'contact_id', 'contactid')
            dyf_join_part_one_contact = dyf_join_part_one_contact \
                .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated'])

            dyf_join_part_one_contact.printSchema()
            print "total 02: ", dyf_join_part_one_contact.count()
            dyf_join_part_one_contact.toDF().show(2)
            # df_join_part_one = dyf_join_part_one_contact.toDF()

            ######################################
            ######## START cancelled
            dyf_join_cancelled_status = Filter.apply(
                frame=dyf_join_part_one_contact,
                f=lambda x: x["status_new"] == CANCELLED)
            print "dyf_join_cancelled_status ", dyf_join_cancelled_status.count(
            )
            dyf_join_cancelled_status.toDF().show(2)
            df_join_cancelled_status = dyf_join_cancelled_status.toDF()

            df_join_cancelled_status = df_join_cancelled_status \
                .withColumn("change_status_date_id",
                            from_unixtime(df_join_cancelled_status.timecreated, 'yyyyMMdd').cast("long")) \
                .withColumn("from_status_id", f.lit(None).cast("long")) \
                .withColumn("to_status_id", f.lit(214).cast("long")) \
                .withColumn("measure1", f.lit(None).cast("long")) \
                .withColumn("measure2", f.lit(None).cast("long")) \
                .withColumn("description", df_join_cancelled_status.status_description) \
                .withColumn("timestamp1", f.lit(None).cast("long"))
            df_join_cancelled_status.show(3)
            dyf_join_cancelled_status = DynamicFrame.fromDF(
                df_join_cancelled_status, glueContext,
                "dyf_join_cancelled_status")

            dyf_join_cancelled_status = dyf_join_cancelled_status \
                .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id',
                                'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1'])
            dyf_join_cancelled_status.printSchema()
            df_join_cancelled_status = dyf_join_cancelled_status.toDF()
            ####### END

            ######## START expired
            dyf_join_expired_status = Filter.apply(
                frame=dyf_join_part_one_contact,
                f=lambda x: x["status_new"] == EXPIRED)
            print "dyf_join_expired_status ", dyf_join_expired_status.count()
            dyf_join_expired_status.toDF().show(2)
            df_join_expired_status = dyf_join_expired_status.toDF()

            df_join_expired_status = df_join_expired_status \
                .withColumn("change_status_date_id",
                            from_unixtime(df_join_expired_status.timecreated, 'yyyyMMdd').cast("long")) \
                .withColumn("from_status_id", f.lit(None).cast("long")) \
                .withColumn("to_status_id", f.lit(215).cast("long")) \
                .withColumn("measure1", f.lit(None).cast("long")) \
                .withColumn("measure2", f.lit(None).cast("long")) \
                .withColumn("description", df_join_expired_status.status_description) \
                .withColumn("timestamp1", f.lit(None).cast("long"))
            df_join_expired_status.show(3)
            dyf_join_expired_status = DynamicFrame.fromDF(
                df_join_expired_status, glueContext, "dyf_join_expired_status")

            dyf_join_expired_status = dyf_join_expired_status \
                .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id',
                                'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1'])
            dyf_join_expired_status.printSchema()
            df_join_expired_status = dyf_join_expired_status.toDF()
            ####### END

            df_join_expired_status = df_join_expired_status.withColumn(
                "user_id",
                f.lit(None).cast("long"))

            dyf_join_status = DynamicFrame.fromDF(df_join_expired_status,
                                                  glueContext,
                                                  "dyf_join_status")

            applymapping1 = ApplyMapping.apply(
                frame=dyf_join_status,
                mappings=[("student_id", "string", "student_id", "long"),
                          ("user_id", "long", "user_id", "long"),
                          ("change_status_date_id", "long",
                           "change_status_date_id", "long"),
                          ("from_status_id", "long", "from_status_id", "long"),
                          ("to_status_id", "long", "to_status_id", "long"),
                          ("measure1", "long", "measure1", "double"),
                          ("measure2", "long", "measure2", "double"),
                          ("description", "string", "description", "string"),
                          ("timestamp1", "long", "timestamp1", "long"),
                          ("contact_id", "string", "contact_id", "string")])

            resolvechoice1 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(
                frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            resolvechoice1.printSchema()
            resolvechoice1.show(5)
            print('START WRITE TO REDSHIFT -------------------------')
            datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields1,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable": "mapping_changed_status_student_temp",
                    "database": "dts_odin"
                },
                redshift_tmp_dir=
                "s3a://dtsodin/temp/mapping_changed_status_student_temp/",
                transformation_ctx="datasink1")

            print('START WRITE TO S3-------------------------')
            # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
            #                                                          connection_options={
            #                                                              "path": "s3://dtsodin/student_behavior/student_behavior/",
            #                                                              "partitionKeys": ["behavior_id"]},
            #                                                          format="parquet",
            #                                                          transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            df_temp = dyf_tpe_enduser_used_product_history.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_trang_thai_tai_khoan_expired_lan_n.parquet",
                mode="overwrite")
        except Exception as e:
            print "Something was wrong ", e
コード例 #15
0
        # Some data points have not registered with correct dropoff latitude and longtitude
        # and show up as zero
        df2 = df1[(df1['dropoff_longitude'] < 0)\
              & (df1['dropoff_latitude'] > 0)\
              &(df1['pickup_longitude'] != df1['dropoff_longitude'])\
              & (df1['pickup_latitude'] != df1['dropoff_latitude'])]

        # Adding a time duration for each taxi ride.
        #For machine learning, the hour of the day integer as well as day of the
        #week integer will be important for the algorithm to learn from one
        #month worth of datapoints.
        time_duration = unix_timestamp("tpep_dropoff_datetime",format = time_format)\
                      - unix_timestamp("tpep_pickup_datetime", format = time_format)
        df3 = df2.withColumn("time_duration",time_duration)\
                 .withColumn("hour",hour(df2.tpep_pickup_datetime))\
                 .withColumn("dayOfWeek",from_unixtime(unix_timestamp\
                            (df1.tpep_pickup_datetime,time_format),"uuuuu").cast("Integer"))

        # A Taxi will not drive more than 500 miles for a single ride.
        # A taxi ride will be more than 10 seconds even if you go just 10 metres
        df4 = df3[(df3['trip_distance'] < 500)\
                  & (df3['time_duration'] > 10)]

        # Remove negative cost fields
        df5 = df4[(df4['fare_amount'] > 0)\
                  & (df4['extra'] >= 0)\
                  & (df4['mta_tax'] >= 0)\
                  & (df4['tip_amount'] >= 0)\
                  & (df4['tolls_amount'] >= 0)\
                  & (df4['improvement_surcharge'] >= 0)\
                  & (df4['total_amount'] > 0)]
コード例 #16
0
def process_log_data(spark, input_data, output_data):
    
    """
    This function takes the log data from Udacity's S3 input file and processes it. This is done by 
    extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS.
   
    Parameters:
            spark       : Spark Session
            input_data  : The S3 bucket location of song_data, think 'input'
            output_data : The S3 bucket location of the song_data, think 'ouput'
    """ 
    
    #Using print statement to understand where in spark statement we are
    print("\n Taking in log data as variable from S3's input location....")
    # get full filepath to song data file
    #log_data = input_data + 'log_data/*/*/*.json'
    #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards)
    log_data = input_data + 'log_data/2018/11/*.json'
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Defining log Schema....")
    log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()),
                         SFld("firstName", Str()), SFld("gender", Str()),
                         SFld("itemInSession", Lng()), SFld("lastName", Str()),
                         SFld("length", Dbl()), SFld("level", Str()),
                         SFld("location", Str()), SFld("method", Str()),
                         SFld("page", Str()), SFld("registration", Dbl()),
                         SFld("sessionId", Lng()), SFld("song", Str()),
                         SFld("status", Str()), SFld("ts", Str()),
                         SFld("userAgent", Str()), SFld("userId", Str())])
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Reading log data JSON files from S3's input location....")
    # read log data file
    df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates()
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Filtering page by NextSong....")
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong').drop_duplicates()

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for users data creation....")     
    # extract columns for users table    
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates()
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for users table....")
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users_table/')
          
          
    #Using print statement to understand where in spark statement we are
    print("\n Creating timeStamp variable....")
    # create timestamp column from original timestamp column
    df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000)))
      
    
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for time data creation....")      
    # extract columns to create time table
    time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \
                    .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \
                    .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates()
                 )
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for time table and partitioned by year and month....")        
    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Reading song data JSON files from S3's input location....")      
    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for song play data creation....")       
    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \
                        .select('songplayId', col('timestamp').alias('start_time'), col('userId'),
                         'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent'))
    
    
    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\
                                     .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates()

    
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for song paly table and partitioned by year and month....")       
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
コード例 #17
0
def get_learnig_info(start_year_month_id, end_year_month_id, start_date, end_date):
    push_down_predicate_v = "((behavior_id == \"" + BEHAVIOR_ID_LS + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_SC + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_LT + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_VOXY + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_HW + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_NCSB + "\" " \
                            + " or behavior_id == \"" + BEHAVIOR_ID_NT + "\") " \
                            + " and  year_month_id >= \"" + str(start_year_month_id) + "\" " \
                            + " and  year_month_id <= \"" + str(end_year_month_id) + "\")"
    dyf_sb_student_behavior = connectGlue(database="olap_student_behavior", table_name="sb_student_behavior",
                                          select_fields=["student_behavior_id", "contact_id", "student_behavior_date"],
                                          push_down_predicate=push_down_predicate_v
                                          )

    df_sb_student_behavior = dyf_sb_student_behavior.toDF()
    df_sb_student_behavior = df_sb_student_behavior.drop_duplicates(["student_behavior_id"])
    df_sb_student_behavior = df_sb_student_behavior.select("student_behavior_id", "contact_id",
                                                           f.from_unixtime("student_behavior_date",
                                                                           format="yyyyMMdd").cast("long").alias(
                                                               "date_id"))

    dyf_sb_student_learning = connectGlue(database="olap_student_behavior", table_name="sb_student_learning",
                                          select_fields=["student_behavior_id", "behavior_id", "duration",
                                                         "role_in_class"],
                                          push_down_predicate=push_down_predicate_v
                                          ).rename_field("student_behavior_id", "student_behavior_id_learning")

    dyf_sb_student_learning = dyf_sb_student_learning.resolveChoice(specs=[("behavior_id", "cast:int")])

    dyf_sb_student_learning = Filter.apply(frame=dyf_sb_student_learning,
                                           f=lambda x: (x["behavior_id"] > 12 and x["duration"] > 59)
                                                       or (x["behavior_id"] < 13 and x["duration"] >= 2100))

    df_sb_student_learning = dyf_sb_student_learning.toDF()

    join = df_sb_student_behavior.join(df_sb_student_learning,
                                       df_sb_student_behavior["student_behavior_id"] == df_sb_student_learning[
                                           "student_behavior_id_learning"])

    join = join.drop("student_behavior_id", "student_behavior_id_learning")

    join = join.groupby("contact_id", "date_id", "behavior_id", "role_in_class").agg(f.count("duration").alias("total"))
    join = join.select(
        "contact_id", "date_id", f.struct("behavior_id", "total", "role_in_class").alias("type_role_and_total")
    )

    df_group_by = join.groupBy("contact_id", "date_id") \
        .agg(f.collect_list("type_role_and_total").alias("l_type_role_and_total"))
    join_total = df_group_by.select(
        "contact_id", "date_id",
        get_final_total("l_type_role_and_total").alias("list_total")
    )

    df_latest = join_total.select(
        "contact_id", "date_id",
        f.col("list_total").getItem("total_ls").alias("total_ls"),
        f.col("list_total").getItem("total_sc").alias("total_sc"),
        f.col("list_total").getItem("total_voxy").alias("total_voxy"),
        f.col("list_total").getItem("total_hw").alias("total_hw"),
        f.col("list_total").getItem("total_nt").alias("total_nt"),
        f.col("list_total").getItem("total_ncsb").alias("total_ncsb"),
        f.col("list_total").getItem("total_audit").alias("total_audit"),
        f.col("list_total").getItem("total_lt").alias("total_lt")
    )

    df_lo = get_lo(start_date, end_date)
    df_latest = df_latest.join(df_lo, (df_lo["contact_id_lo"] == df_latest["contact_id"])
                               & (df_lo["created_date_id"] == df_latest["date_id"]), "outer")

    df_latest = df_latest.fillna(0)

    df_latest = df_latest.select("total_lt", "total_voxy", "total_hw", "total_nt", "total_ncsb", "total_audit",
                                 "total_ls", "total_sc", "total_starter_ait", "total_starter_aip", "total_micro",
                                 check_value(df_latest.contact_id, df_latest.contact_id_lo).alias("contact_id"),
                                 check_date(df_latest.created_date_id, df_latest.date_id).alias("date_id"))

    return df_latest
コード例 #18
0
from pyspark.sql.types import IntegerType
from pyspark.sql.types import ArrayType
from pyspark.sql import Row
import operator
from pyspark.sql.window import Window

conf = SparkConf().setAppName("Ex2").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

df = sc.textFile(
    '/host/HieldshiemMasters/Semester1/DistributedDataAnalytics/Exercises/Ex9_Solution/ml-10M100K/tags.dat'
).map(lambda x: x.split("::"))

df = df.toDF(['UserID', 'MovieID', 'Tag', 'Timestamp'])
df_Update = df.withColumn('time_datestring', func.from_unixtime('timestamp'))
df_Update = df_Update.withColumn(
    'time_date', to_timestamp(df_Update.time_datestring,
                              'yyyy-MM-dd HH:mm:ss'))
#print(df_Update)
#df_Update.show()

#===== get all the time stamps for each user ========================
#test=df_Update.groupBy(['UserID'])
new = df_Update.groupBy(['UserID']).agg(collect_list("time_date"))
#test.show()
#==========sort time stamps for each user===========================
#func=udf(lambda x:sorted(x.tolist()))


def sorter(l):
コード例 #19
0
def process_log_data(spark, input_data, output_data):
    '''
    load and process log json files
    input data is the log directory     
    output data is the output directory for star-schema tables (can be a S3 or HDFS bucket)
    input logs files should be stored in a tree hierarchy : <input_data>/<year>/<month>
    '''
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data", "*", "*")

    # read log data file
    df = spark.read.json(log_data)
    print("EXTRACT USERS")
    # filter by actions for song plays
    df = df.filter("page == 'NextSong' ")
    # extract columns for users table
    users_table = df.select(col("userId").cast("long").alias("user_id"),
                          col("firstName").alias("first_name"),
                          col("lastName").alias("last_name"),
                          "gender",
                          "level"
                          )\
                .distinct()\
                .orderBy("user_id")
    # write users table to parquet files
    out_users = os.path.join(output_data, "USERS")
    users_table.write.mode("overwrite").parquet(out_users)

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: int(x / 1000.), LongType())
    df = df.withColumn("timestamp", get_timestamp("ts"))
    spark.udf.register("get_timestamp", get_timestamp)

    # create datetime column from original timestamp column
    #get_datetime = udf()
    df = df.withColumn("datetime", from_unixtime("timestamp")).withColumn(
        "hour",
        hour("datetime")).withColumn("day", dayofmonth("datetime")).withColumn(
            "week", weekofyear("datetime")).withColumn(
                "month",
                month("datetime")).withColumn("year",
                                              year("datetime")).withColumn(
                                                  "weekday",
                                                  dayofweek("datetime"))
    # extract columns to create time table
    time_table = df.select("ts", "hour", "day", "week", "month", "year",
                           "weekday").distinct()

    # write time table to parquet files partitioned by year and month
    out_time = os.path.join(output_data, "TIMESTAMPS")
    time_table.write.partitionBy("year",
                                 "month").mode("overwrite").parquet(out_time)

    # read in song data to use for songplays table
    song_db = os.path.join(output_data, "SONGS")
    song_df = spark.read.parquet(song_db)

    df.createOrReplaceTempView("lg")
    song_df.createOrReplaceTempView("sg")
    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
    SELECT lg.ts AS start_time,
        lg.year AS year,
        lg.month AS month,
        lg.userId AS user_id,
        lg.level,
        sg.song_id,
        sg.artist_id,
        lg.sessionId AS session_id,
        lg.location,
        lg.userAgent AS user_agent    
    FROM lg
    JOIN sg ON sg.title = lg.song
    """)

    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())
    rearrange_col = songplays_table.schema.names[:]
    rearrange_col.insert(0, "songplay_id")
    rearrange_col.pop()
    songplays_table = songplays_table.select(*rearrange_col)

    # write songplays table to parquet files partitioned by year and month
    out_songplay = os.path.join(output_data, "SONGPLAYS")
    songplays_table.write.partitionBy(
        "year", "month").mode("overwrite").parquet(out_songplay)
コード例 #20
0
spark = SparkSession.builder.appName('nyansa').getOrCreate()

#Read text file in a data frame
df1 = spark.read.option("header", "false") \
    .option("delimiter", ",") \
    .option("inferSchema", "true") \
    .csv(sys.argv[1])

#put data in appropriate columns
split_col = F.split(df1['_c0'], "\\|")
df1 = df1.withColumn('to_time_stamp', split_col.getItem(0))
df1 = df1.withColumn('url', split_col.getItem(1))

#convert to date
df1 = df1.withColumn('date', F.from_unixtime('to_time_stamp','MM/dd/yyyy')).withColumn('date', F.to_date('date','MM/dd/yyyy'))

#group by date and url, count urls
df_grouped= df1.groupby('date','url').count() \
               .orderBy(["date", "count"], ascending=[1, 0]) \
               .withColumn('date', F.date_format('date','MM/dd/yyyy'))\
               .withColumnRenamed('count','counts')

l  = df_grouped.collect()

#print as required
def print_result(list_data):    
    prev_date = 0
    for each_row_val in list_data:
        if prev_date != each_row_val.date:
            sys.stdout.write(each_row_val.date + " GMT"+ '\n')
コード例 #21
0
execfile('__pyfiles__/load.py')
# execfile('src/load.py')

from pyspark.sql.types import *
from pyspark import SQLContext
import json

if __name__ == "__main__":

    _, df = load_data(sc, sample=None)

    # df.show()

    df = df.withColumn(
        'created',
        func.from_unixtime(df['created_utc'],
                           'yyyy-MM-dd HH:mm:ss.SS').cast(DateType()))

    df.registerTempTable("comments")

    daily_metrics = spark.sql("""
    SELECT
    
        *,
    
        AVG(count_of_comments) OVER (
            ORDER BY created
            RANGE BETWEEN 30 PRECEDING AND 30 FOLLOWING
        ) AS count_of_comments_60d_avg,

        AVG(count_of_users) OVER (
            ORDER BY created
コード例 #22
0
#                    StructField("review/profileName", StringType(), nullable=False),
#                    StructField("review/score", FloatType(), nullable=False),
#                    StructField("review/summary", StringType(), nullable=False),
#                    StructField("review/text", StringType(), nullable=False),
#                    StructField("review/time", LongType(), nullable=False),
#                    StructField("review/userId", StringType(), nullable=False)
#                    ])
# ```

df = spark.read.json("movies/movies.json")

split_col = split(df['review/helpfulness'], '/')
df = df.withColumn('helpfulness_agreed', split_col.getItem(0).cast("int"))
df = df.withColumn('helpfulness_reviewed', split_col.getItem(1).cast("int"))
df = df.withColumn('score', df['review/score'].cast("float"))
df = df.withColumn('reviewed_at', from_unixtime(df['review/time']))

df2 = df.selectExpr("`product/productId` as product_id",
                    "`review/profileName` as profile_name",
                    "`review/summary` as summary", "`review/text` as text",
                    "`review/userId` as user_id", "score",
                    "helpfulness_agreed", "helpfulness_reviewed",
                    "reviewed_at")
df2.show()

df2.write.saveAsTable('amazon_movie_reviews',
                      format="parquet",
                      mode='overwrite')

## Original schema of json for meta.json
# ```
コード例 #23
0
    LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders")

ordersDF = orders.toDF()

ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity",
                            "storelocation")

ordersDF2 = ordersDF1.withColumnRenamed(
    "stockcode",
    "item_id").withColumnRenamed("quantity", "demand").withColumnRenamed(
        "storelocation", "location").withColumnRenamed("invoicedate",
                                                       "timestamp")

ordersDF3 = ordersDF2.withColumn(
    'timestamp',
    F.from_unixtime(F.unix_timestamp('timestamp', 'dd/mm/yyyy hh:mm:ss'),
                    'yyyy-MM-dd HH:mm:ss'))

ordersDF4 = ordersDF3.repartition(1)

ordersDF4.write.csv("s3://" + PROCESSED_BUCKET + "/orders/raw")

productsDF1 = ordersDF.select("stockcode", "description", "unitprice")

productsDF2 = productsDF1.withColumnRenamed("stockcode", "item_id")

productsDF3 = productsDF2.repartition(1)

productsDF3.write.csv("s3://" + PROCESSED_BUCKET + "/products/raw")

client = boto3.client('s3')
コード例 #24
0
spark = SparkSession.builder.getOrCreate()

tv_sessions = spark.read.parquet(data_catalog['foo  '])

tv_sessions = (
    tv_sessions.filter((tv_sessions.year == 2018)
                       & (tv_sessions.month >= 1)
                       & (tv_sessions.mediaType == 'series-videos')
                       & (tv_sessions.reach60Srd == 1)).filter(
                           tv_sessions.source.isin('box', 'corder')).select(
                               'customerNumber', 'sessionRecordStartTime',
                               'programGenreLevel', 'programSeriesName',
                               'programSeasonEpisode').withColumn(
                                   "date",
                                   fun.from_unixtime(
                                       fun.unix_timestamp(
                                           tv_sessions.sessionRecordStartTime),
                                       "yyyy-MM-dd")))

# tv_sessions.sessionRecordStartTime is a _timestamp_ type.

import pyspark.sql.functions as sparkfun


def substring_f(startpos, lengte):
    return sparkfun.udf(
        lambda kolom: kolom[startpos - 1:startpos - 1 + lengte])


tv_sessions = (tv_sessions.withColumn(
    "kijkmaand",
    substring_f(startpos=6, lengte=2)(tv_sessions.date)).withColumn(
コード例 #25
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")
    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields(
        [
            '_key', 'id', 'used_product_id', 'contact_id', 'status_new',
            'status_old', 'timecreated'
        ])

    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])

    df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_S0.parquet")

    max_key = df_flag.collect()[0]['flag']

    mdl_tpe_enduser_used_product_history = Filter.apply(
        frame=mdl_tpe_enduser_used_product_history,
        f=lambda x: x["_key"] > max_key)

    if (mdl_tpe_enduser_used_product_history.count() > 0):
        mdl_tpe_enduser_used_product_history = Filter.apply(
            frame=mdl_tpe_enduser_used_product_history,
            f=lambda x: x["contact_id"] is not None and x[
                "used_product_id"] is not None and x[
                    "status_old"] is None and x["status_new"] == 'DEACTIVED')

        mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(
            specs=[('timecreated', 'cast:long')])

        df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF(
        )

        # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id')

        df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn(
            'ngay_kich_hoat',
            from_unixtime(
                df_mdl_tpe_enduser_used_product_history['timecreated'],
                "yyyyMMdd"))

        df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn(
            'timestemp',
            df_mdl_tpe_enduser_used_product_history['timecreated'] *
            f.lit(1000))
        # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id',
        #                                                                                    'contact_id',
        #                                                                                    'ngay_kich_hoat',
        #                                                                                    'id').withColumnRenamed(
        #     'used_product_id', 'id_product_buy')
        data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(
            df_mdl_tpe_enduser_used_product_history, glueContext, "datasource")

        data_mdl_tpe_enduser_used_product_history = data_mdl_tpe_enduser_used_product_history.resolveChoice(
            specs=[('timestemp', 'cast:long')])

        applymapping1 = ApplyMapping.apply(
            frame=data_mdl_tpe_enduser_used_product_history,
            mappings=[("used_product_id", "string", "used_product_id",
                       "string"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("ngay_kich_hoat", "string", "ngay_kich_hoat", "int"),
                      ("id", "string", "id", "string"),
                      ("timestemp", "long", "timestamp", "timestamp")])

        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        dropnullfields3 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields3")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields3,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable":
                "temp_ls_trang_thai_s0_1",
                "database":
                "dts_odin",
                "postactions":
                """  INSERT INTO mapping_changed_status_student (user_id, change_status_date_id, to_status_id, timestamp1, measure1)
                                                                                            SELECT um.user_id, tltta.ngay_kich_hoat, 101, tltta.timestamp, 1
                                                                                            FROM temp_ls_trang_thai_s0_1 tltta 
                                                                                              INNER JOIN user_map um on um.source_type = 1 and um.source_id = tltta.contact_id; DROP TABLE IF EXISTS temp_ls_trang_thai_s0_1;"""
            },
            redshift_tmp_dir="s3n://datashine-dwh/temp1/",
            transformation_ctx="datasink4")
        # ghi flag
        # lay max key trong data source
        datasourceTmp = mdl_tpe_enduser_used_product_history.toDF()
        flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://datashine-dev-redshift-backup/flag/flag_LS_S0.parquet",
            mode="overwrite")
コード例 #26
0
def process_log_data(spark, input_data, output_data):
    '''
    Process the log data from the file(s) specified in the parameters.
    
    Args:
        spark: the spark session
        input_data: 
        output_data:
    
    Returns:
        modeled data from logs and songs json files that are written to parquet files back on S3
    '''
    # get filepath to log data file
    log_data = input_data + "log_data/*/*"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'),
        col('gender').alias('gender'),
        col('level').alias('level')).distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users.parquet", mode="overwrite")

    # create timestamp column from original timestamp column
    df = df.withColumn(
        'timestamp',
        f.to_timestamp(
            f.from_unixtime((col('ts') / 1000),
                            'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    # create datetime column from original timestamp column
    df = df.withColumn('ts_datetime',
                       f.to_datetime(col['ts']).cast('Datetime'))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(col("timestamp"))) \
          .withColumn("day", dayofmonth(col("timestamp"))) \
          .withColumn("week", weekofyear(col("timestamp"))) \
          .withColumn("month", month(col("timestamp"))) \
          .withColumn("year", year(col("timestamp"))) \
          .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \
          .select(
            col("timestamp").alias("start_time"),
            col("hour"),
            col("day"),
            col("week"),
            col("month"),
            col("year"),
            col("weekday")
          )

    # write time table to parquet files partitioned by year and month
    time_table.parquet(output_data + "time.parquet", mode="overwrite")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.withColumn(
        'songplay_id', F.monontonically_increasing_id()).join(
            song_df, song_df.title == df.song).select(
                'songplay_id',
                col().alias('start_time'),
                col('userId').alias('user_id'), 'level', 'song_id',
                'artist_id',
                col('sessionId').alias('session_id'), 'location',
                col('userAgent').alias('user_agent'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays.parquet",
                                  mode="overwrite")
コード例 #27
0
        .format("csv") \
        .load("stream")

    ############### Writing the raw stream to memory #################
    df.writeStream \
        .queryName("Row stream") \
        .format("parquet") \
        .option("path", os.path.join(os.getcwd(), 'sink', 'sink_stream_raw')) \
        .option("checkpointLocation", os.path.join(os.getcwd(), 'checkpoint', 'checkpoint_stream_raw')) \
        .start()

    df = df.drop(df["id_node"])

    # Add latitude and longitude to the dataframe and cast the timestamp into the TimestampType
    df_modified = df.withColumn("timestamp_modified",
                        F.from_unixtime(df["timestamp"] /1000, format='yyyy-MM-dd HH:mm:ss').cast(TimestampType()))\
        .withColumn("location",
                    query_udf(df["latitude"],df["longitude"]))

    df_modified = df_modified.withColumn(
        "timestamp", F.regexp_extract(df["timestamp"], ".{3}$", 0))
    df_modified = df_modified.withColumn(
        "timestamp_millisecond",
        F.concat(df_modified["timestamp_modified"], F.lit('.'),
                 df_modified["timestamp"]).cast(TimestampType()))

    # Splitting the column into different columns using Spark's split function
    split_col = F.split(df_modified["location"], ',')
    df_modified = df_modified.withColumn("name", split_col.getItem(0))\
        .withColumn("highway", split_col.getItem(1))\
        .withColumn("lanes", split_col.getItem(2))\
コード例 #28
0
            'timestamp': 1470663000,
            'url': 'http://example.com/',
            'ip': '192.168.1.1'
        },
        {
            'timestamp': 1470663163,
            'url': 'http://example.com/',
            'ip': '192.168.1.1'
        },
        {
            'timestamp': 1470663277,
            'url': 'http://example.com/article1',
            'ip': '192.168.1.2'
        },
        {
            'timestamp': 1470663277,
            'url': 'http://example.com/article2',
            'ip': '192.168.1.2'
        },
        {
            'timestamp': 1470663277,
            'url': 'http://example.com/article3',
            'ip': '192.168.1.2'
        },
    ])
    logs = logs.map(lambda l: pyspark.sql.Row(**l))
    logs = (sqlContext.createDataFrame(logs).withColumn(
        'timestamp', F.to_date(F.from_unixtime('timestamp'))).withColumn(
            'minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH")))
    (logs.groupBy(['minute', 'url']).count().show())
コード例 #29
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = '{0}/log_data/*/*/*.json'.format(input_data)

    # read log data file
    df = spark.read.json(log_data)
    df.createOrReplaceTempView('logs')

    # filter by actions for song plays
    df = spark.sql('''
    select *
    from logs
    where page = 'NextSong'
    ''')

    # extract columns for users table
    users_table = spark.sql('''
    select
        cast(e.userid as int) as user_id,
        e.firstname,
        e.lastname,
        e.gender,
        e.level
    from logs e
    join (
        select max(ts) as ts, userid
        from logs
        where page = 'NextSong'
        group by userid
    ) last_event on last_event.userid = e.userid and last_event.ts = e.ts
    ''')

    # write users table to parquet files
    output_users_path = '{0}/users/'.format(output_data)
    users_table.write.parquet(output_users_path, mode='overwrite')

    # create timestamp column from original timestamp column
    df = df.withColumn('start_time', F.from_unixtime(F.col('ts') / 1000))

    # create datetime column from original timestamp column
    time_table = df.select('ts', 'start_time') \
                   .withColumn('year', F.year('start_time')) \
                   .withColumn('month', F.month('start_time')) \
                   .withColumn('week', F.weekofyear('start_time')) \
                   .withColumn('weekday', F.dayofweek('start_time')) \
                   .withColumn('day', F.dayofyear('start_time')) \
                   .withColumn('hour', F.hour('start_time')).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    output_times_path = '{0}/times/'.format(output_data)
    time_table.write.parquet(output_times_path,
                             mode='overwrite',
                             partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    song_data = '{0}/song_data/*/*/*/*.json'.format(input_data)
    song_df = spark.read.json(song_data)
    song_df.createOrReplaceTempView('songs')

    # extract columns from joined song and log datasets to create songplays table
    time_table.createOrReplaceTempView('times')

    songplays_table = spark.sql('''
    select distinct
        t.start_time,
        cast(e.userid as int) as user_id,
        e.level,
        s.song_id,
        s.artist_id,
        cast(e.sessionid as int) as session_id,
        e.location as location_id,
        e.useragent as user_agent,
        t.year,
        t.month
    from logs e
    join songs s 
        on e.song = s.title 
        and e.artist = s.artist_name
    join times t
        on t.ts = e.ts
    where e.page = 'NextSong'
    ''')

    # write songplays table to parquet files partitioned by year and month
    output_songplays_path = '{0}/songplays/'.format(output_data)
    songplays_table.write.parquet(output_songplays_path,
                                  mode='overwrite',
                                  partitionBy=['year', 'month'])
コード例 #30
0
now = int(time.time())

data = []

# Building a df with a sequence of chronological timestamps
for i in range(0, 1000):
    data = data + [(i, now)]
    now = now + (random.randint(1, 3) + 1)

df = spark.createDataFrame(data, schema)
df.show()
df.printSchema()

# Turning the timestamps to Timestamp datatype
# timestamp, format='yyyy-MM-dd HH:mm:ss')
df = df.withColumn('date', F.from_unixtime(df.original_ts).cast('timestamp'))
df.show(truncate=False)
df.printSchema()

# Turning back the timestamps to epoch
df = df.withColumn('epoch', F.unix_timestamp(df.date))
df.show(truncate=False)
df.printSchema()

# Collecting the result and printing out
timeRows = [row for row in df.collect()]

for row in timeRows:
    print("{} : {} ({})".format(row[0], row[1], row[2]))

spark.stop()
コード例 #31
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    dyf_3cx_advisor_call = glueContext.create_dynamic_frame.from_catalog(
        database='callcenter', table_name='advisorcall')

    dyf_3cx_advisor_call = dyf_3cx_advisor_call.resolveChoice(
        specs=[('_key', 'cast:long')])
    # print schema and select fields
    print('original schema')
    dyf_3cx_advisor_call.printSchema()

    try:
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/student_status/temp_ls_rating_3cx_v1.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_3cx_advisor_call = Filter.apply(
            frame=dyf_3cx_advisor_call,
            f=lambda x: x["_key"] > read_from_index)
    except:
        print('read flag file error ')
    print('the number of new contacts: ', dyf_3cx_advisor_call.count())

    dyf_3cx_advisor_call = dyf_3cx_advisor_call.select_fields([
        '_key', 'device', 'hanguptvts', 'status', 'phonenumber', 'rating',
        'calldate'
    ])
    # .rename_field('statuss', 'status')

    dy_source_3cx_cache = dyf_3cx_advisor_call.toDF()
    dy_source_3cx_cache = dy_source_3cx_cache.dropDuplicates(['_key'])
    dy_source_3cx_cache = dy_source_3cx_cache.cache()
    dyf_3cx_advisor_call = DynamicFrame.fromDF(dy_source_3cx_cache,
                                               glueContext,
                                               'dyf_3cx_advisor_call')

    if (dyf_3cx_advisor_call.count() > 0):
        dyf_3cx_advisor_call = Filter.apply(
            frame=dyf_3cx_advisor_call,
            f=lambda x: x["device"] == '3CX' and x["status"] == 'ANSWER' and x[
                "hanguptvts"] == 1 and x["phonenumber"] is not None and x[
                    "phonenumber"] != '' and x["calldate"] is not None and x[
                        "calldate"] != '' and x["rating"] is not None and x[
                            "rating"] > 0 and x["rating"] < 6)

        print('dyf_3cx_advisor_call::corrcect')
        print('dyf_3cx_advisor_call number', dyf_3cx_advisor_call.count())
        if (dyf_3cx_advisor_call.count() > 0):

            dyf_3cx_advisor_call = dyf_3cx_advisor_call.resolveChoice(
                specs=[('phonenumber', 'cast:string')])
            dyf_3cx_advisor_call.printSchema()
            #convert data
            df_advisor_call = dyf_3cx_advisor_call.toDF()
            df_advisor_call = df_advisor_call.withColumn(
                'id_time',
                from_unixtime(
                    unix_timestamp(df_advisor_call.calldate,
                                   "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))

            df_advisor_call = df_advisor_call.groupby(
                'phonenumber', 'id_time',
                'rating').agg(f.count('_key').alias("so_lan"))

            df_advisor_call = df_advisor_call.withColumn(
                'phonenumber_correct',
                f.concat(f.lit('0'), df_advisor_call.phonenumber))

            df_advisor_call = df_advisor_call.withColumn(
                'rating_status',
                f.lit(60) + df_advisor_call.rating)

            dyf_3cx_advisor_call_rating_number = DynamicFrame.fromDF(
                df_advisor_call, glueContext,
                'dyf_3cx_advisor_call_rating_number')

            dyf_3cx_advisor_call_rating_number = dyf_3cx_advisor_call_rating_number.resolveChoice(
                specs=[('so_lan', 'cast:int')])

            print('dyf_3cx_advisor_call::after::group::schema')
            dyf_3cx_advisor_call_rating_number.printSchema()
            dyf_3cx_advisor_call_rating_number.show(10)
            print('dyf_3cx_advisor_call after::group: ',
                  dyf_3cx_advisor_call_rating_number.count())

            dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog(
                database='tig_advisor', table_name='student_contact_phone')

            dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields(
                ['phone', 'contact_id'])

            dyf_ad_contact_phone = Filter.apply(
                frame=dyf_ad_contact_phone,
                f=lambda x: x["phone"] is not None and x["phone"] != '' and x[
                    "contact_id"] is not None and x["contact_id"] != '')

            print('dyf_ad_contact_phone::schema')
            dyf_ad_contact_phone.printSchema()

            #-----------------------------------------------------------------------------------------------------------#
            join = Join.apply(dyf_3cx_advisor_call_rating_number,
                              dyf_ad_contact_phone, 'phonenumber_correct',
                              'phone')

            print('join::schema------------')
            join.printSchema()
            join.show(2)
            print('join: ', join.count())

            # # chon field
            applymapping1 = ApplyMapping.apply(
                frame=join,
                mappings=[("contact_id", "string", "contact_id", "string"),
                          ("id_time", "string", "id_time", "bigint"),
                          ("phone", "string", "phone", "string"),
                          ("rating_status", "int", "rating_status", "int"),
                          ("rating", "int", "rating", "int"),
                          ("so_lan", "int", "so_lan", "int")])

            resolvechoice2 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice2")
            dropnullfields3 = DropNullFields.apply(
                frame=resolvechoice2, transformation_ctx="dropnullfields3")

            print('dropnullfields3::printSchema')
            dropnullfields3.printSchema()
            dropnullfields3.show(2)

            # ghi data vao redshift
            datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields3,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_ls_rating_3cx_v1",
                    "database":
                    "dts_odin",
                    "postactions":
                    """
                                                                                           INSERT into mapping_changed_status_student(contact_id, change_status_date_id, user_id,to_status_id, measure1)
                                                                                                SELECT t3cx.contact_id, t3cx.id_time, um.user_id, t3cx.rating_status, t3cx.so_lan 
                                                                                                FROM temp_ls_rating_3cx_v1 t3cx
                                                                                                LEFT JOIN user_map um
                                                                                                     ON um.source_type = 1
                                                                                                     AND um.source_id = t3cx.contact_id	 
                                                                                                WHERE len(t3cx.contact_id) < 33
                                                                                                ;
                                                                                                DROP TABLE IF EXISTS public.temp_ls_rating_3cx_v1
                                                                                           """
                },
                redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_rating_3cx_v1",
                transformation_ctx="datasink4")
            df_datasource = dyf_3cx_advisor_call.toDF()
            flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            df.write.parquet(
                "s3a://dtsodin/flag/student_status/temp_ls_rating_3cx_v1.parquet",
                mode="overwrite")
            dy_source_3cx_cache.unpersist()
コード例 #32
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # ----------------------------------------------DYF-----------------------------------------------------------------#
    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['contact_id', 'student_id'])
    #-----------------------------------------DYF-----------------------------------------------------------------------#

    dyf_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog(
        database="poss", table_name="ghinhan_hp")

    dyf_ghinhan_hp = dyf_ghinhan_hp.select_fields(
        ['ngay_thanhtoan', 'khoa_hoc_makh',
         'trang_thai']).rename_field('trang_thai', 'trang_thai_gnhp')

    dyf_ghinhan_hp = Filter.apply(frame=dyf_ghinhan_hp,
                                  f=lambda x: x["trang_thai_gnhp"] == True)
    # -----------------------------------------DYF-----------------------------------------------------------------------#

    dyf_khoa_hoc = glueContext.create_dynamic_frame.from_catalog(
        database="poss", table_name="khoa_hoc")

    dyf_khoa_hoc = dyf_khoa_hoc.select_fields(['makh', 'mahv',
                                               'trang_thai']).rename_field(
                                                   'trang_thai',
                                                   'trang_thai_kh')

    dyf_khoa_hoc = Filter.apply(frame=dyf_khoa_hoc,
                                f=lambda x: x["trang_thai_kh"] == True)
    # -----------------------------------------DYF-----------------------------------------------------------------------#

    dyf_hoc_vien = glueContext.create_dynamic_frame.from_catalog(
        database="poss", table_name="hoc_vien")

    dyf_hoc_vien = dyf_hoc_vien.select_fields([
        'mahv', 'crm_id', 'trang_thai'
    ]).rename_field('mahv', 'mahv_hv').rename_field('trang_thai',
                                                    'trang_thai_hv')

    dyf_hoc_vien = Filter.apply(frame=dyf_hoc_vien,
                                f=lambda x: x["trang_thai_hv"] == True)
    #-------------------------------------------------------------------------------------------------------------------#

    df_student_contact_1 = dyf_student_contact.toDF()
    df_student_contact_1.drop_duplicates()
    df_student_contact = df_student_contact_1.groupby(
        'contact_id', 'student_id').agg(
            f.count('contact_id').alias("contact_id_after_count"))
    dyf_student_contact = DynamicFrame.fromDF(df_student_contact, glueContext,
                                              "dyf_student_contact")
    dyf_student_contact = Filter.apply(
        frame=dyf_student_contact, f=lambda x: x["contact_id_after_count"] > 1)

    df_student_contact = dyf_student_contact.toDF()
    df_student_contact.drop_duplicates()
    df_student_contact.cache()
    df_student_contact.printSchema()
    df_student_contact.show(2)
    print('df_student_contact count::', df_student_contact.count())

    df_ghinhan_hp = dyf_ghinhan_hp.toDF()
    df_khoa_hoc = dyf_khoa_hoc.toDF()
    df_hoc_vien = dyf_hoc_vien.toDF()

    #------------------------------------------___JOIN___---------------------------------------------------------------#

    df_join = df_ghinhan_hp.join(
        df_khoa_hoc, df_ghinhan_hp.khoa_hoc_makh == df_khoa_hoc.makh)
    df_join.printSchema()
    print('df_join count::', df_join.count())

    df_join1 = df_join.join(df_hoc_vien, df_join.mahv == df_hoc_vien.mahv_hv)
    df_join1.printSchema()
    print('df_join1 count::', df_join1.count())

    df_join2 = df_join1.join(df_student_contact,
                             df_join1.crm_id == df_student_contact.contact_id)

    df_join2 = df_join2.withColumn(
        'change_status_date_id',
        from_unixtime(unix_timestamp(df_join2.ngay_thanhtoan, "yyyy-MM-dd"),
                      "yyyyMMdd"))
    df_join2.drop_duplicates()
    df_join2.printSchema()
    df_join2.show(2)
    print('df_join2 count::', df_join2.count())

    # df_join2.printSchema()
    # print('df_join2 count::', df_join2.count())

    #-----------------------------------_____choose_name_field______----------------------------------------------------#
    to_status_id = 201L
    df_result = df_join2.select('student_id', 'change_status_date_id',
                                f.lit(to_status_id).alias('to_status_id'),
                                'contact_id')

    df_result.printSchema()
    df_result.show(3)
    df_result = df_result.drop_duplicates()
    df_result.cache()
    print('count df_result::', df_result.count())
    dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result")
    dyf_result = Filter.apply(
        frame=dyf_result,
        f=lambda x: x["student_id"] is not None and x[
            "change_status_date_id"] is not None and x[
                "to_status_id"] is not None and x["contact_id"] is not None)

    apply_output = ApplyMapping.apply(
        frame=dyf_result,
        mappings=[
            ("student_id", "string", "student_id", "long"),
            # ("user_id", "long", "user_id", "long"),
            ("change_status_date_id", "string", "change_status_date_id", "long"
             ),
            # ("from_status_id", "long", "from_status_id", "long"),
            ("to_status_id", "long", "to_status_id", "long"),
            # ("measure1", "double", "measure1", "double"),
            # ("measure2", "double", "measure2", "double"),
            # ("description", "string", "description", "string"),
            # ("timestamp1", "string", "timestamp1", "string"),
            ("contact_id", "string", "contact_id", "string"),

            # ("teacher_id", "long", "teacher_id", "long"),
            # ("contact_id1", "string", "contact_id1", "string"),
            # ("measure1_int", "int", "measure1_int", "int"),
            # ("measure2_int", "int", "measure2_int", "int"),
            # ("contact_id_str", "string", "contact_id_str", "string"),
            # ("lc", "string", "lc", "string"),
            # ("student_id_string", "string", "student_id_string", "string")
        ])
    df_apply_output = apply_output.toDF()
    df_apply_output.drop_duplicates()
    print('df_apply_output.count', df_apply_output.count())
    dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext,
                                           "dyf_apply_output")

    resolve_choice = ResolveChoice.apply(frame=dyf_apply_output,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dropnullfields = DropNullFields.apply(frame=resolve_choice,
                                          transformation_ctx="dropnullfields")

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields,
        catalog_connection="glue_redshift",
        connection_options={
            "dbtable": "mapping_changed_status_student_v1",
            "database": "dts_odin"
        },
        redshift_tmp_dir="s3n://datashine-dwh/temp1/",
        transformation_ctx="datasink4")

    df_result.unpersist()
    df_student_contact.unpersist()
    print(
        '------------------------>___complete__________------------------------------>'
    )
コード例 #33
0
    lambda x, y: pygeohash.encode(float(x), float(y), precision=6),
    StringType())

spark = SparkSession.builder.appName('supply-to-demand-app').getOrCreate()

#'hdfs:///grab_data/user_values_staging/'

driver_messages_hdfs_path = sys.argv[1]
user_messages_hdfs_path = sys.argv[2]

driver_msgs_df = spark.read.parquet(driver_messages_hdfs_path).withColumn(
    "geo_hash", calcualte_the_geohash_udf("lat", "long")).select(
        "geo_hash",
        F.col("timestamp").cast('timestamp').alias('time')).select(
            "geo_hash",
            F.from_unixtime(F.unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss'),
                            'yyyy-MM-dd HH:mm').alias('date_time')).groupBy(
                                "geo_hash", "date_time").agg(
                                    F.count("*").alias("supply_count"))
user_msgs_df = spark.read.parquet(user_messages_hdfs_path).withColumn(
    "geo_hash", calcualte_the_geohash_udf("lat", "long")).select(
        "geo_hash",
        F.col("timestamp").cast('timestamp').alias('time')).select(
            "geo_hash",
            F.from_unixtime(F.unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss'),
                            'yyyy-MM-dd HH:mm').alias('date_time')).groupBy(
                                "geo_hash", "date_time").agg(
                                    F.count("*").alias("demand_count"))

driver_msgs_df.write.format('parquet').mode('append').save(
    driver_messages_for_batch_processing_hdfs_path)
user_msgs_df.write.format('parquet').mode('append').save(
コード例 #34
0
    def rdd_to_recordstore(rdd_transform_context_rdd):

        if rdd_transform_context_rdd.isEmpty():
            MonMetricsKafkaProcessor.log_debug(
                "rdd_to_recordstore: nothing to process...")
        else:

            sql_context = SQLContext(rdd_transform_context_rdd.context)
            data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
                get_data_driven_specs_repo()
            pre_transform_specs_df = data_driven_specs_repo.\
                get_data_driven_specs(
                    sql_context=sql_context,
                    data_driven_spec_type=DataDrivenSpecsRepo.
                    pre_transform_specs_type)

            #
            # extract second column containing raw metric data
            #
            raw_mon_metrics = rdd_transform_context_rdd.map(
                lambda nt: nt.rdd_info[1])

            #
            # convert raw metric data rdd to dataframe rdd
            #
            raw_mon_metrics_df = \
                MonMetricUtils.create_mon_metrics_df_from_json_rdd(
                    sql_context,
                    raw_mon_metrics)

            #
            # filter out unwanted metrics and keep metrics we are interested in
            #
            cond = [
                raw_mon_metrics_df.metric.name ==
                pre_transform_specs_df.event_type]
            filtered_metrics_df = raw_mon_metrics_df.join(
                pre_transform_specs_df, cond)

            #
            # validate filtered metrics to check if required fields
            # are present and not empty
            # In order to be able to apply filter function had to convert
            # data frame rdd to normal rdd. After validation the rdd is
            # converted back to dataframe rdd
            #
            # FIXME: find a way to apply filter function on dataframe rdd data
            validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter(
                MonMetricsKafkaProcessor._validate_raw_mon_metrics)
            validated_mon_metrics_df = sql_context.createDataFrame(
                validated_mon_metrics_rdd, filtered_metrics_df.schema)

            #
            # record generator
            # generate a new intermediate metric record if a given metric
            # metric_id_list, in pre_transform_specs table has several
            # intermediate metrics defined.
            # intermediate metrics are used as a convenient way to
            # process (aggregated) metric in mutiple ways by making a copy
            # of the source data for each processing
            #
            gen_mon_metrics_df = validated_mon_metrics_df.select(
                validated_mon_metrics_df.meta,
                validated_mon_metrics_df.metric,
                validated_mon_metrics_df.event_processing_params,
                validated_mon_metrics_df.event_type,
                explode(validated_mon_metrics_df.metric_id_list).alias(
                    "this_metric_id"),
                validated_mon_metrics_df.service_id)

            #
            # transform metrics data to record_store format
            # record store format is the common format which will serve as
            # source to aggregation processing.
            # converting the metric to common standard format helps in writing
            # generic aggregation routines driven by configuration parameters
            #  and can be reused
            #
            record_store_df = gen_mon_metrics_df.select(
                (gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_unix"),
                from_unixtime(
                    gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_string"),
                gen_mon_metrics_df.event_type.alias("event_type"),
                gen_mon_metrics_df.event_type.alias("event_quantity_name"),
                (gen_mon_metrics_df.metric.value / 1.0).alias(
                    "event_quantity"),
                when(gen_mon_metrics_df.metric.dimensions.state != '',
                     gen_mon_metrics_df.metric.dimensions.state).otherwise(
                    'NA').alias("event_status"),
                lit('1.0').alias('event_version'),
                lit('metrics').alias("record_type"),

                # resource_uuid
                when(gen_mon_metrics_df.metric.dimensions.instanceId != '',
                     gen_mon_metrics_df.metric.dimensions.instanceId).when(
                    gen_mon_metrics_df.metric.dimensions.resource_id != '',
                    gen_mon_metrics_df.metric.dimensions.resource_id).
                otherwise('NA').alias("resource_uuid"),

                when(gen_mon_metrics_df.metric.dimensions.tenantId != '',
                     gen_mon_metrics_df.metric.dimensions.tenantId).when(
                    gen_mon_metrics_df.metric.dimensions.tenant_id != '',
                    gen_mon_metrics_df.metric.dimensions.tenant_id).when(
                    gen_mon_metrics_df.metric.dimensions.project_id != '',
                    gen_mon_metrics_df.metric.dimensions.project_id).otherwise(
                    'NA').alias("tenant_id"),

                when(gen_mon_metrics_df.metric.dimensions.mount != '',
                     gen_mon_metrics_df.metric.dimensions.mount).otherwise(
                    'NA').alias("mount"),

                when(gen_mon_metrics_df.metric.dimensions.device != '',
                     gen_mon_metrics_df.metric.dimensions.device).otherwise(
                    'NA').alias("device"),

                when(gen_mon_metrics_df.meta.userId != '',
                     gen_mon_metrics_df.meta.userId).otherwise('NA').alias(
                    "user_id"),

                when(gen_mon_metrics_df.meta.region != '',
                     gen_mon_metrics_df.meta.region).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to).otherwise(
                    'NA').alias("region"),

                when(gen_mon_metrics_df.meta.zone != '',
                     gen_mon_metrics_df.meta.zone).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to).otherwise(
                    'NA').alias("zone"),

                when(gen_mon_metrics_df.metric.dimensions.hostname != '',
                     gen_mon_metrics_df.metric.dimensions.hostname).when(
                    gen_mon_metrics_df.metric.value_meta.host != '',
                    gen_mon_metrics_df.metric.value_meta.host).otherwise(
                    'NA').alias("host"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_group"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_id"),

                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'yyyy-MM-dd').alias("event_date"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'HH').alias("event_hour"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'mm').alias("event_minute"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'ss').alias("event_second"),
                gen_mon_metrics_df.this_metric_id.alias("metric_group"),
                gen_mon_metrics_df.this_metric_id.alias("metric_id"))

            #
            # get transform context
            #
            rdd_transform_context = rdd_transform_context_rdd.first()
            transform_context = rdd_transform_context.transform_context_info

            #
            # cache record store rdd
            #
            if cfg.CONF.service.enable_record_store_df_cache:
                storage_level_prop = \
                    cfg.CONF.service.record_store_df_cache_storage_level
                storage_level = StorageUtils.get_storage_level(
                    storage_level_prop)
                record_store_df.persist(storage_level)

            #
            # start processing metrics available in record_store data
            #
            MonMetricsKafkaProcessor.process_metrics(transform_context,
                                                     record_store_df)

            # remove df from cache
            if cfg.CONF.service.enable_record_store_df_cache:
                record_store_df.unpersist()

            #
            # extract kafka offsets and batch processing time
            # stored in transform_context and save offsets
            #
            offsets = transform_context.offset_info

            # batch time
            batch_time_info = \
                transform_context.batch_time_info

            MonMetricsKafkaProcessor.save_kafka_offsets(
                offsets, rdd_transform_context_rdd.context.appName,
                batch_time_info)

            # call pre hourly processor, if its time to run
            if (cfg.CONF.stage_processors.pre_hourly_processor_enabled
                    is True and PreHourlyProcessor.is_time_to_run(
                        batch_time_info)):
                PreHourlyProcessor.run_processor(
                    record_store_df.rdd.context,
                    batch_time_info)
コード例 #35
0
ファイル: stage1_1.py プロジェクト: noelleli/documentation
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
import pandas as pd


if __name__ == "__main__":
    appName = "stage-1"
    sparkmaster = open("/root/spark-ec2/cluster-url").read().strip()
    conf = SparkConf().setMaster(sparkmaster).setAppName(appName)
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)
    data_input = "s3n://make-emr-data/input/weblog/*"
    df = sqlContext.read.json(data_input)           
    dfview = df[df['data_type'] == "MODULE_VIEW"]
    postviews = dfview.select("payload.post_id", "payload.time_stamp", "payload.author").withColumnRenamed("post_id", "postid")
    cat_input = "s3n://make-emr-data/input/webprop/*"
    df2 = sqlContext.read.json(cat_input)
    dfcat = df2[df2['data_type'] == "TEXT"]
    payload2 = dfcat.select("payload.post_id", "payload.publish_time_stamp")
    postcat = payload2.distinct()
    cond = [postviews.postid == postcat.post_id]
    dfjoin = postviews.join(postcat, cond, "left_outer")
    dfdatetime = dfjoin.withColumn('datetime', F.from_unixtime(dfjoin['time_stamp'], format = "yyyy-MM-dd"))
    dffinal = dfdatetime.withColumn('pub_date', F.from_unixtime(dfdatetime['publish_time_stamp'], format = "yyyy-MM-dd"))
    sqlContext.registerDataFrameAsTable(dffinal, "dftable")
    dfgroupby = sqlContext.sql("select count(postid) as viewcounts, pub_date, author, datetime, post_id from dftable group by datetime, pub_date, author, post_id")
    data_output = "s3n://make-emr-data/output/"
    dfgroupby.write.mode("overwrite").json(data_output)