Ejemplo n.º 1
0
def filter_traces_contained(df, dt1, dt2, parameters=None):
    """Gets traces that are contained in the given interval
    """

    if parameters is None:
        parameters = {}
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    df_converted = importer.convert_timestamp_to_utc_in_df(
        df, timest_columns={timestamp_key})
    df_ordered = df_converted.orderBy(case_id_glue, timestamp_key)
    w = Window().partitionBy(case_id_glue).orderBy(timestamp_key)
    w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key))
    stacked = df_ordered.withColumn(timestamp_key + "_last",
                                    F.max(df_ordered[timestamp_key]).over(w2))
    stacked = stacked.withColumn(timestamp_key + "_first",
                                 F.min(stacked[timestamp_key]).over(w))
    stacked = stacked.filter(stacked[timestamp_key + "_first"] > dt1)
    stacked = stacked.filter(stacked[timestamp_key + "_last"] < dt2)
    stacked_dropped = stacked.drop(timestamp_key + "_last",
                                   timestamp_key + "_first")

    return stacked_dropped
Ejemplo n.º 2
0
def profiler(table):
    print('PROFILING TABLE: ' + str(table))
    table.unpersist()
    table.cache()
    for coll in table.columns:
        print('CHECKING MAX LENGTH')
        try:
            max_length = table.rdd.map(lambda x: len(str(x[coll]))).reduce(
                lambda x, y: x if x > y else y)
        except Exception as e:
            continue
        print('CHECKING MIN LENGTH')
        try:
            min_length = table.rdd.map(lambda x: len(str(x[coll]))).reduce(
                lambda x, y: x if x < y else y)
        except Exception as e:
            continue
        print('MAX LENGTH: ' + str(max_length) + ' MIN LENGTH: ' +
              str(min_length))
        print('GROUP BY ON COLUMN: ' + str(coll))
        groupBy = table.groupBy(coll).agg(count(coll).alias('c')).orderBy(
            col('c').desc())
        groupBy.show(10, 1000)
        if groupBy.count() > 1:
            print('THE TABLE HAS NOT UNIQUE VALUES')
        else:
            print('THE TABLE HAS DISTINCT VALUES')
        print('ROW COUNT ON COLUMN: ' + str(coll))
        table.withColumn(
            'row_num',
            row_number().over(
                Window().partitionBy(coll).orderBy(coll))).filter(
                    col('row_num') == 1).show(10, 100)
    table.unpersist()
    print('FINISHED PROCESSING TABLE: ' + str(table))
def main():
    print (f"""Getting average yearly prices per region for all""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nStarted at");uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname')
    house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable'])
    house_df.printSchema()
    house_df.show(2, False)

    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('Date', 'yyyy').cast("Integer").alias('year') \
                        , 'regionname' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('Date', asending=True)
    df2.show(20,False)
    s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable'])
    print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""")
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nFinished at");uf.println(lst)
def load_powietrze(keys_space_name="json",
                   table_name="powietrze",
                   time_frame=None,
                   spark=None,
                   agg=None,
                   time_frames="5 minutes",
                   time_update="1 minute"):

    # Wczytanie danych

    powietrze_temp, sc = load_table.load_and_get_table_df(
        keys_space_name, table_name, time_frame, spark)

    # Dodanie zmiennych opisujących dokładnie czas i suniecie kolumn nieuzywanych do predykcji

    powietrze = powietrze_preprocessing(powietrze_temp, agg, time_frames,
                                        time_update)

    powietrze.sort("name", "timestamp").show(200)
    # Stworzenie zmiennej celu

    w = Window().partitionBy("name").orderBy("timestamp")
    dane = powietrze.withColumn("target",
                                lead("target_temp", 4).over(w)).na.drop()

    #dane.sort("name", "timestamp").show(200)
    #print(dane.schema)

    return dane, sc
Ejemplo n.º 5
0
def overall_prediction_grouping(csv, prediction_dataset=None):
    """
    Grouping dataset by date
    :param csv: -- dataframe: containing all the data
    :param prediction_dataset: -- dataframe: containing previous prediction
    :return: -- dataframe: grouped
    """
    grouped = csv.groupby('Date').agg({'Date': 'count'})
    grouped_with_date = grouped.withColumn('Date',
                                           change_to_date_func(col('Date')))
    window_row = Window().orderBy('Date')
    grouped_indexed = grouped_with_date.withColumn(
        'id',
        row_number().over(window_row))

    if prediction_dataset:
        grouped_with_cols = grouped_indexed.select('Date', 'id',
                                                   'count(Date)').withColumn(
                                                       'id',
                                                       to_vector(col('id')))

        prediction_dataset_with_cols = prediction_dataset.select(
            'Date', 'id',
            'prediction').withColumnRenamed('prediction', 'count(Date)')
        return grouped_with_cols.union(prediction_dataset_with_cols)
    else:
        return grouped_indexed.withColumn('id', to_vector(col('id')))
Ejemplo n.º 6
0
 def _add_outliers(dataframe, **kwargs):
     """
     Calculate a boundary for which a data point will be considered an outlier [bool]
     The boundary is the mean plus "stddev" (number of standard derivations) * the standard derivation
     Uses pyspark's Window function to partition over the special predictions and thereby count number of data 
     points in each cluster, their number of outliers and the outlier percentage 
     
     :param dataframe: 
     :param kwargs: 
         prediction_col can be set in the function call, else it will search for the column name 'predictionCol'
         distance_col can be set in the function call, else it will search for the column name 'distance'
         no_stddev (number of standard derivations) can be set in the function call, else default sat to 2
     :return: dataframe with added 'is_outlier' bool column
     """
     assert kwargs.get('distance_col', 'distance') in dataframe.columns, 'Distances have not been computed!'
     prediction_col = F.col(kwargs.get('prediction_col', 'prediction'))
     distance_col = F.col(kwargs.get('distance_col', 'distance'))
     no_stddev = kwargs.get('no_stddev', 2.0)
     window_outlier = Window().partitionBy(prediction_col)
     computed_boundary = (F.mean(distance_col).over(window_outlier)
                          + no_stddev * F.stddev_pop(distance_col).over(window_outlier)
                          )
     return (dataframe
             .withColumn(colName='computed_boundary',
                         col=computed_boundary)
             .withColumn(colName='is_outlier',
                         col=F.when(distance_col > computed_boundary, 1).otherwise(0))
             )
Ejemplo n.º 7
0
def merge_tracks_sources(tracks1, tracks2):
    # 1. Union overlapping columns
    overlapping_columns = [c for c in tracks1.columns if c in tracks2.columns]
    print("overlapping_columns", overlapping_columns)
    tracks1_project = tracks1.select(overlapping_columns)
    tracks2_project = tracks2.select(overlapping_columns)
    tracks = tracks1_project.union(tracks2_project)  # .distinct()
    print("tracks", tracks.count(), tracks1.count(), tracks2.count())
    tracks.show(10, truncate=10)
    show_distinct(tracks, "id", count_only=True)

    # 2. Remove duplicate tracks
    w = Window().partitionBy("id").orderBy("name")
    # some tracks appear in both data sources, but their records are not identical, and we select the first record for each id
    tracks = tracks.select('*',
                           row_number().over(w).alias("rank")).where(
                               "rank = 1").select(overlapping_columns)

    # 4. Appending extra columns from both data sources
    tracks = tracks.join(
        tracks1.select(
            'id',
            *[c for c in tracks1.columns if c not in overlapping_columns]),
        'id', 'left')
    tracks = tracks.join(
        tracks2.select(
            'id',
            *[c for c in tracks2.columns if c not in overlapping_columns]),
        'id', 'left')
    # TODO(etl): remove duplicates due to redundant records in the right-side table of left outer join
    return tracks
def load_velib(keys_space_name="json",
               table_name="velib",
               time_frame=None,
               spark=None,
               agg=None,
               time_frames="5 minutes",
               time_update="1 minute"):

    # Wczytanie danych

    velib_temp, sc = load_table.load_and_get_table_df(keys_space_name,
                                                      table_name, time_frame,
                                                      spark)

    # Dodanie zmiennych opisujących dokładnie czas i Usuniecie kolumn nieuzywanych do predykcji

    velib = velib_preprocessing(velib_temp, agg, time_frames, time_update)

    velib.sort("station_id", "timestamp").show(300)

    # Stworzenie zmiennej celu

    w = Window().partitionBy("station_id").orderBy("timestamp")
    dane = velib.withColumn("target",
                            lead("num_bikes_available",
                                 240).over(w)).na.drop()

    #dane.sort("station_id", "timestamp").show(300)
    #print(dane.dtypes)

    return dane, sc
Ejemplo n.º 9
0
    def chain_pings(self):
        print(
            "\n_______________________________________________\nCHAINING PINGS\n\n"
        )

        w = Window().partitionBy('device_id',
                                 'study_dt').orderBy('utc_timestamp')
        init_cnt = self.df.count()

        self.df = self.df.withColumn('chain_dist', ((((self.df['accuracy'] + lead(self.df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
                .withColumn('chain', when((distance(self.df['latitude'], self.df['longitude'], \
                           lead(self.df['latitude'],1).over(w), lead(self.df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) \
                          .when((distance(self.df['latitude'], self.df['longitude'], \
                         lag(self.df['latitude'],1).over(w), lag(self.df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1).otherwise(0))

        self.unchain_df = self.df.filter(self.df['chain'] == 0) \
                                    .drop('chain_dist','chain')

        self.df = self.df.filter(self.df['chain'] == 1) \
                                    .drop('chain_dist','chain')

        unchain_cnt = self.unchain_df.cache().count()
        chain_cnt = self.df.cache().count()



        tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings prior to analyzing spatial relationships'], \
                    ['Chained count', chain_cnt, init_cnt - chain_cnt, ((init_cnt - chain_cnt) / float(init_cnt)) * 100, \
                      'Count of pings that have spatially proximate neighbors to consider for clustering']]

        # Display filter table
        print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \
                                                          'Percent Reduction', 'Description']))
def load_urzedy(keys_space_name="json",
                table_name="urzedy",
                time_frame=None,
                spark=None,
                agg=None,
                time_frames="5 minutes",
                time_update="1 minute"):

    # Wczytanie danych

    urzedy_temp, sc = load_table.load_and_get_table_df(keys_space_name,
                                                       table_name, time_frame,
                                                       spark)

    urzedy = urzedy_preprocessing(urzedy_temp, agg, time_frames, time_update)

    urzedy.sort("idgrupy", "timestamp").show(300)

    # Stworzenie zmiennej celu

    w = Window().partitionBy("idgrupy").orderBy("timestamp")
    dane = urzedy.withColumn("target",
                             lead("liczbaklwkolejce", 240).over(w)).na.drop()

    #dane.sort("idgrupy", "timestamp").show(300)
    #print(dane.dtypes)

    return dane, sc
 def findTopMovies(self, userDataDF, itemDF):
     windowPerMovie = Window.partitionBy("item_id")
     windowPerMovieSortRating = Window().orderBy(F.desc("sumOfRating"))
     topMoviesDF=userDataDF.filter("timestamp is not null").join(itemDF,userDataDF.item_id==itemDF.movieid,"inner").\
         select("item_id","movietitle",F.sum("rating").over(windowPerMovie).alias("sumOfRating")).\
         select("*").distinct().select("*",F.row_number().over(windowPerMovieSortRating).alias("rnk"))
     return topMoviesDF
Ejemplo n.º 12
0
    def _create_index(self, index, make_index):
        '''Handles index creation logic base on user input'''

        if index is None:
            # Case 1: user wanted to make index but did not specify column name
            assert not make_index, "Must specify an index name if make_index is True"
            # Case 2: make_index not specified but no index supplied, use first column
            logger.warning(("Using first column as index. ",
                            "To change this, specify the index parameter"))
            index = self.df.columns[0]
        elif make_index and index in self.df.columns:
            # Case 3: user wanted to make index but column already exists
            raise RuntimeError(
                "Cannot make index: index column already present")
        elif index not in self.df.columns:
            if not make_index:
                # Case 4: user names index, it is not in df. does not specify
                # make_index.  Make new index column and warn
                logger.warning(
                    "index %s not found in dataframe, creating new "
                    "integer column", index)
            # Case 5: make_index with no errors or warnings
            # (Case 4 also uses this code path)

            rank_window = Window().orderBy(col(self.df.columns[0]))
            new_add_col = row_number().over(rank_window)

            self.df = self.df.withColumn(index, new_add_col - 1)

        # Case 6: user specified index, which is already in df. No action needed.
        self.index = index
Ejemplo n.º 13
0
def get_top_10_coinstalls(addons_expanded_day):
    def str_map_to_dict(m):
        result = {}
        for i in m:
            k, v = i.split("=")
            result[k] = v
        return result

    def format_row(row):
        return Row(
            addon_id=row.addon_id,
            top_10_coinstalls=str_map_to_dict(row.top_10_coinstalls),
        )

    w = Window().partitionBy("addon_id").orderBy(F.col("count").desc())
    d = (
        addons_expanded_day.join(
            addons_expanded_day.filter("is_system=false").withColumnRenamed(
                "addon_id", "coaddon"),
            on="client_id",
        ).groupby("client_id", "addon_id", "coaddon").count().withColumn(
            "rn", (F.row_number().over(w) - F.lit(1)))  # start at 0
        .filter("rn BETWEEN 1 and 10"
                )  # ignore 0th addon (where coaddon==addon_id)
        .groupby("addon_id").agg(
            F.collect_list(F.concat(F.col("rn"), F.lit("="), "coaddon")).alias(
                "top_10_coinstalls")).rdd.map(format_row).toDF())

    return d
Ejemplo n.º 14
0
def build_dictionary_table(df, *column, index_column="id"):
    dictionary = df.select(*column).distinct().sort(*column)
    #for c in column:
    #    dictionary = dictionary.where("{0} is not null".format(c))
    w = Window().orderBy(*column)
    dictionary = dictionary.withColumn(index_column, row_number().over(w))
    return dictionary
def calculate_average_distance(vehicles_evts_df, op_prd_evts_df):
    """
    calculate average distance in an operating period of all vehicles and
    per vehicle as well.
    :param vehicles_evts_df: vehicle events DF.
    :param op_prd_evts_df: operating periods DF.
    :return: DF with col `distance`
    """
    # so that we could join both data frames.
    vehicles_evts_df = vehicles_evts_df.withColumn('key', F.lit(1))
    op_prd_evts_df = op_prd_evts_df.withColumn('key', F.lit(1))

    df_merge = vehicles_evts_df.join(op_prd_evts_df, on='key',
                                     how='left').drop('key')
    df_merge = df_merge \
        .withColumn('lng', F.toRadians('lng')) \
        .withColumn('lat', F.toRadians('lat'))

    w = Window().partitionBy('op_prd_id', 'vehicle_id').orderBy("at")

    df = df_merge.withColumn(
        'distance',
        calculate_distance('lng', 'lat',
                           F.lag('lng', 1).over(w),
                           F.lag('lat', 1).over(w))).alias('distance')

    df = df.withColumn(
        'distance',
        F.when(F.isnull(df['distance']),
               0).otherwise(df['distance'])).alias('distance')

    return df
Ejemplo n.º 16
0
def shift_1(df,shift_count=1):
    from pyspark.sql.functions import col,lag
    from pyspark.sql.window import Window
    
    w_1 = Window().partitionBy().orderBy(col('date'))
    df = df.withColumn('shift_1',lag('sales',count=shift_count).over(w_1))
  
    return df
def Breiman(df, label_column, column_names):
    '''This function calculates the average of a given column conditional on the value of another column'''
    for col in column_names:
        print(col)
        w = Window().partitionBy(col)
        df = df.withColumn(col + "B", avg(label_column).over(w))
        df.drop(col)
    return df
Ejemplo n.º 18
0
 def markDistinct(self, dataFrame):
     w = Window().partitionBy('Key').orderBy(Functions.lit('A'))
     localDf = dataFrame.withColumn('IsDistinctKey',
                                    Functions.row_number().over(w))
     localDf = localDf.withColumn(
         'IsDistinctKey',
         Functions.when(localDf.IsDistinctKey == 1, '0').otherwise('1'))
     return localDf
Ejemplo n.º 19
0
def add_lead_lag(df, variable):
    for month in range(1,37,3):
        w = Window().partitionBy(col("GVKEY")).orderBy(col("GVKEY_year_mth"))
        first_new_col = "forward_"+str(month)+"_month_"+str(variable)
        second_new_col = "past_"+str(month)+"_month_"+str(variable)
        df = df.withColumn(first_new_col, lag(col(variable),-month,None).over(w)) \
        .withColumn(second_new_col, lag(col(variable),month,None).over(w))
    return df
Ejemplo n.º 20
0
def extract_nested_json(source_dataframe,table,col_table_name,col_nested_data,cols_to_add_hash):
    df2=source_dataframe.filter('lower('+col_table_name+') =='+'\"'+table+'\"')
    df2_1_col=df2.select(col_nested_data).rdd.map(lambda p1:is_json(p1[col_nested_data])).map(lambda g2: [k.lower() for k in g2.keys()]).reduce(lambda h77,h78: list(set(h77+h78)))
    df2_1=df2.select(col_nested_data).rdd.map(lambda p1:is_json(p1[col_nested_data])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems())).map(lambda g4: dict((k29,unicode("")) if k29 not in g4.keys() else (k29,g4[k29]) for k29 in df2_1_col))
    df3_2=df2_1.map(lambda v:Row(**v)).toDF()
    df2=df2.withColumn("columnindex",  row_number().over(Window().partitionBy(lit("A")).orderBy(lit('A'))))
    df3_2=df3_2.withColumn("columnindex", row_number().over(Window().partitionBy(lit("A")).orderBy(lit('A'))))
    final=df2.join(df3_2, df2.columnindex == df3_2.columnindex, 'inner').drop(df3_2.columnindex)
    final=final.drop('columnindex')
    sha_columns=df3_2.columns
    sha_columns.remove("columnindex")
    if type(cols_to_add_hash)==list:
        sha_columns.extend(cols_to_add_hash)
    else:
        sha_columns.append("eventtype")
    final=final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns), 256))
    return(final)
Ejemplo n.º 21
0
def get_top_addon_names(addons_expanded):
    w = Window().partitionBy("addon_id").orderBy(F.col("n").desc())
    cnts = addons_expanded.groupby("addon_id", "name").agg(
        F.countDistinct("client_id").alias("n"))
    addon_names = (cnts.withColumn(
        "rn",
        F.row_number().over(w)).where(F.col("rn") == 1).select(
            "addon_id", "name"))
    return addon_names
Ejemplo n.º 22
0
  def __baseWindow(self):
    # add all sort keys - time is first, unique sequence number breaks the tie

    ptntl_sort_keys = [self.ts_col, self.sequence_col]
    sort_keys = [f.col(col_name).cast("long") for col_name in ptntl_sort_keys if col_name != '']

    w = Window().orderBy(sort_keys)
    if self.partitionCols:
      w = w.partitionBy([f.col(elem) for elem in self.partitionCols])
    return w
Ejemplo n.º 23
0
def cbind(df1, df2):
    df1 = df1.withColumn('const', F.lit(1))
    df2 = df2.withColumn('const', F.lit(1))

    w = Window().partitionBy().orderBy('const')
    df1 = df1.withColumn("row_id", F.rank().over(w))
    df2 = df2.withColumn("row_id", F.row_number().over(w))
    cbind_df = df1.join(df2, on=["row_id"]).sort("row_id").drop("row_id")

    return cbind_df.drop('const')
Ejemplo n.º 24
0
    def inner(df):
        from pyspark.sql.functions import lag, col, datediff

        w = Window().partitionBy().orderBy(col(colName))
        df1 = (df.select(colName,
                         lag(colName).over(w).alias("lagged_col")).na.drop())
        (df1.withColumn('diff_col', datediff(
            df1[colName], df1['lagged_col'])).sort('diff_col',
                                                   ascending=False).show())
        return df
Ejemplo n.º 25
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = ' '

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter((f.col('page') == 'NextSong'))

    # extract columns for users table
    users_table = df.selectExpr("userId as user_id", "firstName as first_name",
                                " lastName as  last_name", "gender", "level")

    # write users table to parquet files
    users_table.write.parquet("Users_Table")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    df = df.withColumn("datetime", get_datetime(df.ts))

    # create weekday column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%A'))
    df = df.withColumn("weekday", get_weekday(df.ts))

    # extract columns to create time table
    time_table = df.select("timestamp",
                           hour("timestamp").alias('hour'),
                           dayofmonth("timestamp").alias('day'),
                           weekofyear("timestamp").alias('weekofyear'),
                           month("timestamp").alias('month'),
                           year("timestamp").alias('year'), "weekday")

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(['year', 'month']).parquet("time_table")

    # read in song data to use for songplays table
    song_df = spark.read.json(" ")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.alias('a').join(df.alias('b'),(col('b.song') == col('a.title')) & (col('b.artist') == col('a.artist_name') ) )\
.selectExpr("ts as start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionId as session_id", "location", "useragent as user_agent").distinct()
    w = Window().orderBy(lit('A'))
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 row_number().over(w))
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(['year',
                                       'month']).parquet("songplays_table")
Ejemplo n.º 26
0
def initParameters():
    read_df = readSourceData()
    transform_df = transformData()

    spark_session = SparkSession.builder \
        .master('local[1]') \
        .appName(ctest['common']['appName']) \
        .getOrCreate()
    wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"),
                                  'regionname')
    return [spark_session, wSpecY]
Ejemplo n.º 27
0
    def process_stream(self, rdd):
        """
        Args rdd: rdd
        :rtype: None
        """
        def detect_anomaly(sensor_readings, running_avg, std_dev):
            """
            Args:
                sensor_readings: List(float)
                running_avg: float
                std_dev: float
            :rtype: int
            """
            anomalies = []
            for x, (i, y) in zip(sensor_readings, enumerate(running_avg)):
                upper_limit = running_avg[i - 1] + 3 * std_dev
                lower_limit = running_avg[i - 1] - 3 * std_dev
                if (x > upper_limit) or (x < lower_limit):
                    anomalies.append(x)
            return len(anomalies)

        if rdd.isEmpty():

            print("RDD is empty")
        else:
            df = rdd.toDF().cache()
            w = (Window().partitionBy(col("id")).rowsBetween(-1, 1))
            df = df.withColumn('rolling_average', F.avg("val").over(w))
            agg_df = df.groupBy(['id']).agg(
                F.collect_list("val").alias("sensor_reading"),
                first("ts").cast('timestamp').alias("start_ts"),
                last("ts").cast('timestamp').alias("end_ts"),
                F.round(F.stddev("val"), 3).alias("std_temp"),
                F.collect_list("rolling_average").alias("rol_avg"))
            agg_df.show()
            anomaly_udf = udf(detect_anomaly, IntegerType())
            processed_df = agg_df.withColumn(
                "num_anomaly",
                anomaly_udf("sensor_reading", "rol_avg",
                            "std_temp")).sort(desc("num_anomaly"))
            final_df = processed_df.withColumn(
                "anomaly",
                F.when(F.col("num_anomaly") > 1, True).otherwise(False))
            final_df = final_df.select("id", "start_ts", "end_ts", "std_temp",
                                       "num_anomaly", "anomaly")
            try:
                connector = pgConnector.PostgresConnector(
                    "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb",
                    "datanode", "password")
                connector.write(final_df, "anomaly_window_tbl", "append")

            except Exception as e:
                print(e)
                pass
Ejemplo n.º 28
0
def generate_window(order_col, rowrange=None, partitions=None):
    window = Window().orderBy(order_col)

    # for future reference: if we want to use partitions
    # if partitions is not None:
    #     window = window.partitionBy(partitions)

    if rowrange is not None:
        window = window.rowsBetween(*rowrange)

    return window
Ejemplo n.º 29
0
def cvm_formatting_cdw(
        recs,
        model_master_name,
        model_name: str,
        model_type: str,
        start_date: str,
        time_prd_val: int = -7,
        env: str = 'TST',

):
    model_master = oracle_cdw_read(f'SELECT MASTER_ID FROM {model_master_name} WHERE ROWNUM = 1'
                                   f'ORDER BY master_id DESC',
                                   database = 'CDWCMMO',
                                   env = env,
                                   db_type = 'Oracle').toPandas()
    master_id = model_master['MASTER_ID'][0] + 1
    start_date, end_date = date_period(time_prd_val, start_date)
    w = Window().orderBy('sku_X', 'cvm_rank')
    output = recs. \
        withColumn('MASTER_ID', lit(master_id)). \
        withColumn('RECORD_ID', row_number().over(w)). \
        withColumn('TIME_PRD_VAL', lit(time_prd_val)). \
        withColumn('MODEL_TYPE', lit(model_type)). \
        withColumn('MODEL_NAME', lit(model_name)). \
        withColumn('START_DATE', lit(start_date)). \
        withColumn('END_DATE', lit(end_date))

    # Order the columns
    output = output. \
        select('MASTER_ID',
               'RECORD_ID',
               'TIME_PRD_VAL',
               'MODEL_TYPE',
               'MODEL_NAME',
               'SKU_X',
               'COUPON_X',
               'SKU_Y',
               'COUPON_Y',
               'BASKET_COUNT_XY',
               'BASKET_COUNT_X',
               'BASKET_COUNT_Y',
               'SKU_BASKET_COUNT_X',
               'SKU_BASKET_COUNT_Y',
               'SKU_SALES_X',
               'SKU_SALES_Y',
               'CONFIDENCE',
               'CVM_RANK',
               'START_DATE',
               'END_DATE',

               )

    return master_id, output
Ejemplo n.º 30
0
def main(bike_inputs, taxi_inputs, year):
    # read df from csv
    bike_trips_df = spark.read.csv('tripsl.csv', header=True)
    taxi_trips_df = spark.read.csv('taxi.csv', header=True)
    #filter on given year
    bike_trips_df = bike_trips_df.withColumn(
        'date', bike_trips_df['starttime'].cast('date'))
    bike_trips_df = bike_trips_df.withColumn(
        'year', functions.year(bike_trips_df['date']))
    bike_trips_df = bike_trips_df.filter(bike_trips_df['year'] == 2016)

    #calculate the average distance travel by bikes this year
    bike_dist_df = bike_trips_df.groupby([
        'start_station_name', 'end_station_name', 'start_station_latitude',
        'start_station_longitude', 'end_station_latitude',
        'end_station_longitude'
    ]).count().withColumnRenamed('start_station_name',
                                 'startStationName').withColumnRenamed(
                                     'end_station_name', 'endStationName')

    #calculate distance
    w = Window().partitionBy(['start_station_name',
                              'end_station_name']).orderBy(
                                  ['start_station_name', 'end_station_name'])
    bike_dist_df = bike_dist_df.withColumn(
        "dist",
        dist("start_station_longitude", "start_station_latitude",
             'end_station_longitude', 'end_station_latitude').cast('decimal'))

    #calculate 50 percentile distance
    avergae_distance = bike_dist_df.orderBy('dist').selectExpr(
        'percentile_approx(dist, 0.5)').collect()[0][0]

    #convert km to miles
    average_distance = float(avergae_distance) * 0.621371
    #filter out any taxi trips larger than this distance
    new_taxi_trips_df = taxi_trips_df.filter(
        taxi_trips_df['trip_distance'] < average_distance).filter(
            taxi_trips_df['pickup_latitude'].isNotNull()).filter(
                taxi_trips_df['pickup_longitude'].isNotNull()).filter(
                    taxi_trips_df['dropoff_latitude'].isNotNull()).filter(
                        taxi_trips_df['dropoff_longitude'].isNotNull())
    #calculate thr velocity of each trip in miles/s
    new_taxi_trips_df = new_taxi_trips_df.withColumn(
        'velocity', new_taxi_trips_df['trip_distance'] /
        new_taxi_trips_df['travel_time']).orderBy('velocity')
    traffic_velocity = new_taxi_trips_df.selectExpr(
        'percentile_approx(velocity, 0.2)').collect()[0][0]
    new_taxi_trips_df = new_taxi_trips_df.filter(
        new_taxi_trips_df['velocity'] <= traffic_velocity)
    new_taxi_trips_df.repartition(1).toPandas().to_csv(
        'data/q3_data/taxi_traffic.csv', header=True)