def chain_pings(self): print( "\n_______________________________________________\nCHAINING PINGS\n\n" ) w = Window().partitionBy('device_id', 'study_dt').orderBy('utc_timestamp') init_cnt = self.df.count() self.df = self.df.withColumn('chain_dist', ((((self.df['accuracy'] + lead(self.df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(self.df['latitude'], self.df['longitude'], \ lead(self.df['latitude'],1).over(w), lead(self.df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) \ .when((distance(self.df['latitude'], self.df['longitude'], \ lag(self.df['latitude'],1).over(w), lag(self.df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1).otherwise(0)) self.unchain_df = self.df.filter(self.df['chain'] == 0) \ .drop('chain_dist','chain') self.df = self.df.filter(self.df['chain'] == 1) \ .drop('chain_dist','chain') unchain_cnt = self.unchain_df.cache().count() chain_cnt = self.df.cache().count() tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings prior to analyzing spatial relationships'], \ ['Chained count', chain_cnt, init_cnt - chain_cnt, ((init_cnt - chain_cnt) / float(init_cnt)) * 100, \ 'Count of pings that have spatially proximate neighbors to consider for clustering']] # Display filter table print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \ 'Percent Reduction', 'Description']))
def do_it(spark): return gen_df(spark, data_gen, length=2048) \ .withColumn('inc_count_1', f.count('*').over(inclusiveWindowSpec)) \ .withColumn('inc_count_c', f.count('c').over(inclusiveWindowSpec)) \ .withColumn('inc_max_c', f.max('c').over(inclusiveWindowSpec)) \ .withColumn('inc_min_c', f.min('c').over(inclusiveWindowSpec)) \ .withColumn('lead_5_c', f.lead('c', 5).over(baseWindowSpec)) \ .withColumn('lead_def_c', f.lead('c', 2, defaultVal).over(baseWindowSpec)) \ .withColumn('lag_1_c', f.lag('c', 1).over(baseWindowSpec)) \ .withColumn('lag_def_c', f.lag('c', 4, defaultVal).over(baseWindowSpec)) \ .withColumn('row_num', f.row_number().over(baseWindowSpec))
def read(self, iDF): plain_df_idx = iDF.rdd\ .zipWithIndex().toDF(["row","idx"])\ .orderBy(asc("idx"))\ .coalesce(10) Windowspec = Window.orderBy("idx") oDF = plain_df_idx\ .withColumn("seq", F.lead("row",count=1).over(Windowspec))\ .withColumn("seqID", F.lead("row",count=0).over(Windowspec)) parsedDF = oDF.filter(F.col("idx") % 2 == 0).select( "seqID", "seq", "+", "quality") return parsedDF
def generate_dataset_m(df_assembled, CONFIG_PREPROCESS): m_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt') m_feat_cols = ['day0_features0'] m_days_cols = ['day0_features'] shuffled_with_cell = df_assembled.withColumn( 'seq', row_number().over(m_window_spec)).cache() input_m = shuffled_with_cell.withColumnRenamed('features', m_feat_cols[0]) vector_udt_metadata = input_m.schema[input_m.schema.fields.index( input_m.schema[m_feat_cols[0]])].metadata # Generate 1 day data (5min * 10 data) for i in range(1, CONFIG_PREPROCESS.INPUT_M_SIZE): n_features = lead(col(m_feat_cols[0]), i).over(m_window_spec) col_name = 'day{}_features{}'.format(0, i) input_m = input_m.withColumn( col_name, n_features.alias(col_name, metadata=vector_udt_metadata)) m_feat_cols.append('day{}_features{}'.format(0, i)) input_m = input_m.dropna() input_m = VectorAssembler().setInputCols(m_feat_cols).setOutputCol( m_days_cols[0]).transform(input_m) vector_udt_metadata = input_m.schema[-1].metadata # for DAYS_TO_MEMORY(7) days memory in same time zone for i in range(1, CONFIG_PREPROCESS.DAYS_TO_MEMORY): n_features = lead(col('day0_features'), int(CONFIG_PREPROCESS.ITEMS_PER_DAY * i)).over(m_window_spec) col_name = 'day{}_features'.format(i) input_m = input_m.withColumn( col_name, n_features.alias(col_name, metadata=vector_udt_metadata)) m_days_cols.append('day{}_features'.format(i)) m_tail_skip_size = CONFIG_PREPROCESS.ITEMS_PER_DAY # rows to skip, for 1 Day (X & Y) inbound = when( input_m['seq'] <= (max(input_m['seq']).over( m_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) - m_tail_skip_size), 1).otherwise(0) input_m = input_m.dropna().withColumn('inbound', inbound).filter('inbound == 1') input_m = VectorAssembler().setInputCols( m_days_cols).setOutputCol('features').transform(input_m).select([ 'dt', 'CELL_NUM', 'features' ]) # assemble DAYS_TO_MEMORY days columns into one ('features') return input_m
def Sparkseeds(dict, i, k, hashDF, sc): word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j) for j in range(0, len(dict[i]) - k)] rddW = sc.parallelize(word) schemaWordDF = rddW.map( lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2])) df = sqlContext.createDataFrame(schemaWordDF) reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner') reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ) reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window)) reDF = reDF.withColumn( "dist", F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value), 0).otherwise(reDF.POS_SEQ - reDF.prev_value)) reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist, reDF.POS_GEN) reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window)) elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50)) & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50))) reDF = reDF.subtract(elDF) reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) #pos = function(reDF) return reDF
def load_urzedy(keys_space_name="json", table_name="urzedy", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych urzedy_temp, sc = load_table.load_and_get_table_df(keys_space_name, table_name, time_frame, spark) urzedy = urzedy_preprocessing(urzedy_temp, agg, time_frames, time_update) urzedy.sort("idgrupy", "timestamp").show(300) # Stworzenie zmiennej celu w = Window().partitionBy("idgrupy").orderBy("timestamp") dane = urzedy.withColumn("target", lead("liczbaklwkolejce", 240).over(w)).na.drop() #dane.sort("idgrupy", "timestamp").show(300) #print(dane.dtypes) return dane, sc
def load_powietrze(keys_space_name="json", table_name="powietrze", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych powietrze_temp, sc = load_table.load_and_get_table_df( keys_space_name, table_name, time_frame, spark) # Dodanie zmiennych opisujących dokładnie czas i suniecie kolumn nieuzywanych do predykcji powietrze = powietrze_preprocessing(powietrze_temp, agg, time_frames, time_update) powietrze.sort("name", "timestamp").show(200) # Stworzenie zmiennej celu w = Window().partitionBy("name").orderBy("timestamp") dane = powietrze.withColumn("target", lead("target_temp", 4).over(w)).na.drop() #dane.sort("name", "timestamp").show(200) #print(dane.schema) return dane, sc
def get_step_duration(step_df, check_col, key_cols, offset=0, duration_col="duration", timestamp_col="timestamp", unit="min"): """ :param step_df: :param key_cols: :param offset: :param duration_col: :return: """ window = Window.partitionBy(key_cols).orderBy("time").rowsBetween(1, 1) df = step_df.withColumn( duration_col, get_up_time(func.col(timestamp_col), func.lead(timestamp_col, 1, None).over(window), func.col(check_col), func.lit(offset), func.lit(unit))) required_cols = [] required_cols.extend(["time", timestamp_col]) required_cols.extend(key_cols) required_cols.extend([check_col, duration_col]) return df.select(required_cols)
def __generate_target_fill(self, df: DataFrame, partition_cols: List[str], ts_col: str, target_col: str) -> DataFrame: """ Create columns for previous and next value for a specific target column :param df: input DataFrame :param partition_cols: partition column names :param ts_col: timestamp column name :param target_col: target column name """ return (df.withColumn( f"previous_{target_col}", last(df[target_col], ignorenulls=True).over( Window.partitionBy( *partition_cols).orderBy(ts_col).rowsBetween( Window.unboundedPreceding, 0)), ) # Handle if subsequent value is null .withColumn( f"next_null_{target_col}", last(df[target_col], ignorenulls=True).over( Window.partitionBy(*partition_cols).orderBy( col(ts_col).desc()).rowsBetween( Window.unboundedPreceding, 0)), ).withColumn( f"next_{target_col}", lead(df[target_col]).over( Window.partitionBy(*partition_cols).orderBy(ts_col)), ))
def get_session_stats(data_in, name): # Limit to song plays session_vars = data_in.filter(data_in['page'] == 'NextSong') # Get play time for each song session_window = ssw.Window.partitionBy( session_vars['userId'], session_vars['sessionId']).orderBy('ts') session_vars = session_vars.withColumn( 'nextPlayStart', ssf.lead(session_vars['ts']).over(session_window)) session_vars = session_vars.withColumn( 'playTime', session_vars['nextPlayStart'] - session_vars['ts']) # Get play time for each session session_vars = session_vars.groupby('userId', 'sessionId').agg( #pylint: disable=no-member ssf.sum('playTime').alias('sessionLength')) # Get play time stats for each user session_vars = session_vars.groupby('userId').agg( #pylint: disable=no-member ssf.mean('sessionLength').alias(f'lengthMean{name}'), ssf.stddev('sessionLength').alias(f'lengthStd{name}'), ssf.sum('sessionLength').alias(f'lengthSum{name}')) return session_vars
def load_velib(keys_space_name="json", table_name="velib", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych velib_temp, sc = load_table.load_and_get_table_df(keys_space_name, table_name, time_frame, spark) # Dodanie zmiennych opisujących dokładnie czas i Usuniecie kolumn nieuzywanych do predykcji velib = velib_preprocessing(velib_temp, agg, time_frames, time_update) velib.sort("station_id", "timestamp").show(300) # Stworzenie zmiennej celu w = Window().partitionBy("station_id").orderBy("timestamp") dane = velib.withColumn("target", lead("num_bikes_available", 240).over(w)).na.drop() #dane.sort("station_id", "timestamp").show(300) #print(dane.dtypes) return dane, sc
def add_distance_column(dfs, order_column='timestamp'): # Radians lat/lon dfs = dfs.withColumn('latitude2', F.radians('latitude')).withColumn( 'longitude2', F.radians('longitude')) # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist w = Window.partitionBy(['userID']).orderBy(order_column) dfs = dfs.withColumn('next_lat', F.lead('latitude2', 1).over(w)) dfs = dfs.withColumn('next_lon', F.lead('longitude2', 1).over(w)) # Haversine distance dfs = dfs.withColumn('distance_next', EARTH_RADIUS * 2 * F.asin(F.sqrt( F.pow(F.sin((col('next_lat') - col('latitude2')) / 2.0), 2) + F.cos('latitude2') * F.cos('next_lat') * F.pow( F.sin((col('next_lon') - col('longitude2')) / 2.0), 2)))) dfs = dfs.withColumn('distance_prev', F.lag('distance_next', default=0).over(w)).drop( 'latitude2').drop('longitude2').drop('next_lon').drop('next_lat').drop('distance_next') return dfs
def _add_time_diff(self, df): dedup_cols = self.group_by_columns + [self.time_column] df = df.dropDuplicates(dedup_cols) df = df.withColumn( DIFF_COL, F.abs( F.unix_timestamp( F.lead(self.time_column).over(self.merge_window)) - F.unix_timestamp(F.col(self.time_column)))) return df
def next_same(df, field, partition_field='resume_id', sort_field='-work_index'): """获取 下一个值与当前行该字段的值相等的数据 """ window_spec = Window.partitionBy(partition_field).orderBy(sort(sort_field)) new_field = f"next_{field}" df = df.withColumn(new_field, F.lead(df[field], 1).over(window_spec)) # 过滤掉不同的或为空 df = df.filter(df[field] == df[new_field]).filter( df[new_field].isNotNull()) df = df.drop(new_field) return df
def get_change_status(df): """Calculates the change status and change date for each record e.g (New, Change, No Change, Deleted) :param df: :return df: """ employeeWindowSpec = Window.partitionBy("employee_number").orderBy( F.col("snapshot_date")) constWindowSpec = Window.partitionBy(F.lit(1)).orderBy( F.col("snapshot_date").desc()) helper_columns = [ "min_snapshot", "row_hash", "max_snapshot", "prev_row_hash", "next_row_hash", "global_max_shapshot" ] add_change_status_df = df \ .withColumn("min_snapshot", F.min("snapshot_date").over(employeeWindowSpec)) \ .withColumn("max_snapshot", F.max("snapshot_date").over(employeeWindowSpec)) \ .withColumn("prev_row_hash", F.lag("row_hash").over(employeeWindowSpec)) \ .withColumn("next_row_hash", F.lead("row_hash").over(employeeWindowSpec)) \ .withColumn("global_max_shapshot", F.max("snapshot_date").over(constWindowSpec)) \ .withColumn("change_status", F.when(F.col("min_snapshot") == F.col("snapshot_date"), "New") \ .when(F.col("next_row_hash").isNull() & (F.col("max_snapshot") != F.col("global_max_shapshot")) , "Deleted") \ .when(F.col("prev_row_hash") != F.col("row_hash"), "Changed") \ .when(F.col("prev_row_hash") == F.col("row_hash"), "No Change") \ .otherwise("Unknown") \ ) \ rowNum1Window = Window.partitionBy(["employee_number"]).orderBy( F.col("snapshot_date").desc()) rowNum2Window = Window.partitionBy(["employee_number", "row_hash"]).orderBy( F.col("snapshot_date").desc()) gapGrpWindow = Window.partitionBy(["gap_grp" ]).orderBy(F.col("snapshot_date")) helper_columns.extend(["gap_row_num_1", "gap_row_num_2", "gap_grp"]) add_change_date_df = add_change_status_df \ .withColumn("gap_row_num_1", F.row_number().over(rowNum1Window)) \ .withColumn("gap_row_num_2", F.row_number().over(rowNum2Window)) \ .withColumn("gap_grp", F.col("gap_row_num_1") - F.col("gap_row_num_2")) \ .withColumn("changed_status_date",F.min("snapshot_date").over(gapGrpWindow)) \ .withColumn("changed_status_date", \ F.when(F.col("change_status") == "Deleted",F.col("snapshot_date")) \ .otherwise(F.col("changed_status_date"))) \ .drop(*helper_columns) return add_change_date_df
def mr(self, dataframe): tmp = dataframe.withColumn("num", f.monotonically_increasing_id()) \ .withColumn("next", f.lead(dataframe[0], 1, 0) \ .over(window.Window.orderBy("num"))).drop("num") R = tmp.rdd.map(lambda x: abs(x[0] - x[1])).map( lambda x: float(x)).collect()[:-1] Rbar = np.mean(R) lclr = D3[2] * Rbar uclr = D4[2] * Rbar return R, Rbar, lclr, uclr
def do_it(spark): df = gen_df(spark, data_gen, length=2048) \ .withColumn('inc_count_1', f.count('*').over(inclusive_window_spec)) \ .withColumn('inc_count_c', f.count('c').over(inclusive_window_spec)) \ .withColumn('lead_5_c', f.lead('c', 5).over(base_window_spec)) \ .withColumn('lag_1_c', f.lag('c', 1).over(base_window_spec)) \ .withColumn('row_num', f.row_number().over(base_window_spec)) if isinstance(c_gen, StructGen): """ The MIN()/MAX() aggregations amount to a RANGE query. These are not currently supported on STRUCT columns. Also, LEAD()/LAG() defaults cannot currently be specified for STRUCT columns. `[ 10, 3.14159, "foobar" ]` isn't recognized as a valid STRUCT scalar. """ return df.withColumn('lead_def_c', f.lead('c', 2, None).over(base_window_spec)) \ .withColumn('lag_def_c', f.lag('c', 4, None).over(base_window_spec)) else: default_val = gen_scalar_value(c_gen, force_no_nulls=False) return df.withColumn('inc_max_c', f.max('c').over(inclusive_window_spec)) \ .withColumn('inc_min_c', f.min('c').over(inclusive_window_spec)) \ .withColumn('lead_def_c', f.lead('c', 2, default_val).over(base_window_spec)) \ .withColumn('lag_def_c', f.lag('c', 4, default_val).over(base_window_spec))
def create_vars(df, cells): # Loading variables df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss")) #get call_date from call_datetime df = df.withColumn('call_date', df.call_datetime.cast('date')) # Recreate analysis variables df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\ .orderBy('msisdn', 'call_datetime')\ .withColumn('region_lag', F.lag('region').over(user_window))\ .withColumn('region_lead', F.lead('region').over(user_window))\ .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\ .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\ .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\ .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\ .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\ .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\ .withColumn('constant', F.lit(1).cast('byte'))\ .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\ .na.fill({'region' : missing_value_code , 'region_lag' : missing_value_code , 'region_lead' : missing_value_code }) return df
def i(self, dataframe): tmp = dataframe.withColumn("num", f.monotonically_increasing_id()) \ .withColumn("next", f.lead(dataframe[0], 1, 0) \ .over(window.Window.orderBy("num"))).drop("num") R = tmp.rdd.map(lambda x: abs(x[0] - x[1])).map( lambda x: float(x)).collect()[:-1] Rbar = np.mean(R) X = dataframe.rdd.map(lambda x: float(x[0])).collect() Xbar = np.mean(X) lcl = Xbar - 3 * (Rbar / d2[2]) ucl = Xbar + 3 * (Rbar / d2[2]) return X, Xbar, lcl, ucl
def preVCF(hdfs, spark): vcf = spark.sparkContext.textFile(hdfs) col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t") vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\ .map(lambda x : x.split("\t"))\ .toDF(col_name)\ .withColumn("POS", F.col("POS").cast(IntegerType()))\ .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\ .withColumn("INFO", when(F.col("INFO").startswith("END="), None).otherwise(F.col("INFO"))) sample_name = vcf_data.columns[-1] vcf_data = vcf_data.drop("QUAL", "FILTER", sample_name) for index in range(len(vcf_data.columns)): compared_arr = ["#CHROM", "POS", "REF"] if vcf_data.columns[index] not in compared_arr: vcf_data = vcf_data.withColumn(vcf_data.columns[index], F.array(vcf_data.columns[index])) vcf_data = vcf_data.withColumnRenamed( vcf_data.columns[index], vcf_data.columns[index] + "_" + sample_name) vcf_data = vcf_data.withColumnRenamed("REF", "REF_" + sample_name) vcf_data = vcf_data.withColumn("count", F.length(vcf_data.columns[3])) # window & split col parameter sample_ref = vcf_data.columns[3] indel_window = Window.partitionBy("#CHROM").orderBy("POS") split_col = F.split("REF_temp", '_') ### vcf_data = vcf_data.withColumn("count", when(F.col("count") >= 2, F.lead("POS", 1).over(indel_window) - F.col("POS") - F.lit(1))\ .otherwise(F.lit(0))) not_indel = vcf_data.drop("count").withColumn(sample_ref, F.array(F.col(sample_ref))) indel = vcf_data.filter(F.col("count") >= 1)\ .withColumn(sample_ref, ref_melt(F.col(sample_ref), F.col("count"))).drop("count")\ .withColumn(sample_ref, explode(F.col(sample_ref))).withColumnRenamed(sample_ref, "REF_temp")\ .withColumn(sample_ref, F.array(split_col.getItem(0))).withColumn('POS_var', split_col.getItem(1))\ .drop(F.col("REF_temp")).withColumn("POS", (F.col("POS") + F.col("POS_var")).cast(IntegerType()))\ .drop(F.col("POS_var"))\ .withColumn(vcf_data.columns[2], F.array(F.lit(".")))\ .withColumn(vcf_data.columns[4], F.array(F.lit("*, <NON_REF>"))) return not_indel.unionByName(indel)
def __generate_time_series_fill(self, df: DataFrame, partition_cols: List[str], ts_col: str) -> DataFrame: """ Create additional timeseries columns for previous and next timestamps :param df: input DataFrame :param partition_cols: partition column names :param ts_col: timestamp column name """ return df.withColumn( "previous_timestamp", col(ts_col), ).withColumn( "next_timestamp", lead(df[ts_col]).over( Window.partitionBy(*partition_cols).orderBy(ts_col)), )
def get_dmi_realtime_value(spark_df, cal_trend_ratio): """ 实时计算dmi的值 按照存储过程cd_11_sec.dbo.compute_mkt_adx_v2翻译过来 未做改动 :param spark_df: 行情数据 含 open high low close 为spark类型dataframe :param cal_trend_ratio: 保留多少比例的数据用于计算信号 :return: """ dmi_period = 30 # 周期并非传统的14 # 对于最后一行 x y z为空值 udf_max = func.udf( lambda x, y, z: float( max(max(abs(x), abs(y)), abs(z)) if x is not None and y is not None and z is not None else np.nan), FloatType()) udf_get_ema = func.udf(lambda x: float(get_ema(x, dmi_period)), FloatType()) # di_plus di_minus tr 与传统的算法存在差异 是用后一天减去当前天 (具体原因不明) 具体可查看compute_mkt_adx_v2 w = Window.partitionBy('sec_cd').orderBy('pub_dt') spark_df = spark_df.withColumn('di_plus', func.lead('high', 1).over(w) - func.col('high'))\ .withColumn('di_minus', func.col('low') - func.lead('low', 1).over(w))\ .withColumn('tr', udf_max( (func.lead('high', 1).over(w) - func.lead('low', 1).over(w)), (func.lead('high', 1).over(w) - func.col('close')), (func.col('close') - func.lead('low', 1).over(w)))) # 将nan置为空 spark_df = spark_df.replace(float('nan'), None) # 对di_plus di_minus 进行修正 sm_di_plus sm_di_minus sm_tr为di_plus di_minus tr的加权平均值 # pdi mdi 此处取了前一天的值 不明白其中原因 spark_df = spark_df\ .withColumn('di_plus', func.when( (func.col('di_plus') > func.col('di_minus')) & (func.col('di_plus') > 0), func.col('di_plus')).otherwise(0))\ .withColumn('di_minus', func.when( (func.col('di_minus') > func.col('di_plus')) & (func.col('di_minus') > 0), func.col('di_minus')).otherwise(0))\ .withColumn('sm_di_plus', udf_get_ema(func.collect_list('di_plus').over(w)))\ .withColumn('sm_di_minus', udf_get_ema(func.collect_list('di_minus').over(w)))\ .withColumn('sm_tr', udf_get_ema(func.collect_list('tr').over(w)))\ .withColumn('pdi', func.col('sm_di_plus')/func.col('sm_tr')*100)\ .withColumn('mdi', func.col('sm_di_minus')/func.col('sm_tr')*100)\ .withColumn('pdi', func.lag('pdi', 1).over(w))\ .withColumn('mdi', func.lag('mdi', 1).over(w)) # 取最后40%数据 w = Window.partitionBy('sec_cd') spark_df = spark_df.withColumn('sec_length', func.count('pub_dt').over(w)) w = Window.partitionBy('sec_cd').orderBy('pub_dt') spark_df = spark_df.withColumn('row_no', func.count('pub_dt').over(w)) spark_df = spark_df.filter( spark_df.row_no >= spark_df.sec_length * cal_trend_ratio) spark_df = spark_df.select('sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pdi', 'mdi') return spark_df
def transform(self, df): """Cleanup and add new field for next url Assumptions after initial data analysis - 1) since this is about user visit journey drop all null visitorids 2) remove the row if there is any repeated page url in the sequence before assigning next page url pages(a -> a -> b -> a) => (a->b->a) """ # drop null visitors and repartition by visitor to perform multiple sorting on visitor id df = df.na.drop(subset=['visitorId']).repartition("visitorId") # change datatype of timestamp to long before applying sorting df = df.select("id", df['timestamp'].cast('Long').alias('timestamp'), "type", "visitorId","pageURL") # Remove duplicate url entries per visitor in adjuscent rows window_spec = Window.partitionBy("visitorId").orderBy("timestamp") df = df.withColumn("prevUrl", F.lag("pageURL").over(window_spec)).filter("pageURL != prevUrl or prevUrl is null") # add next page url and display timestamp in readable format df = df.withColumn("nextPageUrl",F.lead("pageURL").over(window_spec)) return(df.select("id", "timestamp", "type" , "visitorId", "pageUrl", "nextPageUrl"))
def add_columns_lead(df, list_of_tuples, window): """ :param df: Spark DataFrame :param list_of_tuples: Ex: [ ("old_column_1", "new_column_1"), ("old_column_2", "new_column_2"), ("old_column_3", "new_column_3") ] :param window: Spark Window to iterate over :return: Spark DataFrame with new columns """ for (old_col, new_col) in list_of_tuples: df = df.withColumn(new_col, lead(old_col).over(window)) return df
def add_prev_or_next(df, old_field, new_field, partition_field='resume_id', sort_field='-work_index'): """为某一字段的每一行添加其前一个或后一个值""" window_spec = Window.partitionBy(partition_field).orderBy(sort(sort_field)) if 'prev' in new_field: # 取前一行 df = df.withColumn(new_field, F.lag(df[old_field], 1).over(window_spec)) elif 'next' in new_field: # 取后一行 df = df.withColumn(new_field, F.lead(df[old_field], 1).over(window_spec)) # 过滤掉相同的或为空的 df = df.filter(df[old_field] != df[new_field]).filter( df[new_field].isNotNull()) # df = df.filter(df[new_field].isNotNull()) return df
def get_dmi_realtime_signal(spark_df, cal_trend_ratio): dmi_period = 30 # 周期并非传统的14 # 对于最后一行 x y z为空值 udf_max = func.udf( lambda x, y, z: float( max(max(abs(x), abs(y)), abs(z)) if x is not None and y is not None and z is not None else np.nan), FloatType()) udf_get_ema = func.udf(lambda x: float(get_ema(x, dmi_period)), FloatType()) # di_plus di_minus tr 按储存过程翻译过来 与传统的算法存在差异 具体可查看compute_mkt_adx_v2 w = Window.partitionBy('sec_cd').orderBy('pub_dt') spark_df = spark_df.withColumn('di_plus', func.lead('high', 1).over(w) - func.col('high'))\ .withColumn('di_minus', func.col('low') - func.lead('low', 1).over(w))\ .withColumn('tr', udf_max( (func.lead('high', 1).over(w) - func.lead('low', 1).over(w)), (func.lead('high', 1).over(w) - func.col('close')), (func.col('close') - func.lead('low', 1).over(w)))) # 将nan置为空 spark_df = spark_df.replace(float('nan'), None) # 对di_plus di_minus 进行修正 sm_di_plus sm_di_minus sm_tr为di_plus di_minus tr的加权平均值 # pdi mdi 此处取了前一天的值 不明白其中原因 spark_df = spark_df\ .withColumn('di_plus', func.when( (func.col('di_plus') > func.col('di_minus')) & (func.col('di_plus') > 0), func.col('di_plus')).otherwise(0))\ .withColumn('di_minus', func.when( (func.col('di_minus') > func.col('di_plus')) & (func.col('di_minus') > 0), func.col('di_minus')).otherwise(0))\ .withColumn('sm_di_plus', udf_get_ema(func.collect_list('di_plus').over(w)))\ .withColumn('sm_di_minus', udf_get_ema(func.collect_list('di_minus').over(w)))\ .withColumn('sm_tr', udf_get_ema(func.collect_list('tr').over(w)))\ .withColumn('pdi', func.col('sm_di_plus')/func.col('sm_tr')*100)\ .withColumn('mdi', func.col('sm_di_minus')/func.col('sm_tr')*100)\ .withColumn('pdi', func.lag('pdi', 1).over(w))\ .withColumn('mdi', func.lag('mdi', 1).over(w)) # 取最后40%数据 w = Window.partitionBy('sec_cd') spark_df = spark_df.withColumn('sec_length', func.count('pub_dt').over(w)) w = Window.partitionBy('sec_cd').orderBy('pub_dt') spark_df = spark_df.withColumn('row_no', func.count('pub_dt').over(w)) spark_df = spark_df.filter( spark_df.row_no >= spark_df.sec_length * cal_trend_ratio) spark_df = spark_df.select('sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pdi', 'mdi') return spark_df
def json_without_trailing_comma(json_as_text: DataFrame) -> DataFrame: """Remove trailing commas from a dataframe representing a json file read as text. Args: json_as_text (DataFrame): the input json as dataframe of strings Returns: :DataFrame: a dataframe of strings representing a valid json without trailing commas """ index_window = Window.partitionBy().orderBy("id") columns = ["line", "id"] dataframe_with_next_record = json_as_text \ .rdd \ .zipWithIndex() \ .toDF(columns) \ .withColumn("next_line", functions.lead("line").over(index_window)) dataframe_with_valid_lines = dataframe_with_next_record \ .rdd \ .map(remove_trailing_comma) \ .toDF() \ .select("line.value") return dataframe_with_valid_lines
def prepare_interpolated_updates_dataframe(spark: SparkSession, silver_df: DataFrame) -> DataFrame: dateWindow = Window.partitionBy("device_id").orderBy("p_eventdate") lag_lead_silver_df = silver_df.select( "*", lag(col("heartrate")).over(dateWindow).alias("prev_amt"), lead(col("heartrate")).over(dateWindow).alias("next_amt"), ) updates = lag_lead_silver_df.where(col("heartrate") < 0) updates = updates.withColumn( "heartrate", when(col("prev_amt").isNull(), col("next_amt")).otherwise( when(col("next_amt").isNull(), col("prev_amt")).otherwise( (col("prev_amt") + col("next_amt")) / 2)), ) return updates.select( "device_id", "heartrate", "eventtime", "name", "p_eventdate", )
def generate_dataset_x_y(df_assembled, CONFIG_PREPROCESS): x_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt') x_head_skip_size = CONFIG_PREPROCESS.DAYS_TO_MEMORY * CONFIG_PREPROCESS.ITEMS_PER_DAY shuffled_with_cell = df_assembled.withColumn( 'seq', row_number().over(x_window_spec)).cache() input_x = shuffled_with_cell.filter('seq > ' + str(x_head_skip_size)) x_feat_cols = ['features0'] input_x = input_x.withColumnRenamed('features', x_feat_cols[0]) vector_udt_metadata = input_x.schema[input_x.schema.fields.index( input_x.schema[x_feat_cols[0]])].metadata for i in range(1, CONFIG_PREPROCESS.INPUT_X_SIZE): n_features = lead(col(x_feat_cols[0]), i).over(x_window_spec) col_name = 'features' + str(i) input_x = input_x.withColumn( 'features' + str(i), n_features.alias(col_name, metadata=vector_udt_metadata)) x_feat_cols.append('features{}'.format(i)) x_tail_skip_size = CONFIG_PREPROCESS.INPUT_X_SIZE + CONFIG_PREPROCESS.INPUT_Y_SIZE - 1 inbound = when( input_x['seq'] <= (max(input_x['seq']).over( x_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) - x_tail_skip_size), 1).otherwise(0) input_x = input_x.withColumn('inbound', inbound).filter('inbound == 1') input_x = VectorAssembler().setInputCols(x_feat_cols).setOutputCol( 'features').transform(input_x).select(['dt', 'CELL_NUM', 'features']) y_head_skip_size = CONFIG_PREPROCESS.DAYS_TO_MEMORY * CONFIG_PREPROCESS.ITEMS_PER_DAY + CONFIG_PREPROCESS.INPUT_X_SIZE # rows to skip, for M & X input_y = shuffled_with_cell.filter('seq > ' + str(y_head_skip_size)).drop('seq') return input_x, input_y
def main(): parser = argparse.ArgumentParser() # Read Arguments parser.add_argument("--std", "-s", help="Number of STDEVs for labelling") parser.add_argument("--window", "-w", help="Window size") parser.add_argument("--ahead", "-a", help="Days ahead to check") parser.add_argument("--target", "-t", help="Target column to be used") parser.add_argument("--bucket", "-b", help="GS bucket name") parser.add_argument("--input", "-i", help="Path to input data within bucket") parser.add_argument("--images", "-k", help="Flag to indicate to generate images") parser.add_argument("--volume", "-v", help="Plot Volume?") parser.add_argument("--sma", "-m", help="Plot SMA?") parser.add_argument("--rsi", "-r", help="Plot RSI?") parser.add_argument("--figsize", "-fig", help="Figure Size") parser.add_argument("--offset", "-f", help="Offset") parser.add_argument("--dpi", "-d", help="DPI") parser.add_argument("--stocks", "-c", help="Stocks") # Init Context and Session spark = SparkSession.builder.appName('Preprocessing').getOrCreate() # read arguments from the command line args = parser.parse_args() # Set parameters for execution stdevs = spark.sparkContext.broadcast(float(args.std) if args.std else 1.0) days_ahead = spark.sparkContext.broadcast(int(args.ahead) if args.ahead else 10) target = spark.sparkContext.broadcast(args.target if args.target else 'ADJ_CLOSE') figsize = spark.sparkContext.broadcast(int(args.figsize) if args.figsize else 1) volume = spark.sparkContext.broadcast(args.volume if args.volume else True) sma = spark.sparkContext.broadcast(args.sma if args.sma else True) rsi = spark.sparkContext.broadcast(args.rsi if args.rsi else False) window_size = spark.sparkContext.broadcast(int(args.window) if args.window else 21) window_offset = spark.sparkContext.broadcast(int(args.offset) if args.offset else 5) dpi = spark.sparkContext.broadcast(int(args.dpi) if args.dpi else 72) stocks = args.stocks if args.stocks else 'sp500.csv' create_images = args.images if args.images else False # Initialize logger log = Log4j(spark) # Log used parameters log.log("Std. Devs: " + str(stdevs.value)) log.log("Window size: " + str(window_size.value)) log.log("Window offset: " + str(window_offset.value)) log.log("Days ahead: " + str(days_ahead.value)) log.log("Column to be used: " + str(target.value)) log.log("Plot volume: " + str(volume.value)) log.log("Figure Size: " + str(figsize.value)) log.log("DPI: " + str(dpi.value)) log.log("Create Images: " + str(create_images)) # Read source dataframe from GS Bucket # This file contains all the prices we are going to use inputpath = 'gs://' + args.bucket + '/' + args.input df = spark.read.csv(inputpath, header=True, inferSchema=True) log.log("Num Partitions: " + str(df.rdd.getNumPartitions())) ## LABEL CALCULATION # Calculate pct daily change df = df.withColumn('PREV_DAY', lag(df[target.value]).over(Window.partitionBy("SYMBOL").orderBy("DATE"))) df = df.withColumn('CHANGE', 100 * (df[target.value] - df['PREV_DAY']) / df[target.value]) # Get N number of days ahead df = df.withColumn('AHEAD', lead(df[target.value], count=days_ahead.value).over(Window.partitionBy("SYMBOL").orderBy("DATE"))) # Calculate Annual HVOl windowSpec = Window.partitionBy("SYMBOL").orderBy(col('DATE').cast('long')).rowsBetween(-window_size.value, 0) df = df.withColumn('HVOL', stddev(col(target.value) * np.sqrt(252)).over(windowSpec)) # Calculate Upper and Lower limits df = df.withColumn('UPPER_LIMIT', col(target.value) + col(target.value) * ((col('HVOL') * np.sqrt(days_ahead.value) / (np.sqrt(252))) * stdevs.value / 100)) df = df.withColumn('LOWER_LIMIT', col(target.value) - col(target.value) * ((col('HVOL') * np.sqrt(days_ahead.value) / (np.sqrt(252))) * stdevs.value / 100)) # Finally, calculate the label df = df.withColumn('LABEL', (when(col('AHEAD') > col('UPPER_LIMIT'), 1) .when(col('AHEAD') >= col('LOWER_LIMIT'), 0) .otherwise(-1))) # Build path to save file data_label = datetime.now().strftime("%Y%m%d_%H%M%S") + '_W' + str(window_size.value) + '_A' + str(days_ahead.value) + '_O' + str(window_offset.value) + '_S' + str(stdevs.value) outputpath = 'gs://' + args.bucket + '/data/' + data_label log.log("Writting file to GCS: " + outputpath) # Check if create images flag is active if create_images == True: # Generate image data for CNN log.log("Generating Images") images(spark, log, 'gs://' + args.bucket + '/data/' + stocks, outputpath, df, figsize, window_size, volume, window_offset, dpi, target, sma, rsi) else: # Save prices with label for LSTM log.log("Save prices with label") df.repartition(1).write.csv((outputpath + "/labels"), header=True, mode='overwrite')
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)
def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag): #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt" #inThresh=10 #outThresh=300 #************************************** # #this procedure will use incfile to caculate #flow(everyday one record) store as file #indoor(everyday one record) store as file #indoor_for_delete(every indoor records) store in hbase #indoor detail(every indoor records) store in hbase and as file # #destIndoorFile : /data/indoor/entityid/year/id_date.json used to generate report #destFlowFile : /data/flow/entityid/year/fl_date.json used to generate report #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json this folder is mirror of hbase records # # #************************************** destIndoorFile=get_str_indoorFileName(incFileName) #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/") #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt destFlowFile =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_") #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) EntityID=int(get_str_entityID(incFileName)) #101762 histFileName=get_str_histFileName(incFileName) #processed file will be place here #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt if fs.exists(sc._jvm.Path(histFileName)): tmpFileName=get_str_tmpFileName(histFileName) #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp" #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and destroy the inc file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con) #copymerge the 2 files (inc and hist) into one file sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None) sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con) unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName) # firtDayofMonth= 1475251200 it is 20161001 unixdate startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2])) HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp") hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf") df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1") df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1) df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2") df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2") df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time") df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3") df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1) df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day df5.registerTempTable("df5") #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5") df6.registerTempTable("df6_indoor") df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday") df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) flow_sql= "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac" hc.sql(flow_sql).registerTempTable("df_flow_tmp") df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() #df_flow.saveAsTable("T_FLOW") if len(df5.head(1))==1: #df5 is not empty better than df5.rdd.isEmpty tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder) #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder) df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() if fs.exists(sc._jvm.Path(rec_destIndoorfile)): #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]))) HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor") df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk) if len(df_delete_pk.head(1))==1: df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() else: tmp_rec_destIndoorFolder="NONE" if len(df_flow.head(1))==1: tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder) #df_flow.write.mode('overwrite').json(tmp_destFlowFolder) else: tmp_destFlowFolder="NONE" if len(df_current_result.head(1))==1: tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder) #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder) else: tmp_destIndoorFolder="NONE" sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) if fs.exists(sc._jvm.Path(destIndoorFile)): fs.delete(sc._jvm.Path(destIndoorFile)) if fs.exists(sc._jvm.Path(destFlowFile)): fs.delete(sc._jvm.Path(destFlowFile)) if fs.exists(sc._jvm.Path(rec_destIndoorfile)): fs.delete(sc._jvm.Path(rec_destIndoorfile)) #delete is a must if file already exists otherwise copymerge will fail if tmp_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None) #destIndoorFile=get_str_indoorFileName(incFileName) if tmp_destFlowFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None) if tmp_rec_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)