コード例 #1
0
    def chain_pings(self):
        print(
            "\n_______________________________________________\nCHAINING PINGS\n\n"
        )

        w = Window().partitionBy('device_id',
                                 'study_dt').orderBy('utc_timestamp')
        init_cnt = self.df.count()

        self.df = self.df.withColumn('chain_dist', ((((self.df['accuracy'] + lead(self.df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
                .withColumn('chain', when((distance(self.df['latitude'], self.df['longitude'], \
                           lead(self.df['latitude'],1).over(w), lead(self.df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) \
                          .when((distance(self.df['latitude'], self.df['longitude'], \
                         lag(self.df['latitude'],1).over(w), lag(self.df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1).otherwise(0))

        self.unchain_df = self.df.filter(self.df['chain'] == 0) \
                                    .drop('chain_dist','chain')

        self.df = self.df.filter(self.df['chain'] == 1) \
                                    .drop('chain_dist','chain')

        unchain_cnt = self.unchain_df.cache().count()
        chain_cnt = self.df.cache().count()



        tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings prior to analyzing spatial relationships'], \
                    ['Chained count', chain_cnt, init_cnt - chain_cnt, ((init_cnt - chain_cnt) / float(init_cnt)) * 100, \
                      'Count of pings that have spatially proximate neighbors to consider for clustering']]

        # Display filter table
        print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \
                                                          'Percent Reduction', 'Description']))
コード例 #2
0
 def do_it(spark):
     return gen_df(spark, data_gen, length=2048) \
             .withColumn('inc_count_1', f.count('*').over(inclusiveWindowSpec)) \
             .withColumn('inc_count_c', f.count('c').over(inclusiveWindowSpec)) \
             .withColumn('inc_max_c', f.max('c').over(inclusiveWindowSpec)) \
             .withColumn('inc_min_c', f.min('c').over(inclusiveWindowSpec)) \
             .withColumn('lead_5_c', f.lead('c', 5).over(baseWindowSpec)) \
             .withColumn('lead_def_c', f.lead('c', 2, defaultVal).over(baseWindowSpec)) \
             .withColumn('lag_1_c', f.lag('c', 1).over(baseWindowSpec)) \
             .withColumn('lag_def_c', f.lag('c', 4, defaultVal).over(baseWindowSpec)) \
             .withColumn('row_num', f.row_number().over(baseWindowSpec))
コード例 #3
0
    def read(self, iDF):
        plain_df_idx = iDF.rdd\
                    .zipWithIndex().toDF(["row","idx"])\
                    .orderBy(asc("idx"))\
                    .coalesce(10)

        Windowspec = Window.orderBy("idx")
        oDF = plain_df_idx\
                .withColumn("seq", F.lead("row",count=1).over(Windowspec))\
                .withColumn("seqID", F.lead("row",count=0).over(Windowspec))

        parsedDF = oDF.filter(F.col("idx") % 2 == 0).select(
            "seqID", "seq", "+", "quality")
        return parsedDF
コード例 #4
0
def generate_dataset_m(df_assembled, CONFIG_PREPROCESS):
    m_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt')
    m_feat_cols = ['day0_features0']
    m_days_cols = ['day0_features']

    shuffled_with_cell = df_assembled.withColumn(
        'seq',
        row_number().over(m_window_spec)).cache()
    input_m = shuffled_with_cell.withColumnRenamed('features', m_feat_cols[0])
    vector_udt_metadata = input_m.schema[input_m.schema.fields.index(
        input_m.schema[m_feat_cols[0]])].metadata
    # Generate 1 day data (5min * 10 data)
    for i in range(1, CONFIG_PREPROCESS.INPUT_M_SIZE):
        n_features = lead(col(m_feat_cols[0]), i).over(m_window_spec)
        col_name = 'day{}_features{}'.format(0, i)
        input_m = input_m.withColumn(
            col_name, n_features.alias(col_name, metadata=vector_udt_metadata))
        m_feat_cols.append('day{}_features{}'.format(0, i))

    input_m = input_m.dropna()

    input_m = VectorAssembler().setInputCols(m_feat_cols).setOutputCol(
        m_days_cols[0]).transform(input_m)
    vector_udt_metadata = input_m.schema[-1].metadata

    # for DAYS_TO_MEMORY(7) days memory in same time zone
    for i in range(1, CONFIG_PREPROCESS.DAYS_TO_MEMORY):
        n_features = lead(col('day0_features'),
                          int(CONFIG_PREPROCESS.ITEMS_PER_DAY *
                              i)).over(m_window_spec)
        col_name = 'day{}_features'.format(i)
        input_m = input_m.withColumn(
            col_name, n_features.alias(col_name, metadata=vector_udt_metadata))
        m_days_cols.append('day{}_features'.format(i))

    m_tail_skip_size = CONFIG_PREPROCESS.ITEMS_PER_DAY  # rows to skip, for 1 Day (X & Y)
    inbound = when(
        input_m['seq'] <= (max(input_m['seq']).over(
            m_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) -
                           m_tail_skip_size), 1).otherwise(0)
    input_m = input_m.dropna().withColumn('inbound',
                                          inbound).filter('inbound == 1')
    input_m = VectorAssembler().setInputCols(
        m_days_cols).setOutputCol('features').transform(input_m).select([
            'dt', 'CELL_NUM', 'features'
        ])  # assemble DAYS_TO_MEMORY days columns into one ('features')

    return input_m
コード例 #5
0
def Sparkseeds(dict, i, k, hashDF, sc):
    word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j)
            for j in range(0,
                           len(dict[i]) - k)]
    rddW = sc.parallelize(word)
    schemaWordDF = rddW.map(
        lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2]))
    df = sqlContext.createDataFrame(schemaWordDF)
    reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner')
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)
    my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ)
    reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window))
    reDF = reDF.withColumn(
        "dist",
        F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value),
               0).otherwise(reDF.POS_SEQ - reDF.prev_value))
    reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist,
                       reDF.POS_GEN)
    reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window))
    elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50))
                       & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50)))
    reDF = reDF.subtract(elDF)
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)

    #pos = function(reDF)

    return reDF
コード例 #6
0
def load_urzedy(keys_space_name="json",
                table_name="urzedy",
                time_frame=None,
                spark=None,
                agg=None,
                time_frames="5 minutes",
                time_update="1 minute"):

    # Wczytanie danych

    urzedy_temp, sc = load_table.load_and_get_table_df(keys_space_name,
                                                       table_name, time_frame,
                                                       spark)

    urzedy = urzedy_preprocessing(urzedy_temp, agg, time_frames, time_update)

    urzedy.sort("idgrupy", "timestamp").show(300)

    # Stworzenie zmiennej celu

    w = Window().partitionBy("idgrupy").orderBy("timestamp")
    dane = urzedy.withColumn("target",
                             lead("liczbaklwkolejce", 240).over(w)).na.drop()

    #dane.sort("idgrupy", "timestamp").show(300)
    #print(dane.dtypes)

    return dane, sc
コード例 #7
0
def load_powietrze(keys_space_name="json",
                   table_name="powietrze",
                   time_frame=None,
                   spark=None,
                   agg=None,
                   time_frames="5 minutes",
                   time_update="1 minute"):

    # Wczytanie danych

    powietrze_temp, sc = load_table.load_and_get_table_df(
        keys_space_name, table_name, time_frame, spark)

    # Dodanie zmiennych opisujących dokładnie czas i suniecie kolumn nieuzywanych do predykcji

    powietrze = powietrze_preprocessing(powietrze_temp, agg, time_frames,
                                        time_update)

    powietrze.sort("name", "timestamp").show(200)
    # Stworzenie zmiennej celu

    w = Window().partitionBy("name").orderBy("timestamp")
    dane = powietrze.withColumn("target",
                                lead("target_temp", 4).over(w)).na.drop()

    #dane.sort("name", "timestamp").show(200)
    #print(dane.schema)

    return dane, sc
コード例 #8
0
def get_step_duration(step_df,
                      check_col,
                      key_cols,
                      offset=0,
                      duration_col="duration",
                      timestamp_col="timestamp",
                      unit="min"):
    """

    :param step_df:
    :param key_cols:
    :param offset:
    :param duration_col:
    :return:
    """
    window = Window.partitionBy(key_cols).orderBy("time").rowsBetween(1, 1)
    df = step_df.withColumn(
        duration_col,
        get_up_time(func.col(timestamp_col),
                    func.lead(timestamp_col, 1, None).over(window),
                    func.col(check_col), func.lit(offset), func.lit(unit)))
    required_cols = []
    required_cols.extend(["time", timestamp_col])
    required_cols.extend(key_cols)
    required_cols.extend([check_col, duration_col])
    return df.select(required_cols)
コード例 #9
0
ファイル: interpol.py プロジェクト: databrickslabs/tempo
    def __generate_target_fill(self, df: DataFrame, partition_cols: List[str],
                               ts_col: str, target_col: str) -> DataFrame:
        """
        Create columns for previous and next value for a specific target column

        :param df: input DataFrame
        :param partition_cols: partition column names
        :param ts_col: timestamp column name
        :param target_col: target column name
        """
        return (df.withColumn(
            f"previous_{target_col}",
            last(df[target_col], ignorenulls=True).over(
                Window.partitionBy(
                    *partition_cols).orderBy(ts_col).rowsBetween(
                        Window.unboundedPreceding, 0)),
        )
                # Handle if subsequent value is null
                .withColumn(
                    f"next_null_{target_col}",
                    last(df[target_col], ignorenulls=True).over(
                        Window.partitionBy(*partition_cols).orderBy(
                            col(ts_col).desc()).rowsBetween(
                                Window.unboundedPreceding, 0)),
                ).withColumn(
                    f"next_{target_col}",
                    lead(df[target_col]).over(
                        Window.partitionBy(*partition_cols).orderBy(ts_col)),
                ))
コード例 #10
0
def get_session_stats(data_in, name):
    # Limit to song plays
    session_vars = data_in.filter(data_in['page'] == 'NextSong')

    # Get play time for each song
    session_window = ssw.Window.partitionBy(
        session_vars['userId'], session_vars['sessionId']).orderBy('ts')
    session_vars = session_vars.withColumn(
        'nextPlayStart',
        ssf.lead(session_vars['ts']).over(session_window))

    session_vars = session_vars.withColumn(
        'playTime', session_vars['nextPlayStart'] - session_vars['ts'])

    # Get play time for each session
    session_vars = session_vars.groupby('userId', 'sessionId').agg(
        #pylint: disable=no-member
        ssf.sum('playTime').alias('sessionLength'))

    # Get play time stats for each user
    session_vars = session_vars.groupby('userId').agg(
        #pylint: disable=no-member
        ssf.mean('sessionLength').alias(f'lengthMean{name}'),
        ssf.stddev('sessionLength').alias(f'lengthStd{name}'),
        ssf.sum('sessionLength').alias(f'lengthSum{name}'))

    return session_vars
コード例 #11
0
def load_velib(keys_space_name="json",
               table_name="velib",
               time_frame=None,
               spark=None,
               agg=None,
               time_frames="5 minutes",
               time_update="1 minute"):

    # Wczytanie danych

    velib_temp, sc = load_table.load_and_get_table_df(keys_space_name,
                                                      table_name, time_frame,
                                                      spark)

    # Dodanie zmiennych opisujących dokładnie czas i Usuniecie kolumn nieuzywanych do predykcji

    velib = velib_preprocessing(velib_temp, agg, time_frames, time_update)

    velib.sort("station_id", "timestamp").show(300)

    # Stworzenie zmiennej celu

    w = Window().partitionBy("station_id").orderBy("timestamp")
    dane = velib.withColumn("target",
                            lead("num_bikes_available",
                                 240).over(w)).na.drop()

    #dane.sort("station_id", "timestamp").show(300)
    #print(dane.dtypes)

    return dane, sc
コード例 #12
0
def add_distance_column(dfs, order_column='timestamp'):
    # Radians lat/lon
    dfs = dfs.withColumn('latitude2', F.radians('latitude')).withColumn(
        'longitude2', F.radians('longitude'))

    # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist
    w = Window.partitionBy(['userID']).orderBy(order_column)
    dfs = dfs.withColumn('next_lat', F.lead('latitude2', 1).over(w))
    dfs = dfs.withColumn('next_lon', F.lead('longitude2', 1).over(w))

    # Haversine distance
    dfs = dfs.withColumn('distance_next', EARTH_RADIUS * 2 * F.asin(F.sqrt(
        F.pow(F.sin((col('next_lat') - col('latitude2')) / 2.0), 2) + F.cos('latitude2') * F.cos('next_lat') * F.pow(
            F.sin((col('next_lon') - col('longitude2')) / 2.0), 2))))
    dfs = dfs.withColumn('distance_prev', F.lag('distance_next', default=0).over(w)).drop(
        'latitude2').drop('longitude2').drop('next_lon').drop('next_lat').drop('distance_next')
    return dfs
コード例 #13
0
 def _add_time_diff(self, df):
     dedup_cols = self.group_by_columns + [self.time_column]
     df = df.dropDuplicates(dedup_cols)
     df = df.withColumn(
         DIFF_COL,
         F.abs(
             F.unix_timestamp(
                 F.lead(self.time_column).over(self.merge_window)) -
             F.unix_timestamp(F.col(self.time_column))))
     return df
コード例 #14
0
def next_same(df,
              field,
              partition_field='resume_id',
              sort_field='-work_index'):
    """获取 下一个值与当前行该字段的值相等的数据 """
    window_spec = Window.partitionBy(partition_field).orderBy(sort(sort_field))
    new_field = f"next_{field}"
    df = df.withColumn(new_field, F.lead(df[field], 1).over(window_spec))
    # 过滤掉不同的或为空
    df = df.filter(df[field] == df[new_field]).filter(
        df[new_field].isNotNull())
    df = df.drop(new_field)
    return df
コード例 #15
0
def get_change_status(df):
    """Calculates the change status and change date for each record e.g (New, Change, No Change, Deleted)
    :param df:
    :return df: 
    """

    employeeWindowSpec = Window.partitionBy("employee_number").orderBy(
        F.col("snapshot_date"))
    constWindowSpec = Window.partitionBy(F.lit(1)).orderBy(
        F.col("snapshot_date").desc())

    helper_columns = [
        "min_snapshot", "row_hash", "max_snapshot", "prev_row_hash",
        "next_row_hash", "global_max_shapshot"
    ]

    add_change_status_df = df \
        .withColumn("min_snapshot", F.min("snapshot_date").over(employeeWindowSpec)) \
        .withColumn("max_snapshot", F.max("snapshot_date").over(employeeWindowSpec)) \
        .withColumn("prev_row_hash", F.lag("row_hash").over(employeeWindowSpec)) \
        .withColumn("next_row_hash", F.lead("row_hash").over(employeeWindowSpec)) \
        .withColumn("global_max_shapshot", F.max("snapshot_date").over(constWindowSpec)) \
        .withColumn("change_status",
                F.when(F.col("min_snapshot") == F.col("snapshot_date"), "New") \
                .when(F.col("next_row_hash").isNull() & (F.col("max_snapshot") != F.col("global_max_shapshot"))  , "Deleted") \
                .when(F.col("prev_row_hash") != F.col("row_hash"), "Changed") \
                .when(F.col("prev_row_hash") == F.col("row_hash"), "No Change") \
                .otherwise("Unknown") \
               ) \

    rowNum1Window = Window.partitionBy(["employee_number"]).orderBy(
        F.col("snapshot_date").desc())
    rowNum2Window = Window.partitionBy(["employee_number",
                                        "row_hash"]).orderBy(
                                            F.col("snapshot_date").desc())
    gapGrpWindow = Window.partitionBy(["gap_grp"
                                       ]).orderBy(F.col("snapshot_date"))

    helper_columns.extend(["gap_row_num_1", "gap_row_num_2", "gap_grp"])

    add_change_date_df = add_change_status_df \
        .withColumn("gap_row_num_1", F.row_number().over(rowNum1Window)) \
        .withColumn("gap_row_num_2", F.row_number().over(rowNum2Window)) \
        .withColumn("gap_grp", F.col("gap_row_num_1") - F.col("gap_row_num_2")) \
        .withColumn("changed_status_date",F.min("snapshot_date").over(gapGrpWindow)) \
        .withColumn("changed_status_date", \
                    F.when(F.col("change_status") == "Deleted",F.col("snapshot_date")) \
                    .otherwise(F.col("changed_status_date"))) \
        .drop(*helper_columns)

    return add_change_date_df
コード例 #16
0
    def mr(self, dataframe):

        tmp = dataframe.withColumn("num", f.monotonically_increasing_id()) \
            .withColumn("next", f.lead(dataframe[0], 1, 0) \
                        .over(window.Window.orderBy("num"))).drop("num")

        R = tmp.rdd.map(lambda x: abs(x[0] - x[1])).map(
            lambda x: float(x)).collect()[:-1]
        Rbar = np.mean(R)

        lclr = D3[2] * Rbar
        uclr = D4[2] * Rbar

        return R, Rbar, lclr, uclr
コード例 #17
0
    def do_it(spark):
        df = gen_df(spark, data_gen, length=2048) \
            .withColumn('inc_count_1', f.count('*').over(inclusive_window_spec)) \
            .withColumn('inc_count_c', f.count('c').over(inclusive_window_spec)) \
            .withColumn('lead_5_c', f.lead('c', 5).over(base_window_spec)) \
            .withColumn('lag_1_c', f.lag('c', 1).over(base_window_spec)) \
            .withColumn('row_num', f.row_number().over(base_window_spec))

        if isinstance(c_gen, StructGen):
            """
            The MIN()/MAX() aggregations amount to a RANGE query. These are not
            currently supported on STRUCT columns.
            Also, LEAD()/LAG() defaults cannot currently be specified for STRUCT
            columns. `[ 10, 3.14159, "foobar" ]` isn't recognized as a valid STRUCT scalar.
            """
            return df.withColumn('lead_def_c', f.lead('c', 2, None).over(base_window_spec)) \
                     .withColumn('lag_def_c', f.lag('c', 4, None).over(base_window_spec))
        else:
            default_val = gen_scalar_value(c_gen, force_no_nulls=False)
            return df.withColumn('inc_max_c', f.max('c').over(inclusive_window_spec)) \
                     .withColumn('inc_min_c', f.min('c').over(inclusive_window_spec)) \
                     .withColumn('lead_def_c', f.lead('c', 2, default_val).over(base_window_spec)) \
                     .withColumn('lag_def_c', f.lag('c', 4, default_val).over(base_window_spec))
コード例 #18
0
def create_vars(df, cells):
    # Loading variables
    df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss"))
    #get call_date from call_datetime
    df = df.withColumn('call_date', df.call_datetime.cast('date'))
    
    # Recreate analysis variables
    df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\
      .orderBy('msisdn', 'call_datetime')\
      .withColumn('region_lag', F.lag('region').over(user_window))\
      .withColumn('region_lead', F.lead('region').over(user_window))\
      .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\
      .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\
      .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\
      .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\
      .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\
      .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\
      .withColumn('constant', F.lit(1).cast('byte'))\
      .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\
      .na.fill({'region' : missing_value_code ,
                'region_lag' : missing_value_code ,
                'region_lead' : missing_value_code })    

    return df
コード例 #19
0
    def i(self, dataframe):
        tmp = dataframe.withColumn("num", f.monotonically_increasing_id()) \
            .withColumn("next", f.lead(dataframe[0], 1, 0) \
                        .over(window.Window.orderBy("num"))).drop("num")

        R = tmp.rdd.map(lambda x: abs(x[0] - x[1])).map(
            lambda x: float(x)).collect()[:-1]
        Rbar = np.mean(R)

        X = dataframe.rdd.map(lambda x: float(x[0])).collect()
        Xbar = np.mean(X)

        lcl = Xbar - 3 * (Rbar / d2[2])
        ucl = Xbar + 3 * (Rbar / d2[2])

        return X, Xbar, lcl, ucl
コード例 #20
0
ファイル: function.py プロジェクト: amustaf5/SparkVCFtools
def preVCF(hdfs, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))\
                       .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\
                       .withColumn("INFO", when(F.col("INFO").startswith("END="), None).otherwise(F.col("INFO")))

    sample_name = vcf_data.columns[-1]
    vcf_data = vcf_data.drop("QUAL", "FILTER", sample_name)

    for index in range(len(vcf_data.columns)):
        compared_arr = ["#CHROM", "POS", "REF"]
        if vcf_data.columns[index] not in compared_arr:
            vcf_data = vcf_data.withColumn(vcf_data.columns[index],
                                           F.array(vcf_data.columns[index]))
            vcf_data = vcf_data.withColumnRenamed(
                vcf_data.columns[index],
                vcf_data.columns[index] + "_" + sample_name)
    vcf_data = vcf_data.withColumnRenamed("REF", "REF_" + sample_name)
    vcf_data = vcf_data.withColumn("count", F.length(vcf_data.columns[3]))

    # window & split col parameter
    sample_ref = vcf_data.columns[3]
    indel_window = Window.partitionBy("#CHROM").orderBy("POS")
    split_col = F.split("REF_temp", '_')
    ###

    vcf_data = vcf_data.withColumn("count", when(F.col("count") >= 2, F.lead("POS", 1).over(indel_window) - F.col("POS") - F.lit(1))\
                                           .otherwise(F.lit(0)))

    not_indel = vcf_data.drop("count").withColumn(sample_ref,
                                                  F.array(F.col(sample_ref)))
    indel = vcf_data.filter(F.col("count") >= 1)\
                .withColumn(sample_ref, ref_melt(F.col(sample_ref), F.col("count"))).drop("count")\
                .withColumn(sample_ref, explode(F.col(sample_ref))).withColumnRenamed(sample_ref, "REF_temp")\
                .withColumn(sample_ref, F.array(split_col.getItem(0))).withColumn('POS_var', split_col.getItem(1))\
                .drop(F.col("REF_temp")).withColumn("POS", (F.col("POS") + F.col("POS_var")).cast(IntegerType()))\
                .drop(F.col("POS_var"))\
                .withColumn(vcf_data.columns[2], F.array(F.lit(".")))\
                .withColumn(vcf_data.columns[4], F.array(F.lit("*, <NON_REF>")))

    return not_indel.unionByName(indel)
コード例 #21
0
ファイル: interpol.py プロジェクト: databrickslabs/tempo
    def __generate_time_series_fill(self, df: DataFrame,
                                    partition_cols: List[str],
                                    ts_col: str) -> DataFrame:
        """
        Create additional timeseries columns for previous and next timestamps

        :param df: input DataFrame
        :param partition_cols: partition column names
        :param ts_col: timestamp column name
        """
        return df.withColumn(
            "previous_timestamp",
            col(ts_col),
        ).withColumn(
            "next_timestamp",
            lead(df[ts_col]).over(
                Window.partitionBy(*partition_cols).orderBy(ts_col)),
        )
コード例 #22
0
def get_dmi_realtime_value(spark_df, cal_trend_ratio):
    """
    实时计算dmi的值 按照存储过程cd_11_sec.dbo.compute_mkt_adx_v2翻译过来 未做改动
    :param spark_df: 行情数据 含 open high low close 为spark类型dataframe
    :param cal_trend_ratio: 保留多少比例的数据用于计算信号
    :return:
    """
    dmi_period = 30  # 周期并非传统的14
    # 对于最后一行 x y z为空值
    udf_max = func.udf(
        lambda x, y, z: float(
            max(max(abs(x), abs(y)), abs(z))
            if x is not None and y is not None and z is not None else np.nan),
        FloatType())
    udf_get_ema = func.udf(lambda x: float(get_ema(x, dmi_period)),
                           FloatType())
    # di_plus di_minus tr 与传统的算法存在差异 是用后一天减去当前天 (具体原因不明) 具体可查看compute_mkt_adx_v2
    w = Window.partitionBy('sec_cd').orderBy('pub_dt')
    spark_df = spark_df.withColumn('di_plus', func.lead('high', 1).over(w) - func.col('high'))\
        .withColumn('di_minus', func.col('low') - func.lead('low', 1).over(w))\
        .withColumn('tr', udf_max(
                            (func.lead('high', 1).over(w) - func.lead('low', 1).over(w)),
                            (func.lead('high', 1).over(w) - func.col('close')),
                            (func.col('close') - func.lead('low', 1).over(w))))
    # 将nan置为空
    spark_df = spark_df.replace(float('nan'), None)
    # 对di_plus di_minus 进行修正 sm_di_plus sm_di_minus sm_tr为di_plus di_minus tr的加权平均值
    # pdi mdi 此处取了前一天的值 不明白其中原因
    spark_df = spark_df\
        .withColumn('di_plus', func.when(
        (func.col('di_plus') > func.col('di_minus')) & (func.col('di_plus') > 0),
        func.col('di_plus')).otherwise(0))\
        .withColumn('di_minus', func.when(
        (func.col('di_minus') > func.col('di_plus')) & (func.col('di_minus') > 0),
        func.col('di_minus')).otherwise(0))\
        .withColumn('sm_di_plus', udf_get_ema(func.collect_list('di_plus').over(w)))\
        .withColumn('sm_di_minus', udf_get_ema(func.collect_list('di_minus').over(w)))\
        .withColumn('sm_tr', udf_get_ema(func.collect_list('tr').over(w)))\
        .withColumn('pdi', func.col('sm_di_plus')/func.col('sm_tr')*100)\
        .withColumn('mdi', func.col('sm_di_minus')/func.col('sm_tr')*100)\
        .withColumn('pdi', func.lag('pdi', 1).over(w))\
        .withColumn('mdi', func.lag('mdi', 1).over(w))
    # 取最后40%数据
    w = Window.partitionBy('sec_cd')
    spark_df = spark_df.withColumn('sec_length', func.count('pub_dt').over(w))
    w = Window.partitionBy('sec_cd').orderBy('pub_dt')
    spark_df = spark_df.withColumn('row_no', func.count('pub_dt').over(w))
    spark_df = spark_df.filter(
        spark_df.row_no >= spark_df.sec_length * cal_trend_ratio)
    spark_df = spark_df.select('sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pdi',
                               'mdi')
    return spark_df
コード例 #23
0
 def transform(self, df):
     """Cleanup and add new field for next url
     Assumptions after initial data analysis -
                   1) since this is about user visit journey drop all null visitorids
                   2) remove the row if there is any repeated page url in the sequence
                   before assigning next page url
                    pages(a -> a -> b -> a) => (a->b->a)
     """
     # drop null visitors and repartition by visitor to perform multiple sorting on visitor id
     df = df.na.drop(subset=['visitorId']).repartition("visitorId")
     # change datatype of timestamp to long before applying sorting
     df = df.select("id", df['timestamp'].cast('Long').alias('timestamp'), "type", "visitorId","pageURL")
     # Remove duplicate url entries per visitor in adjuscent rows
     window_spec = Window.partitionBy("visitorId").orderBy("timestamp")
     df = df.withColumn("prevUrl", F.lag("pageURL").over(window_spec)).filter("pageURL != prevUrl or prevUrl is null")
     # add next page url and display timestamp in readable format
     df = df.withColumn("nextPageUrl",F.lead("pageURL").over(window_spec))
     return(df.select("id", "timestamp", "type" , "visitorId", "pageUrl", "nextPageUrl"))
コード例 #24
0
def add_columns_lead(df, list_of_tuples, window):
    """
    :param df: Spark DataFrame
    :param list_of_tuples:
    Ex:
    [
        ("old_column_1", "new_column_1"),
        ("old_column_2", "new_column_2"),
        ("old_column_3", "new_column_3")
    ]
    :param window: Spark Window to iterate over
    :return: Spark DataFrame with new columns
    """

    for (old_col, new_col) in list_of_tuples:
        df = df.withColumn(new_col, lead(old_col).over(window))

    return df
コード例 #25
0
def add_prev_or_next(df,
                     old_field,
                     new_field,
                     partition_field='resume_id',
                     sort_field='-work_index'):
    """为某一字段的每一行添加其前一个或后一个值"""
    window_spec = Window.partitionBy(partition_field).orderBy(sort(sort_field))
    if 'prev' in new_field:
        # 取前一行
        df = df.withColumn(new_field,
                           F.lag(df[old_field], 1).over(window_spec))
    elif 'next' in new_field:
        # 取后一行
        df = df.withColumn(new_field,
                           F.lead(df[old_field], 1).over(window_spec))
    # 过滤掉相同的或为空的
    df = df.filter(df[old_field] != df[new_field]).filter(
        df[new_field].isNotNull())
    # df = df.filter(df[new_field].isNotNull())

    return df
コード例 #26
0
def get_dmi_realtime_signal(spark_df, cal_trend_ratio):
    dmi_period = 30  # 周期并非传统的14
    # 对于最后一行 x y z为空值
    udf_max = func.udf(
        lambda x, y, z: float(
            max(max(abs(x), abs(y)), abs(z))
            if x is not None and y is not None and z is not None else np.nan),
        FloatType())
    udf_get_ema = func.udf(lambda x: float(get_ema(x, dmi_period)),
                           FloatType())
    # di_plus di_minus tr 按储存过程翻译过来 与传统的算法存在差异 具体可查看compute_mkt_adx_v2
    w = Window.partitionBy('sec_cd').orderBy('pub_dt')
    spark_df = spark_df.withColumn('di_plus', func.lead('high', 1).over(w) - func.col('high'))\
        .withColumn('di_minus', func.col('low') - func.lead('low', 1).over(w))\
        .withColumn('tr', udf_max(
                            (func.lead('high', 1).over(w) - func.lead('low', 1).over(w)),
                            (func.lead('high', 1).over(w) - func.col('close')),
                            (func.col('close') - func.lead('low', 1).over(w))))
    # 将nan置为空
    spark_df = spark_df.replace(float('nan'), None)
    # 对di_plus di_minus 进行修正 sm_di_plus sm_di_minus sm_tr为di_plus di_minus tr的加权平均值
    # pdi mdi 此处取了前一天的值 不明白其中原因
    spark_df = spark_df\
        .withColumn('di_plus', func.when(
        (func.col('di_plus') > func.col('di_minus')) & (func.col('di_plus') > 0),
        func.col('di_plus')).otherwise(0))\
        .withColumn('di_minus', func.when(
        (func.col('di_minus') > func.col('di_plus')) & (func.col('di_minus') > 0),
        func.col('di_minus')).otherwise(0))\
        .withColumn('sm_di_plus', udf_get_ema(func.collect_list('di_plus').over(w)))\
        .withColumn('sm_di_minus', udf_get_ema(func.collect_list('di_minus').over(w)))\
        .withColumn('sm_tr', udf_get_ema(func.collect_list('tr').over(w)))\
        .withColumn('pdi', func.col('sm_di_plus')/func.col('sm_tr')*100)\
        .withColumn('mdi', func.col('sm_di_minus')/func.col('sm_tr')*100)\
        .withColumn('pdi', func.lag('pdi', 1).over(w))\
        .withColumn('mdi', func.lag('mdi', 1).over(w))
    # 取最后40%数据
    w = Window.partitionBy('sec_cd')
    spark_df = spark_df.withColumn('sec_length', func.count('pub_dt').over(w))
    w = Window.partitionBy('sec_cd').orderBy('pub_dt')
    spark_df = spark_df.withColumn('row_no', func.count('pub_dt').over(w))
    spark_df = spark_df.filter(
        spark_df.row_no >= spark_df.sec_length * cal_trend_ratio)
    spark_df = spark_df.select('sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pdi',
                               'mdi')
    return spark_df
コード例 #27
0
def json_without_trailing_comma(json_as_text: DataFrame) -> DataFrame:
    """Remove trailing commas from a dataframe representing a json file read as text.
    Args:
      json_as_text (DataFrame): the input json as dataframe of strings
    Returns:
      :DataFrame: a dataframe of strings representing a valid json without trailing commas
    """
    index_window = Window.partitionBy().orderBy("id")
    columns = ["line", "id"]

    dataframe_with_next_record = json_as_text \
        .rdd \
        .zipWithIndex() \
        .toDF(columns) \
        .withColumn("next_line", functions.lead("line").over(index_window))

    dataframe_with_valid_lines = dataframe_with_next_record \
        .rdd \
        .map(remove_trailing_comma) \
        .toDF() \
        .select("line.value")

    return dataframe_with_valid_lines
コード例 #28
0
def prepare_interpolated_updates_dataframe(spark: SparkSession,
                                           silver_df: DataFrame) -> DataFrame:
    dateWindow = Window.partitionBy("device_id").orderBy("p_eventdate")

    lag_lead_silver_df = silver_df.select(
        "*",
        lag(col("heartrate")).over(dateWindow).alias("prev_amt"),
        lead(col("heartrate")).over(dateWindow).alias("next_amt"),
    )
    updates = lag_lead_silver_df.where(col("heartrate") < 0)
    updates = updates.withColumn(
        "heartrate",
        when(col("prev_amt").isNull(), col("next_amt")).otherwise(
            when(col("next_amt").isNull(), col("prev_amt")).otherwise(
                (col("prev_amt") + col("next_amt")) / 2)),
    )
    return updates.select(
        "device_id",
        "heartrate",
        "eventtime",
        "name",
        "p_eventdate",
    )
コード例 #29
0
def generate_dataset_x_y(df_assembled, CONFIG_PREPROCESS):
    x_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt')

    x_head_skip_size = CONFIG_PREPROCESS.DAYS_TO_MEMORY * CONFIG_PREPROCESS.ITEMS_PER_DAY
    shuffled_with_cell = df_assembled.withColumn(
        'seq',
        row_number().over(x_window_spec)).cache()
    input_x = shuffled_with_cell.filter('seq > ' + str(x_head_skip_size))

    x_feat_cols = ['features0']
    input_x = input_x.withColumnRenamed('features', x_feat_cols[0])
    vector_udt_metadata = input_x.schema[input_x.schema.fields.index(
        input_x.schema[x_feat_cols[0]])].metadata

    for i in range(1, CONFIG_PREPROCESS.INPUT_X_SIZE):
        n_features = lead(col(x_feat_cols[0]), i).over(x_window_spec)
        col_name = 'features' + str(i)
        input_x = input_x.withColumn(
            'features' + str(i),
            n_features.alias(col_name, metadata=vector_udt_metadata))
        x_feat_cols.append('features{}'.format(i))

    x_tail_skip_size = CONFIG_PREPROCESS.INPUT_X_SIZE + CONFIG_PREPROCESS.INPUT_Y_SIZE - 1
    inbound = when(
        input_x['seq'] <= (max(input_x['seq']).over(
            x_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) -
                           x_tail_skip_size), 1).otherwise(0)
    input_x = input_x.withColumn('inbound', inbound).filter('inbound == 1')
    input_x = VectorAssembler().setInputCols(x_feat_cols).setOutputCol(
        'features').transform(input_x).select(['dt', 'CELL_NUM', 'features'])

    y_head_skip_size = CONFIG_PREPROCESS.DAYS_TO_MEMORY * CONFIG_PREPROCESS.ITEMS_PER_DAY + CONFIG_PREPROCESS.INPUT_X_SIZE  # rows to skip, for M & X
    input_y = shuffled_with_cell.filter('seq > ' +
                                        str(y_head_skip_size)).drop('seq')

    return input_x, input_y
コード例 #30
0
def main():

    parser = argparse.ArgumentParser()
    # Read Arguments
    parser.add_argument("--std", "-s", help="Number of STDEVs for labelling")
    parser.add_argument("--window", "-w", help="Window size")
    parser.add_argument("--ahead", "-a", help="Days ahead to check")
    parser.add_argument("--target", "-t", help="Target column to be used")
    parser.add_argument("--bucket", "-b", help="GS bucket name")
    parser.add_argument("--input", "-i", help="Path to input data within bucket")
    parser.add_argument("--images", "-k", help="Flag to indicate to generate images")

    parser.add_argument("--volume", "-v", help="Plot Volume?")
    parser.add_argument("--sma", "-m", help="Plot SMA?")
    parser.add_argument("--rsi", "-r", help="Plot RSI?")
    parser.add_argument("--figsize", "-fig", help="Figure Size")
    parser.add_argument("--offset", "-f", help="Offset")
    parser.add_argument("--dpi", "-d", help="DPI")
    parser.add_argument("--stocks", "-c", help="Stocks")

    # Init Context and Session
    spark = SparkSession.builder.appName('Preprocessing').getOrCreate()

    # read arguments from the command line
    args = parser.parse_args()

    # Set parameters for execution
    stdevs = spark.sparkContext.broadcast(float(args.std) if args.std else 1.0)
    days_ahead = spark.sparkContext.broadcast(int(args.ahead) if args.ahead else 10)
    target = spark.sparkContext.broadcast(args.target if args.target else 'ADJ_CLOSE')
    figsize = spark.sparkContext.broadcast(int(args.figsize) if args.figsize else 1)
    volume = spark.sparkContext.broadcast(args.volume if args.volume else True)
    sma = spark.sparkContext.broadcast(args.sma if args.sma else True)
    rsi = spark.sparkContext.broadcast(args.rsi if args.rsi else False)
    window_size = spark.sparkContext.broadcast(int(args.window) if args.window else 21)
    window_offset = spark.sparkContext.broadcast(int(args.offset) if args.offset else 5)
    dpi = spark.sparkContext.broadcast(int(args.dpi) if args.dpi else 72)
    stocks = args.stocks if args.stocks else 'sp500.csv'
    create_images = args.images if args.images else False

    # Initialize logger
    log = Log4j(spark)

    # Log used parameters
    log.log("Std. Devs: " + str(stdevs.value))
    log.log("Window size: " + str(window_size.value))
    log.log("Window offset: " + str(window_offset.value))
    log.log("Days ahead: " + str(days_ahead.value))
    log.log("Column to be used: " + str(target.value))
    log.log("Plot volume: " + str(volume.value))
    log.log("Figure Size: " + str(figsize.value))
    log.log("DPI: " + str(dpi.value))
    log.log("Create Images: " + str(create_images))


    # Read source dataframe from GS Bucket
    # This file contains all the prices we are going to use
    inputpath = 'gs://' + args.bucket + '/' + args.input
    df = spark.read.csv(inputpath, header=True, inferSchema=True)

    log.log("Num Partitions: " + str(df.rdd.getNumPartitions()))

    ## LABEL CALCULATION

    # Calculate pct daily change
    df = df.withColumn('PREV_DAY', lag(df[target.value]).over(Window.partitionBy("SYMBOL").orderBy("DATE")))
    df = df.withColumn('CHANGE', 100 * (df[target.value] - df['PREV_DAY']) / df[target.value])

    # Get N number of days ahead
    df = df.withColumn('AHEAD', lead(df[target.value], count=days_ahead.value).over(Window.partitionBy("SYMBOL").orderBy("DATE")))

    # Calculate Annual HVOl
    windowSpec = Window.partitionBy("SYMBOL").orderBy(col('DATE').cast('long')).rowsBetween(-window_size.value, 0)
    df = df.withColumn('HVOL', stddev(col(target.value) * np.sqrt(252)).over(windowSpec))

    # Calculate Upper and Lower limits
    df = df.withColumn('UPPER_LIMIT',
            col(target.value) + col(target.value) * ((col('HVOL') * np.sqrt(days_ahead.value) / (np.sqrt(252))) * stdevs.value / 100))
    df = df.withColumn('LOWER_LIMIT',
            col(target.value) - col(target.value) * ((col('HVOL') * np.sqrt(days_ahead.value) / (np.sqrt(252))) * stdevs.value / 100))

    # Finally, calculate the label
    df = df.withColumn('LABEL',
            (when(col('AHEAD') >  col('UPPER_LIMIT'), 1)
            .when(col('AHEAD') >= col('LOWER_LIMIT'), 0)
            .otherwise(-1)))

    # Build path to save file
    data_label = datetime.now().strftime("%Y%m%d_%H%M%S") + '_W' + str(window_size.value) + '_A' + str(days_ahead.value) + '_O' + str(window_offset.value) + '_S' + str(stdevs.value)
    outputpath = 'gs://' + args.bucket + '/data/' + data_label
    log.log("Writting file to GCS: " + outputpath)

    # Check if create images flag is active
    if create_images == True:
        # Generate image data for CNN
        log.log("Generating Images")
        images(spark, log, 'gs://' + args.bucket + '/data/' + stocks, outputpath, df, figsize, window_size, volume, window_offset, dpi, target, sma, rsi)
    else:
        # Save prices with label for LSTM
        log.log("Save prices with label")
        df.repartition(1).write.csv((outputpath + "/labels"), header=True, mode='overwrite')
コード例 #31
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)
コード例 #32
0
ファイル: main_inc.py プロジェクト: dalinqin/src
    def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag):
        #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt"
        
        #inThresh=10
        #outThresh=300
        
        #**************************************
        #
        #this procedure will use incfile to caculate 
        #flow(everyday one record) store as file
        #indoor(everyday one record) store as file
        #indoor_for_delete(every indoor records) store in hbase
        #indoor detail(every indoor records) store in hbase and as file
        #
        #destIndoorFile     : /data/indoor/entityid/year/id_date.json      used to generate report
        #destFlowFile       : /data/flow/entityid/year/fl_date.json        used to generate report
        #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json  this folder is mirror of hbase records
        #
        #
        #**************************************

        destIndoorFile=get_str_indoorFileName(incFileName)
        #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt
        rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/")
        #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt
        destFlowFile  =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_")
        #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt
        tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
        tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
        tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
        EntityID=int(get_str_entityID(incFileName))
        #101762
        histFileName=get_str_histFileName(incFileName) #processed file will be place here
        #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt
        if fs.exists(sc._jvm.Path(histFileName)):
            tmpFileName=get_str_tmpFileName(histFileName)
            #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt
            tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp"
            #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp
            #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) 
            #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and  destroy the inc file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con)
            #copymerge the 2 files (inc and hist) into one file 
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None)
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con)

        unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName)
        # firtDayofMonth= 1475251200 it is 20161001 unixdate
        startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime
 
        rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2]))
        HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp")
        hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf")
        
        df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1")
        df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1)
        df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2")
        df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2")
        df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time")
        df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3")
        
        df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1)
        df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day
        df5.registerTempTable("df5")
        #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation
        
        df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5")
        df6.registerTempTable("df6_indoor")
        df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday")
        df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))

        flow_sql=  "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac"
        hc.sql(flow_sql).registerTempTable("df_flow_tmp")
        df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))
       
        #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        #df_flow.saveAsTable("T_FLOW")
        if len(df5.head(1))==1:  #df5 is not empty better than df5.rdd.isEmpty
            tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
            df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder)            
            #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder)
            df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
            if fs.exists(sc._jvm.Path(rec_destIndoorfile)):  #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows
                rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4])))
                HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor")
                df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") 
                df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") 
                df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk)
                if len(df_delete_pk.head(1))==1:
                    df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        else:
            tmp_rec_destIndoorFolder="NONE"
            
        if len(df_flow.head(1))==1:
            tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
            df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder)
            #df_flow.write.mode('overwrite').json(tmp_destFlowFolder)
        else:
            tmp_destFlowFolder="NONE"
            
        if len(df_current_result.head(1))==1:
            tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
            df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder)
            #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder)
        else:
            tmp_destIndoorFolder="NONE"
        
        sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) 

        if fs.exists(sc._jvm.Path(destIndoorFile)):
            fs.delete(sc._jvm.Path(destIndoorFile))
        if fs.exists(sc._jvm.Path(destFlowFile)):
            fs.delete(sc._jvm.Path(destFlowFile))
        if fs.exists(sc._jvm.Path(rec_destIndoorfile)):
            fs.delete(sc._jvm.Path(rec_destIndoorfile))        
        #delete is a must if file already exists otherwise copymerge will fail  
        
        if tmp_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None)
            #destIndoorFile=get_str_indoorFileName(incFileName)
        if tmp_destFlowFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None)
        if tmp_rec_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)