def get_undirected_features(paired_interactions, paired_interactions_articles):
    features_all = paired_interactions.withColumn('pair',
                                                  f.array_sort(f.array(col('event_user_id'), col('event_user_id_r')))) \
        .drop_duplicates(subset=['pair']).select('pair', 'num_common_pages')
    features_articles = paired_interactions_articles.withColumn('pair', f.array_sort(
        f.array(col('event_user_id'), col('event_user_id_r')))) \
        .drop_duplicates(subset=['pair']) \
        .select('pair', 'num_common_articles')  # ,'mean_concentration_ratio')

    undirected = features_all.join(features_articles, on='pair')
    return undirected
Ejemplo n.º 2
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
Ejemplo n.º 3
0
def unique_values(df, cols):
    from functools import reduce
    
    counts = df.groupBy(
        F.lit(True).alias("drop_me")
    ).agg(
        *[F.array_sort(F.collect_set(F.col(c))).alias(c) for c in cols]
    ).drop("drop_me").cache()
    
    result = reduce(lambda l, r: l.unionAll(r), [counts.select(F.lit(c).alias("field"), F.col(c).alias("unique_vals")) for c in counts.columns]).collect()
    counts.unpersist()
    
    return dict([(r[0],r[1]) for r in result])
Ejemplo n.º 4
0
df.cache().count()

pairs=df.groupBy(["tract","patch"]).count()
p= pairs.groupBy("tract").count().withColumnRenamed("count","npatch")
p.count()
good=p.filter(p['npatch']==49).sort("tract")
good.count()
good.show(200)

g=good.select("tract").collect()
from numpy import *
a=array([gg[0] for gg in g])


bad=p.filter(p['npatch']!=49).sort("tract")
bad.show(200)
pairs.join(bad,"tract").groupBy("tract").agg(F.count("patch"),F.array_sort(F.collect_list("patch"))).sort("tract").show(200,truncate=False)

#geometry
geo=df.groupBy("tract").agg(F.avg("ra"),F.min("ra"),F.max("ra"),F.min("dec"),F.max("dec"),F.avg("dec"))

#join with npatch
dfj=geo.join(p,"tract")

#ckeck bads
p=dfj.toPandas()
plt.plot(p["avg(ra)"],p["avg(dec)"],'o')
bad=(p.npatch!=49)
plt.plot(p[bad]["avg(ra)"],p[bad]["avg(dec)"],'ro')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lang",
                        "-l",
                        default="enwiki",
                        type=str,
                        help="language to parse (en or enwiki)")

    parser.add_argument(
        "--start",
        "-t1",
        default=None,
        type=str,
        help=
        "start day to parse [inclusive] (YYYY-MM-DD-HH); default: previous day - 7days"
    )

    parser.add_argument(
        "--end",
        "-t2",
        default=None,
        type=str,
        help=
        "end day to parse [exclusive] (YYYY-MM-DD-HH); default: current day")

    args = parser.parse_args()
    lang = args.lang.replace('wiki', '')
    wiki_db = lang + 'wiki'

    t1 = args.start
    t2 = args.end
    if t1 != None and t2 != None:
        try:
            date_start = datetime.datetime.strptime(t1, '%Y-%m-%d-%H')
            date_end = datetime.datetime.strptime(t2, '%Y-%m-%d-%H')
        except ValueError:
            print('Provide correct day-format YYYY-MM-DD-HH')
    else:

        date_start = datetime.date.today() - datetime.timedelta(days=8)
        date_end = datetime.date.today()

    date_start_str = date_start.strftime('%Y-%m-%d-%H')
    date_end_str = date_end.strftime('%Y-%m-%d-%H')

    #### other parameters
    ## filter pageviews from actor with more than 500 pageviews
    ## the aim is to filter automated traffic that is not tagged as spider
    n_p_max = 500  ## maximum number of pageviews/user/day
    n_p_min = 1  ## minimum number of pageviews/user/day

    ## filtering sessions
    dt = 3600  ## cutoff for splitting sessions(interevent time between 2 pageivews)
    nlen_min = 2  ## min length of session
    nlen_max = 30  ## max length of session

    ## sessions will be saved locally in filename_save
    path_save = os.path.abspath('../output/sessions/')
    # filename_save = '%s.reading-sessions-%s--%s'%(lang,date_start_str,date_end_str)
    filename_save = 'reading-sessions-actors_%s_%s_%s' % (
        wiki_db, date_start_str, date_end_str)

    ## tmp-directory for data on hive (will be deleted)
    base_dir_hdfs = '/tmp/reader-embedding/sessions'

    ### start
    spark = SparkSession.builder\
        .master('yarn')\
        .appName('reading-sessions')\
        .enableHiveSupport()\
        .getOrCreate()

    ########
    ## query
    ################################################
    ## time-window
    ts_start = calendar.timegm(date_start.timetuple())
    ts_end = calendar.timegm(date_end.timetuple())
    row_timestamp = F.unix_timestamp(
        F.concat(F.col('year'), F.lit('-'), F.col('month'), F.lit('-'),
                 F.col('day'), F.lit(' '), F.col('hour'), F.lit(':00:00')))

    ## window for counting pageviews per actor per day
    w_p = Window.partitionBy(F.col('actor_signature_per_project_family'),
                             F.col('year'), F.col('month'), F.col('day'))

    ### actor table (filtered webrequests)
    ## https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_actor
    df_actor = (
        spark.read.table('wmf.pageview_actor').where(
            row_timestamp >= ts_start).where(row_timestamp < ts_end).where(
                F.col('is_pageview') == True)
        ## agent-type user to filter spiders
        ## https://meta.wikimedia.org/wiki/Research:Page_view/Tags#Spider
        .where(F.col('agent_type') == "user")
        ## user: desktop/mobile/mobile app; isaac filters != mobile app
        .where(F.col('access_method') != "mobile app")
        ## only wikis
        .where(F.col('normalized_host.project_family') == 'wikipedia')
        ## only namespace 0
        .where(F.col('namespace_id') == 0).withColumn(
            'wiki_db', F.concat(F.col('normalized_host.project'),
                                F.lit('wiki'))))
    ## filter only specific wiki (or all if wiki_db=='wikidata')
    if wiki_db == 'wikidata':
        pass
    else:
        df_actor = df_actor.where(F.col('wiki_db') == wiki_db)

    ## checkpoint for inspecting table
    # df_actor.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet')

    # filter maximum and minimum pageviews per user
    # n_p is the number of pageviews per actor per day (across projects)
    df_actor = (df_actor.withColumn(
        'n_p',
        F.sum(F.lit(1)).over(w_p)).where(F.col('n_p') >= n_p_min).where(
            F.col('n_p') <= n_p_max))

    ## join the wikidata-item to each pageview
    ## we keep only pageviews for which we have a correpsionding wikidata-item id

    ## table with mapping wikidata-ids to page-ids
    ## partition wikidb and page-id ordered by snapshot
    w_wd = Window.partitionBy(F.col('wiki_db'), F.col('page_id')).orderBy(
        F.col('snapshot').desc())
    df_wd = (
        spark.read.table('wmf.wikidata_item_page_link')
        ## snapshot: this is a partition!
        .where(
            F.col('snapshot') >=
            '2020-07-01')  ## resolve issues with non-mathcing wikidata-items
        ## only wikis (enwiki, ... not: wikisource)
        .where(F.col('wiki_db').endswith('wiki')))
    ## filter only specific wiki (or all if wiki_db=='wikidata')
    if wiki_db == 'wikidata':
        pass
    else:
        df_wd = df_wd.where(F.col('wiki_db') == wiki_db)
    ## get the most recent wikidata-item for each pid+wikidb
    df_wd = (df_wd.withColumn(
        'item_id_latest',
        F.first(F.col('item_id')).over(w_wd)).select(
            'wiki_db', 'page_id',
            F.col('item_id_latest').alias('item_id')).drop_duplicates())
    df_actor_wd = (df_actor.join(df_wd, on=['page_id', 'wiki_db'],
                                 how='inner'))

    ## aggregate all pageviews with same actor-signature across wikis to get sessions
    df_actor_wd_agg = (
        df_actor_wd.groupby('actor_signature_per_project_family').agg(
            # F.first(F.col('access_method')).alias('access_method'), ## this could change along a session
            # F.first(F.col('geocoded_data')).alias('geocoded_data'),
            #              F.first(F.col('n_p_by_user')).alias('session_length'),
            F.array_sort(
                F.collect_list(
                    F.struct(
                        F.col('ts'),
                        F.col('page_id'),
                        F.col('pageview_info.page_title').alias('page_title'),
                        F.col('wiki_db'),
                        F.col('item_id').alias('qid'),
                    ))).alias('session')))

    ## apply filter to the sessions
    try:
        os.mkdir(path_save)
    except FileExistsError:
        pass
    PATH_TMP = os.path.join(path_save, 'tmp')
    try:
        os.mkdir(PATH_TMP)
    except FileExistsError:
        pass

    ## hdfs-storing, some temporary files which will be deleted later
    output_hdfs_dir = os.path.join(base_dir_hdfs, filename_save)
    os.system('hadoop fs -rm -r %s' % output_hdfs_dir)
    ## local storing
    base_dir_local = path_save
    output_local_dir_tmp = os.path.join(base_dir_local, 'tmp', filename_save)
    output_local_file = os.path.join(base_dir_local, filename_save)

    ## load data
    # requests = spark.read.load(filename).rdd.map(lambda x: x['session'])
    requests = df_actor_wd_agg.rdd.map(lambda x: x['session'])
    ## keep only pageviews from a language
    requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None])
    to_str = lambda x: ' '.join([str(e['page_id']) for e in x])

    (requests.map(parse_requests).filter(
        filter_blacklist_qid)  ## remove main_page
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .map(filter_unique_articles
          )  ## remove repeated occurrence of same article in session
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .flatMap(lambda x: sessionize(x, dt=dt)
              )  ## break sessions if interevent time is too large
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .filter(lambda x: len(x) <= nlen_max
             )  ## only sessions with at most length nlen_max
     .map(to_str)  ## conctenate session as single string
     ## write to hdfs
     .saveAsTextFile(
         output_hdfs_dir,
         compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec"))

    ## copy to local (set of tmp-dirs)
    os.system('hadoop fs -copyToLocal %s %s' %
              (output_hdfs_dir, output_local_dir_tmp))
    ## concatenate and unzip into single file
    os.system('cat %s/* | gunzip > %s' %
              (output_local_dir_tmp, output_local_file))
    # ## remove set of tmp-dirs
    os.system('rm -rf %s' % output_local_dir_tmp)
    # ## remove hadoop data
    os.system('hadoop fs -rm -r %s' % output_hdfs_dir)

    print('Path to reading sessions: %s' % filename_save)
    return filename_save
Ejemplo n.º 6
0
    def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        if isinstance(right, (list, tuple)):
            from pyspark.pandas.series import first_series, scol_for
            from pyspark.pandas.frame import DataFrame
            from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField

            len_right = len(right)
            if len(left) != len(right):
                raise ValueError("Lengths must be equal")

            sdf = left._internal.spark_frame
            structed_scol = F.struct(
                sdf[NATURAL_ORDER_COLUMN_NAME],
                *left._internal.index_spark_columns,
                left.spark.column,
            )
            # The size of the list is expected to be small.
            collected_structed_scol = F.collect_list(structed_scol)
            # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order.
            collected_structed_scol = F.array_sort(collected_structed_scol)
            right_values_scol = F.array(*(F.lit(x) for x in right))
            index_scol_names = left._internal.index_spark_column_names
            scol_name = left._internal.spark_column_name_for(
                left._internal.column_labels[0])
            # Compare the values of left and right by using zip_with function.
            cond = F.zip_with(
                collected_structed_scol,
                right_values_scol,
                lambda x, y: F.struct(
                    *[
                        x[index_scol_name].alias(index_scol_name)
                        for index_scol_name in index_scol_names
                    ],
                    F.when(x[scol_name].isNull() | y.isNull(), False).
                    otherwise(x[scol_name] == y, ).alias(scol_name),
                ),
            ).alias(scol_name)
            # 1. `sdf_new` here looks like the below (the first field of each set is Index):
            # +----------------------------------------------------------+
            # |0                                                         |
            # +----------------------------------------------------------+
            # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]|
            # +----------------------------------------------------------+
            sdf_new = sdf.select(cond)
            # 2. `sdf_new` after the explode looks like the below:
            # +----------+
            # |       col|
            # +----------+
            # |{0, false}|
            # | {1, true}|
            # |{2, false}|
            # | {3, true}|
            # |{4, false}|
            # +----------+
            sdf_new = sdf_new.select(F.explode(scol_name))
            # 3. Here, the final `sdf_new` looks like the below:
            # +-----------------+-----+
            # |__index_level_0__|    0|
            # +-----------------+-----+
            # |                0|false|
            # |                1| true|
            # |                2|false|
            # |                3| true|
            # |                4|false|
            # +-----------------+-----+
            sdf_new = sdf_new.select("col.*")

            index_spark_columns = [
                scol_for(sdf_new, index_scol_name)
                for index_scol_name in index_scol_names
            ]
            data_spark_columns = [scol_for(sdf_new, scol_name)]

            internal = left._internal.copy(
                spark_frame=sdf_new,
                index_spark_columns=index_spark_columns,
                data_spark_columns=data_spark_columns,
                index_fields=[
                    InternalField.from_struct_field(index_field)
                    for index_field in sdf_new.select(
                        index_spark_columns).schema.fields
                ],
                data_fields=[
                    InternalField.from_struct_field(
                        sdf_new.select(data_spark_columns).schema.fields[0])
                ],
            )
            return first_series(DataFrame(internal))
        else:
            from pyspark.pandas.base import column_op

            return column_op(Column.__eq__)(left, right)
Ejemplo n.º 7
0
def main():
    """Main function"""

    # Get args
    args = get_args()

    # Azure credentials
    sas_token = args.sas
    storage_account_name = args.storage
    container_in = args.container_in
    container_out = args.container_out

    azure_accounts = list()
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_in
    })
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_out
    })

    # VM
    cores = args.vm_cores
    ram = args.vm_ram
    shuffle_partitions = args.shuffle_partitions

    # Geohash file path
    geohash_path = args.geohashpath

    # Date, country, prefix
    country = args.country
    date_string = args.date
    prefix = args.prefix

    # Set date variables
    day_time = datetime.strptime(date_string, "%Y-%m-%d")
    year = day_time.year
    month = day_time.month
    day = day_time.day

    # stop config
    seconds = 60
    accuracy = args.accuracy
    roam_dist = args.roam_dist
    min_stay = args.min_stay
    overlap_hours = args.overlap_hours

    # Path in - path out
    blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/"
    path_out = f"stoplocation-v{VERSION}_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}"

    if prefix:
        path_out = f"stoplocation-v{VERSION}_prefix_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}"

    # config spark
    conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts)

    # Create spark session
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession

    # Init azure client
    blob_service_client = BlobServiceClient.from_connection_string(
        CONN_STRING.format(storage_account_name, sas_token))

    #  build keys, date is mandatory, prefix opt
    partition_key = "year={}/month={}/day={}".format(year, month, day)
    if prefix:
        partition_key = "year={}/month={}/day={}/prefix={}".format(
            year, month, day, prefix)

    blob_base = "{}/{}".format(path_out, partition_key)

    #
    # check for skip
    # TODO
    #
    skip = False

    print("process " + partition_key + " to " + blob_base)
    start_time = time.time()
    local_dir = LOCAL_PATH + partition_key
    print("write temp to " + local_dir)

    # cleanup local if exists
    if (os.path.isdir(local_dir)):
        map(os.unlink,
            (os.path.join(local_dir, f) for f in os.listdir(local_dir)))

    # TODO cleanup remote if exists

    # Output schema
    schema = ArrayType(
        StructType([
            #StructField('device_type', IntegerType(), False),
            StructField('serial', IntegerType(), False),
            StructField('latitude', DoubleType(), False),
            StructField('longitude', DoubleType(), False),
            StructField('begin', TimestampType(), False),
            StructField('end', TimestampType(), False),
            StructField('personal_area', BooleanType(), False),
            StructField('distance', DoubleType(), False),
            StructField('geohash6', StringType(), False),
            StructField('after_stop_distance', DoubleType(), False)
        ]))

    spark_get_stop_location = udf(
        lambda z: get_stop_location(z, roam_dist, min_stay), schema)

    # Geohash file
    print("read geohash parquet")
    csv_time = time.time()
    dfs_us_states = spark.read.format("parquet").load(geohash_path)
    # states = [s.STUSPS for s in dfs_us_states.select(
    #     'STUSPS').distinct().collect()]

    dfs_us_states = dfs_us_states.select(
        col('STUSPS').alias('state'),
        col('geohash').alias('geohash5'))
    dfs_us_states = dfs_us_states.drop_duplicates(subset=['geohash5'])

    # Input dataset
    print("read dataset table")
    read_time = time.time()

    # dfs = spark.read.format("parquet").load(blob_in)

    # # apply partition filter
    # dfs_partition = dfs.where(
    #     f"(year = {year} AND month = {month} AND day = {day}  AND prefix = '{prefix}')")

    # read only partition to reduce browse time
    dfs_cur_partition = spark.read.format("parquet").load(
        f"{blob_in}/{partition_key}")

    # lit partition filters as data
    dfs_cur_partition = dfs_cur_partition.withColumn('year', F.lit(year))
    dfs_cur_partition = dfs_cur_partition.withColumn('month', F.lit(month))
    dfs_cur_partition = dfs_cur_partition.withColumn('day', F.lit(day))
    if prefix:
        dfs_cur_partition = dfs_cur_partition.withColumn(
            'prefix', F.lit(prefix))

    # read next day for overlap
    next_day = day_time + timedelta(days=1)
    next_partition_key = "year={}/month={}/day={}".format(
        next_day.year, next_day.month, next_day.day)
    if prefix:
        next_partition_key = "year={}/month={}/day={}/prefix={}".format(
            next_day.year, next_day.month, next_day.day, prefix)

    dfs_next_partition = spark.read.format("parquet").load(
        f"{blob_in}/{next_partition_key}")
    dfs_next_partition = dfs_next_partition.where(
        F.hour("timestamp") <= (overlap_hours - 1))

    # lit partition filters as data
    dfs_next_partition = dfs_next_partition.withColumn('year',
                                                       F.lit(next_day.year))
    dfs_next_partition = dfs_next_partition.withColumn('month',
                                                       F.lit(next_day.month))
    dfs_next_partition = dfs_next_partition.withColumn('day',
                                                       F.lit(next_day.day))
    if prefix:
        dfs_next_partition = dfs_next_partition.withColumn(
            'prefix', F.lit(prefix))

    # union with overlap
    dfs_partition = dfs_cur_partition.unionAll(dfs_next_partition)

    print("process with spark")
    spark_time = time.time()

    # select columns
    dfs_partition = dfs_partition.select(
        'prefix', 'userID', 'timestamp', 'latitude', 'longitude',
        (F.when(col('opt1') == 'PERSONAL_AREA',
                True).otherwise(False)).alias('personal_area'), 'accuracy')

    # keep only data with required accuracy
    dfs_partition = dfs_partition.where((col('accuracy') <= accuracy)
                                        & (col('accuracy') >= 0))

    # stats - enable only for debug!
    # num_inputs = dfs_partition.count()
    # print(f"read {num_inputs} rows from "+partition_key)

    # Lowering the granularity to 1 minutes

    # explicitely convert to timestamp
    #dfs_partition = dfs_partition.withColumn('timestamp', col('timestamp').cast('timestamp'))
    seconds_window = F.unix_timestamp(
        'timestamp') - F.unix_timestamp('timestamp') % seconds
    w = Window().partitionBy('userID', seconds_window).orderBy('accuracy')
    dfs_partition = dfs_partition.withColumn(
        'rn',
        F.row_number().over(w).cast('int')).where(col('rn') == 1).drop('rn')

    # Radians lat/lon
    dfs_partition = dfs_partition.withColumn('latitude',
                                             F.radians('latitude')).withColumn(
                                                 'longitude',
                                                 F.radians('longitude'))

    # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist
    w = Window.partitionBy(['prefix', 'userID']).orderBy('timestamp')
    dfs_partition = dfs_partition.withColumn('next_lat',
                                             F.lead('latitude', 1).over(w))
    dfs_partition = dfs_partition.withColumn('next_lon',
                                             F.lead('longitude', 1).over(w))

    # Haversine distance
    dfs_partition = dfs_partition.withColumn(
        'distance_next', EARTH_RADIUS * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((col('next_lat') - col('latitude')) / 2.0), 2) +
                F.cos('latitude') * F.cos('next_lat') *
                F.pow(F.sin((col('next_lon') - col('longitude')) / 2.0), 2))))
    dfs_partition = dfs_partition.withColumn(
        'distance_prev',
        F.lag('distance_next', default=0).over(w))

    # Chunks
    dfs_partition = dfs_partition.withColumn(
        'chunk',
        F.when(col('distance_prev') > roam_dist, 1).otherwise(0))

    windowval = (Window.partitionBy(
        'prefix',
        'userID').orderBy('timestamp').rangeBetween(Window.unboundedPreceding,
                                                    0))
    dfs_partition = dfs_partition.withColumn(
        'chunk',
        F.sum('chunk').over(windowval).cast('int'))

    # Remove chunks of the next day
    w = Window.partitionBy(['prefix', 'userID', 'chunk'])
    dfs_partition = dfs_partition.withColumn(
        'min_timestamp', F.dayofmonth(F.min('timestamp').over(w)))
    dfs_partition = dfs_partition.where(
        col('min_timestamp') == day).drop('min_timestamp')

    # Get the stops
    result_df = dfs_partition.groupBy('prefix', 'userID', 'chunk').agg(
        F.array_sort(
            F.collect_list(
                F.struct('timestamp', 'latitude', 'longitude', 'distance_prev',
                         'personal_area'))).alias('gpsdata'),
        F.sum('distance_prev').alias('dist_sum'))
    result_df = result_df.withColumn('gpsdata',
                                     spark_get_stop_location('gpsdata'))

    result_df = result_df.select('userID', 'chunk',
                                 F.explode_outer('gpsdata').alias('e'),
                                 'dist_sum')
    result_df = result_df.select(
        'userID', 'chunk',
        col('e.latitude').alias('latitude'),
        col('e.longitude').alias('longitude'),
        col('e.begin').alias('begin'),
        col('e.end').alias('end'),
        col('e.personal_area').alias('personal_area'),
        col('e.geohash6').alias('geohash6'),
        col('e.serial').alias('serial'),
        col('e.distance').alias('stop_distance'),
        col('e.after_stop_distance').alias('after_stop_distance'), 'dist_sum')
    result_df = result_df.fillna(0, subset=['after_stop_distance'])

    # Remove all those stop that start the next day
    result_df = result_df.where((col('begin').isNull())
                                | (F.dayofmonth('begin') != next_day.day))

    result_df = result_df.withColumn(
        'isStop',
        F.when(col('serial').isNotNull(), 1).otherwise(0))

    result_df = result_df.withColumn(
        'dist_sum',
        F.when(col('isStop') == 1,
               col('stop_distance')).otherwise(col('dist_sum')))

    windowval = (Window.partitionBy('userId').orderBy(
        'chunk', 'serial').rowsBetween(Window.currentRow,
                                       Window.unboundedFollowing))
    result_df = result_df.withColumn('isStop_cum',
                                     F.sum('isStop').over(windowval))

    result_df = result_df.groupBy('userId', 'isStop_cum').agg(
        F.first('latitude', ignorenulls=True).alias('latitude'),
        F.first('longitude', ignorenulls=True).alias('longitude'),
        F.first('begin', ignorenulls=True).alias('begin'),
        F.first('end', ignorenulls=True).alias('end'),
        F.first('personal_area', ignorenulls=True).alias('personal_area'),
        F.first('geohash6', ignorenulls=True).alias('geohash6'),
        F.sum('dist_sum').alias('prev_travelled_distance'),
        F.sum('after_stop_distance').alias('after_stop_distance'))

    # compute next distance, which is null if it's the last
    windowval = Window.partitionBy('userId').orderBy(F.desc('isStop_cum'))
    result_df = result_df.withColumn(
        'next_travelled_distance',
        F.lead('prev_travelled_distance').over(windowval))
    result_df = result_df.withColumn(
        'next_travelled_distance',
        F.when((col('next_travelled_distance').isNull()) &
               (col('after_stop_distance') > 0),
               col('after_stop_distance')).otherwise(
                   col('next_travelled_distance')))

    # Drop nulls
    result_df = result_df.dropna(subset=['latitude']).drop('isStop_cum')

    # Transform latitude and longitude back to degrees
    result_df = result_df.withColumn('latitude', F.degrees('latitude'))
    result_df = result_df.withColumn('longitude', F.degrees('longitude'))

    # US states
    result_df = result_df.withColumn(
        "geohash5", F.expr("substring(geohash6, 1, length(geohash6)-1)"))
    result_df = result_df.join(F.broadcast(dfs_us_states),
                               on="geohash5",
                               how="inner").drop('geohash5')

    # lit partition data - enable only if added to partitionBy
    # result_df = result_df.withColumn('year', F.lit(year))
    # result_df = result_df.withColumn('month', F.lit(month))
    # result_df = result_df.withColumn('day', F.lit(day))

    # write
    out_partitions = len(US_STATES)
    result_df.repartition(out_partitions, "state").write.partitionBy(
        "state").format('parquet').mode("overwrite").save(local_dir + "/")

    # stats - enable only for debug!
    # num_records = result_df.count()
    # print(f"written {num_records} rows to "+local_dir)

    # if num_records == 0:
    #     raise Exception("Zero rows output")

    print("upload local data to azure")
    upload_time = time.time()

    # upload parts over states
    for state in US_STATES:
        print(f"upload files for {state}")
        state_dir = local_dir + "/state=" + state
        state_key = f"{partition_key}/state={state}/"

        if (os.path.isdir(state_dir)):
            files = [
                filename for filename in os.listdir(state_dir)
                if filename.startswith("part-")
            ]

            if len(files) > 0:

                for file_local in files:
                    file_path = state_dir + "/" + file_local
                    part_num = int(file_local.split('-')[1])
                    part_key = '{:05d}'.format(part_num)
                    # fix name as static hash to be reproducible
                    filename_hash = hashlib.sha1(
                        str.encode(state_key + part_key)).hexdigest()

                    blob_key = "{}/state={}/part-{}-{}.snappy.parquet".format(
                        blob_base, state, part_key, filename_hash)

                    print("upload " + file_path + " to " + container_out +
                          ":" + blob_key)

                    blob_client = blob_service_client.get_blob_client(
                        container_out, blob_key)

                    with open(file_path, "rb") as data:
                        blob_client.upload_blob(data, overwrite=True)

                    # cleanup
                    os.remove(file_path)
            else:
                print(f"no files to upload for {state}")

        else:
            print(f"missing partition for {state}")

    print("--- {} seconds elapsed ---".format(int(time.time() - start_time)))
    print()
    stop_time = time.time()
    spark.stop()

    end_time = time.time()
    print("Done in {} seconds (csv:{} read:{} spark:{} upload:{} stop:{})".
          format(int(end_time - start_time), int(read_time - csv_time),
                 int(spark_time - read_time), int(upload_time - spark_time),
                 int(stop_time - upload_time), int(end_time - stop_time)))
    print('Done.')
Ejemplo n.º 8
0
# Extract event sequences and groundtruth

udf_normalize = F.udf(
    lambda x: [[
        (x[i][0] - x[0][0] + (x[-1][0] - x[0][0]) /
         (len(x) - 1)) / args.time_divisor,
        float(x[i][1]),
    ] for i in range(len(x))],
    psql.types.ArrayType(psql.types.ArrayType(psql.types.FloatType())),
)

with Timer("extract event sequences"):
    event_seqs = (df_filtered.withColumn(
        "phrase", F.explode("phrases")).withColumn(
            "event", F.array("ts", "type")).groupby("phrase").agg(
                F.array_sort(
                    F.collect_set("event")).alias("event_seq")).filter(
                        F.size("event_seq").between(
                            args.min_seq_length,
                            args.max_seq_length)).withColumn(
                                "event_seq",
                                udf_normalize("event_seq"))).persist()

event_seqs.limit(5).toPandas()

# seq_lengths = (
#     event_seqs.select("phrase", F.size("event_seq").alias("size"))
#     .groupby("size")
#     .count()
#     .sort("size")
# )