def test_mixed_sql_and_udf(self):
        df = self.data
        w = self.unbounded_window
        ow = self.ordered_window
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
        expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))

        # Test mixing sql window function and window udf in the same expression
        result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
        expected2 = expected1

        # Test chaining sql aggregate function and udf
        result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('min_v', min(df['v']).over(w)) \
                    .withColumn('v_diff', col('max_v') - col('min_v')) \
                    .drop('max_v', 'min_v')
        expected3 = expected1

        # Test mixing sql window function and udf
        result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('rank', rank().over(ow))
        expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('rank', rank().over(ow))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
Example #2
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
 def getValueFieldValueLists(self, handlerId, keyFields, valueFields):
     df = self.entity.groupBy(keyFields)
     agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId))
     maxRows = int(self.options.get("rowCount","100"))
     numRows = min(maxRows,df.count())
     valueLists = []
     for valueField in valueFields:
         valueDf = None
         if agg == "SUM":
             valueDf = df.agg(F.sum(valueField).alias("agg"))
         elif agg == "AVG":
             valueDf = df.agg(F.avg(valueField).alias("agg"))
         elif agg == "MIN":
             valueDf = df.agg(F.min(valueField).alias("agg"))
         elif agg == "MAX":
             valueDf = df.agg(F.max(valueField).alias("agg"))
         else:
             valueDf = df.agg(F.count(valueField).alias("agg"))
         for keyField in keyFields:
             valueDf = valueDf.sort(F.col(keyField).asc())
         valueDf = valueDf.dropna()
         rows = valueDf.select("agg").take(numRows)
         valueList = []
         for row in rows:
             valueList.append(row["agg"])
         valueLists.append(valueList)
     return valueLists   
Example #5
0
def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()
    def test_multiple_udfs(self):
        df = self.data
        w = self.unbounded_window

        result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \
                    .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \
                    .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w))

        expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \
                      .withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('min_w', min(df['w']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_timestamp_splitter(test_specs, spark_dataset):
    dfs_rating = spark_dataset.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float"))

    splits = spark_timestamp_split(
        dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL
    )

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split0 <= min_split1)

    # Test multi split
    splits = spark_timestamp_split(dfs_rating, ratio=test_specs["ratios"])

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split0 <= min_split1)

    max_split1 = splits[1].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split2 = splits[2].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split1 <= min_split2)
Example #8
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
Example #9
0
def do_something_only_once():
    # the command I use to run this script:
    #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py
    global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda
    ## Loading of data
    sc = SparkContext(appName='Simple App') #"local"
    sqlContext = SQLContext(sc)
    # Load metadata avro
    reader = sqlContext.read.format('com.databricks.spark.avro')
    meta = reader.load('data/spark_metadata.avro')
    # # Loading topic distributions
    topdisFile = 'data/spark_output.tuples'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile)
    strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType())
    topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0']))
    strip_first_col_float = udf(lambda row: float(row[1:]), FloatType())
    topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1']))
    strip_last_col = udf(lambda row: float(row[:-2]), FloatType())
    topdis = topdis.withColumn('C20',strip_last_col(topdis['C20']))
    # # Load dictionary CSV
    dicFile = 'data/spark_dic.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile)
    dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    ldaFile = 'data/spark_lda.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile)
    lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache()
    # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    # # # Load clustertopics CSV
    # clutoFile = 'enron_small_clustertopics.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile)
    # # # Load topicswords CSV
    # towoFile = 'enron_small_lda_transposed.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile)
    # # Merge topdis which has document id and with metadata, based on document id
    metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp"))
    doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache()
    maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp']
    mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
Example #10
0
    def test_bounded_mixed(self):
        from pyspark.sql.functions import mean, max

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.unbounded_window

        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf

        result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w1)) \
            .withColumn('max_v', max_udf(df['v']).over(w2)) \
            .withColumn('mean_unbounded_v', mean_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(df['v']).over(w1)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('mean_unbounded_v', mean(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def _if_later(data1, data2):
    """Helper function to test if records in data1 are earlier than that in data2.
    Returns:
        bool: True or False indicating if data1 is earlier than data2.
    """
    x = (data1.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL)
         .groupBy(DEFAULT_USER_COL)
         .agg(F.max(DEFAULT_TIMESTAMP_COL).cast('long').alias('max'))
         .collect())
    max_times = {row[DEFAULT_USER_COL]: row['max'] for row in x}

    y = (data2.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL)
         .groupBy(DEFAULT_USER_COL)
         .agg(F.min(DEFAULT_TIMESTAMP_COL).cast('long').alias('min'))
         .collect())
    min_times = {row[DEFAULT_USER_COL]: row['min'] for row in y}

    result = True
    for user, max_time in max_times.items():
        result = result and min_times[user] >= max_time

    return result
Example #12
0
    def test_bounded_simple(self):
        from pyspark.sql.functions import mean, max, min, count

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.shrinking_range_window

        plus_one = self.python_plus_one
        count_udf = self.pandas_agg_count_udf
        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count_udf(df['v']).over(w2)) \
            .withColumn('max_v',  max_udf(df['v']).over(w2)) \
            .withColumn('min_v', min_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count(df['v']).over(w2)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('min_v', min(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
sqlCtx = SQLContext(sc)

lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"])

record = lines.map(lambda line: line.split(",")).map(
    lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2]))

recordSchema = sqlCtx.createDataFrame(record)

recordSchema.groupBy().agg({"*": "count"}).show()

recordSchema.groupBy("machine", recordSchema["domain"]).agg(
    {"domain": "max", "request": "min"}).show()

recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max(
    recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int")).groupBy("machine").count().show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").max("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").min("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").sum("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").avg("request").show()
Example #14
0
from pyspark.shell import sqlContext
from pyspark.sql.functions import rand, randn
from pyspark.sql import *
from pyspark.sql.functions import mean, min, max

df = sqlContext.range(0, 7)

df.show()

df.select("id",
          rand(seed=10).alias("uniform"),
          randn(seed=27).alias("normal")).show()

df.describe("uniform", "normal").show()

dfNew = df.describe("uniform", "normal").show()

dfNew.select([mean("uniform"), min("uniform"), max("uniform")]).show()
Example #15
0
print('OR')
df_new.orderBy(df_new['High'].desc()).head(1)[0][0]

print('What about day preseted the 2nd highest peak in High?'.upper())
# Note that I need the second row, i.e., head(2)
df_new.orderBy(df_new['High'].desc()).head(2)[1][0]

print('What is the mean of the Close column?'.upper())
from pyspark.sql.functions import mean
df_new.select(format_number(mean('Close'), 2).alias('avg')).show()

print('What is the min and max values of the Volume column?'.upper())
from pyspark.sql.functions import min, max
df_new.select(
    format_number(min('Volume'), 2).alias('min_volume'),
    format_number(max('Volume'), 2).alias('max_volume')).show()

print('How many days had the Close lower than 60 dollars'.upper())
from pyspark.sql.functions import count
df_filt = df_new.filter(df_new['Close'] < 60)
df_filt.select(count(df_filt['Close'])).show()

print('Percentage of days where High>80'.upper())
df_filt = df_new.filter(df_new['High'] > 80)
df_filt = df_filt.select(count(df_filt['Date']).alias('days_80'))

df_filt2 = df_new.select(count(df_new['Date']).alias('total_days'))

percentage = 100 * df_filt.head(1)[0][0] / df_filt2.head(1)[0][0]
print('The percentage is {}'.format(percentage))
def min_and_max_year(df: DataFrame):
    return df.select("Year").agg(f.min("Year").alias("min"), f.max("Year").alias("max"))
 def tree_json(self, tree, df):
     data = []
     for line in tree.splitlines() :
         if line.strip():
             line = line.strip()
             data.append(line)
         else : break
         if not line : break
     res = []
     res.append({'name': 'Root', 'children':self.parse(data[1:], df)})
     measure_column_name = self._target_column
     self._splits = []
     start = self._data_frame.filter(col(measure_column_name).isNotNull()).select(FN.min(measure_column_name)).collect()[0][0]
     self._splits.append(start)
     self._label_code = {}
     label_code = 0.0
     self._coding = []
     for idx in range(len(self._predicts)):
         if idx == len(self._predicts) - 1:
             end = self._data_frame.filter(col(measure_column_name).isNotNull()).select(FN.max(measure_column_name)).collect()[0][0]
         else:
             end = old_div((self._predicts[idx]+self._predicts[idx+1]),2)
         group_name = NarrativesUtils.round_number(start,2) + ' to ' + NarrativesUtils.round_number(end,2)
         self._map[self._predicts[idx]] ={'start':start, 'end': end, 'group': group_name}
         self._label_code[label_code] = group_name
         start = end
         label_code = label_code+1
         self._splits.append(start)
     return res[0]
Example #18
0
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'ra_summary.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)
    # table nsjoin (day, uuid)
    # table mapmon (day, uuid)
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    day_idx = datenow[0:4]+datenow[5:7]+datenow[8:10]
    uuid_list = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n') if day_idx in x]
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    post_partition_n = 1000

    for uuid_idx in uuid_list:
        # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload
        nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format(day_idx,
                                                                                                                                                                                                                                                                                                                            uuid_idx)

        # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp
        mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format(day_idx,
                                                                                                                                                                                                                                                                                                          uuid_idx)
        logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

        nsjoin = hiveCtx.sql(nsjoin_query)
        nsjoin_rows = nsjoin.repartition(post_partition_n).cache()
        data = hiveCtx.sql(mapmon_query)
        data_rows = data.repartition(post_partition_n).cache()

        col = ['mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s', 'min_r', 'max_r',
               'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor', 'continent', 'country', 'latitude', 'longitude', 'prp',
               'ns_ip', 'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgload']

        cols_appended = ['nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon',
                         'mpgid', 'mpg_type', 'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors',
                         'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r',
                         'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r', 'ping_ratio', 'local_ratio',
                         'cont_fb_ratio', 'in_cont_ratio', 'in_country_ratio', 'private_ratio', 'avg_distance',
                         'num_region_mapped', 'mapping_entropy', 'sum_dftime']

        df = nsjoin_rows.join(data_rows, data_rows.mpgid == nsjoin_rows.ns_mpgid, 'inner')[col].cache()
        row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0]
        max_dftime = row1[0]

        df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat,
                                                                  x.ns_lon,
                                                                  x.latitude,
                                                                  x.longitude,
                                                                  x.mpd_dftime)))\
                .map(lambda x: ((   x[19], # nsip
                                    x[20], # demand
                                    x[21], # ns_asnum
                                    x[22], # ns_country
                                    x[23], # ns_continent
                                    round(x[24], 3), # ns_lat & ns_lon
                                    round(x[25], 3),
                                    x[0], # mpgid
                                    x[1], # mrid
                                    x[2], # mpg type
                                    x[26], # mpg load
                                    ),
                               [   [int(x[3])], # region
                                   [str(int(x[3])) + "_" + str(int(x[4]))], # region_link
                                   x[5]/max_dftime, # min_s
                                   x[6]/max_dftime, # max_s
                                   x[7]/max_dftime, # min_r
                                   x[8]/max_dftime, # max_r
                                   x[9]/max_dftime, # ping ratio
                                   x[10]/max_dftime, # local ratio
                                   x[11]/max_dftime, # cont_fb ratio
                                   [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio)
                                   [int(x[13])], # ecor
                                   x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio
                                   x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio
                                   [round(x[16], 3)], # lat
                                   [round(x[17], 3)], # lon
                                   x[18]/max_dftime, # prp
                                   x[27]/max_dftime, # w_distance
                                   x[12],
                                   [round(x[5]/x[12], 2)], # min_s list
                                   [round(x[6]/x[12], 2)], # max_s list
                                   [round(x[7]/x[12], 2)], # min_r list
                                   [round(x[8]/x[12], 2)], # max_r list
                               ]))\
                .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\
                .map(lambda x: [x[0][0], # nsip
                                x[0][8], # mrid
                                x[0][1], # demand
                                x[0][2], # ns_asnum
                                x[0][3], # ns_country
                                x[0][4], # ns_continent
                                x[0][5], # ns_lat
                                x[0][6], # ns_lon
                                x[0][7], # mpgid
                                x[0][9], # mpg type
                                x[0][10], # mpg load
                                x[1][0], # list of region
                                x[1][1], # list of region_link
                                [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio
                                x[1][10], # list of ecor
                                x[1][13], # list of region lat
                                x[1][14], # list of region lon
                                round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s
                                round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s
                                round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r
                                round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r
                                round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio
                                round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio
                                round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio
                                round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio
                                round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio
                                round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio
                                round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance
                                round(x[1][17], 3), # summation of covered dftime
                                x[1][18], # list of min_s
                                x[1][19], # list of max_s
                                x[1][20], # list of min_r
                                x[1][21], # list of max_r
                                len(x[1][9]), # number of different regions mapped
                                round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments
                                ])\
                .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\
                .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region
                                         ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link
                                         ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio
                                         ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor
                                         ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s
                                         ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s
                                         ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r
                                         ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r
                                         ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat
                                         ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon
                                         ] + x[17:28] + x[33:35] + [x[28]])\
                .toDF(cols_appended).cache()

        df_all = df2.map(lambda x: toCSVLine(x))
        logger.info('writing into HDFS')
        df_all.saveAsTextFile('/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}'.format(day_idx,
                                                                                                             uuid_idx))
        logger.info('updating Hive table: mrqos_mapmon_stats')
        beeline.add_partitions("mrqos.mrqos_mapmon_stats","datestamp='{}',uuid='{}'".format(day_idx,
                                                                                            uuid_idx))
        .format('com.databricks.spark.csv') \
        .options(header='false') \
        .load(args.file, schema=StructType(fields))

    # calculate the totals summed across all dates
    countDF = df.groupBy('name').agg({"count": "sum"}).withColumnRenamed('sum(count)', 'total')

    # read from the column dates
    dates = sorted(df.select("date")
        .distinct()
        .map(lambda row: row[0])
        .collect())

    # find the counts for each date
    cols = [when(col("date") == m, col("percentage")).otherwise(None).alias(m)
        for m in  dates]
    maxs = [max(col(m)).alias(m) for m in dates]

    # reformat dataframe
    series = (df
        .select(col("name"), *cols)
        .groupBy("name")
        .agg(*maxs)
        .na.fill(0))

    compressedTimeseries = series.select("name", concat_ws(",", *dates).alias("timeseries"))

    # add totals to timeseries table
    resultDF = compressedTimeseries.join(countDF, 'name', 'inner')

    resultDF.write.format('com.databricks.spark.csv').save('converted.csv.files')
Example #20
0
                                            "user_st_et_record.ser_id",
                                            "user_st_et_record.st",
                                            "user_st_et_record.et")

        else:
            LOGGER.info('For non first runs of SPARK_PROCESSOR')
            #Get Hive Partitions
            from pyspark.sql.functions import max
            from pyspark.sql.functions import first

            spark.sql("MSCK REPAIR TABLE " + settings.hive_db + "." +
                      settings.hive_table)
            hive_partitions = spark.sql("SHOW PARTITIONS " + settings.hive_db +
                                        "." + settings.hive_table)
            latest_partition = hive_partitions.select(
                regexp_extract(max(hive_partitions.result), '(et_hr)=(\d+)',
                               2).alias('part_hr'))
            LOGGER.info('Hive Partitions:- {}'.format(
                hive_partitions.collect()))
            LOGGER.info('Latest Hive Partition:- {}'.format(
                latest_partition.collect()))

            import pandas as pd
            part_df = latest_partition.toPandas()
            old_users = spark.read.csv(settings.hive_dir +
                                       str(part_df.iloc[0]['part_hr']))

            old_users_hive = old_users.select(
                (old_users._c0).alias("user_id"),
                (old_users._c1).alias("st")).distinct()
            join_users = all_users.join(old_users_hive, "user_id",
    def chisquare_trend(self,column_name,base_dir):
        if self._date_columns != None:
            if self._dateFormatDetected:
                output = []
                date_column = self._date_column_suggested
                chisquare_column = column_name
                result_column = self._result_column
                if chisquare_column in self._dataframe_helper.get_numeric_columns():
                    min_max = self._data_frame.select([FN.min(chisquare_column), FN.max(chisquare_column)]).collect()
                    maxval = min_max[0][1]
                    minval = min_max[0][0]
                    step = (maxval - minval) / 5.0
                    splits = [math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval)]
                    bucketizer = Bucketizer(splits=splits,inputCol=chisquare_column,outputCol="BINNED_COL")
                    self._data_frame = self._data_frame.withColumn(chisquare_column, self._data_frame[chisquare_column].cast(DoubleType()))
                    bucketedData = bucketizer.transform(self._data_frame)
                    df = bucketedData.select([col for col in bucketedData.columns if col != chisquare_column])
                    df = df.withColumnRenamed("BINNED_COL",chisquare_column)
                    ranges = []
                    for idx in range(len(splits)-1):
                        text = str(splits[idx])+" to "+str(splits[idx+1])
                        ranges.append(text)
                    bin_dict = dict(list(zip(list(range(len(ranges))),ranges)))
                else:
                    df = self._data_frame

                df = df.select([date_column,chisquare_column,result_column]).toPandas()
                df["suggestedDate"] = df[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                df["year_month"] = df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                # result_column_count_df = df.groupBy(self._result_column).count().orderBy("count",ascending=False)
                # grouped_data.sort_values(by='key', ascending=True)
                result_column_count = df[result_column].value_counts()
                top2levels = result_column_count[:2].index
                for level in top2levels:
                    filtered_df = df.loc[df[result_column] == level]
                    grouped_result = pd.DataFrame(filtered_df[date_column].value_counts()).reset_index()
                    grouped_result.columns=[date_column,"value"]
                    # grouped_result["suggestedDate"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                    grouped_result["year_month"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat).strftime("%b-%y"))
                    crosstab_df = pd.DataFrame(pd.crosstab(filtered_df["suggestedDate"],filtered_df[chisquare_column])).reset_index()
                    if chisquare_column in self._dataframe_helper.get_numeric_columns():
                        crosstab_columns = crosstab_df.columns
                        chisquare_levels = crosstab_columns[1:]
                        chisquare_levels = [bin_dict[x] for x in chisquare_levels]
                        crosstab_df.columns = [crosstab_columns[0]]+chisquare_levels
                    else:
                        chisquare_levels = crosstab_df.columns[1:]


                    crosstab_df["year_month"] = crosstab_df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                    final_df = pd.merge(grouped_result,crosstab_df, how='outer', on=['year_month'])
                    final_df.sort_values(by="suggestedDate",ascending=True,inplace=True)
                    final_df.reset_index(drop=True,inplace=True)
                    final_df["overallPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df["value"].iloc[1:],final_df["value"])]

                    growth_dict = {}
                    for val in chisquare_levels:
                        growth_dict[val]  = {}
                        growth_dict[val]["growth"] = round(((final_df[val].iloc[-1]-final_df[val].iloc[0])*100/float(final_df[val].iloc[0])),self._num_significant_digits)
                        if growth_dict[val]["growth"] > 3 or final_df[val].iloc[0] == 0:
                            growth_dict[val]["growthType"] = "positive"
                            print(growth_dict[val]["growth"])
                        elif growth_dict[val]["growth"] < -3:
                            growth_dict[val]["growthType"] = "negative"
                        else:
                            growth_dict[val]["growthType"] = "stable"
                        growth_dict[val]["total"] = sum(final_df[val])
                    growth_dict["overall"] = {}
                    growth_dict["overall"]["growth"] = round((final_df["value"].iloc[-1]-final_df["value"].iloc[0]/float(final_df["value"].iloc[0])),self._num_significant_digits)
                    data_dict = {}
                    total_tuple = []
                    for k,v in list(growth_dict.items()):
                        if k != "overall":
                            total_tuple.append((k,v["total"]))
                    sorted_total_tuple = sorted(total_tuple,key=lambda x:x[1],reverse=True)
                    top_dimension = sorted_total_tuple[0][0]
                    final_df["topDimensionPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df[top_dimension].iloc[1:],final_df[top_dimension])]
                    data_dict["dimension"] = chisquare_column
                    data_dict["correlation"] = final_df["value"].corr(final_df[top_dimension])
                    data_dict["subset_increase_percent"] = growth_dict[top_dimension]["growth"]
                    data_dict["overall_increase_percent"] = growth_dict["overall"]["growth"]
                    data_dict["target"] = level
                    data_dict["top_dimension"] = top_dimension
                    overall_peak_index = np.argmax(final_df["value"])
                    overall_low_index = np.argmin(final_df["value"])
                    top_dimension_peak_index = np.argmax(final_df[top_dimension])
                    top_dimension_low_index = np.argmin(final_df[top_dimension])
                    data_dict["overallPeakValue"] = final_df["value"][overall_peak_index]
                    data_dict["overallLowestValue"] = final_df["value"][overall_low_index]
                    data_dict["overallPeakTime"] = final_df["year_month"][overall_peak_index]
                    data_dict["overallLowestTime"] = final_df["year_month"][overall_low_index]
                    data_dict["overallPeakIncrease"] = final_df["overallPerChange"][overall_peak_index]
                    data_dict["topDimensionPeakValue"] = final_df[top_dimension][top_dimension_peak_index]
                    data_dict["topDimensionLowestValue"] = final_df[top_dimension][top_dimension_low_index]
                    data_dict["topDimensionPeakTime"] = final_df["year_month"][top_dimension_peak_index]
                    data_dict["topDimensionLowestTime"] = final_df["year_month"][top_dimension_low_index]
                    data_dict["topDimensionPeakIncrease"] = final_df["topDimensionPerChange"][top_dimension_peak_index]
                    data_dict["overall_streak"] = NarrativesUtils.streak_data(final_df,overall_peak_index,overall_low_index,\
                                                    "overallPerChange","value")
                    data_dict["top_dimension_streak"] = NarrativesUtils.streak_data(final_df,top_dimension_peak_index,top_dimension_low_index,\
                                                    "topDimensionPerChange",top_dimension)
                    # print growth_dict
                    data_dict["num_positive_growth_dimensions"] = 0
                    data_dict["positive_growth_dimensions"] = []
                    data_dict["positive_growth_values"] = []
                    data_dict["num_negative_growth_dimensions"] = 0
                    data_dict["negative_growth_dimensions"] = []
                    data_dict["negative_growth_values"] = []
                    data_dict["num_stable_growth_dimensions"] = 0
                    data_dict["stable_growth_dimensions"] = []
                    data_dict["stable_growth_values"] = []
                    data_dict["overall_growth_rate"] = growth_dict["overall"]["growth"]
                    data_dict["total_levels"] = len(chisquare_levels)
                    for val in chisquare_levels:
                        if growth_dict[val]["growthType"] == "positive":
                            data_dict["num_positive_growth_dimensions"] += 1
                            data_dict["positive_growth_dimensions"].append(val)
                            data_dict["positive_growth_values"].append(growth_dict[val]["growth"])
                        elif growth_dict[val]["growthType"] == "negative":
                            data_dict["num_negative_growth_dimensions"] += 1
                            data_dict["negative_growth_dimensions"].append(val)
                            data_dict["negative_growth_values"].append(growth_dict[val]["growth"])
                        else:
                            data_dict["num_stable_growth_dimensions"] += 1
                            data_dict["stable_growth_dimensions"].append(val)
                            data_dict["stable_growth_values"].append(growth_dict[val]["growth"])
                    summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                    'chisquare_trend.html',data_dict)
                    chart_data = {"data":[],"header":[]}
                    chart_data["header"] = ["time",result_column,top_dimension]
                    chart_data["data"]=[["time"],[result_column],[top_dimension]]
                    for idx in range(final_df.shape[0]):
                        chart_data["data"][0].append(final_df["year_month"].iloc[idx])
                        chart_data["data"][1].append(final_df["value"].iloc[idx])
                        chart_data["data"][2].append(final_df[top_dimension].iloc[idx])

                    paragraphs = NarrativesUtils.paragraph_splitter(summary1)
                    card_data = {"paragraphs":paragraphs,"chart":chart_data}
                    output.append([card_data])
                print(json.dumps(output,indent=2))
Example #22
0
#Groupby and list form , collect_list
df.groupBy("username").agg(F.collect_list("friend").alias("friends_grouped")).show(10)

def top_ss(ss_list):
	tsk = str(Counter(ss_list).most_common(50))
	return tsk


from pyspark.sql.functions import collect_list
udf_top = udf(top_ss, StringType())
final_data = useful_data.groupBy("single_col_l").agg(udf_top(collect_list(col('single_col_2'))).alias('ss_frequencies'))



#Select max or maximum from a column
train.select(max("datetime")).show(truncate=False)

#Get Item : Extract item from a specific postion of a column consisting of lists
#Previously id was [ab,fg,fe] out of which new_id [ab] is to be selected
ans=df_tmp.withColumn('new_id',split(df_tmp.id,',').getItem(0))


#Add row number column to a dataframe: Useful as pyspark dataframes cannot be accessed by index, no command like tail and join reshuffles them
df.withColumn("id", monotonically_increasing_id()).show()

#Relacing null values, missing values
train_test=train_test.na.fill({'siteid':3696590,'browserid_merged':2, 'devid_encode':1})
#siteid, browserid_merged are column names
#Sellect not null values of a column
df1.filter(df1.ColumnName_to_check.isNotNull()).show()
Example #23
0
new_fire_df.select("ResponseDelayedinMins")\
            .where(F.col("ResponseDelayedinMins") > 5)\
            .show(5, False)

#Date and time columns...
fire_ts_df = new_fire_df\
                .withColumn("IncidentDate", to_timestamp(F.col("CallDate"), "MM/dd/yyyy"))\
                .drop("CallDate")\
                .withColumn("OnWatchDate", to_timestamp(F.col("WatchDate"), "MM/dd/yyyy"))\
                .drop("WatchDate") \
                .withColumn("AvailableDtTm", to_timestamp(F.col("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a")) \
                .drop("WatchDate")

fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTm")\
            .show(5, False)

fire_ts_df.select(year("IncidentDate"))\
        .distinct()\
        .orderBy(year("IncidentDate"))\
        .show()

#AGGREGATIONS
fire_ts_df.select("CallType")\
        .where(F.col("CallType").isNotNull())\
        .groupBy("CallType")\
        .count()\
        .orderBy("count", ascending=False)\
        .show(n=10, truncate=False)

fire_ts_df.select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"), F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))\
            .show()
Example #24
0
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog,
            probability_images, seed, config_filename):
    """The primary script

    Args:
        spark (``pyspark.sql.SparkSession``)
        logger (``py4j.JavaObject``)
        s3_bucket (str): Name of the S3 bucket to search for configuration objects
            and save results to
        run_id (str): The identifier of the current run
        aoi_id (str): The identifier for the current area of interest
        probability_images (int): The number of tiles to save the generated
            probability images for
        seed (int): A random seed used to sample the probability images, for
            reproducability

    Required external inputs:
        <s3_bucket>/cvmapper_config.yaml
            under ``learner`` key:
                    prefix: The S3 prefix under which CSVs can be read and written
                    pool: Name of CSV file under s3_bucket/prefix giving the
                        comprehensive list of active grid cells
                    incoming_names: Name of CSV file under s3_bucket/prefix giving
                        list of cells used for training/validation
                    image_catalog: Name of CSV file under s3_bucket giving catalog
                        of imagery
                    image_output_pattern: URI pattern used for output of probability
                        images.  Must contain two '{}' tokens to be replaced by the
                        column and row for the relevant cell
                    outgoing: S3 URI to save the CSV of worst-performing cells to

        location pool:
            A CSV of ``name``, ``col``, ``row`` for each grid cell under
            consideration.  Identified by ``pool`` parameter above.

        incoming names:
            CSV containing (at least) ``name``, ``iteration``, and ``usage``
            columns.  Every name in this file must also be contained in the image
            pool.  Location of this file given in YAML file.

        image catalog:
            A CSV minimally containing ``col``, ``row``, ``season``, and ``uri``
            columns.  Season is either 'GS' or 'OS'.  Every grid cell in the
            location pool must be contained here, and must have an entry for both
            seasons.  URI points to TIFF that completely covers listed cell with
            valid image data (no NODATA values).

    Note:

        Grid cells are defined according to the master_layout object, which
        specifies a rectangular extent in long/lat coords.  This extent is
        subdivided into cells (in this case, 13792 columns and 14477 rows).
        Each cell is then given a pixel resolution (in this case 200x200, but
        whatever is chosen must match the resolution of the label images
        provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif``
        files identified by the incoming names CSV).  When we refer to tiles,
        we mean image chips of the stated resolution, indexed by
        ``gps.SpatialKey`` objects.  The key is a col/row pair where row=0,
        col=0 corresponds to the chip in the upper left corner of the bounding
        extent.

    Note:

        Grid cell names for the output probability images
        (`image_output_pattern`) are relative to a different, coarser layout.
        These grid cell ids need not be clearly defined, since the output of
        this process is simply a bucket of COGs for display using another
        tool.  However, see the `coarse_layout` definition below for specific
        details of the layout.

    """
    params = parse_yaml_from_s3(s3_bucket, config_filename)['learner']
    label_path = parse_yaml_from_s3(
        s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1]
    s3_prefix = params['prefix']
    s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix

    catalog_prefix = params['image_catalog']
    catalog_prefix_fix = params['image_catalog_fix']

    feature_names = functools.reduce(lambda a, b: a + b, [[
        "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n),
        "{}_std_{}".format(season, n)
    ] for season in ["GS", "OS"] for n in range(1, 5)])

    master_layout = gps.LayoutDefinition(
        gps.Extent(-17.541, -35.46, 51.459, 37.54),
        gps.TileLayout(13800, 14600, 200, 200))
    master_metadata = gps.Metadata(
        gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)),
        "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8,
        master_layout.extent, master_layout)

    ####################################
    logger.warn("Reading source tables")

    checkpoint = time.time()
    f_pool = spark\
         .read\
         .option('inferScheme', True)\
         .option('header', True)\
         .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\
         .repartition('col', 'row')

    qs_in = spark \
        .read \
        .option('inferScheme', True) \
        .option('header', True) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \
        .repartition('col', 'row')

    incoming = spark.read\
                    .option('header', True)\
                    .schema(StructType([
                        StructField('name', StringType()),
                        StructField('run', IntegerType()),
                        StructField('iteration', IntegerType()),
                        StructField('processed', BooleanType()),
                        StructField('usage', StringType()),
                        StructField('label', StringType())
                    ]))\
                    .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names']))

    # merge incoming_names and incoming_names_static
    incoming = incoming.union(spark.read \
        .option('header', True) \
        .schema(StructType([
        StructField('name', StringType()),
        StructField('run', IntegerType()),
        StructField('iteration', IntegerType()),
        StructField('processed', BooleanType()),
        StructField('usage', StringType()),
        StructField('label', StringType())
    ])) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static'])))

    incoming = incoming.filter(incoming['run'] == params['runid']).filter(
        incoming['label'] == True)
    test_names = f_pool.join(incoming.select('name'), 'name',
                             'left_anti').withColumn("usage", lit("test"))
    all_names = f_pool.join(incoming.select('name', 'usage'),
                            f_pool.name == incoming.name,
                            how='left')\
                      .select(f_pool.name.alias('name'), 'col', 'row', 'usage')
    num_test_images = test_names.count()

    image_catalog = spark.read\
                          .option('inferScheme', True)\
                          .option('header', True)\
                          .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\
                          .repartition('col', 'row')
    all_image_uris = image_catalog\
                     .filter(image_catalog['season'] == 'GS')\
                     .alias('gs')\
                     .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'),
                           (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\
                     .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))
    logger.warn(
        "Elapsed time for reading source tables: {}s".format(time.time() -
                                                             checkpoint))
    ####################################
    logger.warn("Reading training labels & building training features")

    checkpoint = time.time()
    training_data = gather_data(all_image_uris,
                                all_names.filter(all_names.usage == 'train'),
                                master_metadata,
                                feature_names,
                                s3_bucket,
                                label_path,
                                include_masks=True)
    training_data.show()
    logger.warn(
        "Elapsed time for reading training labels and feature building: {}s".
        format(time.time() - checkpoint))

    ####################################
    logger.warn("Balancing data")

    checkpoint = time.time()
    balanced_data = balance_samples(spark, training_data, 'mask')
    balanced_data.show()
    logger.warn("Elapsed time for balancing data: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    logger.warn("Training model")

    checkpoint = time.time()
    pipeline = ml_pipeline(feature_names, 'mask')
    model = pipeline.fit(balanced_data)
    print(model)
    logger.warn("Elapsed time for training the model: {}s".format(time.time() -
                                                                  checkpoint))

    ####################################
    logger.warn("Validating model results")

    checkpoint = time.time()
    validation_data = gather_data(
        all_image_uris,
        all_names.filter(all_names.usage == 'validate'),
        master_metadata,
        feature_names,
        s3_bucket,
        label_path,
        include_masks=True)

    valid_fit = model.transform(validation_data).select(
        'prediction', 'probability', 'mask')

    metrics = MulticlassMetrics(
        valid_fit.rdd.map(lambda r: (r.prediction, r.mask)))
    confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist(
    )  #left to right, top to bottom
    tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \
          1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1
    binmetrics = BinaryClassificationMetrics(
        valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask'])))

    last_iteration = incoming.agg(F.max('iteration')).collect()[0][0]
    report = pd.DataFrame({
        'run': [run_id],
        'iteration': [last_iteration + 1],
        'tss': [tss],
        'accuracy': [metrics.accuracy],
        'precision': [metrics.precision(1.0)],
        'recall': [metrics.recall(1.0)],
        'fpr': [metrics.falsePositiveRate(1.0)],
        'tpr': [metrics.truePositiveRate(1.0)],
        'AUC': [binmetrics.areaUnderROC],
        'aoi': [aoi_name],
        'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')]
    })
    # TODO: allow target location to be derived from params (local or s3)
    # added because of an error where incoming_metrics.csv contained different iteration number (10)
    # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv
    # print("############Old Iteration Metrics  to overwrite###########")
    # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics']))
    # print(incoming_previous.to_string())
    # print("############New Iteration Metrics to use to overwrite###########")
    # print(report.to_string())
    pd_df_to_s3_csv(report, s3_bucket,
                    os.path.join(s3_prefix, params['metrics']))
    logger.warn(
        "Elapsed time for validating and saving metrics to s3: {}s".format(
            time.time() - checkpoint))

    ####################################
    logger.warn("Classifying test data")

    checkpoint = time.time()
    filtered_names = test_names.filter(test_names.usage == "test")
    # filtered_names.cache()
    # filtered_names.show()
    test_features = gather_data(all_image_uris, filtered_names,
                                master_metadata, feature_names, s3_bucket)

    test_features_sample = test_features.sample(True, 0.1)

    fitted = model.transform(test_features_sample).select(
        'spatial_key', 'column_index', 'row_index', 'probability',
        'prediction')
    # fitted.cache()
    # fitted.show()
    grouped = fitted.groupBy('spatial_key')

    # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses:
    # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability
    # (This did not work without the UDF!)
    firstelement = F.udf(lambda v: float(v[0]), FloatType())
    # added this UDF to select the probability of field rather than no field to write to probability images
    secondelement = F.udf(lambda v: float(v[1]), FloatType())

    logger.warn(
        "Elapsed time for classifying test grids: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    if probability_images > 0 or complete_catalog:
        logger.warn("Write catalog of {} probability images".format(
            probability_images))
        checkpoint = time.time()

        if complete_catalog:

            # new catalog
            image_catalog_fix = spark.read \
                .option('inferScheme', True) \
                .option('header', True) \
                .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \
                .repartition('col', 'row')
            all_image_uris_fix = image_catalog_fix \
                .filter(image_catalog_fix['season'] == 'GS') \
                .alias('gs') \
                .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'),
                      (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \
                .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))

            #recollect all pixels for all testing images
            compreh_names = f_pool.join(qs_in,
                                        ['name', 'col', 'row', 'name_col_row'],
                                        'outer')
            features_compreh = gather_data(all_image_uris_fix, compreh_names,
                                           master_metadata, feature_names,
                                           s3_bucket)
            fitted_compreh = model.transform(features_compreh)\
                 .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_compreh = fitted_compreh.groupBy('spatial_key')
            # added to test sampling
            assembled = grouped_compreh.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        else:
            ####################################
            logger.warn("Identify worst performing cells")
            checkpoint = time.time()
            # TODO: Determine which images to take
            certainty = grouped \
                .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache()
            certainty.show()

            worst_keys_rdd = certainty \
                .sort('certainty') \
                .select('spatial_key') \
                .limit(round(certainty.count() * 0.05)) \
                .rdd.takeSample(False, (params['number_outgoing_names']))
            worst_keys = spark.createDataFrame(worst_keys_rdd)
            outgoing_names = worst_keys \
                .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \
                .select('name') \
                .withColumn('run', lit(run_id)) \
                .withColumn('iteration', lit(last_iteration + 1)) \
                .withColumn('processed', lit(False)) \
                .withColumn('usage', lit('train')) \
                .toPandas()
            uri = urlparse.urlparse(params['outgoing'])
            pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:])
            logger.warn(
                "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s"
                .format(time.time() - checkpoint))

            ###########################################
            checkpoint = time.time()
            # sampling testing images (num = probability_images)
            filtered_names_sample = filtered_names\
                .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\
                .select('scene_id')\
                .dropDuplicates()\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\
                .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\
                .select('name', 'col', 'row', 'name_col_row')

            #re-collect all pixels within sampled images
            features_images = gather_data(all_image_uris,
                                          filtered_names_sample,
                                          master_metadata, feature_names,
                                          s3_bucket)
            #reclassify sampled testing images
            fitted_images = model.transform(features_images)\
                    .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_sample = fitted_images.join(
                filtered_names_sample, (col('spatial_key.col') == col('col')) &
                (col('spatial_key.row') == col('row'))).groupby('spatial_key')
            assembled = grouped_sample.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        coarse_layout = gps.LayoutDefinition(
            gps.Extent(-17.541, -35.46, 51.459, 37.54),
            gps.TileLayout(1380, 1460, 2000, 2000))
        # we multiply by 100 to select digits that will be kept after converting from float to int.
        # range of int8 is to 128, so we can only preserve 2 sig figs
        output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\
                            .tile_to_layout(coarse_layout)\
                            .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED)

        cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[
            'image_output_pattern']
        output_tiles.foreach(lambda pair: write_bytes_to_s3(
            cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id,
                                str(last_iteration + 1)), pair[1]))
        logger.warn(
            "Elapsed time for writing catalog of probability images: {}s".
            format(time.time() - checkpoint))
# Samuel Tribe - 201318996 - [email protected]
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession
from pyspark.sql.types import DateType
conf = SparkConf().setAppName("covid19").setMaster("local")
spark = SparkSession(SparkContext(conf=conf))
csvPath = "C:\spark\COMP336-Coursework-1\data\covid19.csv"
covidDF = spark.read.csv(csvPath,header=True,inferSchema=True)
covidDF = covidDF.withColumn("date", F.col("date").cast(DateType()))
print("covid19.csv read as Dataframe with header=True")
covidDF.show()
print("Schema for dataframe")
covidDF.printSchema()
print("Filtering out NULL values from dataframe")
covidDF = covidDF.filter(covidDF.continent.isNotNull() & covidDF.location.isNotNull() & covidDF.date.isNotNull() & covidDF.total_cases.isNotNull() & covidDF.new_cases.isNotNull() & covidDF.total_deaths.isNotNull() & covidDF.new_deaths.isNotNull())
covidDF.show()
print("Highest deaths per country")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_deaths)).show()
print("max and min function results on total_cases")
covidDF.groupBy(['location']).agg(F.max(covidDF.total_cases).alias('total_cases_max'), F.min(covidDF.total_cases).alias('total_cases_min')).show()
Example #26
0
# count by different code type
logs_df.groupBy("code").count().show()
# rank by counts
from pyspark.sql.functions import asc, desc
logs_df.groupBy('code').count().orderBy(desc('count')).show()

# calculate average size of different code
logs_df.groupBy("code").avg("bytes").show()
# more calculation by code - average, min, max
import pyspark.sql.functions as F
logs_df.groupBy("code").agg(
logs_df.code,
F.avg(logs_df.bytes),
F.min(logs_df.bytes),
F.max(logs_df.bytes)
).show()


# homework
# 1
yelp_df.select("cool").agg({"cool" : "mean"}).collect()
# 2
import pyspark.sql.functions as F
yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 3
yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 4
from pyspark.sql.functions import asc, desc
yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy('state').count().orderBy(desc('count')).show()
# 5
Example #27
0
def calculate_and_plot_ar(jnd_path, sdk_path, dest):

    jnd = sqlContext.read.parquet(
        jnd_path)  #'/Users/amirdavidoff/Desktop/data/enriched_data/jnd'

    window = Window.partitionBy("jnd_sender_id").orderBy(["jnd_ts"])
    ''' plot response code by answer '''

    rates = jnd.select(['nbr_response_code', 'is_answered',
                        'jnd_retailer']).toPandas()
    count_codes = rates.groupby(['nbr_response_code', 'jnd_retailer'],
                                as_index=False).agg(["count"]).reset_index()
    count_codes.columns = ['code', 'retailer', 'count']

    count_codes.to_csv(dest + '/count_codes.csv')

    rates = rates.groupby(['nbr_response_code', 'jnd_retailer'],
                          as_index=False).agg(
                              {"is_answered": ["count", "mean", binary_ci]})
    #rates["codes"] = rates.index
    rates.columns = [
        'codes', 'retailer', 'is_answered_count', 'is_answered_mean',
        'is_answered_ci'
    ]
    rates = rates.sort_values('is_answered_mean',
                              ascending=True).reset_index(drop=True)
    rates = rates[rates["is_answered_count"] >= 50]
    rates["codes"] = np.where(rates["codes"] == "", "empty", rates["codes"])

    fig = plt.figure(figsize=(10, 10))

    for retailer in rates["retailer"].unique().tolist():

        temp = rates[rates["retailer"] == retailer]
        plt.errorbar(temp["is_answered_mean"],
                     temp["codes"],
                     xerr=temp['is_answered_ci'].values,
                     fmt='o',
                     label=retailer,
                     alpha=0.7)

    plt.xlabel('answer rate')
    plt.ylabel('nbr response code')
    plt.title('nbr codes answer rate')
    plt.legend()
    plt.savefig(
        '{}/nbr_code_ctr2.png'.format(dest)
    )  # /Users/amirdavidoff/mmuze-research/Improve_conversation_quality/tasks/task5
    plt.show(block=False)
    plt.close(fig)
    ''' sdk '''

    #sdk_path = '/Users/amirdavidoff/Desktop/data/yaron_sql_dumps/sdk_reports.csv'

    spark_sdk = sqlContext.read.options(header=True).csv(sdk_path)  #,sep='\t'
    spark_sdk = spark_sdk.withColumn(
        'date', F.to_date(F.from_unixtime(F.col('timestamp') / F.lit(1000.0))))

    spark_sdk = spark_sdk.withColumn(
        'date_time', F.from_unixtime(F.col('timestamp') / F.lit(1000.0)))

    spark_sdk = spark_sdk.withColumn("timestamp_int",
                                     spark_sdk.timestamp.cast(IntegerType()))

    spark_sdk = spark_sdk.where((F.to_date('date') >= F.lit("2019-07-01")))

    sdk_window = Window.partitionBy("user_id").orderBy(["date_time"])
    sdk_window_no_order = Window.partitionBy("user_id")

    #sdk = sqlContext.read.parquet('/Users/amirdavidoff/Desktop/data/enriched_data/sdk')

    #sdk.count()

    #sdk.groupBy('action').count().show(100)

    actions = ['click', 'add to cart']

    spark_sdk = spark_sdk.where(spark_sdk.action.isin(actions))
    #sdk.count()

    sdk_cols = [
        'retailer_id', 'user_id', 'timestamp', 'date', 'date_time', 'action',
        'value'
    ]
    spark_sdk = spark_sdk.select(sdk_cols)

    spark_sdk = spark_sdk.withColumn('rank', F.rank().over(sdk_window))
    spark_sdk = spark_sdk.withColumn('max_rank',
                                     F.max('rank').over(sdk_window_no_order))

    spark_sdk = spark_sdk.withColumn(
        'dates',
        F.collect_list(F.col('date_time').cast(StringType())).over(sdk_window))
    spark_sdk = spark_sdk.withColumn('actions',
                                     F.collect_list('action').over(sdk_window))

    spark_sdk2 = spark_sdk.where(spark_sdk.rank == spark_sdk.max_rank)

    #sdkp2 = spark_sdk2.toPandas()
    ''' join jnd and sdk '''
    jnd2 = jnd.join(spark_sdk2.select(
        ['user_id', 'dates', 'actions', 'retailer_id']),
                    spark_sdk.user_id == jnd.jnd_sender_id,
                    how="left")

    #jnd2.count()

    def len_clicks(ls):

        try:
            return len([c for c in ls if c == 'click'])
        except:
            return None

    len_clicks_udf = F.udf(len_clicks, IntegerType())

    def len_adds(ls):

        try:
            return len([c for c in ls if c == 'add to cart'])
        except:
            return None

    len_adds_udf = F.udf(len_adds, IntegerType())

    jnd2 = jnd2.withColumn('clicks', len_clicks_udf(F.col('actions')))
    jnd2 = jnd2.withColumn('adds', len_adds_udf(F.col('actions')))
    ''' add nbr count '''
    jnd_window = Window.partitionBy("jnd_sender_id")
    jnd2 = jnd2.withColumn(
        "nbr_count",
        F.sum(F.when(F.col('nbr_date').isNotNull(),
                     1).otherwise(0)).over(jnd_window))

    #jnd2.where(jnd2.nbr_count<=3).select('jnd_sender_id').dropDuplicates().show(100)

    #jnd2.where(jnd2.nlu_positive_product_type=='shirt').select('jnd_sender_id').dropDuplicates().show(100)
    ''' grp convs '''
    convs_grp = jnd2.groupBy('jnd_sender_id').agg(
        F.count('nbr_date').alias('nbr_count'),
        F.sum('is_answered').alias('sum_is_answered'),
        F.first('clicks').alias('clicks'),
        F.first('adds').alias('adds'),
        F.first('jnd_retailer').alias('retailer')).toPandas()

    convs_grp[['clicks', 'adds', 'sum_is_answered'
               ]] = convs_grp[['clicks', 'adds', 'sum_is_answered']].fillna(0)

    convs_grp["answers_cut"] = pd.cut(convs_grp.sum_is_answered,
                                      [0, 1, 2, np.Inf],
                                      include_lowest=True,
                                      right=False)
    convs_grp["answers_cut"].value_counts()
    ''' plot answer on click and add '''
    grp = convs_grp.groupby(['answers_cut', 'retailer'], as_index=False).agg({
        "clicks": ["count", "mean", binary_ci],
        "adds": ["count", "mean", binary_ci]
    })
    grp.columns = [
        'cut', 'retailer', 'click_count', 'click_rate', 'click_ci',
        'add_to_cart_count', 'add_to_cart_rate', 'add_to_cart_ci'
    ]

    for retailer in ['429']:  #convs_grp['retailer'].unique().tolist()

        for c in ['click', 'add_to_cart']:

            temp = grp[grp["retailer"] == retailer]

            plt.errorbar([i for i in range(len(temp[c + "_rate"]))],
                         temp[c + "_rate"].values,
                         yerr=temp[c + "_ci"].values,
                         fmt='o',
                         label=c + "_" + retailer)

            plt.xticks([i for i in range(len(temp[c + "_rate"]))], temp["cut"])
            plt.legend()

            plt.title("user answer count effect on click / add to cart")
            plt.ylabel('rate')
            plt.xlabel("answer count")
    plt.savefig(
        '{}/answer_click.png'.format(dest)
    )  #/Users/amirdavidoff/mmuze-research/Improve_conversation_quality/tasks/task5
    plt.show(block=False)
    plt.close(fig)
    ''' plot questions for each type'''
    window_type2 = Window.partitionBy(["jnd_sender_id",
                                       'lag_type']).orderBy(["jnd_ts"])

    jnd2 = jnd2.withColumn('lag_type',
                           F.lag(F.col('nlu_positive_product_type')).over(
                               window))  #F.lit('_'),F.col('code_rank')))

    #jnd2 = jnd2.withColumn('lag_type2',F.lag(F.col('nlu_positive_product_type'),2).over(window))#F.lit('_'),F.col('code_rank')))

    #jnd2 = jnd2.withColumn('lag_type3',F.when(F.col('lag_type').isNull(),F.col('lag_type2')).otherwise(F.col('lag_type')))#F.lit('_'),F.col('code_rank')))

    #jnd2 = jnd2.withColumn('lag_type_filled',F.when(F.col('nlu_date').isNull(),F.last('nlu_positive_product_type',True).over(Window.partitionBy('jnd_sender_id').orderBy('jnd_ts').rowsBetween(-sys.maxsize, 0))))

    #jnd2 = jnd2.withColumn('type_q_rank',F.row_number().over(window_type2))

    jnd2 = jnd2.withColumn(
        'type_q_rank2',
        F.sum(F.when(F.col('nbr_response_code').isNotNull(),
                     1).otherwise(0)).over(window_type2))

    jnd2 = jnd2.withColumn(
        'q_rank',
        F.concat(F.col('type_q_rank2'), F.lit('_'),
                 F.col('nbr_response_code')))

    jnd2 = jnd2.withColumn('lag_nlu_date', F.lag('nlu_date').over(window))

    cols = [
        'nlu_positive_product_type', 'lag_type', 'nlu_text', 'nlu_date',
        'is_answered', 'nbr_date', 'nbr_response_code', 'type_q_rank2',
        'q_rank', 'jnd_sender_id', 'jnd_ts'
    ]
    temp = jnd2.where((jnd2.nbr_response_code.isNotNull())
                      & (jnd2.lag_type.isNotNull())).select(cols)
    #temp = temp.withColumn('type_q_rank',F.dense_rank().over(window_type2))
    #delete = temp.where(temp.jnd_sender_id=='4rbvkcyxbrg').toPandas()

    grp = temp.groupBy(['lag_type', 'q_rank']).agg(
        F.count('is_answered').alias('count'),
        F.mean('is_answered').alias('answer_rate')).toPandas()

    grp["ci"] = grp[["answer_rate", "count"]].apply(binary_ci_pd, axis=1)

    grp = grp[grp["count"] >= 100]

    grp = grp.sort_values(["lag_type", "count"], ascending=False)
    ''' plot types bar plot '''

    for typ in grp["lag_type"].unique().tolist():

        fig = plt.figure(figsize=(10, 10))

        temp = grp[grp["lag_type"] == typ]
        temp = temp.sort_values('q_rank', ascending=False)

        #temp = temp.sort_values("answer_rate")
        plt.errorbar(temp["answer_rate"],
                     temp["q_rank"],
                     xerr=temp['ci'].values,
                     fmt='o',
                     label=typ,
                     alpha=0.7)

        plt.xlabel('answer rate')
        plt.ylabel('order_question')
        plt.title('answer rate per question and order for type {}'.format(typ))
        plt.legend()
        # plt.show()

        plt.savefig('{}/{}.png'.format(dest, typ))
        plt.show(block=False)
        plt.close(fig)
    ''' original plots -  there is a bug check it sometime  bug came after filling answered na with 0  12/11/19'''
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql import functions as F

sc = SparkContext(appName = "Lab2ex3")
sqlContext = SQLContext(sc)

temp_file = sc.textFile("BDA/input/temperature-readings.csv")
temp_lines = temp_file.map(lambda line: line.split(";"))

tempReadingsRow = temp_lines.map(lambda p: Row(station=p[0], date=p[1], year=p[1].split("-")[0], month=p[1].split("-")[1], day=p[1].split("-")[2], time=p[2], value=float(p[3]), quality=p[4]))

schemaTempReadings = sqlContext.createDataFrame(tempReadingsRow)
schemaTempReadings.registerTempTable("tempReadingsTable")

min_max_val_day = schemaTempReadings.select("year", "month", "day", "station", "value").filter((schemaTempReadings["year"]>=1960) & (schemaTempReadings["year"]<=2014)).groupBy("year", "month", "day", "station").agg(F.max(schemaTempReadings["value"]).alias("maxVal"), F.min(schemaTempReadings["value"]).alias("minVal"))

monthly_avg = min_max_val_day.select("year", "month", "station", ((min_max_val_day["maxVal"] + min_max_val_day["minVal"])/2).alias("dailyAvg")).groupBy("year", "month", "station").agg(F.avg("dailyAvg").alias("monthlyAvg")).orderBy("monthlyAvg", ascending=False)

monthly_avg.rdd.coalesce(1,shuffle=True).saveAsTextFile("BDA/output")
Example #29
0
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')

# Correlation is a normalized measure of covariance that is easier to understand, 
# as it provides quantitative measurements of the statistical dependence between two random variables.
df.stat.corr('rand1', 'rand2')
df.stat.corr('id', 'id')
Example #30
0
 def calc_whiskers(colname, outliers):
     # Computes min and max values of non-outliers - the whiskers
     minmax = (outliers.filter("not `__{}_outlier`".format(colname)).agg(
         F.min("`%s`" % colname).alias("min"),
         F.max(colname).alias("max")).toPandas())
     return minmax.iloc[0][["min", "max"]].values
    # Build vocabulary of categorical columns.
    vocab = build_vocabulary(
        train_df.select(*categorical_cols).unionAll(
            test_df.select(*categorical_cols)).cache(), categorical_cols)

    # Cast continuous columns to float & lookup categorical columns.
    train_df = cast_columns(train_df, continuous_cols + ['Sales'])
    train_df = lookup_columns(train_df, vocab)
    test_df = cast_columns(test_df, continuous_cols)
    test_df = lookup_columns(test_df, vocab)

    # Split into training & validation.
    # Test set is in 2015, use the same period in 2014 from the training set as a validation set.
    test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0]
    test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0]
    one_year = datetime.timedelta(365)
    train_df = train_df.withColumn('Validation',
                                   (train_df.Date > test_min_date - one_year) &
                                   (train_df.Date <= test_max_date - one_year))

    # Determine max Sales number.
    max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0]

    # Convert Sales to log domain
    train_df = train_df.withColumn('Sales', F.log(train_df.Sales))

    print('===================================')
    print('Data frame with transformed columns')
    print('===================================')
    train_df.show()
Example #32
0
 def fn(col):
     return ~(F.max(col))
Example #33
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


def write_file(accommodates):
    with open("../output/out_2_4.txt", "a") as f:
        f.write(str(accommodates))


spark: SparkSession = SparkSession.builder.master("local[*]").appName(
    "task1").getOrCreate()
hotelDF = spark.read.parquet(
    "../input/part-00000-tid-4320459746949313749-5c3d407c-c844-4016-97ad-2edec446aa62-6688-1-c000.snappy.parquet"
)
min_max = hotelDF.agg(F.min(hotelDF.price),
                      F.max(hotelDF.review_scores_value)).head()
col_min_price = min_max[0]
col_max_rating = min_max[1]
data = hotelDF.filter((F.col("price") == col_min_price) & (
    F.col("review_scores_value") == col_max_rating)).collect()
accommodates = data['accommodates']
write_file(accommodates)
Example #34
0
def market_x_seg(config, **dict):

    header_df = dict['table1'].filter(col('contact_stage_code') == 'ALC')

    header_df = header_df.drop('contact_stage_code')
    print '1'
    header_df.cache()
    print header_df.count()
    print header_df.distinct().count()
    header_df.show()

    detail_df = dict['table2']

    # details control customers required
    detail_df = detail_df.withColumn("offer_rank_num",
                                     col("offer_rank_num").cast(DoubleType()))

    detail_df.cache()
    print '2'
    print detail_df.count()
    print detail_df.distinct().count()
    detail_df.show()

    detail_header_df = detail_df.join(header_df, 'prsn_code', 'left_outer')

    detail_header_df.cache()
    print '3'
    print detail_header_df.count()
    print detail_header_df.distinct().count()
    detail_header_df.show(truncate=False)

    volume_udf = udf(
        lambda col1, col2: 'baulcm' if (col1 >= 1 and col1 <= 8 and col2 != 'Y'
                                        and col2 is not None) else 'lessloyal'
        if (col1 >= 1 and col1 <= 3 and col2 == 'Y') else 'baby'
        if (col1 >= 9 and col1 <= 12) else 'npm' if (col1 >= 13 and col1 <= 20)
        else 'extralcm' if (col1 >= 21 and col1 <= 24) else 'babyextra' if
        (col1 >= 25 and col1 <= 28) else 'nfm' if (col1 >= 29 and col1 <= 32)
        else 'additional' if (col1 >= 33 and col1 <= 40) else 'others')

    slot_df = detail_header_df.withColumn(
        'slot_segment',
        (volume_udf(col('offer_rank_num'), col('less_loyal_flag'))))

    slot_df.cache()
    print '4'
    print slot_df.count()
    print slot_df.distinct().count()
    slot_df.filter(col('contact_stage_code') == 'RDM').show(200)

    slot_df = slot_df.drop('offer_rank_num')
    slot_df = slot_df.drop('less_loyal_flag')

    print 'count for slot_df'
    slot_df.cache()
    print slot_df.count()
    print slot_df.distinct().count()
    slot_df.show()

    card_lifestyle_seg_df = dict['card_lifestyle_seg'].filter(
        trim(col(config['identity_type_code'])) != '')
    slot_segment_df = slot_df.join(card_lifestyle_seg_df,
                                   config['identity_type_code'], 'left_outer')

    print 'count for slot_segment_df'
    slot_segment_df.cache()
    print slot_segment_df.count()
    print slot_segment_df.distinct().count()
    slot_segment_df.show()

    card_loyalty_seg_df = dict['card_loyalty_seg'].filter(
        trim(col(config['identity_type_code'])) != '')
    slot_segment_df = slot_segment_df.join(card_loyalty_seg_df,
                                           config['identity_type_code'],
                                           'left_outer')

    print 'count for slot_segment_df'
    slot_segment_df.cache()
    print slot_segment_df.count()
    print slot_segment_df.distinct().count()
    slot_segment_df.show()

    card_pricesence_seg_df = dict['card_pricesence_seg'].filter(
        col(config['identity_type_code']) != '')
    slot_segment_df = slot_segment_df.join(card_pricesence_seg_df,
                                           config['identity_type_code'],
                                           'left_outer')

    print 'count for slot_segment_df'
    slot_segment_df.cache()
    print slot_segment_df.count()
    print slot_segment_df.distinct().count()
    slot_segment_df.show()

    date_df = dict['date_dim']
    targ_date = date_df.filter(
        col("fis_week_id") == str(config['seg_week'])).filter(
            col('fis_day_of_week_num') == '7').select(
                col('date').cast(StringType())).collect()[0][0]

    # age
    card_dim_df = dict['card_dim'].withColumn(
        'age_1',
        F.floor(F.datediff(F.lit(targ_date), F.col('card_birth_date'))) /
        365).withColumn(
            'Age',
            F.when(F.col('age_1') > 66, '67 eller mer').otherwise(
                F.when(F.col('age_1') > 55, '56-66').otherwise(
                    F.when(F.col('age_1') > 45, '46-55').otherwise(
                        F.when(F.col('age_1') > 35, '36-45').otherwise(
                            F.when(F.col('age_1') > 25, '26-35').otherwise(
                                F.when(F.col('age_1') > 0, '0-25').otherwise(
                                    'Uklassifiserte'))))))).drop(
                                        F.col('age_1'))
    slot_segment_df = slot_segment_df.join(card_dim_df,
                                           config['identity_type_code'],
                                           'left_outer')

    slot_segment_df.cache()
    print '6'
    print slot_segment_df.count()
    print slot_segment_df.distinct().count()
    slot_segment_df.show()

    # supplier_name
    print dict
    offer_dim_df = dict['offer_dim']
    #     .select("offer_code", "supplier_name").distinct()

    # CHECK FOR NULL EMPTY OFFER_CODE and empty
    offer_dim_df = offer_dim_df.filter(trim(col("offer_code")) != "").groupBy(
        'offer_code', 'supplier_name').agg(
            F.max('offer_discount_amt').alias("offer_discount_amt"),
            F.max('offer_amount').alias("offer_amount")).dropDuplicates(
                ['offer_code'])

    # removed ACT after discussion with Sharang
    slot_segment_df = slot_segment_df.join(
        offer_dim_df, 'offer_code', 'left_outer').select(
            slot_segment_df["*"], offer_dim_df.supplier_name,
            offer_dim_df.offer_amount,
            F.when(
                (slot_segment_df.contact_stage_code.isin("ALC", "DLV", "EXP")),
                offer_dim_df.offer_discount_amt).otherwise(
                    slot_segment_df.offer_discount_amt).alias(
                        "offer_discount_amt")).drop(
                            slot_segment_df.offer_discount_amt)

    slot_segment_df.cache()
    print '7'
    print slot_segment_df.count()
    print slot_segment_df.distinct().count()
    slot_segment_df.show()

    # private label and prod_hier_l20_code
    #     prod_dim_df = dict['prod_dim'].withColumn('supplier_private_label', F.when(
    #         F.upper(F.col('prod_desc')).like('%X-TRA%'),
    #         F.lit('private')
    #     ).when(
    #         F.upper(F.col('prod_desc')).like('%NGLAMARK%'),
    #         F.lit('private')
    #     ).when(
    #         F.upper(F.col('prod_desc')).like('%MARKET%'),
    #         F.lit('private')
    #     ).otherwise(F.lit('non-private')))
    #     slot_segment_df = slot_segment_df.join(prod_dim_df, 'prod_code', 'left_outer')

    # banner_name
    #     store_dim_df = dict['store_dim']
    #     slot_segment_df = slot_segment_df.join(store_dim_df, 'store_code', 'left_outer')

    return slot_segment_df
# read the precipitation data
rdd = sc.textFile("data/precipitation-readings.csv")
# create DataFrame from RDD
parts = rdd.map(lambda a: a.split(';'))
precReadingsRow = parts.map(lambda x: (x[0], x[1], int(x[1].split("-")[
    0]), int(x[1].split("-")[1]), x[2], float(x[3]), x[4]))
precReadingsString = [
    "station", "date", "year", "month", "time", "value", "quality"
]
schemaPrecReadings = sqlContext.createDataFrame(precReadingsRow,
                                                precReadingsString)

# find the max temp per station
maxTemps = schemaTempReadings.groupBy('station').agg(
    F.max('value').alias('maxTemp'))
# filter
maxTemps = maxTemps.filter((maxTemps['maxTemp'] >= 25)
                           & (maxTemps['maxTemp'] <= 30))

# calculate the daily precipitation and find the max
maxPrecs = schemaPrecReadings.groupBy('station', 'date').agg(
    F.sum('value')).groupBy('station').agg(
        F.max('sum(value)').alias('maxDailyPrecipitation'))
# filter
maxPrecs = maxPrecs.filter((maxPrecs['maxDailyPrecipitation'] >= 100)
                           & (maxPrecs['maxDailyPrecipitation'] <= 200))

# join and output the max temp and max precipitation
joined = maxTemps.join(maxPrecs, 'station',
                       'inner').orderBy('station', ascending=False).show()
Example #36
0
sc = SparkContext("local", "Test")
print(sc)

sqlContext = SQLContext(sc)
sqlContext

#Creating data frame from list
data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)]
schema = ['fname', 'lname', 'age']
df = sqlContext.createDataFrame(data, schema)
df

#Retrieving contents of data frame
df.printSchema()
df.show()
df.first()
df.count()

#Adding columns
df = df.withColumn('salary', F.lit(0))
df.show()
df.withColumn('salary2', df['age'] * 100).show()

#Filtering and subsetting 
df.filter(df['age'] > 30).select('fname','age').show()
df.select(F.max('age').alias('max-age')).show()

#Grouped aggregations
df.groupBy('lname').max('age').show()
###HadoopLink = "hdfs://10.82.187.10:8020/hadoop/hdfs/INPUTPARQUET/"

CashLoanForRandomSample = hq.read.parquet(
    HadoopLink + "var/CashLoanForRandomSample_parquet").persist()
CashLoanForRandomSample.registerTempTable("CashLoanForRandomSample")

ClientContractDateMapping = hq.read.parquet(
    HadoopLink + "dict/ClientContractDateMapping_parquet").persist()
ClientContractDateMapping.registerTempTable("ClientContractDateMapping")

SaleOfCredits = hq.read.parquet(HadoopLink +
                                "contr/SaleOfCredits_parquet").persist()
SaleOfCredits.registerTempTable("SaleOfCredits")

max_date = str(
    CashLoanForRandomSample.agg(psf.max("ReportingDate")).take(1)[0][0])
tag = str(sys.argv[1]) if len(sys.argv) > 1 else 'Last3MWindowsChurnIn3M'
#tag='Last3MWindowsChurnIn3M'

soc = hq.sql("\
SELECT \
    clfrs.ContractID,clfrs.ReportingDate \
    ,COUNT(case when soc.ContractID is not null then 1 else null end) AS ExistFlag \
FROM CashLoanForRandomSample clfrs \
LEFT JOIN SaleOfCredits soc \
ON soc.ContractID = clfrs.ContractID AND soc.SalesDate < clfrs.ReportingDate \
GROUP BY clfrs.ContractID,clfrs.ReportingDate \
")
soc.registerTempTable("soc")

step1 = hq.sql("\
#print('Best maxIter: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxIter()))

bestModel_gdbt = cvModel_gdbt.bestModel
#Get the best model with best hyper-parameter
# According to the AUC result on test samples, GDBT with maxDepth=4, maxBins=20, and maxIter=10, is the best model.
best_model = bestModel_gdbt
#Apply the best model
# 2 Classify all the users
# Predict over all comments
predictions_over_comments = best_model.transform(dataset_noEmpty)

# Predict over all users. If a user has more than one comments, he or she has more than one prediction.
# We assume that we want to find the potential buyer so we don't want to miss any candidates.
# As a result, we apply max-win algorithm, which mean unless all prediction is 0, the user is marked as 1.
from pyspark.sql import functions as F
predictions_over_users = predictions_over_comments.groupBy('userid').agg(F.max('prediction').alias('predictions_over_users'))
predictions_over_users.show(5)

# Display the percetage of cat or dog owner.
#print('%.2f% of users are cat or dog owner.' % (predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100))
print(predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100)
#investigate the reasons from the text
# 3 get insight of users
# First, select cat or dog owners from the dataset
cat_dog_owner = ((predictions_over_users.filter(F.col('predictions_over_users') == 1)).join(predictions_over_comments, ['userid'])).select('userid', 'comment', 'words','predictions_over_users','creator_name')
# Second, find top 10 popular words in cat and dot owners' comments.
# In particular, common words, such as 'and', 'I', 'you', and 'we', have been kicked out.
common_words = ['i', 'the', 'and', 'a', 'to', 'you', 'is', 'it', 'of', 'my',
               'that', 'in', 'so', 'for', 'have', 'this', 'your', 'are',
               'was', 'on', 'with', 'but', 'he', 'they', 'be', 'me',
               'just', 'do', 'all', 'one', 'not', 'what', 'im', 'if',
Example #39
0
    def any(self, axis: Union[int, str] = 0) -> bool:
        """
        Return whether any element is True.

        Returns False unless there at least one element within a series that is
        True or equivalent (e.g. non-zero or non-empty).

        Parameters
        ----------
        axis : {0 or 'index'}, default 0
            Indicate which axis or axes should be reduced.

            * 0 / 'index' : reduce the index, return a Series whose index is the
              original column labels.

        Examples
        --------
        >>> ks.Series([False, False]).any()
        False

        >>> ks.Series([True, False]).any()
        True

        >>> ks.Series([0, 0]).any()
        False

        >>> ks.Series([0, 1, 2]).any()
        True

        >>> ks.Series([False, False, None]).any()
        False

        >>> ks.Series([True, False, None]).any()
        True

        >>> ks.Series([]).any()
        False

        >>> ks.Series([np.nan]).any()
        False

        >>> df = ks.Series([True, False, None]).rename("a").to_frame()
        >>> df.set_index("a").index.any()
        True
        """
        axis = validate_axis(axis)
        if axis != 0:
            raise NotImplementedError(
                'axis should be either 0 or "index" currently.')

        sdf = self._internal._sdf.select(self.spark_column)
        col = scol_for(sdf, sdf.columns[0])

        # Note that we're ignoring `None`s here for now.
        # any and every was added as of Spark 3.0
        # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
        # Here we use max as its alternative:
        ret = sdf.select(F.max(F.coalesce(col.cast("boolean"),
                                          F.lit(False)))).collect()[0][0]
        if ret is None:
            return False
        else:
            return ret
Example #40
0
# Note also that the alias function is a way of specifying the name of the column
# in the output 

avg_cost_by_animal = (
    recent_rescue.filter(
        recent_rescue.AnimalGroup.isin(
            "Horse", 
            "Goat", 
            "Cat", 
            "Bird"
        ))
    .groupBy("AnimalGroup")
    .agg(
        f.min('TotalCost').alias('Min'), 
        f.avg('TotalCost').alias('Mean'), 
        f.max('TotalCost').alias('Max'), 
        f.count('TotalCost').alias('Count'))
    .sort("Mean", ascending=False)
    .toPandas()
)
avg_cost_by_animal


#------------------------
## Joining Data
#------------------------

# Lets load in another data source to indicate population based on postcode, and join that 
# onto the rescue data

filepath = "/tmp/training/population_by_postcode.csv"
Example #41
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

if __name__ == "__main__":
    spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate()
    file_path = "hdfs:///dataset/bank-data.csv"
    df = spark.read.csv(path=file_path, header=True, inferSchema=True)

    df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show()

    df.groupBy("region").agg({"income": "mean"}).show()
#import SQLContext and pyspark SQL functions

from pyspark.sql import SQLContext, Row
import pyspark.sql.functions as func
sqlContext = SQLContext(sc)

inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(","))
auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8])))

# Infer the schema, and register the DataFrame as a table.
auctiondf = sqlContext.createDataFrame(auctions)
auctiondf.registerTempTable("auctions")

auctiondf.show()

auctiondf.printSchema()

totbids = auctiondf.count()
print totbids

totalauctions = auctiondf.select("auctionid").distinct().count()
print total auctions

itemtypes = auctiondf.select("itemtype").distinct().count()
print itemtypes
auctiondf.groupBy("itemtype","auctionid").count().show()
auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show()
auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show()
auctiondf.filter(auctiondf.price>200).count()
xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
    # konsum_user_agg=konsum_user.groupBy('a_user_key').agg(sqlfuncs.max('reg_date').alias('reg_date'),\
    #     sqlfuncs.avg('age').alias('age'), sqlfuncs.max('gender').alias('gender'),sqlfuncs.max('date').alias('last_consume'),\
    #     sqlfuncs.min('date').alias('first_consume')  )
    # konsum_user_agg.registerTempTable('user_agg')
    #
    #
    # print(konsum_user_agg.first())
    # konsum_user_agg.write.save('/home/erlenda/data/konsum/a_users_parquet')
    #
    #
    #
    # #reg_late=konsum_user.filter(konsum_user.reg_date<datetime.datetime(2015,11,16,0,0))
    #
    pvs=konsum_user.groupBy('a_virtual','a_user_key','timegroup','device').agg(sqlfuncs.sum(konsum_user.pv).alias("pvs"),\
                                                  sqlfuncs.sum(konsum_user.pv_bet).alias("pvs_bet"),\
                                                  sqlfuncs.max('date').alias('last_consume'),\
                                                  sqlfuncs.min('date').alias('first_consume'),\
                                                  sqlfuncs.sum(konsum_user.visits).alias("visits"))

    pprint(pvs.take(10))
    print()
    #print(pvs.take(100)[55])
    pvs_tot1=pvs.agg(sqlfuncs.sum(pvs.pvs)).first()
    print('Total after basic aggregation',pvs_tot1)

    pvs_mapped=pvs.rdd.map(lambda x:((x.a_user_key,x.a_virtual), (Counter({literal_eval(x.timegroup):x.pvs}),\
        Counter({literal_eval(x.timegroup):1}),\
        x.pvs,\
        x.pvs_bet,\
        Counter({x.device:x.pvs}) ) )  )
# MAGIC %md ### Question: What is the difference between the revenue of a product and the revenue of the best selling product in the same category as this product?

# COMMAND ----------

import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func

# Window function partioned by Category and ordered by Revenue
windowSpec = \
  Window \
    .partitionBy(df['category']) \
    .orderBy(df['revenue'].desc()) \
    .rangeBetween(-sys.maxsize, sys.maxsize)
    
# Create dataframe based on the productRevenue table    
dataFrame = sqlContext.table("productRevenue")

# Calculate the Revenue difference
revenue_difference = \
  (func.max(dataFrame['revenue']).over(windowSpec) - dataFrame['revenue'])
  
# Generate a new dataframe (original dataframe and the revenue difference)
revenue_diff = dataFrame.select(
  dataFrame['product'],
  dataFrame['category'],
  dataFrame['revenue'],
  revenue_difference.alias("revenue_difference"))

# Display revenue_diff
display(revenue_diff)
# A slightly different way to generate the two random columns
df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
#df.describe().show()
display(df.describe())


# COMMAND ----------

#df.describe('uniform', 'normal').show()
display(df.describe('uniform', 'normal'))

# COMMAND ----------

from pyspark.sql.functions import mean, min, max
#df.select([mean('uniform'), min('uniform'), max('uniform')]).show()
display(df.select([mean('uniform'), min('uniform'), max('uniform')]))

# COMMAND ----------

# MAGIC %md ### Sample covariance and correlation
# MAGIC 
# MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows:

# COMMAND ----------

from pyspark.sql.functions import rand
df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))


# COMMAND ----------
Example #46
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450


# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310


# COMMAND ----------
Example #48
0
    def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues):
        allNumericCols = self.getNumericalFieldNames()
        if len(allNumericCols) == 0:
            self._addHTML("Unable to find a numerical column in the dataframe")
            return
        
                 
        keyFields = self.options.get("keyFields")
        valueField = self.options.get("valueFields")

        if(keyFields==None and valueField==None):
            keyFields=self.getFirstStringColInfo()
            valueField=self.getFirstNumericalColInfo() 
        else:
            keyFields = keyFields.split(',') 
            valueField = valueField.split(',') 
            if(len(valueField) > 1):
                self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField)))
                return
            keyFields = keyFields[0]
            valueField=valueField[0]
        
                
        #if(len(valueFields>)):


    
        #init
        fig=figure
        ax=axes
        
        #fig, ax = plt.subplots()   
        #fig = plt.figure()
        

        params = plt.gcf()
        plSize = params.get_size_inches()
        params.set_size_inches( (plSize[0]*2, plSize[1]*2) )


        agg=self.options.get("aggregation")
        groupByCol=self.options.get("groupByCol")
        
        if (agg=="None" or agg==None):
            colLabel = keyFields
            y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist()
            x_intv = np.arange(len(y))
            labels =  self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel(valueField, fontsize=18)
        elif(agg=='AVG'):
            y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields)
            y=y1["avg"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("Average "+valueField, fontsize=18)
        elif(agg=='SUM'):
            y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields)
            y=y1["sum"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("sum "+valueField, fontsize=18)
        elif(agg=='MAX'):
            y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields)
            y=y1["max"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("max "+valueField, fontsize=18)
        elif(agg=='MIN'):
            y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields)
            y=y1["min"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("min "+valueField, fontsize=18)
        elif(agg=='COUNT'):
            y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields)
            y=y1["count"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("count "+valueField, fontsize=18)

        mpld3.enable_notebook()      
        plt.bar(x_intv,y,color="blue",alpha=0.5)
        ax_fmt = BarChart(labels)
        mpld3.plugins.connect(fig, ax_fmt)
Example #49
0
File: queries.py Project: jiep/ABD
'''
Necesario para utilizar la función to_date
'''
from pyspark.sql.functions import *

df.select("*")\
  .where((to_date(df.CreationDate) ==
    df.select(
      min(
        to_date("CreationDate"))\
        .alias("min"))\
        .collect()[0].min) | (
        to_date(df.CreationDate) ==
          df.select(
            max(to_date("CreationDate"))\
            .alias("max"))\
            .collect()[0].max))\
  .orderBy(to_date("CreationDate"))\
  .show()

''' Comparando fechas hasta los milisegundos'''

'''
Usuario más antiguo
'''
df.sort("CreationDate", ascending=False)\
  .limit(1)\
  .show()
'''
Usuario más reciente
'''
    StructField("Likes", StringType(), True),
    StructField("RTs", StringType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", IntegerType(), True),
    StructField("Friends", IntegerType(), True)
])
csvDF =( spark \
    .readStream \
    .schema(userSchema) \
    .option("delimiter", ";") \
    .option("maxFilesPerTrigger",1)\
    .csv(inputpath))

query2 = csvDF.select("name", "Followers", "Friends")
aria = query2.withColumn("FRRatio", query2.Followers / query2.Friends).select(
    "name", "FRRatio").groupBy("name").agg(max("FRRatio").alias("FRRatio"))
spar = aria.orderBy('FRRatio', ascending=False)

spar.writeStream\
 .outputMode("complete")\
 .format("console")\
 .option("numrows" ,1)\
 .queryName("counts1")\
 .start()\
 .awaitTermination(60)
spar.stop()
department_schema = StructType([
    StructField("dept_id", StringType(), True),
    StructField("dept_name", StringType(), True)
])
department_data = spark.sparkContext.textFile("C:\\data\\department.txt").map(
    lambda x: x.split(','))
department = spark.createDataFrame(department_data, department_schema)
department = department.select(
    department.dept_id.cast("int").alias('dept_id'), department.dept_name)
print(department.count())
department.dropDuplicates()
print(department.count())

department_people = people.where('dept_id in (10,20,30)').join(department, people.dept_id == department.dept_id) \
    .groupBy(people.name, people.age, department.dept_name).agg(F.max(department.dept_id).alias('dept_id'))
department_peopleSorted = department_people.orderBy(
    department_people.name, department_people.dept_id.desc())
department_peopleSorted.dropDuplicates()

#print(department_peopleSorted.collect())

# department_peopleSorted.coalesce(1).write.mode("overwrite").save("C:\\data\\out_put")
#department_peopleSorted.coalesce(1).rdd.saveAsTextFile("file:///C:/data/out_put/")

department_peopleSorted.registerTempTable("test_table")
people.registerTempTable("people_table")

query = '''
select people.name,people.age,people_dept.dept_id from people_table people 
join test_table people_dept on people.dept_id = people_dept.dept_id
Example #52
0
# COMMAND ----------

flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("/mnt/enterprise/flightdata/2015-summary.csv")


# COMMAND ----------

flightData2015.sort("count").explain()

# COMMAND ----------

from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

# COMMAND ----------

from pyspark.sql.functions import desc

flightData2015\
 .groupBy("DEST_COUNTRY_NAME")\
 .sum("count")\
 .withColumnRenamed("sum(count)", "destination_total")\
 .sort(desc("destination_total"))\
 .limit(5)\
 .show()
from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, SparkSession, Window
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType, StructField
from pyspark.sql.functions import explode, split, max, rank, min, current_timestamp, expr

spark = SparkSession.builder.appName("Task1b").getOrCreate()

userSchema = StructType().add("id", "integer").add("lang", "string").add("date", "string").add("source", "string").add("len", "integer").add("likes", "integer").add("RTs", "string").add("hashtags", "string").add("umn", "string").add("umid", "string").add("name", "string").add("place", "string").add("followers", "integer").add("friends", "integer")

csvDF = spark.readStream.option("sep", ";").schema(userSchema).csv("hdfs://localhost:9000/stream")

#csvDF = spark.readStream.option("sep", ";").schema(userSchema).csv("/home/chaitra/data")

csvDF = csvDF.withColumn("ratio", csvDF.followers/csvDF.friends).where("followers != 0 and friends != 0")

q = csvDF.groupBy('name').agg(max('ratio').alias("FRRatio")).sort("FRRatio", ascending=False)

query = q.writeStream.outputMode("complete").format("console").option("numRows", 1).start()

query.awaitTermination(60)
query.stop()

def create_monthly_yearly_award(pargs, params):
    """
    Read in denormalized_awards and cntry_award_no data and create combined_yr_mo_awards data

    :inputs: denormalized_awards, cntry_award_no
    :outputs: combined_yr_mo_awards
    """

    df = spark.read.option("delimiter",
                           "|").csv(data_paths['denormalized_awards'],
                                    inferSchema=True)
    award_ref = spark.read.csv(data_paths[configs['cntry_award_no']], header=True, inferSchema=True) \
        .withColumnRenamed("awd_desc_{cntry}".format(cntry=configs["cntry"]), "awd_desc_cntry")

    # Rename columns
    orig_col = df.schema.names
    for i in range(0, len(orig_col)):
        df = df.withColumnRenamed(orig_col[i], configs["de_awd_col_names"][i])

    # Filter by country and create award rank column
    rnk_scores = configs["rnk_scores"]
    monthly_awards1 = df.filter(F.col("cntry_key_no").isin(run['cntry_key_no'])) \
        .withColumn("awd_rnk_no",
                    F.when(df.CROWN_PLUS_FLG == 1, rnk_scores["CROWN_PLUS_FLG"])
                    .when(df.F_TRIPLE_DIA_FLG == 1, rnk_scores["F_TRIPLE_DIA_FLG"])
                    .when(df.TRIPLE_DIA_FLG == 1, rnk_scores["TRIPLE_DIA_FLG"])
                    .when(df.F_DOUBLE_DIA_FLG == 1, rnk_scores["F_DOUBLE_DIA_FLG"])
                    .when(df.DOUBLE_DIA_FLG == 1, rnk_scores["DOUBLE_DIA_FLG"])
                    .when(df.F_EXEC_DIA_FLG == 1, rnk_scores["F_EXEC_DIA_FLG"])
                    .when(df.EXEC_DIA_FLG == 1, rnk_scores["EXEC_DIA_FLG"])
                    .when(df.F_DIA_FLG == 1, rnk_scores["F_DIA_FLG"])
                    .when(df.DIA_FLG == 1, rnk_scores["DIA_FLG"])
                    .when(df.F_EMRLD_FLG == 1, rnk_scores["F_EMRLD_FLG"])
                    .when(df.EMRLD_FLG == 1, rnk_scores["EMRLD_FLG"])
                    .when(df.F_SAPPHIRE_FLG == 1, rnk_scores["F_SAPPHIRE_FLG"])
                    .when(df.SAPPHIRE_FLG == 1, rnk_scores["SAPPHIRE_FLG"])
                    .when(df.PEARL_FLG == 1, rnk_scores["PEARL_FLG"])
                    .when(df.F_PLAT_FLG == 1, rnk_scores["F_PLAT_FLG"])
                    .when(df.RUBY_FLG == 1, rnk_scores["RUBY_FLG"])
                    .when(df.PLAT_FLG == 1, rnk_scores["PLAT_FLG"])
                    .when(df.GOLD_FLG == 1, rnk_scores["GOLD_FLG"])
                    .when(df.SILVER_FLG == 1, rnk_scores["SILVER_FLG"])
                    .otherwise(F.lit(None))
                    ).select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'awd_rnk_no')

    # Merge two tables
    monthly_awards2 = monthly_awards1.withColumn("month", expr("substring(mo_yr_key_no, length(mo_yr_key_no)-1, length(mo_yr_key_no))").cast('int')) \
        .withColumn("year", expr("substring(mo_yr_key_no, 1, 4)").cast('int'))

    monthly_awards3 = monthly_awards2.withColumn("perf_yr",
                                                 when(monthly_awards2.month >= configs["first_month_of_perf_yr"], monthly_awards2.year + 1).otherwise(monthly_awards2.year)) \
        .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'awd_rnk_no')

    monthly_awards = monthly_awards3.join(award_ref, monthly_awards3.awd_rnk_no == award_ref.cur_awd_awd_rnk_no, 'left') \
        .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'awd_rnk_no', 'awd_desc_cntry').withColumnRenamed("awd_desc_cntry", "i_mthly_awd_cd") \
        .withColumnRenamed("awd_rnk_no", "i_mthly_awd_rnk_no")

    yearly_awards1 = monthly_awards.groupBy(
        'imc_key_no', 'cntry_key_no',
        'perf_yr').agg(F.max("i_mthly_awd_rnk_no").alias("i_yrly_awd_rnk_no"))
    yearly_awards = yearly_awards1.join(award_ref, yearly_awards1.i_yrly_awd_rnk_no == award_ref.cur_awd_awd_rnk_no, 'left')\
        .select('imc_key_no', 'perf_yr', 'awd_desc_cntry', 'i_yrly_awd_rnk_no').withColumnRenamed("awd_desc_cntry", "i_yrly_awd_cd")\
        .withColumnRenamed("awd_desc_cntry","i_yrly_awd_cd").withColumnRenamed("imc_key_no", "imc_key_no_yr").withColumnRenamed("perf_yr", "perf_yr_yr")

    combined_awards = monthly_awards.join(yearly_awards, (monthly_awards.imc_key_no == yearly_awards.imc_key_no_yr) & (monthly_awards.perf_yr == yearly_awards.perf_yr_yr), 'left') \
        .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'i_mthly_awd_cd', 'i_yrly_awd_cd', 'i_mthly_awd_rnk_no', 'i_yrly_awd_rnk_no')

    combined_awards = combined_awards.withColumn(
        'mo_yr_key_no',
        combined_awards.mo_yr_key_no.cast('string')).withColumn(
            'imc_key_no', combined_awards.imc_key_no.cast('string'))
    combined_awards = combined_awards.withColumn(
        'mo_yr_key_no',
        to_timestamp(combined_awards.mo_yr_key_no, 'yyyyMM')).withColumn(
            'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd'))

    # Write finial result
    combined_awards.write.parquet(data_paths['combined_yr_mo_awards'].format(
        run_mode=run['run_mode'], run_id=run['run_id']),
                                  mode='overwrite')
Example #55
0
# In[ ]:

sqlCtx.sql("select program,avg(age) AS AverageAge FROM st GROUP BY program").show()


# In[ ]:




# In[ ]:

from pyspark.sql import functions as funcs

AvgMin=students.groupBy('program').agg(funcs.avg('age').alias('AverageAge '),funcs.max('age').alias('MaximumAge'))

AvgMin.show()


# In[ ]:




# #How the queries are optimized

# In[ ]:

sqlCtx.sql("select name, program FROM st").explain()
def create_classroom_feature(pargs, params):
    """
    :inputs: download_df, browse_df, share_df, search_df, fav_df
    :outputs: classroom_data
    """

    download_df = spark.read.parquet(data_paths['download_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    browse_df = spark.read.parquet(data_paths['browse_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    share_df = spark.read.parquet(data_paths['share_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    search_df = spark.read.parquet(data_paths['search_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    fav_df = spark.read.parquet(data_paths['fav_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))

    fav_df2 = fav_df.withColumn("date",
                                to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    fav_df2 = fav_df2.withColumn(
        "MONTH_tmp", F.from_unixtime(F.unix_timestamp(fav_df2.date, "yyyyMM")))
    fav_df2 = fav_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    fav_df3 = fav_df2.withColumn(
        "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)"))
    fav_df3 = fav_df3.withColumn(
        "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))"))
    fav = fav_df3.withColumn("ADJ_USERID",
                             regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    fav = fav.groupby(['ADJ_USERID', 'MONTH']).count()
    fav = fav.withColumnRenamed("count", "num_fav")

    download_df2 = download_df.withColumn(
        "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    download_df2 = download_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(download_df2.date, "yyyyMM")))
    download_df2 = download_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    download_df3 = download_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    download_df4 = download_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    download = download.withColumnRenamed("count", "num_" + "download")

    browse_df2 = browse_df.withColumn(
        "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    browse_df2 = browse_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(browse_df2.date, "yyyyMM")))
    browse_df2 = browse_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    browse_df3 = browse_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    browse_df4 = browse_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    browse = browse.withColumnRenamed("count", "num_" + "browse")

    share_df2 = share_df.withColumn(
        "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    share_df2 = share_df2.withColumn(
        "MONTH_tmp", F.from_unixtime(F.unix_timestamp(share_df2.date,
                                                      "yyyyMM")))
    share_df2 = share_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    share_df3 = share_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    share_df4 = share_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    share = share.withColumnRenamed("count", "num_" + "share")

    search_df2 = search_df.withColumn(
        "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    search_df2 = search_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(search_df2.date, "yyyyMM")))
    search_df2 = search_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    search_df3 = search_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    search_df4 = search_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    search = search.withColumnRenamed("count", "num_" + "search")

    data = [("2013-01-01", str(datetime.date.today()))]

    df = spark.createDataFrame(data, ["minDate", "maxDate"])

    df = df.withColumn("monthsDiff", F.months_between("maxDate", "minDate")) \
        .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", F.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", F.expr("add_months(minDate, date)")) \
        .select('date')
    df = df.withColumn(
        "MONTH", F.from_unixtime(F.unix_timestamp(F.col("date")),
                                 "yyyyMM")).select('MONTH')
    unique_id = download.select('ADJ_USERID').distinct() \
        .union(browse.select('ADJ_USERID').distinct()) \
        .union(share.select('ADJ_USERID').distinct()) \
        .union(search.select('ADJ_USERID').distinct()) \
        .union(fav.select('ADJ_USERID').distinct())
    unique_id = unique_id.distinct()
    all_abo_month = unique_id.crossJoin(df)
    combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \
        .union(share.select(['ADJ_USERID', 'MONTH'])) \
        .union(search.select(['ADJ_USERID', 'MONTH'])) \
        .union(fav.select(['ADJ_USERID', 'MONTH']))
    min_max_date = combine.groupby("ADJ_USERID").agg(F.min("MONTH"),
                                                     F.max("MONTH"))
    all_abo_month = all_abo_month.join(
        min_max_date,
        all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID,
        how='left').drop(min_max_date.ADJ_USERID)
    all_abo_month = all_abo_month.filter(F.col("MONTH") >= F.col("min(MONTH)"))
    all_abo_month = all_abo_month.filter(F.col("MONTH") <= F.col("max(MONTH)"))

    all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"])

    download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'],
                                  'left').na.fill(0)
    for n in range(1, 12):
        download = download.withColumn('num_' + "download" + str(n), F.lag(download['num_' + "download"], n, 0) \
                                       .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_3m",
        download['num_' + "download"] + download['num_' + "download" + "1"] +
        download['num_' + "download" + "2"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_6m",
        download["n_lag_currentyr_" + "download" + "_sum_3m"] +
        download['num_' + "download" + "3"] +
        download['num_' + "download" + "4"] +
        download['num_' + "download" + "5"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_9m",
        download["n_lag_currentyr_" + "download" + "_sum_6m"] +
        download['num_' + "download" + "6"] +
        download['num_' + "download" + "7"] +
        download['num_' + "download" + "8"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_12m",
        download["n_lag_currentyr_" + "download" + "_sum_9m"] +
        download['num_' + "download" + "9"] +
        download['num_' + "download" + "10"] +
        download['num_' + "download" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "download" + str(n)]
    download = download.drop(*droplist)

    browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, 12):
        browse = browse.withColumn('num_' + "browse" + str(n), F.lag(browse['num_' + "browse"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_3m", browse['num_' + "browse"] +
        browse['num_' + "browse" + "1"] + browse['num_' + "browse" + "2"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_6m",
        browse["n_lag_currentyr_" + "browse" + "_sum_3m"] +
        browse['num_' + "browse" + "3"] + browse['num_' + "browse" + "4"] +
        browse['num_' + "browse" + "5"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_9m",
        browse["n_lag_currentyr_" + "browse" + "_sum_6m"] +
        browse['num_' + "browse" + "6"] + browse['num_' + "browse" + "7"] +
        browse['num_' + "browse" + "8"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_12m",
        browse["n_lag_currentyr_" + "browse" + "_sum_9m"] +
        browse['num_' + "browse" + "9"] + browse['num_' + "browse" + "10"] +
        browse['num_' + "browse" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "browse" + str(n)]
    browse = browse.drop(*droplist)

    share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'],
                               'left').na.fill(0)
    for n in range(1, 12):
        share = share.withColumn('num_' + "share" + str(n), F.lag(share['num_' + "share"], n, 0) \
                                 .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_3m", share['num_' + "share"] +
        share['num_' + "share" + "1"] + share['num_' + "share" + "2"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_6m",
        share["n_lag_currentyr_" + "share" + "_sum_3m"] +
        share['num_' + "share" + "3"] + share['num_' + "share" + "4"] +
        share['num_' + "share" + "5"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_9m",
        share["n_lag_currentyr_" + "share" + "_sum_6m"] +
        share['num_' + "share" + "6"] + share['num_' + "share" + "7"] +
        share['num_' + "share" + "8"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_12m",
        share["n_lag_currentyr_" + "share" + "_sum_9m"] +
        share['num_' + "share" + "9"] + share['num_' + "share" + "10"] +
        share['num_' + "share" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "share" + str(n)]
    share = share.drop(*droplist)

    search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, 12):
        search = search.withColumn('num_' + "search" + str(n), F.lag(search['num_' + "search"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_3m", search['num_' + "search"] +
        search['num_' + "search" + "1"] + search['num_' + "search" + "2"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_6m",
        search["n_lag_currentyr_" + "search" + "_sum_3m"] +
        search['num_' + "search" + "3"] + search['num_' + "search" + "4"] +
        search['num_' + "search" + "5"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_9m",
        search["n_lag_currentyr_" + "search" + "_sum_6m"] +
        search['num_' + "search" + "6"] + search['num_' + "search" + "7"] +
        search['num_' + "search" + "8"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_12m",
        search["n_lag_currentyr_" + "search" + "_sum_9m"] +
        search['num_' + "search" + "9"] + search['num_' + "search" + "10"] +
        search['num_' + "search" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "search" + str(n)]
    search = search.drop(*droplist)

    fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0)
    for n in range(1, 12):
        fav = fav.withColumn('num_' + "fav" + str(n), F.lag(fav['num_' + "fav"], n, 0) \
                             .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_3m", fav['num_' + "fav"] +
        fav['num_' + "fav" + "1"] + fav['num_' + "fav" + "2"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_6m",
        fav["n_lag_currentyr_" + "fav" + "_sum_3m"] +
        fav['num_' + "fav" + "3"] + fav['num_' + "fav" + "4"] +
        fav['num_' + "fav" + "5"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_9m",
        fav["n_lag_currentyr_" + "fav" + "_sum_6m"] +
        fav['num_' + "fav" + "6"] + fav['num_' + "fav" + "7"] +
        fav['num_' + "fav" + "8"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_12m",
        fav["n_lag_currentyr_" + "fav" + "_sum_9m"] +
        fav['num_' + "fav" + "9"] + fav['num_' + "fav" + "10"] +
        fav['num_' + "fav" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "fav" + str(n)]
    fav = fav.drop(*droplist)

    classroom_data = all_abo_month.join(
        download, ['ADJ_USERID', 'MONTH'],
        'left').join(browse, ['ADJ_USERID', 'MONTH'],
                     'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join(
                         search, ['ADJ_USERID', 'MONTH'],
                         'left').join(fav, ['ADJ_USERID', 'MONTH'],
                                      'left').na.fill(0)
    classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no")
    classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no")

    df = classroom_data
    df = df.withColumn('mo_yr_key_no', df.mo_yr_key_no.cast('string'))
    df = df.withColumn('mo_yr_key_no', to_timestamp(df.mo_yr_key_no, 'yyyyMM'))
    df = df.withColumn('mo_yr_key_no', date_format('mo_yr_key_no',
                                                   'yyyy-MM-dd'))
    classroom_data = df

    print("now saving the data")
    classroom_data.write.parquet(data_paths['classroom_data'].format(
        run_mode=run['run_mode'], run_id=run['run_id']),
                                 mode='overwrite')
from pyspark.sql import functions as F

#Creating data frame from list
data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)]
schema = ['fname', 'lname', 'age']
df = sqlContext.createDataFrame(data, schema)
df

#Retrieving contents of data frame
df.printSchema()
df.show()
df.first()
df.count()

#Adding columns
df = df.withColumn('salary', F.lit(0))
df.show()
df.withColumn('salary2', df['age'] * 100).show()

#Filtering and subsetting 
df.filter(df['age'] > 30).select('fname','age').show()
df.select(F.max('age').alias('max-age')).show()

#Grouped aggregations
df.groupBy('lname').max('age').show()
df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()


def wechat_cloudcommerce(pargs, params):
    """
    wechat cloud commerce (yungou) feature engineering including search,browse,order placement, product purchased
    :inputs: wechat_miniprogram_input
    :outputs: wechat_cloudcommerce
    """

    wechat_mini = spark.read.option("delimiter", "\t").option(
        "header",
        "true").option("encoding",
                       "UTF-8").csv(data_paths['wechat_miniprogram_input'])

    wechat_mini = wechat_mini.withColumn('time',
                                         to_timestamp('时间戳', 'yyyy-MM-dd'))
    wechat_mini = wechat_mini.withColumn('month',
                                         to_timestamp('时间戳', 'yyyy-MM'))
    wechat_mini = wechat_mini.withColumn('month',
                                         date_format('month', 'yyyyMM'))

    wechat_mini2 = wechat_mini.withColumnRenamed('事件类型', 'event_type') \
        .withColumnRenamed('时间戳', 'timestamp') \
        .withColumnRenamed('诸葛id', 'trip_id') \
        .withColumnRenamed('事件id', 'event_id') \
        .withColumnRenamed('事件名', 'event_name') \
        .withColumnRenamed('商品id', 'product_id') \
        .withColumnRenamed('商品名称', 'product_name') \
        .withColumnRenamed('搜索词', 'search_word')

    # clean up imc_no
    wechat_mini3 = wechat_mini2.withColumn("leading360",
                                           expr("substring(amwayid, 1, 3)"))
    wechat_mini4 = wechat_mini3.withColumn(
        "ADJ_USERID",
        when(
            F.col("leading360") == "360",
            expr("substring(amwayid, 4, length(amwayid)-2)")).otherwise(
                F.col("amwayid")))
    wechat_mini5 = wechat_mini4.withColumn(
        "imc_no", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    wechat_mini_all = wechat_mini5.withColumn(
        "imc_no",
        when(
            F.col("leading360") == "360",
            expr("substring(imc_no, 1, length(imc_no)-2)")).otherwise(
                F.col("imc_no")))

    # browse
    wechat_mini_browse = wechat_mini_all.where((F.col("event_type") == '页面浏览'))
    wechat_mini_browse2 = wechat_mini_browse.groupBy('imc_no', 'month').agg(
        F.count("event_id").alias("n_num_cloudcommerce_browse"))

    # search
    wechat_mini_search = wechat_mini_all.where((F.col("event_type") == '站内搜索'))
    wechat_mini_search2 = wechat_mini_search.groupBy('imc_no', 'month').agg(
        F.count("event_id").alias("n_num_cloudcommerce_search"))

    # order
    wechat_mini_order = wechat_mini_all.where(
        (F.col("event_name") == '小程序_订单确认'))
    wechat_mini_order2 = wechat_mini_order.groupBy('imc_no', 'month').agg(
        F.count("event_id").alias("n_num_cloudcommerce_order"))

    # cart
    purchase_trip = wechat_mini_order.select('trip_id').distinct()
    wechat_mini_cart = wechat_mini_all.join(
        purchase_trip, 'trip_id', 'inner').where(
            (F.col("event_type") == '商品加购'))
    wechat_mini_cart2 = wechat_mini_cart.groupBy(
        'imc_no', 'month', 'trip_id').agg(
            F.count("product_id").alias(
                "n_num_cloudcommerce_product_per_cart"))
    wechat_mini_cart3 = wechat_mini_cart2.groupBy('imc_no', 'month').agg(
        F.avg("n_num_cloudcommerce_product_per_cart").alias(
            "n_num_cloudcommerce_product_per_cart"))

    # all abo and month combination
    unique_id = wechat_mini_all.select('imc_no').distinct()
    month = wechat_mini_all.select('month').distinct()
    all_abo_month = unique_id.crossJoin(month)
    min_max_date = wechat_mini_all.groupby("imc_no").agg(
        F.min("month"), F.max("month"))
    all_abo_month = all_abo_month.join(
        min_max_date, all_abo_month.imc_no == min_max_date.imc_no,
        how='left').drop(min_max_date.imc_no)
    all_abo_month = all_abo_month.filter(F.col("month") >= F.col("min(month)"))
    all_abo_month = all_abo_month.filter(F.col("month") <= F.col("max(month)"))

    # join everything together
    combine1 = all_abo_month.join(wechat_mini_browse2, ['imc_no', 'month'],
                                  'left').na.fill(0)
    combine2 = combine1.join(wechat_mini_search2, ['imc_no', 'month'],
                             'left').na.fill(0)
    combine3 = combine2.join(wechat_mini_order2, ['imc_no', 'month'],
                             'left').na.fill(0)
    combine4 = combine3.join(wechat_mini_cart3, ['imc_no', 'month'],
                             'left').na.fill(0)

    # create lag features
    combine = combine4.withColumnRenamed("month", "mo_yr_key_no")
    feature_list = [
        'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search',
        'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart'
    ]
    lag_features = configs["lag_features"]

    for feature in feature_list:
        for lag_mo in lag_features:
            for lag in range(0, lag_mo):
                colname = feature + "_" + str(lag)
                feature_col = feature + "_sum_" + str(lag_mo) + "m"
                combine = combine.withColumn(
                    colname,
                    F.lag(combine[feature], lag).over(
                        Window.partitionBy("imc_no").orderBy("mo_yr_key_no")))
                if lag == 0:
                    combine = combine.withColumn(feature_col, combine[colname])
                else:
                    combine = combine.withColumn(
                        feature_col, combine[feature_col] + combine[colname])

    main_col = ['imc_no', 'mo_yr_key_no']
    selected_feature = []
    for feature in feature_list:
        for lag_mo in lag_features:
            feature_col = feature + "_sum_" + str(lag_mo) + "m"
            selected_feature.append(feature_col)

    selected_feature = main_col + feature_list + selected_feature
    wechat_cloudcommerce = combine.select(selected_feature)

    wechat_formatting = wechat_cloudcommerce
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', wechat_formatting.mo_yr_key_no.cast('string'))
    # wechat_formatting = wechat_formatting.withColumn('imc_no',wechat_formatting.imc_no.cast('string'))
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', to_timestamp(wechat_formatting.mo_yr_key_no, 'yyyyMM'))
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd'))
    wechat_cloudcommerce = wechat_formatting

    wechat_cloudcommerce.write.parquet(
        data_paths['wechat_cloudcommerce'].format(run_mode=run['run_mode'],
                                                  run_id=run['run_id']),
        mode='overwrite')
# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC Alternatively, we can use SQL to directly calculate these statistics.  You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).
# MAGIC 
# MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`.

# COMMAND ----------

from pyspark.sql import functions as sqlFunctions
contentSizeStats =  (logs_df
                     .agg(sqlFunctions.min(logs_df['content_size']),
                          sqlFunctions.avg(logs_df['content_size']),
                          sqlFunctions.max(logs_df['content_size']))
                     .first())

print 'Using SQL functions:'
print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats)

# COMMAND ----------

# MAGIC %md
# MAGIC ### (3b) Example: HTTP Status Analysis
# MAGIC 
# MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times.  We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column.

# COMMAND ----------

status_to_count_df =(logs_df
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

sqlWay.explain()
dataFrameWay.explain()


# COMMAND ----------

from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)


# COMMAND ----------

maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.show()