Example #1
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
    def doRender(self, handlerId):
        self.addProfilingTime = False
        self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3")
        #self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geo.js")
        #self._addScriptElement("https://cdnjs.cloudflare.com/ajax/libs/d3-geo-projection/0.2.16/d3.geo.projection.js")
        #self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geom.js")

        #Load the data from the flight history db
        df = loadFlightHistory()
        ShellAccess.flightHistoryDF = df

        res = df.flatMap(lambda row: [\
                (row.depAirportFSCode, row.depAirportName.encode("ascii","ignore"), 0.0 if row.depAirportLat is None else row.depAirportLat,0.0 if row.depAirportLong is None else row.depAirportLong), \
                (row.arrAirportFSCode, row.arrAirportName.encode("ascii","ignore"), 0.0 if row.arrAirportLat is None else row.arrAirportLat,0.0 if row.arrAirportLong is None else row.arrAirportLong)\
            ])\
            .distinct()\
            .map(lambda t: """ "{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}""".format(t[0], t[1], t[2],t[3]))

        graphNodesJson="{"
        for r in res.collect():
            graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r)
        graphNodesJson+="}"
        myLogger.debug("graphNodesJson: {0}".format(graphNodesJson))        

        graphLinksJson = df.select("arrAirportFSCode","depAirportFSCode")\
            .withColumnRenamed("depAirportFSCode", "src").withColumnRenamed("arrAirportFSCode", "dst")\
            .groupBy("src","dst").agg(F.count("src").alias("count"))\
            .toJSON().map(lambda j: yaml.safe_load(j)).collect()
        myLogger.debug("graphLinksJson: {0}".format(graphLinksJson))


        self._addHTMLTemplate("mapResults.html", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson)
 def getValueFieldValueLists(self, handlerId, keyFields, valueFields):
     df = self.entity.groupBy(keyFields)
     agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId))
     maxRows = int(self.options.get("rowCount","100"))
     numRows = min(maxRows,df.count())
     valueLists = []
     for valueField in valueFields:
         valueDf = None
         if agg == "SUM":
             valueDf = df.agg(F.sum(valueField).alias("agg"))
         elif agg == "AVG":
             valueDf = df.agg(F.avg(valueField).alias("agg"))
         elif agg == "MIN":
             valueDf = df.agg(F.min(valueField).alias("agg"))
         elif agg == "MAX":
             valueDf = df.agg(F.max(valueField).alias("agg"))
         else:
             valueDf = df.agg(F.count(valueField).alias("agg"))
         for keyField in keyFields:
             valueDf = valueDf.sort(F.col(keyField).asc())
         valueDf = valueDf.dropna()
         rows = valueDf.select("agg").take(numRows)
         valueList = []
         for row in rows:
             valueList.append(row["agg"])
         valueLists.append(valueList)
     return valueLists   
Example #4
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
Example #5
0
    def test_smvRenameField_preserve_meta_for_unrenamed_fields(self):
        df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
        desc = "c description"
        res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
                 .smvDesc(("c", desc))
        self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])

        res2 = res1.smvRenameField(("a", "d"))
        self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
def acquire_majority_clusters(communities_in):
    # based on Top 5 Clusters where majority are
    # 1 large community, 1 small community, several small micro-communitiess
    q = communities_in.select("id", "type", "label").groupBy("label").agg(count("id").alias("count")).orderBy(desc("count"))
    maj_clusters    = communities_in.select("id", "department", "loan", "type", "label")
    maj_clusters    = maj_clusters.join(q.limit(5), on='label').select('id', 'department', 'type', 'loan', 'label')
    df_maj_clusters = maj_clusters.toPandas()
    df_maj_clusters = df_maj_clusters.rename(columns={'id': 'obj_id'})
    df_maj_clusters.obj_id = df_maj_clusters.obj_id.astype(long)
    #n_vertices_clusters = df_maj_clusters.shape[0]
    return maj_clusters, df_maj_clusters
Example #7
0
 def sampleColumn(self, numerical):
     default=None
     if Environment.hasSpark:
         from pyspark.sql import functions as F
         for field in self.entity.schema.fields:
             # Ignore unique ids
             if field.name.lower() != 'id' and ( not numerical or dataFrameMisc.isNumericType(field.dataType) ):
                 # Find a good column to display in pie ChartDisplay
                 default = default or field.name.decode("utf-8") if PY2 else field.name
                 count = self.entity.count()
                 sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity
                 orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg")
                 if orderedSample.take(1)[0]["agg"] > 10:
                     return [field.name.decode("utf-8") if PY2 else field.name]
     
     # Otherwise, return first non-id column
     return [default]
Example #8
0
def runAggregateFunctions(spark, df1, df2):
    # collect_list, collect_set
    doubledDf1 = df1.union(df1)
    doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False)
    doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False)

    # count, countDistinct
    doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show(
        truncate=False)

    # sum
    df2.printSchema()
    df2.select(sum(df2["price"])).show(truncate=False)

    # grouping, grouping_id
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False)
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show(
        truncate=False)
Example #9
0
    def getPieColInfo(self, numerical):
        # If user selects a column in dialog box, give it to them
        keyFields = self.options.get("keyFields")
        if keyFields is not None:
            return keyFields

        schema = self.entity.schema
        default=None
        for field in schema.fields:
            # Ignore unique ids
            if field.name.lower() != 'id' and ( not numerical or isNum(field.dataType.__class__.__name__) ):
            # Find a good column to display in pie ChartDisplay
                default = default or field.name
                count = self.entity.count()
                sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity
                orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg")
                if orderedSample.take(1)[0]["agg"] > 10:
                    return field.name
        # Otherwise, return first non-id column
        return default
Example #10
0
    def describe_categorical_1d(df, column):
        value_counts = (df.select(column).na.drop()
                        .groupBy(column)
                        .agg(count(col(column)))
                        .orderBy("count({c})".format(c=column),ascending=False)
                       ).cache()

        # Get the most frequent class:
        stats = (value_counts
                 .limit(1)
                 .withColumnRenamed(column, "top")
                 .withColumnRenamed("count({c})".format(c=column), "freq")
                ).toPandas().ix[0]

        # Get the top 50 classes by value count,
        # and put the rest of them grouped at the
        # end of the Series:
        top_50 = value_counts.limit(50).toPandas().sort_values("count({c})".format(c=column),
                                                               ascending=False)
        top_50_categories = top_50[column].values.tolist()

        others_count = pd.Series([df.select(column).na.drop()
                        .where(~(col(column).isin(*top_50_categories)))
                        .count()
                        ], index=["***Other Values***"])
        others_distinct_count = pd.Series([value_counts
                                .where(~(col(column).isin(*top_50_categories)))
                                .count()
                                ], index=["***Other Values Distinct Count***"])

        top = top_50.set_index(column)["count({c})".format(c=column)]
        top = top.append(others_count)
        top = top.append(others_distinct_count)
        stats["value_counts"] = top
        stats["type"] = "CAT"
        value_counts.unpersist()
        unparsed_valid_jsons = df.select(column).na.drop().rdd.map(
            lambda x: guess_json_type(x[column])).filter(
            lambda x: x).distinct().collect()
        stats["unparsed_json_types"] = unparsed_valid_jsons
        return stats
Example #11
0
    def test_bounded_simple(self):
        from pyspark.sql.functions import mean, max, min, count

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.shrinking_range_window

        plus_one = self.python_plus_one
        count_udf = self.pandas_agg_count_udf
        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count_udf(df['v']).over(w2)) \
            .withColumn('max_v',  max_udf(df['v']).over(w2)) \
            .withColumn('min_v', min_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count(df['v']).over(w2)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('min_v', min(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
Example #12
0
df_tab2.coalesce(1).write.mode('overwrite').option(
    "header",
    "true").format('com.databricks.spark.csv').save(output_path + "tab2.csv")
com_list = [
    "McDonald's", "Starbucks", "Chipotle Mexican Grill", "Dunkin'",
    "Buffalo Wild Wings", "Denny's", "Panera Bread", "Pizza Hut", "Taco Bell",
    "Wendy's"
]

df_10_bsn = df_re_bsn  #filter(df_re_bsn.name.isin(com_list)).select("text")

from pyspark.sql.functions import split, col, explode, count

df_tab4=df_10_bsn.withColumn('words',split(col('text'),' '))\
.withColumn('word',explode(col('words')))\
.drop('text','words').groupBy('word').agg(count('word')\
 .alias('count')).orderBy('count',ascending=False)

df_tab4.coalesce(1).write.mode('overwrite').option(
    "header",
    "true").format('com.databricks.spark.csv').save(output_path + "tab4.csv")

for i in range(10):
    df_tab3_iter = df_10_bsn.filter(df_10_bsn.name == com_list[i])
    df_tab3_iter = df_tab3_iter.withColumn('words', split(
        col('text'), ' ')).withColumn('word', explode(col('words'))).drop(
            'text', 'words').groupBy('word').agg(
                count('word').alias('count')).orderBy('count', ascending=False)
    df_tab3_iter.coalesce(1).write.mode('overwrite').option(
        "header",
        "true").format('com.databricks.spark.csv').save(output_path + str(i) +
                                                        "tab4.csv")
def groupByMention(df):
    return df.withColumn('mentioned', f.explode(df.mentioned)).groupBy('mentioned')\
     .agg(f.count('id').alias('count'),f.avg('sentiment').alias('sentiment'))
    print('misstatement_precision is {}, misstatement recall is {}'.format(
        misstatement_precision, misstatement_recall))
    print('non_misstatement_precision is {}, non_misstatement recall is {}'.
          format(non_misstatement_precision, non_misstatement_recall))


# Downsampling:
misstated_df = integrated_df.filter(integrated_df.label == 1.0)
misstated_count = misstated_df.count()
non_misstated_df = integrated_df.filter(
    integrated_df.label == 0.0).limit(misstated_count)
integrated_df = misstated_df.union(non_misstated_df).cache()

# Using nullcounts to filter columns to keep
nullcounts = integrated_df.select([
    count(when(isnan(c) | col(c).isNull(), c)).alias(c)
    for c in integrated_df.columns
])
nc = list(nullcounts.first())

# Services-packaged software category selection (from EDA)
services_prepacked_software = integrated_df  # .filter(integrated_df.sic == '7372')
print('Total records in integrated file: ', integrated_df.count())
print('Number of records in Services-packaged software industrial category: ',
      services_prepacked_software.count())

# Reusing preprocessing steps implemented by Vincent
# filling nulls and nones with zeroes.
some_dict = {}
for x in services_prepacked_software.columns:
    some_dict[x] = 0
Example #15
0
# )
# broken_readings.createOrReplaceTempView("broken_readings")

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import col, count

broken_readings = (
  spark.read
  .format("delta")
  .load(health_tracker + "processed")
  .select(col("heartrate"), col("dte"))
  .where(col("heartrate") < 0)
  .groupby("dte")
  .agg(count("heartrate"))
  .orderBy("dte")
)
broken_readings.createOrReplaceTempView("broken_readings")

# COMMAND ----------

%sql

SELECT SUM(`count(heartrate)`) FROM broken_readings

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC #### Step 2: Verify That These are New Broken Readings
def main(sc):
    """
    Read GDELT data from S3, select columns, join tables,
    and perform calculations with grouped themes and document
    times
    """

    #Obtain taxonomy dictionary and broadcast to the workers
    tax_file = os.environ['TAX_LIST_FILE']
    tax_list = f.read_tax_file(tax_file)
    rdd_tax_list = sc.broadcast(tax_list)

    #Obtain list of top 500 themes used for filtering
    theme_file = os.environ['THEME_LIST_FILE']
    theme_list = f.read_theme_file(theme_file)
    rdd_theme_list = sc.broadcast(theme_list)

    #Obtainb list of top new src used for filtering
    src_file = os.environ['SRC_LIST_FILE']
    src_list = f.read_src_file(src_file)
    rdd_src_list = sc.broadcast(src_list)


    #Read "mentions" table from GDELT S3 bucket. Transform into RDD
    mentionRDD = sc.textFile('s3a://gdelt-open-data/v2/mentions/*.mentions.csv')
    mentionRDD = mentionRDD.map(lambda x: x.encode("utf", "ignore"))
    mentionRDD = mentionRDD.map(lambda x : x.split('\t'))
    mentionRDD = mentionRDD.filter(lambda x: len(x)==16)
    mentionRDD = mentionRDD.filter(lambda x: f.is_not_empty([x[2], x[5], x[13]]))
    mentionRDD = mentionRDD.filter(lambda x: f.is_number(x[13])) 
    mentionRowRDD = mentionRDD.map(lambda x : Row(
                                        mention_id = x[5],
                                        mention_doc_tone = float(x[13]),
                                        mention_time_date = f.transform_to_timestamptz_daily(x[2])
					))
 
    #Read 'GKG" table from GDELT S3 bucket. Transform into RDD
    gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/YEARMONTH*0000.gkg.csv')
    gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore"))
    gkgRDD = gkgRDD.map(lambda x: x.split('\t'))
    gkgRDD = gkgRDD.filter(lambda x: len(x)==27)   
    gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty([x[3], x[4], x[7]]))
    gkgRowRDD = gkgRDD.map(lambda x : Row(src_common_name = x[3],
                                        doc_id = x[4],
                                        themes = f.clean_taxonomy(x[7].split(';')[:-1], rdd_tax_list)
                                        ))


    sqlContext = SQLContext(sc)

    #Transform RDDs to dataframes
    mentionDF = sqlContext.createDataFrame(mentionRowRDD)
    gkgDF     = sqlContext.createDataFrame(gkgRowRDD)


    df1 = mentionDF.alias('df1')
    df2 = gkgDF.alias('df2')

    #Themes and tones information are stored in two different tables
    joinedDF = df1.join(df2, df1.mention_id == df2.doc_id, "inner").select('df1.*'
                                                , 'df2.src_common_name','df2.themes').repartition(2000)

    #Each document could contain multiple themes. Explode on the themes and make a new column on filtered themes
    explodedDF = joinedDF.select('mention_id'
				, 'mention_doc_tone'
                                , 'mention_time_date'
				, 'src_common_name'
                                , explode(joinedDF.themes).alias("theme")) \
                                .filter(col('theme').isin(*(rdd_theme_list.value)))

    hist_data_udf = udf(f.hist_data, ArrayType(IntegerType()))
    get_quantile_udf = udf(f.get_quantile, ArrayType(FloatType()))
    
    #Compute statistics for each theme at a time
    explodedDF.cache()
    
    #Over all sources
    testDF1 = explodedDF.groupBy('theme', 'mention_time_date').agg(
            count('*').alias('num_mentions'),
            avg('mention_doc_tone').alias('avg'),
            collect_list('mention_doc_tone').alias('tones')
            )
    
    #For each source 
    testDF2 = explodedDF.groupBy('theme', 'mention_time_date', 'src_common_name').agg(
            count('*').alias('num_mentions'),
            avg('mention_doc_tone').alias('avg'),
            collect_list('mention_doc_tone').alias('tones')
            ).repartition(2000)
    
    #Histogram and compute quantiles for tones
    
    histDF1 = testDF1.withColumn("bin_vals", hist_data_udf('tones')) \
                   .withColumn("quantiles", get_quantile_udf('tones'))
    
    histDF2 = testDF2.withColumn("bin_vals", hist_data_udf('tones')) \
                   .withColumn("quantiles", get_quantile_udf('tones'))
   
    
    finalDF1 = histDF1.select('theme', 'num_mentions', 'avg', 'quantiles', 'bin_vals', col('mention_time_date').alias('time'))
    #Filter sources
    finalDF2 = histDF2.select('theme', 'src_common_name', 'num_mentions', 'avg', 'quantiles', 'bin_vals', 
            col('mention_time_date').alias('time')).filter(col('src_common_name').isin(*(rdd_src_list.value)))
    
    
    
    #Preparing to write to TimescaleDB
    #Fist write to group-by-src table
    
    db_properties = {}
    config = configparser.ConfigParser()
    
    config.read("db_properties.ini")
    db_prop = config['postgresql']
    db_url = db_prop['url']
    db_properties['username'] = db_prop['username']
    db_properties['password'] = db_prop['password']
    db_properties['url'] = db_prop['url']
    db_properties['driver'] = db_prop['driver']

    #Write to table
    finalDF1.write.format("jdbc").options(
    url=db_properties['url'],
    dbtable='bubblebreaker_schema.tones_table_v3',
    user='******',
    password='******',
    stringtype="unspecified"
    ).mode('append').save()
    
    #Then write to per-src table

    config.read("db_properties_src.ini")
    db_prop = config['postgresql']
    db_url = db_prop['url']
    db_properties['username'] = db_prop['username']
    db_properties['password'] = db_prop['password']
    db_properties['url'] = db_prop['url']
    db_properties['driver'] = db_prop['driver']

    #Write to table
    finalDF2.write.format("jdbc").options(
    url=db_properties['url'],
    dbtable='bubblebreaker_src_schema.tones_table_v2',
    user='******',
    password='******',
    stringtype="unspecified"
    ).mode('append').save()
Example #17
0
 def range_frame_match():
     return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
         F.count("*").over(
             window.Window.rangeBetween(-sys.maxsize,
                                        sys.maxsize))).columns[0]
Example #18
0
kmeans = KMeans().setK(20).setMaxIter(5)

# fitting out features into K means
model = kmeans.fit(Dataframe.select('features'))

# Save your model
# model.save("F:\\kMeans")

# Adding the prediction from K means to the Dataset
clusters = model.transform(Dataframe)

clusters.show()
print("K means predictions")

clusters.select(
    month("dt").alias("month"),
    dayofmonth("dt").alias("day"),
    hour("dt").alias("hour"),
    "prediction").groupBy("month", "day", "hour", "prediction").agg(
        func.count("prediction").alias("count")).orderBy(
            "day", "hour", "prediction").show()
print("Count Total")

clusters.select(hour("dt").alias("hour"), "prediction").groupBy(
    "hour", "prediction").agg(func.count("prediction").alias("count")).orderBy(
        func.desc("count")).show()
print("Count Total ordered by count")

clusters.groupBy("prediction").count().show()
print("Counts in each cluster")
Example #19
0
def summary(df, datatypes=None):
    spark = df.sql_ctx
    types = {x.name: x.dataType for x in list(df.schema)}

    #filter datatypes
    if datatypes is not None:
        types = {
            k: v
            for k, v in types.items()
            if any([x in datatypes
                    for x in [v, str(v), v.simpleString()]])
        }

    res = pd.DataFrame.from_dict(types, orient='index')
    res.columns = ['datatype']

    count = df.count()
    res['count'] = count

    d = df.select([F.approx_count_distinct(c).alias(c)
                   for c in df.columns]).toPandas().T
    d.columns = ['approx_distinct']
    d.index.name = 'index'
    res = res.join(d)

    res['unique_ratio'] = res['approx_distinct'] / count

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.mean(c).alias(c)]
        else:
            sel += [F.min(F.lit(None)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['mean']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.min(c).alias(c) for c in df.columns]).toPandas().T
    d.columns = ['min']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.max(c).alias(c) for c in df.columns]).toPandas().T
    d.columns = ['max']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([
        F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns
    ]).toPandas().T
    d.columns = ['null']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.count(F.when(F.isnan(c), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['nan']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.StringType)):
            sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['empty']
    d.index.name = 'index'
    res = res.join(d)

    return res
Example #20
0
 def zeros(col_name):
     return F.count(F.when(F.col(col_name) == 0, col_name))
Example #21
0
 def na(col_name):
     return F.count(
         F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))
def main(username):
    # For verification on the username received and print in console for demo. For actual deployment,
    # can comment away.
    print(f"Received username= {username}")

    # Start the Spark instance
    cnfg = SparkConf().setAppName("TwitterUserProfile").setMaster("local[2]")
    sc = SparkContext(conf=cnfg)
    spark = SparkSession(sc)

    # Initialise the first page of tweets & user (1 page consist of 10 entries)
    url = create_url(target=username)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)

    # Parsing the JSON response returned by Twitter
    tweet_df = spark.createDataFrame(json_response['data'])

    # Check if there's geolocation field in the response.
    geo_exist = has_column(tweet_df, "geo")

    # Extracting the geolocation information via geo.place_id
    if geo_exist:
        tweet_df = tweet_df.select("author_id", "created_at", "geo.place_id",
                                   "id", "text")
    else:
        tweet_df = tweet_df.select("author_id", "created_at", "id", "text")

    # Extracting the user details
    user_df = spark.createDataFrame(json_response['includes']['users'])

    # flatten the public_metrics
    cols = list(
        map(lambda f: F.col("public_metrics").getItem(f).alias(str(f)), [
            "following_count", "tweet_count", "listed_count", "followers_count"
        ]))

    public_metrics = user_df.select(cols)
    user_df = user_df.drop('public_metrics')

    # Merge user_df with public_metrics
    user_df = with_column_index(user_df)
    public_metrics = with_column_index(public_metrics)
    user_df = user_df.join(public_metrics,
                           user_df.ColumnIndex == public_metrics.ColumnIndex,
                           'inner').drop("ColumnIndex")

    # If there are more tweets (next page / next token), append it to tweet_df.
    # user_df is just for a single user, so no need to append. Info will be the same.

    if 'next_token' not in json_response['meta']:
        pass
    else:
        next_token = json_response['meta']['next_token']

        while next_token is not None:
            url = create_url(username, next_token)
            json_response = connect_to_endpoint(url, headers)

            new_tweets = spark.createDataFrame(json_response['data'])

            # Check if there's geolocation field in the new tweets
            new_tweet_geo_exist = has_column(new_tweets, "geo")

            if new_tweet_geo_exist:
                new_tweets = new_tweets.select("author_id", "created_at",
                                               "geo.place_id", "id", "text")
            else:
                new_tweets = new_tweets.select("author_id", "created_at", "id",
                                               "text")

            # to make sure all have the same number of columns
            for column in tweet_df.columns:
                if column not in new_tweets.columns:
                    new_tweets = new_tweets.withColumn(column, F.lit(None))

            for column in new_tweets.columns:
                if column not in tweet_df.columns:
                    tweet_df = tweet_df.withColumn(column, F.lit(None))

            # Reordering the column of new_tweets for union function
            if geo_exist:
                new_tweets = new_tweets.select("author_id", "created_at",
                                               "place_id", "id", "text")
            else:
                new_tweets = new_tweets.select("author_id", "created_at", "id",
                                               "text")

            tweet_df = tweet_df.union(new_tweets)

            if 'next_token' not in json_response['meta']:
                next_token = None
            else:
                next_token = json_response['meta']['next_token']

    # Show the df. Can comment away in actual production.
    tweet_df.show(truncate=False)
    user_df.show(truncate=False)

    # Extract geolocation information within the tweets. Currently not in use.
    if geo_exist:
        location_df = tweet_df.select("author_id", "id", "place_id").dropna()
        location_df.show(truncate=False)

    # WORD FREQUENCY - to be made into word cloud in Tableau or other visualisation software.
    tweet_only = tweet_df.select("author_id", "text")

    # Remove punctuation, covert to lower case
    df_clean = tweet_only.select(
        "author_id",
        (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text')))

    # Tokenize text
    tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select(
        'author_id', 'words_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
    df_words_no_stopw = remover.transform(df_words_token).select(
        'author_id', 'words_clean')

    # Filter length word > 3
    filter_length_udf = udf(lambda row: [x for x in row if 3 <= len(x) <= 13],
                            ArrayType(StringType()))
    df_final_words = df_words_no_stopw.withColumn(
        'words', filter_length_udf(col('words_clean')))

    # Printing the word list. Can comment away in actual deployment.
    df_final_words.show(truncate=False)

    word_count = df_final_words.select('author_id', F.explode('words').alias('word')).\
        groupBy('author_id', 'word').\
        count().\
        sort('count', ascending=False)

    # Printing the word list and count. Can comment away in actual deployment.
    word_count.show()

    # SENTIMENT ANALYSIS. Sentiment is in the range of (-1, 1).
    sentiment = udf(lambda x: TextBlob(x).sentiment[0])
    tweet_sentiment = tweet_df.withColumn(
        "sentiment_score",
        sentiment(tweet_df["text"]).cast("double"))

    classify_sentiment_udf = udf(classify_sentiment)

    tweet_sentiment = tweet_sentiment.withColumn(
        "sentiment",
        classify_sentiment_udf(tweet_sentiment["sentiment_score"]))
    tweet_sentiment = tweet_sentiment.select('author_id', 'created_at', 'id',
                                             'text', 'sentiment_score',
                                             'sentiment')
    # Can comment away the show statement. Left here to display the progress in console for demo.
    tweet_sentiment.show()

    sentiment_count = tweet_sentiment.groupBy('author_id', 'sentiment').agg(
        F.mean('sentiment_score'),
        F.count('sentiment')).toDF("author_id", "sentiment",
                                   "avg_sentiment_score", "count")
    # Can comment away the show statement. Left here to display the progress in console for demo.
    sentiment_count.show()

    # Read in existing data from Amazon RedShift DB. If user already exists, need to merge and deduplicate, then write data back.
    with redshift_conn.connect() as conn, conn.begin():

        # Check if Table exists first. If so, read in existing Twitter users that are already in RedShift DB.
        # The unique key is the id, which is the author_id, Twitter user id.
        if redshift_conn.has_table("user_data"):
            user = pd.read_sql("""
               select * from user_data;""", conn)

            # Append latest data retrieved to those in DB and remove duplicates, keeping the latest.
            user = user.append(user_df.toPandas())
            user = user.drop_duplicates(subset="id", keep="last")
        else:
            user = user_df.toPandas()

        # Similarly, check if the Table for sentiment count exists. If so, read in existing sentiment count
        # for existing users in RedShift DB. The pair, author_id and sentiment," is used for deduplication.
        if redshift_conn.has_table("sentiment_count"):
            senti_df = pd.read_sql(
                """
               select * from sentiment_count;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            senti_df = senti_df.append(sentiment_count.toPandas())
            senti_df = senti_df.drop_duplicates(
                subset=["author_id", "sentiment"], keep="last")
        else:
            senti_df = sentiment_count.toPandas()

        # Checking if Table for word_count already exists in RedShift. If so, read in existing word count for
        # existing users in RedShift DB. Distinct pair of author_id and word is used for comparison.
        if redshift_conn.has_table("word_count"):
            word_df = pd.read_sql(
                """
                       select * from word_count;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            word_df = word_df.append(word_count.toPandas())
            word_df = word_df.drop_duplicates(subset=["author_id", "word"],
                                              keep="last")
        else:
            word_df = word_count.toPandas()

        # Check for Table, tweet_sentiment. If exists, read in existing tweet sentiment for existing users in
        # RedShift DB. The unique ID used is the tweet id, which is unique for each tweet. All unique tweets
        # are kept. Thus even if the Twitter user deleted his old tweets, it will still be retained in the
        # Redshift DB if it was previously captured.
        if redshift_conn.has_table("tweet_sentiment"):
            tweet_db = pd.read_sql(
                """
                       select * from tweet_sentiment;""", conn)

            # Append latest data to those in DB and remove duplicates, keeping the latest.
            tweet_db = tweet_db.append(tweet_sentiment.toPandas())
            tweet_db = tweet_db.drop_duplicates(subset="id", keep="last")
        else:
            tweet_db = tweet_sentiment.toPandas()

    # Update the data to Redshift.
    user.to_sql('user_data', redshift_conn, index=False, if_exists='replace')
    word_df.to_sql('word_count',
                   redshift_conn,
                   index=False,
                   if_exists='replace')
    senti_df.to_sql('sentiment_count',
                    redshift_conn,
                    index=False,
                    if_exists='replace')
    tweet_db.to_sql('tweet_sentiment',
                    redshift_conn,
                    index=False,
                    if_exists='replace',
                    dtype={
                        'author_id':
                        sqlalchemy.types.VARCHAR(length=255),
                        'created_at':
                        sqlalchemy.types.VARCHAR(length=255),
                        'id':
                        sqlalchemy.types.VARCHAR(length=255),
                        'text':
                        sqlalchemy.types.VARCHAR(length=5000),
                        'sentiment_score':
                        sqlalchemy.types.Float(precision=3, asdecimal=True),
                        'sentiment':
                        sqlalchemy.types.VARCHAR(length=255),
                    })
    # Location information in tweet. Currently not in use.
    # location.to_sql('location_data', redshift_conn, index=False, if_exists='replace')

    # Can comment away print statement for actual deployment. Left here so that status will be printed in
    # console for demo purpose.
    print("Redshift DB updated successfully.")
Example #23
0
def basic_eda(df, dependent_var, id_var):

    eda_start_time = time()

    # Extracting Data Types of All Columns
    print("\n++++++ Printing Data Types of All Columns ++++++\n")
    df.printSchema()

    # Duplicate Observation Checking
    print("\n++++++ Printing Duplicate Removal Summary ++++++\n")
    print("Total No of Obs Before Duplicate Removal: " + str(df.count()))
    print("Unique No of Obs Before Duplicate Removal: " +
          str(df.distinct().count()))

    # Removing Duplicate Observations
    df = df.dropDuplicates()
    df = df.na.drop('all')
    print("Total No of Obs After Duplicate Removal: " + str(df.count()))
    print("Unique No of Obs After Duplicate Removal: " +
          str(df.distinct().count()))

    # Extracting Dependent and Independent Variables
    column_names = [item[0] for item in df.dtypes]
    categorical_var = [
        item[0] for item in df.dtypes if item[1].startswith('string')
    ]
    independent_catgorical_var = [
        x for x in categorical_var if x not in [id_var, dependent_var]
    ]
    independent_continuous_var = [
        x for x in column_names
        if x not in independent_catgorical_var + [id_var, dependent_var]
    ]

    # Descriptive Summary of Numeric Variables
    temp_df_1 = pd.DataFrame()
    desc_summary_1 = pd.DataFrame()

    for col_name in df[independent_continuous_var].columns:
        temp_df_1.loc[0, "Column_Name"] = col_name
        temp_df_1.loc[0, "Total_Obs"] = df.agg({
            col_name: "count"
        }).collect()[0][0]
        temp_df_1.loc[0, "Unique_No_Obs"] = df.select(
            col_name).distinct().count()
        temp_df_1.loc[0, "Missing_No_Obs"] = df.select(
            count(when(isnan(col_name)
                       | col(col_name).isNull(), col_name))).toPandas().iloc[0,
                                                                             0]
        temp_df_1.loc[0, "Min"] = df.agg({col_name: "min"}).collect()[0][0]
        temp_var = df.approxQuantile(col_name, [
            0.01,
            0.05,
            0.1,
            0.25,
            0.5,
            0.75,
            0.85,
            0.95,
            0.99,
        ], 0)
        temp_df_1.loc[0, "Pct_1"] = temp_var[0]
        temp_df_1.loc[0, "Pct_5"] = temp_var[1]
        temp_df_1.loc[0, "Pct_10"] = temp_var[2]
        temp_df_1.loc[0, "Pct_25"] = temp_var[3]
        temp_df_1.loc[0, "Median"] = temp_var[4]
        temp_df_1.loc[0, "Average"] = df.agg({col_name: "avg"}).collect()[0][0]
        temp_df_1.loc[0, "Pct_75"] = temp_var[5]
        temp_df_1.loc[0, "Pct_85"] = temp_var[6]
        temp_df_1.loc[0, "Pct_95"] = temp_var[7]
        temp_df_1.loc[0, "Pct_99"] = temp_var[8]
        temp_df_1.loc[0, "Max"] = df.agg({col_name: "max"}).collect()[0][0]
        desc_summary_1 = desc_summary_1.append(temp_df_1)
        desc_summary_1.reset_index(inplace=True, drop=True)

    print(
        "\n++++++ Printing Summary Statistics For Numeric Variables ++++++\n")
    display(desc_summary_1)

    # Target Variables V/s Categorical Variables
    temp_df_2 = pd.DataFrame()
    desc_summary_2 = pd.DataFrame()

    for x in independent_catgorical_var:
        temp_df_2 = df.groupby(x).agg({dependent_var: "avg"}).toPandas()
        temp_df_2.columns = ["Column_Value", "Avg_Target_Var"]
        temp_df_2["Column_Name"] = x
        temp_df_2 = temp_df_2.iloc[:, [2, 0, 1]]
        desc_summary_2 = desc_summary_2.append(temp_df_2)

    print(
        "\n++++++ Printing Averages of Target Variable Grouped By All Categorical Variable ++++++\n"
    )
    display(desc_summary_2)

    # Returning Final Output
    desc_summary = [desc_summary_1, desc_summary_2]
    final_list = (df, independent_catgorical_var, independent_continuous_var,
                  desc_summary)

    eda_end_time = time()
    eda_elapsed_time = (eda_end_time - eda_start_time) / 60
    print("\nTime To Perform EDA: %.3f Minutes\n" % eda_elapsed_time)

    return (final_list)
Example #24
0
)


# In[238]:


match_india_played = india_filtered_data.distinct().count()


# In[239]:


win_loss_percentage = india_filtered_data.groupby(
    "win_flag"
).agg(
    F.count("Team 1").alias("match_count")
).withColumn(
    "percentage",
    (F.col("match_count")*100.0)/F.lit(match_india_played)
)


# In[240]:


win_loss_percentage.show()


# # 2. What is India’s Win/Loss/Tie percentage in away and home matches?

# In[148]:
Example #25
0
def main(output):
    stream_sch, channel_sch = createSchema()

    # data_s = spark.read.json('stream_base/part*', schema = stream_sch)
    # data_c = spark.read.json('channel_base/part*', schema = channel_sch)

    convertTime = functions.udf(timeToFrame)

    data_s = spark.read.json('stream_info.json', schema=stream_sch)
    data_c = spark.read.json('channel_info.json', schema=channel_sch)
    data_s = data_s.withColumn('time_frame',
                               convertTime(data_s.created_at)).cache()

    data_s.createOrReplaceTempView('data_s')
    data_c.createOrReplaceTempView('data_c')

    game_count_by_time = data_s.groupBy('time_frame', 'game').count()
    game_count_by_time = game_count_by_time.orderBy(
        game_count_by_time['count'].desc())

    view_count_by_time = data_s.groupBy('time_frame', 'game').agg(
        functions.sum('viewers').alias('total_view'))
    view_count_by_time = view_count_by_time.orderBy(
        view_count_by_time['total_view'].desc())

    # game_count_by_time.coalesce(1).write.json('game_count_by_time', mode='overwrite')
    # view_count_by_time.coalesce(1).write.json('view_count_by_time', mode='overwrite')

    # see which games have the most audiences and followers
    view_num_by_game = data_c.groupby(data_c['game'])\
        .agg(functions.sum(data_c['views']),functions.sum(data_c['followers']))

    # see who are the currently most popular streamers
    view_num_by_streamer = data_c\
        .select('stream_id','channel_id','game','name','views','followers','created_at','updated_at','partner')\
        .orderBy(functions.desc('views'),'game')
    #print(view_num_by_streamer.show(5))

    # see what are the games that have the most total vies and total follower (the most popular games in twitch recent history)
    viewcount_by_game = view_num_by_game\
        .select('game', view_num_by_game['sum(views)'].alias('total_views'),
                view_num_by_game['sum(followers)'].alias('total_followers'))\
        .orderBy(functions.desc('total_views'))
    #print(viewcount_by_game.show(5))

    # see what are the most popular non-english speaking streams (by game and language)
    yuyan = spark.sql(
        """SELECT broadcaster_language, game, SUM(views) AS total_views
        FROM data_c
        WHERE broadcaster_language != 'en'
        GROUP BY broadcaster_language, game
        ORDER BY total_views DESC
        """)
    yuyan.createOrReplaceTempView('yuyan')
    #print(yuyan.show(5))

    # see what are the biggest broadcaster communities (by language)
    yuyan_by_game = spark.sql(
        """SELECT broadcaster_language, game, count(*) AS total_streamer
        FROM data_c
        GROUP BY broadcaster_language, game
        ORDER BY total_streamer DESC
        """)
    yuyan.createOrReplaceTempView('yuyan_by_game')
    #print(yuyan_by_game.show(5))

    # -------------------------ow jonning the 2 tables---------------------------------------

    # joint_df = t_max.join(t_min, (t_max.stationmax == t_min.stationmin) & (t_max.date == t_min.date), 'inner')

    #put WHERE above ORDER BY ,stream.game is dropped since some streamers are playing games different than what are shown in stream.game
    cs_joint_table = spark.sql("""
        SELECT s.stream_id AS stream_id, c.game AS game, c.name AS name, s.viewers AS watchings,
        s.time_frame as time_frame, c.views AS views, c.followers AS followers, s.created_at AS stream_created_date, 
        c.updated_at AS channel_last_updated, c.broadcaster_language, c.language, c.created_at AS channel_created_date,  
        c.display_name, c.status, c.mature, c.partner, s.average_fps, s.delay, s.video_height, c.broadcaster_software
        FROM data_c AS c JOIN data_s AS s
        ON s.stream_id = c.stream_id
        ORDER BY watchings DESC
        """).cache()

    cs_joint_table.createOrReplaceTempView('cs_joint_table')
    #cs_joint_table.coalesce(1).write.csv(output, mode='overwrite')
    #cs_joint_table.coalesce(1).write.json(output, mode='overwrite')

    #-------------------------------------list of attributes in cs_joint_table:---------------------------------------
    # """
    # stream_id
    # game (game name)
    # name (streamer name)
    # watchings (current number of audiences)
    # time_frame
    # views (current total views of the stream)
    # followers
    # stream_created_date
    # channel_last_updated
    # broadcaster_language
    # language
    # channel_created_date
    # display_name (streamer's displayed name, has emojis and stuff)
    # status (like a brief intro to the channel)
    # mature
    # partner
    # average_fps
    # delay
    # video_height
    # broadcaster_software (most of the streamers didn't specify this)
    # """

    # ------partnership and average streaming fps and current audiences(num of people watching) by game and streamer--------

    partner = spark.sql("""
        SELECT game, partner, COUNT(name), AVG(average_fps), AVG(delay), 
        SUM(watchings), SUM(views), SUM(followers), AVG(video_height)
        FROM cs_joint_table
        WHERE game LIKE 'Call of Duty%' 
        GROUP BY partner, game     
        HAVING COUNT(name) > 100
        ORDER BY game
        """)
    #print(partner.show(50))

    # ----------------------mature vs non-mature contents------------------------------

    mature = spark.sql("""
        SELECT game, mature, COUNT(name), SUM(watchings), SUM(views), SUM(followers) 
        FROM cs_joint_table
        GROUP BY mature, game
        ORDER BY game
        """)
    #print(mature.show(50))

    mature_total = cs_joint_table.select('game', 'mature', 'name', 'watchings', 'views', 'followers').groupBy('mature')\
        .agg(functions.count('mature'), functions.sum('watchings'),
             functions.sum('views').alias('total_views'), functions.sum('followers'))
    mature_total = mature_total.orderBy(mature_total['total_views'].desc())
    print(mature_total.show(2))
# MAGIC +-------+-----------------------------+-----+-------+
# MAGIC |5.0    |Ella Lola, a la Trilby (1898)|1    |94431  |
# MAGIC |5.0    |Serving Life (2011)          |1    |129034 |
# MAGIC |5.0    |Diplomatic Immunity (2009? ) |1    |107434 |
# MAGIC +-------+-----------------------------+-----+-------+
# MAGIC only showing top 3 rows
# MAGIC ```

# COMMAND ----------

# TODO: Replace <FILL_IN> with appropriate code
from pyspark.sql import functions as F

# From ratingsDF, create a movie_ids_with_avg_ratings_df that combines the two DataFrames
ratings_df.show(3)
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print 'movie_ids_with_avg_ratings_df:'
movie_ids_with_avg_ratings_df.show(3, truncate=False)

# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"])
movie_names_with_avg_ratings_df = movie_names_df.drop("Id")

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
Example #27
0
 def range_frame_match():
     return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
         F.count("*").over(window.Window.rangeBetween(-sys.maxsize, sys.maxsize))
     ).columns[0]
 def findGenderAggCount(self, userDF):
     genderAggCntDF = userDF.groupBy("gender").\
         agg(F.count("gender").alias("CountOfEmployeesByGender")).\
         sort(F.desc("CountOfEmployeesByGender"))
     return genderAggCntDF
Example #29
0
def app_and_plays(result_loc_, date_):
    """
    Compute App Sessions and content play sessions and time spent on content consumption.
    :param result_loc_: pathlib.Path object to store resultant CSV at.
    :param date_: datetime object to use in query and path
    :return: None
    """
    spark = SparkSession.builder.appName("content_plays").master(
        "local[*]").getOrCreate()
    account_name = os.environ['AZURE_STORAGE_ACCOUNT']
    account_key = os.environ['AZURE_STORAGE_ACCESS_KEY']
    container = 'telemetry-data-store'
    spark.conf.set(
        'fs.azure.account.key.{}.blob.core.windows.net'.format(account_name),
        account_key)
    path = 'wasbs://{}@{}.blob.core.windows.net/telemetry-denormalized/summary/{}-*'.format(
        container, account_name, date_.strftime('%Y-%m-%d'))
    data = spark.read.json(path).filter(
        func.col("dimensions.pdata.id").isin(
            config['context']['pdata']['id']['app'], config['context']['pdata']
            ['id']['portal'], config['context']['pdata']['id']['desktop'])
        & func.col("dimensions.type").isin("content", "app")).select(
            func.col("dimensions.sid"),
            func.col("dimensions.pdata.id").alias("pdata_id"),
            func.col("dimensions.type"), func.col("dimensions.mode"),
            func.col("dimensions.did"),
            func.col("object.id").alias("object_id"),
            func.col("edata.eks.time_spent"), func.col("object.rollup.l1"))
    app = data.filter(
        func.col('type').isin('app') & func.col('pdata_id').isin(
            config['context']['pdata']['id']['app'], config['context']['pdata']
            ['id']['desktop']))
    app_df = app.groupBy(func.col('pdata_id')).agg(
        func.count('sid').alias('Total App Sessions'),
        func.countDistinct('did').alias('Total Devices on App'),
        (func.sum('time_spent') /
         3600).alias('Total Time on App (in hours)')).toPandas()
    app_df['x_index'] = 0
    app_df.set_index("x_index", inplace=True)
    x_app = app_df.pivot(columns='pdata_id')
    result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
    x_app.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                      'app_sessions.csv'),
                 index=False)
    post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                           'app_sessions.csv'),
                      backup=True)
    play = data.filter(func.col('mode').isin('play'))
    content = spark.read.csv(
        str(
            result_loc_.parent.joinpath('tb_metadata',
                                        date_.strftime('%Y-%m-%d'),
                                        'textbook_snapshot.csv')),
        header=True).select(func.col('identifier'),
                            func.col('channel')).distinct()
    play_df = play.join(
        content, play.l1 == content.identifier,
        how='left').groupBy(func.col('channel'), func.col('pdata_id')).agg(
            func.count('sid').alias('Total Content Plays'),
            func.countDistinct('did').alias(
                'Total Devices that played content'),
            (func.sum('time_spent') /
             3600).alias('Content Play Time (in hours)')).toPandas()
    x_play = play_df.pivot(index='channel', columns='pdata_id')
    x_play.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                       'plays.csv'))
    post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                           'plays.csv'),
                      backup=True)
    spark.stop()
 def findOccupAggCount(self, userDF):
     occupAggCntDF = userDF.groupBy("occupation"). \
         agg(F.count("occupation").alias("CountOfEmployeesByOccup")). \
         sort(F.desc("CountOfEmployeesByOccup"))
     return occupAggCntDF
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    ## Phonetic
    dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="learning_object"
    )
    dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic')
    dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name'])
    # df_phonemic = dyf_phonemic.toDF()
    # df_phonemic = df_phonemic.withColumn('lo_name', convertedudf(df_phonemic.learning_object_name))
    # df_phonemic.show()
    # Lay ra ngu am
    df1 = dyf_phonemic.toDF()
    df1 = df1.select('learning_object_id', 'learning_object_name')
    # myArr = np.array(df1.select('phonemic').collect())
    arrPhonetic = [row.learning_object_name for row in df1.collect()]
    arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()]
    # print(unicode(arrPhonetic[2]))
    # print('ARR:', arrPhonetic)
    # print('ARR:', arrPhonetic[2].encode('utf-8', 'replace'))
    # print('ARR1 :', (u'i:' in arrPhonetic))

    # ETL TBHV
    # Custom function


    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_get_phone_tic_id(phonetic):
        phonetic = phonetic.encode('utf-8', 'replace').strip()
        for x in arrPhoneticId:
            p = x[0].encode('utf-8', 'replace').strip()
            if p == phonetic:
                return x[1]

    get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    def doSplitWord(word):
        rs = []
        if word is not None:
            i = 0
            size = len(word)
            while i < size:
                s = word[i:i + 2]
                i += 2
                if s in arrPhonetic:
                    rs.append(s)
                if s not in arrPhonetic:
                    i -= 2
                    s = word[i:i + 1]
                    i += 1
                    if s in arrPhonetic:
                        rs.append(s)

        return rs

    splitWord = udf(lambda x: doSplitWord(x))

    state_right = 'state_right'
    state_wrong = 'state_wrong'

    # mac dinh duoc cong knowledge
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2
    # knowledge = []
    # cong diem comprehension:
    # Can list cac name duoc cong diem comprehension:
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    comprehension = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem application:
    # Can list cac name duoc cong diem application:
    # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    application = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem analysis:
    # Can list cac name duoc cong diem analysis
    # P2_D3; P3_D2; P4_D1; P4_D2
    analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem synthesis:
    # Can list cac name duoc cong diem synthesis
    # P4_D1; P4_D2
    synthesis = []
    # cong diem evaluation:
    # Can list cac name duoc cong diem evaluation
    evaluation = []

    def doAddScore(name, state, type):
        arr = ['']
        score = 0
        if type == 'comprehension':
            arr = comprehension

        if type == 'application':
            arr = application

        if type == 'analysis':
            arr = analysis

        if type == 'synthesis':
            arr = synthesis

        name = name.lower()
        if state == state_right:
            score = 2
        if state == state_wrong:
            score = -1

        if name is not None:
            for x in arr:
                if x.lower() in name:
                    return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    # chuoi ky tu can replace
    special_str = '["] ;'

    ########## top_quiz_attempts
    dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_quiz_attempts"
    )
    dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(['_key', 'id', 'timestart', 'quiz'])

    dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')])

    # print dyf_top_quiz_attempts.count()
    # dyf_top_quiz_attempts.show(2)

    dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts,
                                         f=lambda x: x["timestart"] >= timestamp)

    # print dyf_top_quiz_attempts.count()
    # dyf_top_quiz_attempts.show()

    # xu ly truong hop start_read is null
    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    # print('the number of new contacts: ', dyf_top_quiz_attempts.count())

    if dyf_top_quiz_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="do_top_user"
        )
        dyf_top_user = dyf_top_user.select_fields(
            ['id', 'student_id']).rename_field('id', 'top_user_id')
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question"
        )
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name'])
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_result_ai
        dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_result_ai"
        )
        dyf_top_result_ai = dyf_top_result_ai.select_fields(
            ['question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word'])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'id')
        dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id')
        dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918])
        dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id')

        # dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["student_id"] == 259442)

        # dyf_join02.show()
        df_study = dyf_join02.toDF()
        df_study.cache()
        if (df_study.count() > 0):
            try:

                # print("COUNT 1:", df_study.count())
                # Loc cac ky tu dac biet [ ] "
                # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ...
                # df_study = df_study.select(
                #     'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word,
                #                                                                                     special_str, ''), f.translate(df_study.wrong_word,
                #                                        special_str, ''))
                df_study = df_study.select(
                    'quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word')
                df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \
                    .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, ''))

                # Tach cau thanh array tu:
                # house, her => [house, her]
                # PHan tich tu dung
                df_study_right = df_study.withColumn("right_word_list", f.split(
                    df_study.right_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_right = df_study_right.withColumn("right", f.explode(df_study_right.right_word_list))
                # convert to lowercase
                df_study_right = df_study_right.withColumn("right", f.lower(f.col("right")))
                df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right')
                # print("COUNT 2:", df_study_right.count())
                # df_study_right.printSchema()
                # df_study_right.show()
                dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right")
                ## Learning Object
                # dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                #     database="nvn_knowledge",
                #     table_name="nvn_knowledge_learning_object"
                # )
                dyf_learning_object = Filter.apply(frame=dyf_learning_object,
                                            f=lambda x: x["learning_object_type"] == 'vocabulary')
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name', 'transcription'])
                df_learning_object = dyf_learning_object.toDF()
                # convert to lowercase
                df_learning_object = df_learning_object.withColumn("learning_object_name", f.lower(f.col("learning_object_name")))
                # replace cac ky tu
                df_learning_object = df_learning_object.withColumn("phone_tic_new",
                                                                   f.translate(df_learning_object.transcription, '\',', ''))

                df_learning_object = df_learning_object.withColumn("phone_tic_tmp",
                                                                   splitWord(df_learning_object.phone_tic_new))
                df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01",
                                                                   f.translate(df_learning_object.phone_tic_tmp, '[]',
                                                                               ''))
                df_learning_object = df_learning_object.withColumn("phone_tic_arr",
                                                                   f.split(df_learning_object.phone_tic_tmp_01, ','))

                df_learning_object = df_learning_object.withColumn("split_phonetic",
                                                                   f.explode(df_learning_object.phone_tic_arr))

                df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name',
                                                               'split_phonetic')

                dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name')


                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # 1
                df_knowledge_right = dyf_knowledge_right.toDF()
                # df_knowledge_right = df_knowledge_right.withColumn("right_phonetic",
                #                                                    f.explode(df_knowledge_right.phone_tic_arr))
                df_knowledge_right = df_knowledge_right.select('timestart', 'name', 'student_id', 'split_phonetic')
                df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.split_phonetic))
                # dyf_phonemic_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_phonemic_right")



                # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'split_phonetic', 'learning_object_name')
                #
                # dropnullfields = DropNullFields.apply(frame=dyf_phonemic_right, transformation_ctx="dropnullfields")
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history_v06",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink6")

                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                # df_knowledge_right = dyf_phonemic_right.toDF()
                # print("COUNT 4:")
                # df_knowledge_right.printSchema()
                df_knowledge_right.cache()

                df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(2)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \
                    .withColumn("synthesis",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) \
                    .withColumn("lo_type", f.lit(2))

                dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # dropnullfields = DropNullFields.apply(frame=dyf_knowledge_right, transformation_ctx="dropnullfields")
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history_v02",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink6")

                # print("COUNT 444444444444444:", df_knowledge_right.count())
                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                #
                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # # chon cac truong va kieu du lieu day vao db
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timestart", "long", "timestart", "long"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("name", 'string', 'name', 'string'),
                #                                             ("learning_object_id", "long", "learning_object_id", "long"),
                #                                             ("date_id", "string", "date_id", "long"),
                #                                             ("knowledge", "int", "knowledge", "long"),
                #                                             ("comprehension", "int", "comprehension", "long"),
                #                                             ("application", "int", "application", "long"),
                #                                             ("analysis", "int", "analysis", "long"),
                #                                             ("synthesis", "int", "synthesis", "long"),
                #                                             ("evaluation", "int", "evaluation", "long"),
                #                                             ("lo_type", "int", "lo_type", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
                #
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "t_temp_right_learning_object_phonetic",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")
                # END Cong diem cac tu dung

                ##################################################
                # Tru diem cac tu sai: Xu lu tuong tu tu dung.
                # rule tru diem la -1 diem neu sai
                df_study_wrong = df_study.withColumn("wrong_word_list", f.split(
                    df_study.wrong_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_wrong = df_study_wrong.withColumn("wrong", f.explode(df_study_wrong.wrong_word_list))
                #convert to lowercase
                df_study_wrong = df_study_wrong.withColumn("wrong",  f.lower(f.col("wrong")))
                df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong')
                # print("COUNT 2222:", df_study_wrong.count())
                # df_study_wrong.printSchema()
                # df_study_wrong.show()
                dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong")
                ## Learning Object
                dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name')

                df_knowledge_wrong = dyf_knowledge_wrong.toDF()
                # df_knowledge_wrong = df_knowledge_wrong.withColumn("wrong_phonetic",
                #                                                    f.explode(df_knowledge_wrong.phone_tic_arr))
                df_knowledge_wrong = df_knowledge_wrong.select('timestart', 'name', 'student_id', 'split_phonetic')

                df_knowledge_wrong = df_knowledge_wrong.withColumn("learning_object_id",
                                                                   get_phone_tic_id(df_knowledge_wrong.split_phonetic))

                # dyf_study_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_study_wrong")

                # dyf_phonemic_wrong = Join.apply(dyf_study_wrong, dyf_phonemic, 'split_phonetic', 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                # df_knowledge_wrong = dyf_phonemic_wrong.toDF()
                df_knowledge_wrong.cache()

                df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-1)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \
                    .withColumn("synthesis",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd'))

                # df_knowledge_wrong.printSchema()
                # df_knowledge_wrong.show()
                #
                # dyf_knowledge_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_knowledge_wrong")
                #
                # # chon cac truong va kieu du lieu day vao db
                # applymapping1 = ApplyMapping.apply(frame=dyf_knowledge_wrong,
                #                                    mappings=[("timestart", "long", "timestart", "long"),
                #                                              ("name", 'string', 'name', 'string'),
                #                                              ("student_id", 'int', 'student_id', 'long'),
                #                                              ("id", "int", "learning_object_id", 'long'),
                #                                              ("date_id", "string", "date_id", "long"),
                #                                              ("knowledge", "int", "knowledge", "long"),
                #                                              ("comprehension", "int", "comprehension", "long"),
                #                                              ("application", "int", "application", "long"),
                #                                              ("analysis", "int", "analysis", "long"),
                #                                              ("synthesis", "int", "synthesis", "long"),
                #                                              ("evaluation", "int", "evaluation", "long")])
                # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                #                                      transformation_ctx="resolvechoice1")
                # dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1")
                #
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "t_temp_right_learning_object_phonetic",
                #                                                                "database": "dts_odin",
                #                                                                "postactions": """ call proc_knowledge_ngu_am_top_result_ai () """
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")



                ### Luu bang mapping_lo_student_history
                df_knowledge_right = df_knowledge_right.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_plus"),
                    f.sum('knowledge').alias("knowledge_plus"),
                    f.sum('comprehension').alias("comprehension_plus"),
                    f.sum('application').alias("application_plus"),
                    f.sum('analysis').alias("analysis_plus"),
                    f.sum('synthesis').alias("synthesis_plus"),
                    f.sum('evaluation').alias("evaluation_plus"))
                df_knowledge_right = df_knowledge_right.where('student_id is not null')

                df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_minus"),
                    f.sum('knowledge').alias("knowledge_minus"),
                    f.sum('comprehension').alias("comprehension_minus"),
                    f.sum('application').alias("application_minus"),
                    f.sum('analysis').alias("analysis_minus"),
                    f.sum('synthesis').alias("synthesis_minus"),
                    f.sum('evaluation').alias("evaluation_minus")) \
                    .withColumnRenamed('student_id', 'student_id_wrong') \
                    .withColumnRenamed('date_id', 'date_id_wrong') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_wrong')
                df_knowledge_wrong = df_knowledge_wrong.where('student_id_wrong is not null')
                df_knowledge = df_knowledge_right.join(df_knowledge_wrong, (
                        df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (
                                                               df_knowledge_right['date_id'] ==
                                                               df_knowledge_wrong['date_id_wrong']) & (
                                                               df_knowledge_right['learning_object_id'] ==
                                                               df_knowledge_wrong['learning_object_id_wrong']), 'outer')
                df_knowledge = df_knowledge.withColumn("user_id",
                                check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \
                    .withColumn("learning_object_id",
                                check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \
                    .withColumn("created_date_id",
                                check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \
                    .withColumn("source_system", f.lit('top_result_ai_phonetic')) \
                    .withColumn("lu_id", f.lit(0))

                dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge")

                # dyf_knowledge.printSchema()
                dyf_knowledge.printSchema()
                dyf_knowledge.show()

                # dyf_knowledge = DynamicFrame.fromDF(dyf_knowledge, glueContext, "dyf_knowledge")
                # chon cac truong va kieu du lieu day vao db
                applymapping = ApplyMapping.apply(frame=dyf_knowledge,
                                                  mappings=[("user_id", 'string', 'student_id', 'long'),
                                                             ("learning_object_id", "string", "learning_object_id", "long"),
                                                             # ("knowledge", "int", "knowledge", "long"),
                                                             # ("comprehension", "int", "comprehension", "long"),
                                                             # ("application", "int", "application", "long"),
                                                             # ("analysis", "int", "analysis", "long"),
                                                             # ("synthesis", "int", "synthesis", "long"),
                                                             # ("evaluation", "int", "evaluation", "long"),
                                                             ("knowledge_plus", "long", "knowledge_plus", "long"),
                                                             ("comprehension_plus", "long", "comprehension_plus", "long"),
                                                             ("application_plus", "long", "application_plus", "long"),
                                                             ("analysis_plus", "long", "analysis_plus", "long"),
                                                             ("synthesis_plus", "long", "synthesis_plus", "long"),
                                                             ("evaluation_plus", "long", "evaluation_plus", "long"),
                                                             ("knowledge_minus", "long", "knowledge_minus", "long"),
                                                             ("comprehension_minus", "long", "comprehension_minus", "long"),
                                                             ("application_minus", "long", "application_minus", "long"),
                                                             ("analysis_minus", "long", "analysis_minus", "long"),
                                                             ("synthesis_minus", "long", "synthesis_minus", "long"),
                                                             ("evaluation_minus", "long", "evaluation_minus", "long"),
                                                             ("count_plus", "long", "plus_number", "long"),
                                                             ("count_minus", "long", "minus_number", "long"),
                                                             # ("lo_type", "string", "lo_type", "long"),
                                                             ("source_system", "string", "source_system", "string"),
                                                             ("created_date_id", "string", "created_date_id", "long"),
                                                             ("lu_id", "int", "lu_type", "long")
                                                             # ("student_level", "string", "student_level", "string"),
                                                             # ("advisor_id", "string", "advisor_id", "long"),
                                                             # ("package_code", "string", "package_code", "string")
                                                             ])
                resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                    transformation_ctx="resolvechoice")
                dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")

                print('START WRITE TO S3-------------------------')

                datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3",
                                                                         connection_options={
                                                                             "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/",
                                                                             "partitionKeys": ["created_date_id", "source_system"]},
                                                                         format="parquet",
                                                                         transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')

                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink5")


                ### END Luu bang mapping_lo_student_history
                # END Tru diem cac tu sai
                # lay max _key tren datasource
                datasource = dyf_top_quiz_attempts.toDF()
                flag = datasource.agg({"_key": "max"}).collect()[0][0]
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')

                # ghi de flag moi vao s3
                df.write.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai", mode="overwrite")
                # xoa cache
                df_study.unpersist()
                df_knowledge_right.unpersist()
                # df_knowledge_right.unpersist()
            except Exception as e:
                print("###################### Exception ##########################")
                print(e)
df.show()
df.count()
df.describe().show()

pd.options.display.html.table_schema = True
df.describe().toPandas()

df.select("dept_name").describe().show()


# The count in the describe method is a count of non-missing values.
df.describe().show()
df.count()

df.head(5)
rows = df.head(5)
type(rows)
rows[0][0]
rows[0]['dept_division']
df.take(5)

df.show(10)

df.describe("dept_division").show()

from pyspark.sql.functions import count, countDistinct
df.select(count("dept_division"), countDistinct("dept_division")).show()

df.select("dept_name").show(10)

df.select("dept_name").distinct().show()
Example #33
0
    def describe_1d(df, column, nrows, lookup_config=None):
        column_type = df.select(column).dtypes[0][1]
        # TODO: think about implementing analysis for complex
        # data types:
        if ("array" in column_type) or ("stuct"
                                        in column_type) or ("map"
                                                            in column_type):
            raise NotImplementedError(
                "Column {c} is of type {t} and cannot be analyzed".format(
                    c=column, t=column_type))

        distinct_count = df.select(column).agg(
            countDistinct(col(column)).alias("distinct_count")).toPandas()
        non_nan_count = df.select(column).na.drop().select(
            count(col(column)).alias("count")).toPandas()
        results_data = pd.concat([distinct_count, non_nan_count], axis=1)
        results_data["p_unique"] = results_data["distinct_count"] / float(
            results_data["count"])
        results_data["is_unique"] = results_data["distinct_count"] == nrows
        results_data["n_missing"] = nrows - results_data["count"]
        results_data["p_missing"] = results_data["n_missing"] / float(nrows)
        results_data["p_infinite"] = 0
        results_data["n_infinite"] = 0
        result = results_data.ix[0].copy()
        result["memorysize"] = 0
        result.name = column

        if result["distinct_count"] <= 1:
            result = result.append(describe_constant_1d(df, column))
        elif column_type in {"tinyint", "smallint", "int", "bigint"}:
            result = result.append(
                describe_integer_1d(df, column, result, nrows))
        elif column_type in {"float", "double", "decimal"}:
            result = result.append(describe_float_1d(df, column, result,
                                                     nrows))
        elif column_type in {"date", "timestamp"}:
            result = result.append(describe_date_1d(df, column))
        elif result["is_unique"] == True:
            result = result.append(describe_unique_1d(df, column))
        else:
            result = result.append(describe_categorical_1d(df, column))
            # Fix to also count MISSING value in the distict_count field:
            if result["n_missing"] > 0:
                result["distinct_count"] = result["distinct_count"] + 1

        # TODO: check whether it is worth it to
        # implement the "real" mode:
        if (result["count"] > result["distinct_count"] > 1):
            try:
                result["mode"] = result["top"]
            except KeyError:
                result["mode"] = 0
        else:
            try:
                result["mode"] = result["value_counts"].index[0]
            except KeyError:
                result["mode"] = 0
            # If and IndexError happens,
            # it is because all column are NULLs:
            except IndexError:
                result["mode"] = "MISSING"

        if lookup_config:
            lookup_object = lookup_config['object']
            col_name_in_db = lookup_config[
                'col_name_in_db'] if 'col_name_in_db' in lookup_config else None
            try:
                matched, unmatched = lookup_object.lookup(
                    df.select(column), col_name_in_db)
                result['lookedup_values'] = str(matched.count()) + "/" + str(
                    df.select(column).count())
            except:
                result['lookedup_values'] = 'FAILED'
        else:
            result['lookedup_values'] = ''

        return result
Example #34
0
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
reload(sys)
sys.setdefaultencoding('utf8')

spark = SparkSession \
    .builder \
    .appName("p3b") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df_escuelas = spark.read.csv("hdfs://localhost/data/escuelasPR.csv")
df_count = df_escuelas.filter(df_escuelas._c0 == "Arecibo").groupBy(
    "_c1", "_c2").agg(count("*"))

df_escuelas.show()
df_count = df_count.toDF("Distrito", "Ciudad", "Count")
df_count.show()
Example #35
0
def main(input1,input2,input3,input4,input5,input6,output):
    # main logic starts here
 business = spark.read.json(input1)
 bus = business.select(business['business_id'],business['name'],business['latitude'],business['longitude'], business['categories'], business['stars'], business['review_count']).filter(business['categories'].contains("Restaurants,")).filter(business['city'].contains("Toronto"))
 review = spark.read.json(input2)
 review_final = review.select(review['business_id'].alias("bus_id_rev"), review['user_id'], review['user_id'], review['stars'], review['text'])
 rev = review.select(review['business_id'].alias("bus_id"), review['text'])
 bus_rev = bus.join(rev, rev['bus_id']==bus['business_id']).filter(bus['business_id']=='m2xeKBhS0szlm7xfU5b8ew')
 bus_count = bus_rev.select(functions.count(bus_rev['business_id'])).collect()
 
 
 food = spark.read.csv(input3, header = True, schema = food_schema)
 fd = food.select(food['business_id'].alias("bus_id_fd"), food['text'], food['food'].alias("food_rating"), (functions.regexp_extract(food['prob1'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob1"), (functions.regexp_extract(food['prob2'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob2"), (functions.regexp_extract(food['prob3'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob3"))
 bus_fd = bus.join(fd, fd['bus_id_fd']==bus['business_id'])
 fd_business1 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_negative")).filter(bus_fd['food_rating']==1)
 fd_business2 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_negative")).filter(bus_fd['food_rating']==2)
 fd_business3 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_negative")).filter(bus_fd['food_rating']==3)
 fd_bus = fd_business1.unionAll(fd_business2)
 fd_business = fd_bus.unionAll(fd_business3)
 fd_group = fd_business.groupby(fd_business['bus_id_fd']).agg(functions.sum(fd_business['f_positive']).alias("f_positive"), functions.sum(fd_business['f_neutral']).alias("f_neutral"), functions.sum(fd_business['f_negative']).alias("f_negative")) 
 fd_count = fd_group.select(functions.count(fd_group['bus_id_fd'])).collect()
 fd_c1 = fd_group.select(functions.sum(fd_group['f_positive'])).collect()
 fd_c2 = fd_group.select(functions.sum(fd_group['f_neutral'])).collect()
 fd_c3 = fd_group.select(functions.sum(fd_group['f_negative'])).collect()

 price = spark.read.csv(input5, header = True, schema = price_schema)
 pr = price.select(price['business_id'].alias("bus_id_pr"), price['text'], price['price'].alias("price_rating"), (functions.regexp_extract(price['p_prob1'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob1"), (functions.regexp_extract(price['p_prob2'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob2"), (functions.regexp_extract(price['p_prob3'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob3"))
 bus_pr = bus.join(pr, pr['bus_id_pr']==bus['business_id'])
 pr_business1 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_negative")).filter(bus_pr['price_rating']==1)
 pr_business2 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_negative")).filter(bus_pr['price_rating']==2)
 pr_business3 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_negative")).filter(bus_pr['price_rating']==3)
 pr_bus = pr_business1.unionAll(pr_business2)
 pr_business = pr_bus.unionAll(pr_business3)
 pr_group = pr_business.groupby(pr_business['bus_id_pr']).agg(functions.sum(pr_business['p_positive']).alias("p_positive"), functions.sum(pr_business['p_neutral']).alias("p_neutral"), functions.sum(pr_business['p_negative']).alias("p_negative")) 
 pr_count = pr_group.select(functions.count(pr_group['bus_id_pr'])).collect()
 pr_c1 = pr_group.select(functions.sum(pr_group['p_positive'])).collect()
 pr_c2 = pr_group.select(functions.sum(pr_group['p_neutral'])).collect()
 pr_c3 = pr_group.select(functions.sum(pr_group['p_negative'])).collect()
 
 service = spark.read.csv(input4, header = True, schema = service_schema)
 sr = service.select(service['business_id'].alias("bus_id_sr"), service['text'], service['service'].alias("service_rating"), (functions.regexp_extract(service['s_prob1'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob1"), (functions.regexp_extract(service['s_prob2'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob2"), (functions.regexp_extract(service['s_prob3'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob3"))
 bus_sr = bus.join(sr, sr['bus_id_sr']==bus['business_id'])
 sr_business1 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_negative")).filter(bus_sr['service_rating']==1)
 sr_business2 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_negative")).filter(bus_sr['service_rating']==2)
 sr_business3 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_negative")).filter(bus_sr['service_rating']==3)
 sr_bus = sr_business1.unionAll(sr_business2)
 sr_business = sr_bus.unionAll(sr_business3)
 sr_group = sr_business.groupby(sr_business['bus_id_sr']).agg(functions.sum(sr_business['s_positive']).alias("s_positive"), functions.sum(sr_business['s_neutral']).alias("s_neutral"), functions.sum(sr_business['s_negative']).alias("s_negative")) 
 sr_count = sr_group.select(functions.count(sr_group['bus_id_sr'])).collect()
 sr_c1 = sr_group.select(functions.sum(sr_group['s_positive'])).collect()
 sr_c2 = sr_group.select(functions.sum(sr_group['s_neutral'])).collect()
 sr_c3 = sr_group.select(functions.sum(sr_group['s_negative'])).collect()

 bus_fd_pr= bus.join(fd_group, fd_group['bus_id_fd']==bus['business_id'])
 bus_fd_final = bus_fd_pr.select(bus_fd_pr['business_id'],bus_fd_pr['name'],bus_fd_pr['latitude'],bus_fd_pr['longitude'], bus_fd_pr['categories'], bus_fd_pr['stars'], bus_fd_pr['review_count'],bus_fd_pr['f_positive'],bus_fd_pr['f_neutral'],bus_fd_pr['f_negative'])
 bus_pr_fd = bus_fd_final.join(pr_group, pr_group['bus_id_pr']==bus_fd_pr['business_id'])
 bus_fd_pr_final = bus_pr_fd.select(bus_pr_fd['business_id'],bus_pr_fd['name'],bus_pr_fd['latitude'],bus_pr_fd['longitude'], bus_pr_fd['categories'], bus_pr_fd['stars'], bus_pr_fd['review_count'],bus_pr_fd['f_positive'],bus_pr_fd['f_neutral'],bus_pr_fd['f_negative'],bus_pr_fd['p_positive'],bus_pr_fd['p_neutral'],bus_pr_fd['p_negative'])
 bus_pr_fd_sr = bus_fd_pr_final.join(sr_group, sr_group['bus_id_sr']==bus_fd_pr_final['business_id'])
 bus_fd_pr_sr_final = bus_pr_fd_sr.select(bus_pr_fd_sr['business_id'],bus_pr_fd_sr['name'],bus_pr_fd_sr['latitude'],bus_pr_fd_sr['longitude'], bus_pr_fd_sr['categories'], bus_pr_fd_sr['stars'], bus_pr_fd_sr['review_count'],bus_pr_fd_sr['f_positive'],bus_pr_fd_sr['f_neutral'],bus_pr_fd_sr['f_negative'],bus_pr_fd_sr['p_positive'],bus_pr_fd_sr['p_neutral'],bus_pr_fd_sr['p_negative'],bus_pr_fd_sr['s_positive'],bus_pr_fd_sr['s_neutral'],bus_pr_fd_sr['s_negative'])
 
 vader = spark.read.json(input6)
 vd = vader.select(vader['id'], (vader['composite']).alias("v_composite"), (vader['positive']).alias("v_positive"), (vader['neutral']).alias("v_neutral"), (vader['negative']).alias("v_negative"))
 vd_group = vd.groupby(vader['id']).agg(functions.round(functions.sum(vd['v_composite'])/functions.count(vd['id']),2).alias("v_composite"), functions.round(functions.sum(vd['v_positive'])/functions.count(vd['id']),2).alias("v_positive"),functions.round(functions.sum(vd['v_neutral'])/functions.count(vd['id']),2).alias("v_neutral"), functions.round(functions.sum(vd['v_negative'])/functions.count(vd['id']),2).alias("v_negative")) 
 bus_pr_fd_sr_vd = bus_fd_pr_sr_final.join(vd_group, vd_group['id']==bus_fd_pr_sr_final['business_id'])
 bus_fd_pr_sr_vd_final = bus_pr_fd_sr_vd.select(bus_pr_fd_sr_vd['business_id'],bus_pr_fd_sr_vd['name'],bus_pr_fd_sr_vd['latitude'],bus_pr_fd_sr_vd['longitude'], bus_pr_fd_sr_vd['categories'], bus_pr_fd_sr_vd['stars'], bus_pr_fd_sr_vd['review_count'],bus_pr_fd_sr_vd['f_positive'],bus_pr_fd_sr_vd['f_neutral'],bus_pr_fd_sr_vd['f_negative'],bus_pr_fd_sr_vd['p_positive'],bus_pr_fd_sr_vd['p_neutral'],bus_pr_fd_sr_vd['p_negative'],bus_pr_fd_sr_vd['s_positive'],bus_pr_fd_sr_vd['s_neutral'],bus_pr_fd_sr_vd['s_negative'],bus_pr_fd_sr_vd['v_composite'],bus_pr_fd_sr_vd['v_positive'],bus_pr_fd_sr_vd['v_neutral'],bus_pr_fd_sr_vd['v_negative'])
 vd_c3 = bus_fd_pr_sr_vd_final.select(functions.count(bus_fd_pr_sr_final['business_id'])).collect()

 print(bus_fd_pr_sr_vd_final.show(10))
df = spark.read.json("data/purchases.json")

# Basic Operations
df.printSchema()
df.describe().show()
df.show(2)
print "Num of records:", df.count()

print ""
print "Answers: Juan David Botero"
print ""

# 1. Top 10 most purchased products
print "1. Top 10 most purchased products:"
df.groupBy(df.product_id, df.item_type).agg(
    sf.count(df.product_id).alias("top_items")).orderBy(
        "top_items", ascending=False).show(10)

# Top 10 by product_id
print "Top 10 by product_id:"
df.groupBy(df.product_id).agg(sf.count(
    df.product_id).alias("top_items")).orderBy("top_items",
                                               ascending=False).show(10)

# Top by item_type
print "Top by item_type:"
df.groupBy(df.item_type).agg(sf.count(
    df.item_type).alias("top_item_type")).orderBy("top_item_type",
                                                  ascending=False).show()

# 2. Purchase percentage of each product type (item_type)
def groupByUserId(df):
    return df.groupBy('user_id')\
     .agg(f.count('text_lower').alias('tweet_count'),f.avg('sentiment').alias('sentiment'), \
     f.first('user_name').alias('user_name'), f.avg('followers_count').cast(IntegerType()).alias('followers_count'))
Example #38
0
    def doRender(self,handlerId):
        g = self.entity
        width = int(self.getPreferredOutputWidth() - 10 )
        height = int(self.getPreferredOutputHeight() - 10  )

        if handlerId == "graphMap":
            graphNodesJson="{"

            for r in g.vertices.rdd.map(lambda row: """"{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}"""
                .format(row.id, row.name.encode("ascii","ignore").decode("ascii"),0.0 if row.latitude is None else row.latitude,0.0 if row.longitude is None else row.longitude)).collect():
                graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r)

            graphNodesJson+="}"

            graphLinksJson = str(g.edges.select("src","dst").groupBy("src","dst").agg(F.count("src").alias("count")).toJSON().map(lambda j: yaml.safe_load(j)).collect())
            
            myLogger.debug("graphMap - nodes: {0}".format(graphNodesJson))
            myLogger.debug("graphMap - links: {0}".format(graphLinksJson))

            self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3",
                callback=self.renderTemplate("graphMap.js", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson, preferredWidth=width, preferredHeight=height))

            self._addHTMLTemplate("graphMap.html")

        elif handlerId == "graphTree":
            def expand(values, visited, level):
                results=[]
                if values is not None and level < maxDepth:
                    for v in values:
                        if v not in visited and len(results)<maxChildren:
                            visited[v]=True
                            results.append({ "name": str(v), "children": {}})
                    for item in results:
                        nextVisited = {}
                        nextVisited.update(visited)
                        item["children"]=expand(dic.get(item["name"]), nextVisited, level+1)
                return results
            
            ar = g.edges.select("src","dst").rdd.map(lambda row: (row[0],[row[1]]))\
                .reduceByKey(lambda d1,d2: d1+d2).map(lambda row: (row[0], list(set(row[1]))))\
                .collect()

            dic = {item[0] : item[1] for item in ar}
            maxDepth = self.options.get("maxDepth", 5)
            maxChildren = self.options.get("maxChildren", 10)
            root = self.options.get("root")
            rootNode = ar[0]

            if root:
                def findRoot(ar):
                    for a in ar:
                        if a[0]==root:
                            return a
                rootNode = findRoot(ar)
            
            if not rootNode:
                self._addHTML("<p>Root node not found!</p>")
            else:
                res = { "name": str(rootNode[0]), "children":expand(dic[ar[0][0]], {rootNode[0]:True}, 1)}
                tree = json.dumps(res)

                myLogger.debug("graphTree - tree: {0}".format(res))

                #if user specified root, then only send back the json tree
                if root:
                    self.addProfilingTime = False
                    print(tree)
                else:
                    nodes = g.vertices.select('id').orderBy('id').rdd.map(lambda r: r[0]).collect()
                    self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", 
                        callback=self.renderTemplate("graphTree.js", root=str(rootNode[0]), tree=tree, preferredWidth=width, preferredHeight=height))
                    self._addHTMLTemplate("graph.html", root=str(rootNode[0]), nodes=nodes, maxDepth=maxDepth, maxChildren=maxChildren, handlerId=handlerId)
            
        else:
            # force-directed graph
            maxEdges = self.options.get("maxEdges", 100)
            cols = [g.edges.columns[i] for i in range(len(g.edges.columns)) if g.edges.columns[i] not in ['src', 'dst']]
            edges = g.edges.toPandas()[:maxEdges].to_json(orient='records')
            graph = json.dumps(edges)
            isupdate = self.options.get("isupdate")

            cols.sort()
            colorBy = self.options.get("colorBy", cols[0] if len(cols) > 0 else "")

            myLogger.debug("graphDirected - edges: {0}".format(edges))

            #if user specified update, then only send back the json graph
            if isupdate:
                self.addProfilingTime = False
                print(graph)
            else:
                self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3",
                    callback=self.renderTemplate("graphDirected.js", graph=graph, preferredWidth=width, preferredHeight=height, colorBy=colorBy))
                self._addHTMLTemplate("graph.html", maxEdges=maxEdges, handlerId=handlerId, cols=cols, colorBy=colorBy)
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20))

# COMMAND ----------

# MAGIC %md ## Most popular flights (single city hops)
# MAGIC Using the `tripGraph`, we can quickly determine what are the most popular single city hop flights

# COMMAND ----------

# Determine the most popular flights (single city hops)
import pyspark.sql.functions as func
topTrips = tripGraph \
  .edges \
  .groupBy("src", "dst") \
  .agg(func.count("delay").alias("trips")) 

# COMMAND ----------

# Show the top 20 most popular flights (single city hops)
display(topTrips.orderBy(topTrips.trips.desc()).limit(20))

# COMMAND ----------

# MAGIC %md ## Top Transfer Cities
# MAGIC Many airports are used as transfer points instead of the final Destination.  An easy way to calculate this is by calculating the ratio of inDegree (the number of flights to the airport) / outDegree (the number of flights leaving the airport).  Values close to 1 may indicate many transfers, whereas values < 1 indicate many outgoing flights and > 1 indicate many incoming flights.  Note, this is a simple calculation that does not take into account of timing or scheduling of flights, just the overall aggregate number within the dataset.

# COMMAND ----------

# Calculate the inDeg (flights into the airport) and outDeg (flights leaving the airport)
inDeg = tripGraph.inDegrees
Example #40
0
flights \
  .withColumn("flight_code", concat("carrier", "flight")) \
  .show()

# `agg()` ejecutar agregaciones usando expresiones
# especificas

# la sentencia agg() te permite crear un Dataframe agregado

# importar y usar funciones de agregacion como `count()`,
# `countDistinct()`, `sum()`, and `mean()`:

from pyspark.sql.functions import count, countDistinct

flights.agg(count("*")).show()

flights.agg(countDistinct("carrier")).show()

# usar el metodo de una columna llamado `alias()` 
# para asignar un nombre a la columna resultado:

flights \
  .agg(countDistinct("carrier").alias("num_carriers")) \
  .show()

# `groupBy()` agrupa datos por columnas especificas 
# las agregaciones pueden ser calculadas por grupos:

from pyspark.sql.functions import mean
sqlCtx = SQLContext(sc)

lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"])

record = lines.map(lambda line: line.split(",")).map(
    lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2]))

recordSchema = sqlCtx.createDataFrame(record)

recordSchema.groupBy().agg({"*": "count"}).show()

recordSchema.groupBy("machine", recordSchema["domain"]).agg(
    {"domain": "max", "request": "min"}).show()

recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max(
    recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int")).groupBy("machine").count().show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").max("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").min("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").sum("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
Example #42
0
    def doRender(self,handlerId):
        g=self.entity
        
        if ( handlerId == "nodeLinkGraph"):
            import json
            ar = g.edges.select("src","dst").map(lambda (s,d): (s,[d]))\
                .reduceByKey(lambda d1,d2: d1+d2).map(lambda (src, arTargets): (src, list(set(arTargets))))\
                .collect()

            dic = {item[0] : item[1] for item in ar}
            limitLevel = self.options.get("limitLevel", 5)
            limitChildren = self.options.get("limitChildren", 10)
            def expand(values, visited,level):
                results=[]
                if values is not None and level < limitLevel:
                    for v in values:
                        if v not in visited and len(results)<limitChildren:
                            visited[v]=True
                            results.append({ "name": str(v), "children": {}})
                    for item in results:
                        nextVisited = {}
                        nextVisited.update(visited)
                        item["children"]=expand(dic.get(item["name"]), nextVisited, level+1)
                return results
            
            root = self.options.get("root")
            rootNode = ar[0]
            if root:
                def findRoot(ar):
                    for a in ar:
                        if a[0]==root:
                            return a
                rootNode = findRoot(ar)
            
            if not rootNode:
                self._addHTML("<p>Can't find the airport</p>");
                return;

            res = { "name": str(rootNode[0]), "children":expand(dic[ar[0][0]], {rootNode[0]:True}, 1)}
            tree = json.dumps(res)

            #if user specified root, then only send back the json tree
            if root:
                print(tree)
                return

            self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", callback=self.renderTemplate("nodeLinkGraph.js", root=tree))
            self._addHTMLTemplate("nodeLinkGraph.html", root=tree, res=res)
        else:
            graphNodesJson="{"
            for r in g.vertices.map(lambda row: """"{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}"""
                .format(row.id, row.name.encode("ascii","ignore"),0.0 if row.latitude is None else row.latitude,0.0 if row.longitude is None else row.longitude)).collect():
                graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r)
            graphNodesJson+="}"        
            graphLinksJson=str(g.edges.select("src","dst").groupBy("src","dst").agg(F.count("src").alias("count")).toJSON().map(lambda j: yaml.safe_load(j)).collect())
    
            self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.js", checkJSVar="d3")
            self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geo.js")
            self._addScriptElement(
                "https://mbostock.github.io/d3/talk/20111116/d3/d3.geom.js", 
                callback= self.renderTemplate("graphMap.js", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson)
            )        
            self._addHTMLTemplate("graphMap.html")
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/data/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")


# COMMAND ----------

from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909


# COMMAND ----------

from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070


# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("SimpleApp").getOrCreate()

df = spark.createDataFrame(
    [(0, 0, 4.0), (0, 1, 2.0), (0, 3, 3.0), (1, 0, 4.0), (1, 1, 1.0), (1, 2, 5.0)],
    ["user", "item", "rating"]
)

df_pandas = df.groupBy("user").agg(F.count(F.col("item"))).toPandas()
print(df_pandas)

spark.stop()
Example #45
0
# COMMAND ----------

# MAGIC %md
# MAGIC For your final task, you'll group by word and count the number of times each word occurs.  Make sure to return the counts in descending order and to call them `counts`.
# MAGIC  
# MAGIC For this task, you can use:
# MAGIC  * `DataFrame` operations `groupBy`, `agg`, and `sort`
# MAGIC  * the `Column` operation `alias`
# MAGIC  * functions `func.count` and `func.desc`.

# COMMAND ----------

# ANSWER
wordGroupCount = (wordList
                  .groupBy('word')  # group
                  .agg(func.count('word').alias('counts'))  # aggregate
                  .sort(func.desc('counts')))  #sort

wordGroupCount.take(5)

# COMMAND ----------

# TEST
Test.assertEquals(tuple(wordGroupCount.first()), (u'ref', 29263), 'incorrect counts.')

# COMMAND ----------

# MAGIC %md
# MAGIC We could also use SQL to accomplish this counting.

# COMMAND ----------
# DBTITLE 1,Quais os tipos de variáveis da base "Closed Deals"?
closed_deals.dtypes

# COMMAND ----------

# DBTITLE 1,Resumo estatístico da base "Closed Deals" antes do tratamento
display(closed_deals.describe())

# COMMAND ----------

# DBTITLE 1,Quantidade de missing na base "Closed Deals"
from pyspark.sql.functions import isnull, when, count, col

aux = []
for c in closed_deals.columns:
  aux.append(count(when(isnull(c), c)).alias(c))

display(closed_deals.select(aux))

# COMMAND ----------

# DBTITLE 1,Tratamento de missing nas variáveis categóricas
closed_deals = closed_deals.fillna('NA', subset=['business_segment'])
closed_deals = closed_deals.fillna('NA', subset=['lead_type'])
closed_deals = closed_deals.fillna('NA', subset=['lead_behaviour_profile'])
closed_deals = closed_deals.fillna('NA', subset=['has_company'])
closed_deals = closed_deals.fillna('NA', subset=['has_gtin'])
closed_deals = closed_deals.fillna('NA', subset=['average_stock'])
closed_deals = closed_deals.fillna('NA', subset=['business_type'])

# COMMAND ----------
# 重製日期格式
tDf = (pDf.withColumn('dateYear', pDf['Tran_time'].substr(1, 4)).withColumn(
    'dateMonth',
    pDf['Tran_time'].substr(6, 2)).withColumn('dateDay',
                                              pDf['Tran_time'].substr(9, 2)))

# 刪除不使用欄位
tDf = tDf.drop(tDf.Tran_time)

# 支付方式
paymentColumn = ['900', '901', '902', '903', '905', '906', '907', '931', '933']

# 根據加油站、年及月,計算各類支付方式的使用次數
groupColumn = ['Deptno', 'dateYear', 'dateMonth']
deptnoYMaPayment = (tDf.groupBy(groupColumn).agg(
    count(when((col("Payment") == paymentColumn[0]), True)).alias('a900'),
    count(when((col("Payment") == paymentColumn[1]), True)).alias('a901'),
    count(when((col("Payment") == paymentColumn[2]), True)).alias('a902'),
    count(when((col("Payment") == paymentColumn[3]), True)).alias('a903'),
    count(when((col("Payment") == paymentColumn[4]), True)).alias('a905'),
    count(when((col("Payment") == paymentColumn[5]), True)).alias('a906'),
    count(when((col("Payment") == paymentColumn[6]), True)).alias('a907'),
    count(when((col("Payment") == paymentColumn[7]), True)).alias('a931'),
    count(when((col("Payment") == paymentColumn[8]),
               True)).alias('a933')).orderBy(groupColumn))

# 路徑
outputPath = "/home/cpc/data/resultData"
# 資料
outputFile = "stdnoPaymentYearMonthDayCount.json"
# 完整路徑和資料
master_url = open("/root/spark-ec2/cluster-url").read().strip()
context = SparkContext(master_url)
context.setLogLevel("WARN")
sqlcontext = SQLContext(context)

def extract_kmers(r):
    for i in range(0,len(r.seq)-k+1):
        yield r.seq[i:i+k]

for sample_name in samples:
    sample_filename = "s3n://helgag/ocean_metagenome/overlapped/{sample_name}.csv".format(sample_name=sample_name)
    customSchema = StructType([ \
                StructField("id", StringType(), True), \
                StructField("seq", StringType(), True)])
    sample = sqlcontext.read.format('com.databricks.spark.csv').options(header='true').load(sample_filename, schema=customSchema).repartition(80)
    sample = sample.flatMap(extract_kmers).map(Row("kmer")).toDF().groupBy("kmer").agg(count("*"))
    #Toggle comment the following to export the data
    sample.registerTempTable(sample_name + "_count")
    #sample.repartition(1).write.format('com.databricks.spark.csv').options(header='true').save(sample_name+'.csv')
    #Or this for pushing to s3
    #sample.repartition(1).write.format('com.databricks.spark.csv').options(header='true').save('s3n://oceankmers/overlapped/'+sample_name+'.csv')

for i, sample_a in enumerate(samples):
    for j in range(i+1):
        if i == j:
            print 0
            continue

        X_sql = """
        select '{sample1}' as asample, '{sample2}' as bsample,
               case when a.count < b.count then a.count else b.count end as minv,
Example #49
0
        sys.exit(-1)

    spark = SparkSession\
        .builder\
        .appName("PythonMnMCount")\
        .getOrCreate()
    mnm_file = sys.argv[1]
    mnm_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(mnm_file)
    # aggregate count of all colors and groupBy state and color
    # orderBy descending order
    count_mnm_df = mnm_df.select("State", "Color", "Count") \
                    .groupBy("State", "Color") \
                    .agg(count("Count") \
                    .alias("Total")) \
                    .orderBy("Total", ascending=False)

    count_mnm_df.show(n=60, truncate=False)
    print("Total Rows = %d" % (count_mnm_df.count()))
    #
    # find the aggregate count for California
    ca_count_mnm_df = mnm_df.select("*") \
                       .where(mnm_df.State == 'CA') \
                       .groupBy("State", "Color") \
                       .agg(count("Count") \
                            .alias("Total")) \
                       .orderBy("Total", ascending=False)
    ca_count_mnm_df.show(n=10, truncate=False)
    # stop the SparkSession
    spark.stop()
Example #50
0
    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(psdf._internal.spark_column_for(label).alias(input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(
                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
            )

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                )
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                    )
                )

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (
            output_df.groupby("__group_id", "__bucket")
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=["__group_id", "__bucket"])
        )

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
                ["count"]
            ]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series
Example #51
0
    def describe_1d(df, column, nrows, lookup_config=None):
        column_type = df.select(column).dtypes[0][1]
        # TODO: think about implementing analysis for complex
        # data types:
        if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type):
            raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type))

        distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas()
        non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas()
        results_data = pd.concat([distinct_count, non_nan_count],axis=1)
        results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"])
        results_data["is_unique"] = results_data["distinct_count"] == nrows
        results_data["n_missing"] = nrows - results_data["count"]
        results_data["p_missing"] = results_data["n_missing"] / float(nrows)
        results_data["p_infinite"] = 0
        results_data["n_infinite"] = 0
        result = results_data.ix[0].copy()
        result["memorysize"] = 0
        result.name = column

        if result["distinct_count"] <= 1:
            result = result.append(describe_constant_1d(df, column))
        elif column_type in {"tinyint", "smallint", "int", "bigint"}:
            result = result.append(describe_integer_1d(df, column, result, nrows))
        elif column_type in {"float", "double", "decimal"}:
            result = result.append(describe_float_1d(df, column, result, nrows))
        elif column_type in {"date", "timestamp"}:
            result = result.append(describe_date_1d(df, column))
        elif result["is_unique"] == True:
            result = result.append(describe_unique_1d(df, column))
        else:
            result = result.append(describe_categorical_1d(df, column))
            # Fix to also count MISSING value in the distict_count field:
            if result["n_missing"] > 0:
                result["distinct_count"] = result["distinct_count"] + 1

        # TODO: check whether it is worth it to
        # implement the "real" mode:
        if (result["count"] > result["distinct_count"] > 1):
            try:
                result["mode"] = result["top"]
            except KeyError:
                result["mode"] = 0
        else:
            try:
                result["mode"] = result["value_counts"].index[0]
            except KeyError:
                result["mode"] = 0
            # If and IndexError happens,
            # it is because all column are NULLs:
            except IndexError:
                result["mode"] = "MISSING"

        if lookup_config:
            lookup_object = lookup_config['object']
            col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None
            try:
                matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db)
                result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count())
            except:
                result['lookedup_values'] = 'FAILED'
        else:
            result['lookedup_values'] = ''

        return result
Example #52
0
def countByWord(spark, df):
    return df.groupBy(WORD_COL).agg(
        F.count(WORD_COL)).alias('count').orderBy(WORD_COL)
Example #53
0
    def process_data(self):
        ##############################################################################
        # DECLARE VARIABLES
        ##############################################################################
        dt_range = self.study_dates("2020-07-30")
        dt = dt_range
        s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s1_initial_bucket_depth = 'cuebiq/daily-feed/US/'
        s1_bucket_output = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/'
        anchor_dist = 430
        time_thresh = 28800
        part_num = 9

        gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True)
        ])

        s2_gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True),
            StructField("row_number", IntegerType(), True)
        ])

        ##############################################################################
        # WINDOWS
        ##############################################################################
        w = Window().partitionBy('device_id').orderBy('utc_timestamp')
        l = Window().partitionBy('device_id',
                                 'lin_grp').orderBy('utc_timestamp')
        w2 = Window().partitionBy('device_id').orderBy('row_number')

        ##############################################################################
        # BEGIN DAILY ITERATION
        ##############################################################################

        print("Reading in files for {}".format(str(dt['study_dt'])[:10]))
        print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name,
                                                 s1_initial_bucket_depth,
                                                 dt['s3_before'],
                                                 dt['s3_study_dt'],
                                                 dt['s3_after']))
        print("")

        #################################################################################################
        # START STEP 1
        #################################################################################################
        df1 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_before'] +"/*.gz") # the day before

        df2 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_study_dt'] +"/*.gz") # actual study date

        df3 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_after'] +"/*.gz") # the day after

        # Union data from three inputs into 1 dataframe
        df = df1.union(df2).union(df3) \
            .repartition(part_num, 'device_id')

        del df1
        del df2
        del df3

        ##############################################################################
        # FILTER INITIAL JUNK RECORDS
        # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range
        ##############################################################################
        df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \
                    .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \
                            & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \
                            & (df['utc_timestamp'] + df['tz_offset']) \
                                    .between(dt['utc_study_dt'], dt['utc_after'])) \
                    .dropDuplicates(['utc_timestamp','device_id'])

        ##############################################################################
        # EXCESSIVE SPEED REMOVAL
        ##############################################################################
        df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \
                            lead(df['longitude'],1).over(w))) \
            .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \
            .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \
            .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \
            .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \
            .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \
            .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \
                        | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \
                        & ((col('speed_from') < 600) & (col('speed_to') < 600)) \
                        & ((col('speed_from') < 20) | (col('speed_to') < 20))) \
            .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        ##############################################################################
        # LINEAR TRAVEL PING REMOVAL
        # Break pings out into groups of 4 to measure the linear distance
        ##############################################################################
        #Assign a record number and linear grouping and lead distance
        df = df.withColumn('RecordNum',row_number().over(w)) \
            .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \
            .withColumn('dist_to', distance(df['latitude'], df['longitude'], \
                lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters'))

        # Create aggregated table for linear groupings
        expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \
            py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')]

        dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr)

        dfl_grp.createOrReplaceTempView('dfl_grp')
        df.createOrReplaceTempView('dfl')

        # Grab just the first and last records in each linear grouping and append aggregated info
        dfls = spark.sql(
            "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.min_utc_timestamp \
                    UNION ALL \
                    SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.max_utc_timestamp")

        # Measure the distance between first and last in each linear grouping and compare to sum distance of all points
        # Only keep groups that meet criteria for being straight-line
        df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \
                    lead(dfls['latitude'],1).over(l), \
                    lead(dfls['longitude'],1).over(l), 'meters')) \
                .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \
                .na.drop(subset=['strt_dist']) \
                .filter((dfls['min_dist'] > 0)  \
                    & (col('strt_dist').between(150, 2000)) \
                    & (dfls['cnt'] == 4) \
                    & (col('lin') >= .99825)) \
                .select('device_id','lin_grp', 'lin')

        # Outer join main dataframe to linears groups to filter non-linear pings
        df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \
            .filter(col('lin').isNull()) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        del dfl_grp
        del dfls
        del df_j

        #######################################
        # CHAIN
        # Calculating the dynamic chain threshold to find proximate ping relationships
        #######################################
        df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
            .withColumn('chain', when((distance(df['latitude'], df['longitude'], \
                            lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1)
                            .when((distance(df['latitude'], df['longitude'], \
                            lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \
            .filter(col('chain') == 1) \
            .withColumn('row_number', row_number().over(w)) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \
            .persist()

        df \
            .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \
            .write \
            .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        ##############################################################################################
        # START STEP 2
        ##############################################################################################

        print('Begin micro-clustering')

        # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold
        self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \
                        .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \
                                .when(distance(df['latitude'], df['longitude'], \
                                                lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \
                                            >= anchor_dist, col('tz_timestamp')) \
                                .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \
                        .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \
                        .repartition(part_num, 'device_id') \
                        .persist()

        print('df_dist starting count = {}'.format(
            self.df_dist.count()))  # Materialize table for caching

        df.unpersist()
        del df

        #####################################################################################################
        # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing
        # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration
        ######################################################################################################
        df_anchor1 = self.anchor_func(3, 3)
        df_anchor2 = self.anchor_func(5, 5)
        df_anchor3 = self.anchor_func(12, 6)
        df_anchor4 = self.anchor_func(20, 5)
        df_anchor5 = self.anchor_func(30, 5)
        df_anchor6 = self.anchor_func(50, 5)
        df_anchor7 = self.anchor_func(80, 5, 1000000)
        df_anchor8 = self.anchor_func(1000, 5, 1000000)

        ##################################################################################################
        # Collect remaining pings to driver for Python analysis
        print('collect remaining pings')
        anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \
                                                'longitude':row[4], 'anchor':row[7]}).collect()

        # Sort elements in list by device_id and timestamp
        anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp'))

        # Python analysis on driver of final remaining pings
        print('iterate through remaining pings on driver')
        anchor_dr = []

        for r in anchor_list:
            if r['anchor'] is not None:
                anchor_dr.append(r)

            else:
                if anchor_dr[-1]['device_id'] == r['device_id']:
                    if distance_dr(r['latitude'],r['longitude'], \
                                anchor_dr[-1]['latitude'], \
                                anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \
                                & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh:
                        anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \
                                        'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \
                                        'anchor':anchor_dr[-1]['anchor']})

                    else:
                        r['anchor'] = r['timestamp']
                        anchor_dr.append(r)

        # Condense result table for dataframe distribution
        print('generate driver anchor table')
        new_anchor = []
        for r in anchor_dr:
            new_anchor.append([r['timestamp'], r['device_id'], r['anchor']])

        # Bring driver results back into a distributed dataframe and join results
        print('disperse driver anchor table back to cluster')
        new_anchor_schema = StructType([
            StructField('tz_timestamp', IntegerType(), True),
            StructField('device_id', StringType(), True),
            StructField('anchor', IntegerType(), True)
        ])

        df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \
                        .repartition(part_num, 'device_id')

        # Join remaining anchors to main analysis table
        self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \
                                'accuracy','row_number') \
                            .join(df_anchor_dr,['tz_timestamp','device_id']) \

        # Union all anchor tables together and sort
        print('finalizing anchor results into central table')
        df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \
                            .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \
                            .repartition(part_num,'device_id') \
                            .persist()

        self.df_dist.unpersist()

        #######################################################################################
        # Calculate centroids
        #######################################################################################
        print('start calculating centroids')
        # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings
        df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \
                                                                        py.count(col('tz_timestamp')).alias('cnt')]) \
                                    .withColumn('max_acc_1', col('max_accuracy') + 1) \
                                    .filter(col('cnt') > 1) \
                                    .select('device_id','anchor','max_acc_1','cnt')

        # Calculate the nominator for each micro-cluster
        df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \
                                        .withColumn('nom',col('max_acc_1') - col('accuracy'))

        df_denom = df_anchors_fnl.groupBy(
            'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')])


        df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \
                            .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \
                            .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \
                            .withColumn('lon', df_anchors_fnl['longitude'] * col('weight'))


        expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \
                    py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \
                    py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \
                    py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \
                    py.avg(col('accuracy')).alias('avg_accuracy')]

        df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \
                                .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \
                                .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \
                                .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \
                                .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \
                                .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \
                                .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \
                                .filter(col('dwell_seconds') > 1) \
                                .select('device_id','os','start_tm','end_tm', \
                                        'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \
                                        col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy')


        df_micro \
                .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \
                .write \
                .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        df_anchors_fnl.unpersist()

        return
Example #54
0
    def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues):
        allNumericCols = self.getNumericalFieldNames()
        if len(allNumericCols) == 0:
            self._addHTML("Unable to find a numerical column in the dataframe")
            return
        
                 
        keyFields = self.options.get("keyFields")
        valueField = self.options.get("valueFields")

        if(keyFields==None and valueField==None):
            keyFields=self.getFirstStringColInfo()
            valueField=self.getFirstNumericalColInfo() 
        else:
            keyFields = keyFields.split(',') 
            valueField = valueField.split(',') 
            if(len(valueField) > 1):
                self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField)))
                return
            keyFields = keyFields[0]
            valueField=valueField[0]
        
                
        #if(len(valueFields>)):


    
        #init
        fig=figure
        ax=axes
        
        #fig, ax = plt.subplots()   
        #fig = plt.figure()
        

        params = plt.gcf()
        plSize = params.get_size_inches()
        params.set_size_inches( (plSize[0]*2, plSize[1]*2) )


        agg=self.options.get("aggregation")
        groupByCol=self.options.get("groupByCol")
        
        if (agg=="None" or agg==None):
            colLabel = keyFields
            y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist()
            x_intv = np.arange(len(y))
            labels =  self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel(valueField, fontsize=18)
        elif(agg=='AVG'):
            y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields)
            y=y1["avg"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("Average "+valueField, fontsize=18)
        elif(agg=='SUM'):
            y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields)
            y=y1["sum"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("sum "+valueField, fontsize=18)
        elif(agg=='MAX'):
            y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields)
            y=y1["max"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("max "+valueField, fontsize=18)
        elif(agg=='MIN'):
            y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields)
            y=y1["min"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("min "+valueField, fontsize=18)
        elif(agg=='COUNT'):
            y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields)
            y=y1["count"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("count "+valueField, fontsize=18)

        mpld3.enable_notebook()      
        plt.bar(x_intv,y,color="blue",alpha=0.5)
        ax_fmt = BarChart(labels)
        mpld3.plugins.connect(fig, ax_fmt)