def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def doRender(self, handlerId): self.addProfilingTime = False self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3") #self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geo.js") #self._addScriptElement("https://cdnjs.cloudflare.com/ajax/libs/d3-geo-projection/0.2.16/d3.geo.projection.js") #self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geom.js") #Load the data from the flight history db df = loadFlightHistory() ShellAccess.flightHistoryDF = df res = df.flatMap(lambda row: [\ (row.depAirportFSCode, row.depAirportName.encode("ascii","ignore"), 0.0 if row.depAirportLat is None else row.depAirportLat,0.0 if row.depAirportLong is None else row.depAirportLong), \ (row.arrAirportFSCode, row.arrAirportName.encode("ascii","ignore"), 0.0 if row.arrAirportLat is None else row.arrAirportLat,0.0 if row.arrAirportLong is None else row.arrAirportLong)\ ])\ .distinct()\ .map(lambda t: """ "{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}""".format(t[0], t[1], t[2],t[3])) graphNodesJson="{" for r in res.collect(): graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r) graphNodesJson+="}" myLogger.debug("graphNodesJson: {0}".format(graphNodesJson)) graphLinksJson = df.select("arrAirportFSCode","depAirportFSCode")\ .withColumnRenamed("depAirportFSCode", "src").withColumnRenamed("arrAirportFSCode", "dst")\ .groupBy("src","dst").agg(F.count("src").alias("count"))\ .toJSON().map(lambda j: yaml.safe_load(j)).collect() myLogger.debug("graphLinksJson: {0}".format(graphLinksJson)) self._addHTMLTemplate("mapResults.html", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson)
def getValueFieldValueLists(self, handlerId, keyFields, valueFields): df = self.entity.groupBy(keyFields) agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId)) maxRows = int(self.options.get("rowCount","100")) numRows = min(maxRows,df.count()) valueLists = [] for valueField in valueFields: valueDf = None if agg == "SUM": valueDf = df.agg(F.sum(valueField).alias("agg")) elif agg == "AVG": valueDf = df.agg(F.avg(valueField).alias("agg")) elif agg == "MIN": valueDf = df.agg(F.min(valueField).alias("agg")) elif agg == "MAX": valueDf = df.agg(F.max(valueField).alias("agg")) else: valueDf = df.agg(F.count(valueField).alias("agg")) for keyField in keyFields: valueDf = valueDf.sort(F.col(keyField).asc()) valueDf = valueDf.dropna() rows = valueDf.select("agg").take(numRows) valueList = [] for row in rows: valueList.append(row["agg"]) valueLists.append(valueList) return valueLists
def handleUIOptions(self, displayColName): agg = self.options.get("aggregation") valFields = self.options.get("valueFields") if agg == 'COUNT': return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas() elif agg == 'SUM': return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas() elif agg == 'AVG': return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas() elif agg == 'MIN': return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas() elif agg == 'MAX': return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas() elif agg == 'MEAN': return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas() else: return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def test_smvRenameField_preserve_meta_for_unrenamed_fields(self): df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij") desc = "c description" res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\ .smvDesc(("c", desc)) self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)]) res2 = res1.smvRenameField(("a", "d")) self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
def acquire_majority_clusters(communities_in): # based on Top 5 Clusters where majority are # 1 large community, 1 small community, several small micro-communitiess q = communities_in.select("id", "type", "label").groupBy("label").agg(count("id").alias("count")).orderBy(desc("count")) maj_clusters = communities_in.select("id", "department", "loan", "type", "label") maj_clusters = maj_clusters.join(q.limit(5), on='label').select('id', 'department', 'type', 'loan', 'label') df_maj_clusters = maj_clusters.toPandas() df_maj_clusters = df_maj_clusters.rename(columns={'id': 'obj_id'}) df_maj_clusters.obj_id = df_maj_clusters.obj_id.astype(long) #n_vertices_clusters = df_maj_clusters.shape[0] return maj_clusters, df_maj_clusters
def sampleColumn(self, numerical): default=None if Environment.hasSpark: from pyspark.sql import functions as F for field in self.entity.schema.fields: # Ignore unique ids if field.name.lower() != 'id' and ( not numerical or dataFrameMisc.isNumericType(field.dataType) ): # Find a good column to display in pie ChartDisplay default = default or field.name.decode("utf-8") if PY2 else field.name count = self.entity.count() sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg") if orderedSample.take(1)[0]["agg"] > 10: return [field.name.decode("utf-8") if PY2 else field.name] # Otherwise, return first non-id column return [default]
def runAggregateFunctions(spark, df1, df2): # collect_list, collect_set doubledDf1 = df1.union(df1) doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False) doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False) # count, countDistinct doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show( truncate=False) # sum df2.printSchema() df2.select(sum(df2["price"])).show(truncate=False) # grouping, grouping_id df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False) df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show( truncate=False)
def getPieColInfo(self, numerical): # If user selects a column in dialog box, give it to them keyFields = self.options.get("keyFields") if keyFields is not None: return keyFields schema = self.entity.schema default=None for field in schema.fields: # Ignore unique ids if field.name.lower() != 'id' and ( not numerical or isNum(field.dataType.__class__.__name__) ): # Find a good column to display in pie ChartDisplay default = default or field.name count = self.entity.count() sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg") if orderedSample.take(1)[0]["agg"] > 10: return field.name # Otherwise, return first non-id column return default
def describe_categorical_1d(df, column): value_counts = (df.select(column).na.drop() .groupBy(column) .agg(count(col(column))) .orderBy("count({c})".format(c=column),ascending=False) ).cache() # Get the most frequent class: stats = (value_counts .limit(1) .withColumnRenamed(column, "top") .withColumnRenamed("count({c})".format(c=column), "freq") ).toPandas().ix[0] # Get the top 50 classes by value count, # and put the rest of them grouped at the # end of the Series: top_50 = value_counts.limit(50).toPandas().sort_values("count({c})".format(c=column), ascending=False) top_50_categories = top_50[column].values.tolist() others_count = pd.Series([df.select(column).na.drop() .where(~(col(column).isin(*top_50_categories))) .count() ], index=["***Other Values***"]) others_distinct_count = pd.Series([value_counts .where(~(col(column).isin(*top_50_categories))) .count() ], index=["***Other Values Distinct Count***"]) top = top_50.set_index(column)["count({c})".format(c=column)] top = top.append(others_count) top = top.append(others_distinct_count) stats["value_counts"] = top stats["type"] = "CAT" value_counts.unpersist() unparsed_valid_jsons = df.select(column).na.drop().rdd.map( lambda x: guess_json_type(x[column])).filter( lambda x: x).distinct().collect() stats["unparsed_json_types"] = unparsed_valid_jsons return stats
def test_bounded_simple(self): from pyspark.sql.functions import mean, max, min, count df = self.data w1 = self.sliding_row_window w2 = self.shrinking_range_window plus_one = self.python_plus_one count_udf = self.pandas_agg_count_udf mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count_udf(df['v']).over(w2)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('min_v', min_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count(df['v']).over(w2)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('min_v', min(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
df_tab2.coalesce(1).write.mode('overwrite').option( "header", "true").format('com.databricks.spark.csv').save(output_path + "tab2.csv") com_list = [ "McDonald's", "Starbucks", "Chipotle Mexican Grill", "Dunkin'", "Buffalo Wild Wings", "Denny's", "Panera Bread", "Pizza Hut", "Taco Bell", "Wendy's" ] df_10_bsn = df_re_bsn #filter(df_re_bsn.name.isin(com_list)).select("text") from pyspark.sql.functions import split, col, explode, count df_tab4=df_10_bsn.withColumn('words',split(col('text'),' '))\ .withColumn('word',explode(col('words')))\ .drop('text','words').groupBy('word').agg(count('word')\ .alias('count')).orderBy('count',ascending=False) df_tab4.coalesce(1).write.mode('overwrite').option( "header", "true").format('com.databricks.spark.csv').save(output_path + "tab4.csv") for i in range(10): df_tab3_iter = df_10_bsn.filter(df_10_bsn.name == com_list[i]) df_tab3_iter = df_tab3_iter.withColumn('words', split( col('text'), ' ')).withColumn('word', explode(col('words'))).drop( 'text', 'words').groupBy('word').agg( count('word').alias('count')).orderBy('count', ascending=False) df_tab3_iter.coalesce(1).write.mode('overwrite').option( "header", "true").format('com.databricks.spark.csv').save(output_path + str(i) + "tab4.csv")
def groupByMention(df): return df.withColumn('mentioned', f.explode(df.mentioned)).groupBy('mentioned')\ .agg(f.count('id').alias('count'),f.avg('sentiment').alias('sentiment'))
print('misstatement_precision is {}, misstatement recall is {}'.format( misstatement_precision, misstatement_recall)) print('non_misstatement_precision is {}, non_misstatement recall is {}'. format(non_misstatement_precision, non_misstatement_recall)) # Downsampling: misstated_df = integrated_df.filter(integrated_df.label == 1.0) misstated_count = misstated_df.count() non_misstated_df = integrated_df.filter( integrated_df.label == 0.0).limit(misstated_count) integrated_df = misstated_df.union(non_misstated_df).cache() # Using nullcounts to filter columns to keep nullcounts = integrated_df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in integrated_df.columns ]) nc = list(nullcounts.first()) # Services-packaged software category selection (from EDA) services_prepacked_software = integrated_df # .filter(integrated_df.sic == '7372') print('Total records in integrated file: ', integrated_df.count()) print('Number of records in Services-packaged software industrial category: ', services_prepacked_software.count()) # Reusing preprocessing steps implemented by Vincent # filling nulls and nones with zeroes. some_dict = {} for x in services_prepacked_software.columns: some_dict[x] = 0
# ) # broken_readings.createOrReplaceTempView("broken_readings") # COMMAND ---------- # ANSWER from pyspark.sql.functions import col, count broken_readings = ( spark.read .format("delta") .load(health_tracker + "processed") .select(col("heartrate"), col("dte")) .where(col("heartrate") < 0) .groupby("dte") .agg(count("heartrate")) .orderBy("dte") ) broken_readings.createOrReplaceTempView("broken_readings") # COMMAND ---------- %sql SELECT SUM(`count(heartrate)`) FROM broken_readings # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC #### Step 2: Verify That These are New Broken Readings
def main(sc): """ Read GDELT data from S3, select columns, join tables, and perform calculations with grouped themes and document times """ #Obtain taxonomy dictionary and broadcast to the workers tax_file = os.environ['TAX_LIST_FILE'] tax_list = f.read_tax_file(tax_file) rdd_tax_list = sc.broadcast(tax_list) #Obtain list of top 500 themes used for filtering theme_file = os.environ['THEME_LIST_FILE'] theme_list = f.read_theme_file(theme_file) rdd_theme_list = sc.broadcast(theme_list) #Obtainb list of top new src used for filtering src_file = os.environ['SRC_LIST_FILE'] src_list = f.read_src_file(src_file) rdd_src_list = sc.broadcast(src_list) #Read "mentions" table from GDELT S3 bucket. Transform into RDD mentionRDD = sc.textFile('s3a://gdelt-open-data/v2/mentions/*.mentions.csv') mentionRDD = mentionRDD.map(lambda x: x.encode("utf", "ignore")) mentionRDD = mentionRDD.map(lambda x : x.split('\t')) mentionRDD = mentionRDD.filter(lambda x: len(x)==16) mentionRDD = mentionRDD.filter(lambda x: f.is_not_empty([x[2], x[5], x[13]])) mentionRDD = mentionRDD.filter(lambda x: f.is_number(x[13])) mentionRowRDD = mentionRDD.map(lambda x : Row( mention_id = x[5], mention_doc_tone = float(x[13]), mention_time_date = f.transform_to_timestamptz_daily(x[2]) )) #Read 'GKG" table from GDELT S3 bucket. Transform into RDD gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/YEARMONTH*0000.gkg.csv') gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore")) gkgRDD = gkgRDD.map(lambda x: x.split('\t')) gkgRDD = gkgRDD.filter(lambda x: len(x)==27) gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty([x[3], x[4], x[7]])) gkgRowRDD = gkgRDD.map(lambda x : Row(src_common_name = x[3], doc_id = x[4], themes = f.clean_taxonomy(x[7].split(';')[:-1], rdd_tax_list) )) sqlContext = SQLContext(sc) #Transform RDDs to dataframes mentionDF = sqlContext.createDataFrame(mentionRowRDD) gkgDF = sqlContext.createDataFrame(gkgRowRDD) df1 = mentionDF.alias('df1') df2 = gkgDF.alias('df2') #Themes and tones information are stored in two different tables joinedDF = df1.join(df2, df1.mention_id == df2.doc_id, "inner").select('df1.*' , 'df2.src_common_name','df2.themes').repartition(2000) #Each document could contain multiple themes. Explode on the themes and make a new column on filtered themes explodedDF = joinedDF.select('mention_id' , 'mention_doc_tone' , 'mention_time_date' , 'src_common_name' , explode(joinedDF.themes).alias("theme")) \ .filter(col('theme').isin(*(rdd_theme_list.value))) hist_data_udf = udf(f.hist_data, ArrayType(IntegerType())) get_quantile_udf = udf(f.get_quantile, ArrayType(FloatType())) #Compute statistics for each theme at a time explodedDF.cache() #Over all sources testDF1 = explodedDF.groupBy('theme', 'mention_time_date').agg( count('*').alias('num_mentions'), avg('mention_doc_tone').alias('avg'), collect_list('mention_doc_tone').alias('tones') ) #For each source testDF2 = explodedDF.groupBy('theme', 'mention_time_date', 'src_common_name').agg( count('*').alias('num_mentions'), avg('mention_doc_tone').alias('avg'), collect_list('mention_doc_tone').alias('tones') ).repartition(2000) #Histogram and compute quantiles for tones histDF1 = testDF1.withColumn("bin_vals", hist_data_udf('tones')) \ .withColumn("quantiles", get_quantile_udf('tones')) histDF2 = testDF2.withColumn("bin_vals", hist_data_udf('tones')) \ .withColumn("quantiles", get_quantile_udf('tones')) finalDF1 = histDF1.select('theme', 'num_mentions', 'avg', 'quantiles', 'bin_vals', col('mention_time_date').alias('time')) #Filter sources finalDF2 = histDF2.select('theme', 'src_common_name', 'num_mentions', 'avg', 'quantiles', 'bin_vals', col('mention_time_date').alias('time')).filter(col('src_common_name').isin(*(rdd_src_list.value))) #Preparing to write to TimescaleDB #Fist write to group-by-src table db_properties = {} config = configparser.ConfigParser() config.read("db_properties.ini") db_prop = config['postgresql'] db_url = db_prop['url'] db_properties['username'] = db_prop['username'] db_properties['password'] = db_prop['password'] db_properties['url'] = db_prop['url'] db_properties['driver'] = db_prop['driver'] #Write to table finalDF1.write.format("jdbc").options( url=db_properties['url'], dbtable='bubblebreaker_schema.tones_table_v3', user='******', password='******', stringtype="unspecified" ).mode('append').save() #Then write to per-src table config.read("db_properties_src.ini") db_prop = config['postgresql'] db_url = db_prop['url'] db_properties['username'] = db_prop['username'] db_properties['password'] = db_prop['password'] db_properties['url'] = db_prop['url'] db_properties['driver'] = db_prop['driver'] #Write to table finalDF2.write.format("jdbc").options( url=db_properties['url'], dbtable='bubblebreaker_src_schema.tones_table_v2', user='******', password='******', stringtype="unspecified" ).mode('append').save()
def range_frame_match(): return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select( F.count("*").over( window.Window.rangeBetween(-sys.maxsize, sys.maxsize))).columns[0]
kmeans = KMeans().setK(20).setMaxIter(5) # fitting out features into K means model = kmeans.fit(Dataframe.select('features')) # Save your model # model.save("F:\\kMeans") # Adding the prediction from K means to the Dataset clusters = model.transform(Dataframe) clusters.show() print("K means predictions") clusters.select( month("dt").alias("month"), dayofmonth("dt").alias("day"), hour("dt").alias("hour"), "prediction").groupBy("month", "day", "hour", "prediction").agg( func.count("prediction").alias("count")).orderBy( "day", "hour", "prediction").show() print("Count Total") clusters.select(hour("dt").alias("hour"), "prediction").groupBy( "hour", "prediction").agg(func.count("prediction").alias("count")).orderBy( func.desc("count")).show() print("Count Total ordered by count") clusters.groupBy("prediction").count().show() print("Counts in each cluster")
def summary(df, datatypes=None): spark = df.sql_ctx types = {x.name: x.dataType for x in list(df.schema)} #filter datatypes if datatypes is not None: types = { k: v for k, v in types.items() if any([x in datatypes for x in [v, str(v), v.simpleString()]]) } res = pd.DataFrame.from_dict(types, orient='index') res.columns = ['datatype'] count = df.count() res['count'] = count d = df.select([F.approx_count_distinct(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['approx_distinct'] d.index.name = 'index' res = res.join(d) res['unique_ratio'] = res['approx_distinct'] / count sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.mean(c).alias(c)] else: sel += [F.min(F.lit(None)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['mean'] d.index.name = 'index' res = res.join(d) d = df.select([F.min(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['min'] d.index.name = 'index' res = res.join(d) d = df.select([F.max(c).alias(c) for c in df.columns]).toPandas().T d.columns = ['max'] d.index.name = 'index' res = res.join(d) d = df.select([ F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns ]).toPandas().T d.columns = ['null'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.count(F.when(F.isnan(c), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['nan'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.StringType)): sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['empty'] d.index.name = 'index' res = res.join(d) return res
def zeros(col_name): return F.count(F.when(F.col(col_name) == 0, col_name))
def na(col_name): return F.count( F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))
def main(username): # For verification on the username received and print in console for demo. For actual deployment, # can comment away. print(f"Received username= {username}") # Start the Spark instance cnfg = SparkConf().setAppName("TwitterUserProfile").setMaster("local[2]") sc = SparkContext(conf=cnfg) spark = SparkSession(sc) # Initialise the first page of tweets & user (1 page consist of 10 entries) url = create_url(target=username) headers = create_headers(bearer_token) json_response = connect_to_endpoint(url, headers) # Parsing the JSON response returned by Twitter tweet_df = spark.createDataFrame(json_response['data']) # Check if there's geolocation field in the response. geo_exist = has_column(tweet_df, "geo") # Extracting the geolocation information via geo.place_id if geo_exist: tweet_df = tweet_df.select("author_id", "created_at", "geo.place_id", "id", "text") else: tweet_df = tweet_df.select("author_id", "created_at", "id", "text") # Extracting the user details user_df = spark.createDataFrame(json_response['includes']['users']) # flatten the public_metrics cols = list( map(lambda f: F.col("public_metrics").getItem(f).alias(str(f)), [ "following_count", "tweet_count", "listed_count", "followers_count" ])) public_metrics = user_df.select(cols) user_df = user_df.drop('public_metrics') # Merge user_df with public_metrics user_df = with_column_index(user_df) public_metrics = with_column_index(public_metrics) user_df = user_df.join(public_metrics, user_df.ColumnIndex == public_metrics.ColumnIndex, 'inner').drop("ColumnIndex") # If there are more tweets (next page / next token), append it to tweet_df. # user_df is just for a single user, so no need to append. Info will be the same. if 'next_token' not in json_response['meta']: pass else: next_token = json_response['meta']['next_token'] while next_token is not None: url = create_url(username, next_token) json_response = connect_to_endpoint(url, headers) new_tweets = spark.createDataFrame(json_response['data']) # Check if there's geolocation field in the new tweets new_tweet_geo_exist = has_column(new_tweets, "geo") if new_tweet_geo_exist: new_tweets = new_tweets.select("author_id", "created_at", "geo.place_id", "id", "text") else: new_tweets = new_tweets.select("author_id", "created_at", "id", "text") # to make sure all have the same number of columns for column in tweet_df.columns: if column not in new_tweets.columns: new_tweets = new_tweets.withColumn(column, F.lit(None)) for column in new_tweets.columns: if column not in tweet_df.columns: tweet_df = tweet_df.withColumn(column, F.lit(None)) # Reordering the column of new_tweets for union function if geo_exist: new_tweets = new_tweets.select("author_id", "created_at", "place_id", "id", "text") else: new_tweets = new_tweets.select("author_id", "created_at", "id", "text") tweet_df = tweet_df.union(new_tweets) if 'next_token' not in json_response['meta']: next_token = None else: next_token = json_response['meta']['next_token'] # Show the df. Can comment away in actual production. tweet_df.show(truncate=False) user_df.show(truncate=False) # Extract geolocation information within the tweets. Currently not in use. if geo_exist: location_df = tweet_df.select("author_id", "id", "place_id").dropna() location_df.show(truncate=False) # WORD FREQUENCY - to be made into word cloud in Tableau or other visualisation software. tweet_only = tweet_df.select("author_id", "text") # Remove punctuation, covert to lower case df_clean = tweet_only.select( "author_id", (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text'))) # Tokenize text tokenizer = Tokenizer(inputCol='text', outputCol='words_token') df_words_token = tokenizer.transform(df_clean).select( 'author_id', 'words_token') # Remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean') df_words_no_stopw = remover.transform(df_words_token).select( 'author_id', 'words_clean') # Filter length word > 3 filter_length_udf = udf(lambda row: [x for x in row if 3 <= len(x) <= 13], ArrayType(StringType())) df_final_words = df_words_no_stopw.withColumn( 'words', filter_length_udf(col('words_clean'))) # Printing the word list. Can comment away in actual deployment. df_final_words.show(truncate=False) word_count = df_final_words.select('author_id', F.explode('words').alias('word')).\ groupBy('author_id', 'word').\ count().\ sort('count', ascending=False) # Printing the word list and count. Can comment away in actual deployment. word_count.show() # SENTIMENT ANALYSIS. Sentiment is in the range of (-1, 1). sentiment = udf(lambda x: TextBlob(x).sentiment[0]) tweet_sentiment = tweet_df.withColumn( "sentiment_score", sentiment(tweet_df["text"]).cast("double")) classify_sentiment_udf = udf(classify_sentiment) tweet_sentiment = tweet_sentiment.withColumn( "sentiment", classify_sentiment_udf(tweet_sentiment["sentiment_score"])) tweet_sentiment = tweet_sentiment.select('author_id', 'created_at', 'id', 'text', 'sentiment_score', 'sentiment') # Can comment away the show statement. Left here to display the progress in console for demo. tweet_sentiment.show() sentiment_count = tweet_sentiment.groupBy('author_id', 'sentiment').agg( F.mean('sentiment_score'), F.count('sentiment')).toDF("author_id", "sentiment", "avg_sentiment_score", "count") # Can comment away the show statement. Left here to display the progress in console for demo. sentiment_count.show() # Read in existing data from Amazon RedShift DB. If user already exists, need to merge and deduplicate, then write data back. with redshift_conn.connect() as conn, conn.begin(): # Check if Table exists first. If so, read in existing Twitter users that are already in RedShift DB. # The unique key is the id, which is the author_id, Twitter user id. if redshift_conn.has_table("user_data"): user = pd.read_sql(""" select * from user_data;""", conn) # Append latest data retrieved to those in DB and remove duplicates, keeping the latest. user = user.append(user_df.toPandas()) user = user.drop_duplicates(subset="id", keep="last") else: user = user_df.toPandas() # Similarly, check if the Table for sentiment count exists. If so, read in existing sentiment count # for existing users in RedShift DB. The pair, author_id and sentiment," is used for deduplication. if redshift_conn.has_table("sentiment_count"): senti_df = pd.read_sql( """ select * from sentiment_count;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. senti_df = senti_df.append(sentiment_count.toPandas()) senti_df = senti_df.drop_duplicates( subset=["author_id", "sentiment"], keep="last") else: senti_df = sentiment_count.toPandas() # Checking if Table for word_count already exists in RedShift. If so, read in existing word count for # existing users in RedShift DB. Distinct pair of author_id and word is used for comparison. if redshift_conn.has_table("word_count"): word_df = pd.read_sql( """ select * from word_count;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. word_df = word_df.append(word_count.toPandas()) word_df = word_df.drop_duplicates(subset=["author_id", "word"], keep="last") else: word_df = word_count.toPandas() # Check for Table, tweet_sentiment. If exists, read in existing tweet sentiment for existing users in # RedShift DB. The unique ID used is the tweet id, which is unique for each tweet. All unique tweets # are kept. Thus even if the Twitter user deleted his old tweets, it will still be retained in the # Redshift DB if it was previously captured. if redshift_conn.has_table("tweet_sentiment"): tweet_db = pd.read_sql( """ select * from tweet_sentiment;""", conn) # Append latest data to those in DB and remove duplicates, keeping the latest. tweet_db = tweet_db.append(tweet_sentiment.toPandas()) tweet_db = tweet_db.drop_duplicates(subset="id", keep="last") else: tweet_db = tweet_sentiment.toPandas() # Update the data to Redshift. user.to_sql('user_data', redshift_conn, index=False, if_exists='replace') word_df.to_sql('word_count', redshift_conn, index=False, if_exists='replace') senti_df.to_sql('sentiment_count', redshift_conn, index=False, if_exists='replace') tweet_db.to_sql('tweet_sentiment', redshift_conn, index=False, if_exists='replace', dtype={ 'author_id': sqlalchemy.types.VARCHAR(length=255), 'created_at': sqlalchemy.types.VARCHAR(length=255), 'id': sqlalchemy.types.VARCHAR(length=255), 'text': sqlalchemy.types.VARCHAR(length=5000), 'sentiment_score': sqlalchemy.types.Float(precision=3, asdecimal=True), 'sentiment': sqlalchemy.types.VARCHAR(length=255), }) # Location information in tweet. Currently not in use. # location.to_sql('location_data', redshift_conn, index=False, if_exists='replace') # Can comment away print statement for actual deployment. Left here so that status will be printed in # console for demo purpose. print("Redshift DB updated successfully.")
def basic_eda(df, dependent_var, id_var): eda_start_time = time() # Extracting Data Types of All Columns print("\n++++++ Printing Data Types of All Columns ++++++\n") df.printSchema() # Duplicate Observation Checking print("\n++++++ Printing Duplicate Removal Summary ++++++\n") print("Total No of Obs Before Duplicate Removal: " + str(df.count())) print("Unique No of Obs Before Duplicate Removal: " + str(df.distinct().count())) # Removing Duplicate Observations df = df.dropDuplicates() df = df.na.drop('all') print("Total No of Obs After Duplicate Removal: " + str(df.count())) print("Unique No of Obs After Duplicate Removal: " + str(df.distinct().count())) # Extracting Dependent and Independent Variables column_names = [item[0] for item in df.dtypes] categorical_var = [ item[0] for item in df.dtypes if item[1].startswith('string') ] independent_catgorical_var = [ x for x in categorical_var if x not in [id_var, dependent_var] ] independent_continuous_var = [ x for x in column_names if x not in independent_catgorical_var + [id_var, dependent_var] ] # Descriptive Summary of Numeric Variables temp_df_1 = pd.DataFrame() desc_summary_1 = pd.DataFrame() for col_name in df[independent_continuous_var].columns: temp_df_1.loc[0, "Column_Name"] = col_name temp_df_1.loc[0, "Total_Obs"] = df.agg({ col_name: "count" }).collect()[0][0] temp_df_1.loc[0, "Unique_No_Obs"] = df.select( col_name).distinct().count() temp_df_1.loc[0, "Missing_No_Obs"] = df.select( count(when(isnan(col_name) | col(col_name).isNull(), col_name))).toPandas().iloc[0, 0] temp_df_1.loc[0, "Min"] = df.agg({col_name: "min"}).collect()[0][0] temp_var = df.approxQuantile(col_name, [ 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99, ], 0) temp_df_1.loc[0, "Pct_1"] = temp_var[0] temp_df_1.loc[0, "Pct_5"] = temp_var[1] temp_df_1.loc[0, "Pct_10"] = temp_var[2] temp_df_1.loc[0, "Pct_25"] = temp_var[3] temp_df_1.loc[0, "Median"] = temp_var[4] temp_df_1.loc[0, "Average"] = df.agg({col_name: "avg"}).collect()[0][0] temp_df_1.loc[0, "Pct_75"] = temp_var[5] temp_df_1.loc[0, "Pct_85"] = temp_var[6] temp_df_1.loc[0, "Pct_95"] = temp_var[7] temp_df_1.loc[0, "Pct_99"] = temp_var[8] temp_df_1.loc[0, "Max"] = df.agg({col_name: "max"}).collect()[0][0] desc_summary_1 = desc_summary_1.append(temp_df_1) desc_summary_1.reset_index(inplace=True, drop=True) print( "\n++++++ Printing Summary Statistics For Numeric Variables ++++++\n") display(desc_summary_1) # Target Variables V/s Categorical Variables temp_df_2 = pd.DataFrame() desc_summary_2 = pd.DataFrame() for x in independent_catgorical_var: temp_df_2 = df.groupby(x).agg({dependent_var: "avg"}).toPandas() temp_df_2.columns = ["Column_Value", "Avg_Target_Var"] temp_df_2["Column_Name"] = x temp_df_2 = temp_df_2.iloc[:, [2, 0, 1]] desc_summary_2 = desc_summary_2.append(temp_df_2) print( "\n++++++ Printing Averages of Target Variable Grouped By All Categorical Variable ++++++\n" ) display(desc_summary_2) # Returning Final Output desc_summary = [desc_summary_1, desc_summary_2] final_list = (df, independent_catgorical_var, independent_continuous_var, desc_summary) eda_end_time = time() eda_elapsed_time = (eda_end_time - eda_start_time) / 60 print("\nTime To Perform EDA: %.3f Minutes\n" % eda_elapsed_time) return (final_list)
) # In[238]: match_india_played = india_filtered_data.distinct().count() # In[239]: win_loss_percentage = india_filtered_data.groupby( "win_flag" ).agg( F.count("Team 1").alias("match_count") ).withColumn( "percentage", (F.col("match_count")*100.0)/F.lit(match_india_played) ) # In[240]: win_loss_percentage.show() # # 2. What is India’s Win/Loss/Tie percentage in away and home matches? # In[148]:
def main(output): stream_sch, channel_sch = createSchema() # data_s = spark.read.json('stream_base/part*', schema = stream_sch) # data_c = spark.read.json('channel_base/part*', schema = channel_sch) convertTime = functions.udf(timeToFrame) data_s = spark.read.json('stream_info.json', schema=stream_sch) data_c = spark.read.json('channel_info.json', schema=channel_sch) data_s = data_s.withColumn('time_frame', convertTime(data_s.created_at)).cache() data_s.createOrReplaceTempView('data_s') data_c.createOrReplaceTempView('data_c') game_count_by_time = data_s.groupBy('time_frame', 'game').count() game_count_by_time = game_count_by_time.orderBy( game_count_by_time['count'].desc()) view_count_by_time = data_s.groupBy('time_frame', 'game').agg( functions.sum('viewers').alias('total_view')) view_count_by_time = view_count_by_time.orderBy( view_count_by_time['total_view'].desc()) # game_count_by_time.coalesce(1).write.json('game_count_by_time', mode='overwrite') # view_count_by_time.coalesce(1).write.json('view_count_by_time', mode='overwrite') # see which games have the most audiences and followers view_num_by_game = data_c.groupby(data_c['game'])\ .agg(functions.sum(data_c['views']),functions.sum(data_c['followers'])) # see who are the currently most popular streamers view_num_by_streamer = data_c\ .select('stream_id','channel_id','game','name','views','followers','created_at','updated_at','partner')\ .orderBy(functions.desc('views'),'game') #print(view_num_by_streamer.show(5)) # see what are the games that have the most total vies and total follower (the most popular games in twitch recent history) viewcount_by_game = view_num_by_game\ .select('game', view_num_by_game['sum(views)'].alias('total_views'), view_num_by_game['sum(followers)'].alias('total_followers'))\ .orderBy(functions.desc('total_views')) #print(viewcount_by_game.show(5)) # see what are the most popular non-english speaking streams (by game and language) yuyan = spark.sql( """SELECT broadcaster_language, game, SUM(views) AS total_views FROM data_c WHERE broadcaster_language != 'en' GROUP BY broadcaster_language, game ORDER BY total_views DESC """) yuyan.createOrReplaceTempView('yuyan') #print(yuyan.show(5)) # see what are the biggest broadcaster communities (by language) yuyan_by_game = spark.sql( """SELECT broadcaster_language, game, count(*) AS total_streamer FROM data_c GROUP BY broadcaster_language, game ORDER BY total_streamer DESC """) yuyan.createOrReplaceTempView('yuyan_by_game') #print(yuyan_by_game.show(5)) # -------------------------ow jonning the 2 tables--------------------------------------- # joint_df = t_max.join(t_min, (t_max.stationmax == t_min.stationmin) & (t_max.date == t_min.date), 'inner') #put WHERE above ORDER BY ,stream.game is dropped since some streamers are playing games different than what are shown in stream.game cs_joint_table = spark.sql(""" SELECT s.stream_id AS stream_id, c.game AS game, c.name AS name, s.viewers AS watchings, s.time_frame as time_frame, c.views AS views, c.followers AS followers, s.created_at AS stream_created_date, c.updated_at AS channel_last_updated, c.broadcaster_language, c.language, c.created_at AS channel_created_date, c.display_name, c.status, c.mature, c.partner, s.average_fps, s.delay, s.video_height, c.broadcaster_software FROM data_c AS c JOIN data_s AS s ON s.stream_id = c.stream_id ORDER BY watchings DESC """).cache() cs_joint_table.createOrReplaceTempView('cs_joint_table') #cs_joint_table.coalesce(1).write.csv(output, mode='overwrite') #cs_joint_table.coalesce(1).write.json(output, mode='overwrite') #-------------------------------------list of attributes in cs_joint_table:--------------------------------------- # """ # stream_id # game (game name) # name (streamer name) # watchings (current number of audiences) # time_frame # views (current total views of the stream) # followers # stream_created_date # channel_last_updated # broadcaster_language # language # channel_created_date # display_name (streamer's displayed name, has emojis and stuff) # status (like a brief intro to the channel) # mature # partner # average_fps # delay # video_height # broadcaster_software (most of the streamers didn't specify this) # """ # ------partnership and average streaming fps and current audiences(num of people watching) by game and streamer-------- partner = spark.sql(""" SELECT game, partner, COUNT(name), AVG(average_fps), AVG(delay), SUM(watchings), SUM(views), SUM(followers), AVG(video_height) FROM cs_joint_table WHERE game LIKE 'Call of Duty%' GROUP BY partner, game HAVING COUNT(name) > 100 ORDER BY game """) #print(partner.show(50)) # ----------------------mature vs non-mature contents------------------------------ mature = spark.sql(""" SELECT game, mature, COUNT(name), SUM(watchings), SUM(views), SUM(followers) FROM cs_joint_table GROUP BY mature, game ORDER BY game """) #print(mature.show(50)) mature_total = cs_joint_table.select('game', 'mature', 'name', 'watchings', 'views', 'followers').groupBy('mature')\ .agg(functions.count('mature'), functions.sum('watchings'), functions.sum('views').alias('total_views'), functions.sum('followers')) mature_total = mature_total.orderBy(mature_total['total_views'].desc()) print(mature_total.show(2))
# MAGIC +-------+-----------------------------+-----+-------+ # MAGIC |5.0 |Ella Lola, a la Trilby (1898)|1 |94431 | # MAGIC |5.0 |Serving Life (2011) |1 |129034 | # MAGIC |5.0 |Diplomatic Immunity (2009? ) |1 |107434 | # MAGIC +-------+-----------------------------+-----+-------+ # MAGIC only showing top 3 rows # MAGIC ``` # COMMAND ---------- # TODO: Replace <FILL_IN> with appropriate code from pyspark.sql import functions as F # From ratingsDF, create a movie_ids_with_avg_ratings_df that combines the two DataFrames ratings_df.show(3) movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average")) print 'movie_ids_with_avg_ratings_df:' movie_ids_with_avg_ratings_df.show(3, truncate=False) # Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"]) movie_names_with_avg_ratings_df = movie_names_df.drop("Id") print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
def range_frame_match(): return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select( F.count("*").over(window.Window.rangeBetween(-sys.maxsize, sys.maxsize)) ).columns[0]
def findGenderAggCount(self, userDF): genderAggCntDF = userDF.groupBy("gender").\ agg(F.count("gender").alias("CountOfEmployeesByGender")).\ sort(F.desc("CountOfEmployeesByGender")) return genderAggCntDF
def app_and_plays(result_loc_, date_): """ Compute App Sessions and content play sessions and time spent on content consumption. :param result_loc_: pathlib.Path object to store resultant CSV at. :param date_: datetime object to use in query and path :return: None """ spark = SparkSession.builder.appName("content_plays").master( "local[*]").getOrCreate() account_name = os.environ['AZURE_STORAGE_ACCOUNT'] account_key = os.environ['AZURE_STORAGE_ACCESS_KEY'] container = 'telemetry-data-store' spark.conf.set( 'fs.azure.account.key.{}.blob.core.windows.net'.format(account_name), account_key) path = 'wasbs://{}@{}.blob.core.windows.net/telemetry-denormalized/summary/{}-*'.format( container, account_name, date_.strftime('%Y-%m-%d')) data = spark.read.json(path).filter( func.col("dimensions.pdata.id").isin( config['context']['pdata']['id']['app'], config['context']['pdata'] ['id']['portal'], config['context']['pdata']['id']['desktop']) & func.col("dimensions.type").isin("content", "app")).select( func.col("dimensions.sid"), func.col("dimensions.pdata.id").alias("pdata_id"), func.col("dimensions.type"), func.col("dimensions.mode"), func.col("dimensions.did"), func.col("object.id").alias("object_id"), func.col("edata.eks.time_spent"), func.col("object.rollup.l1")) app = data.filter( func.col('type').isin('app') & func.col('pdata_id').isin( config['context']['pdata']['id']['app'], config['context']['pdata'] ['id']['desktop'])) app_df = app.groupBy(func.col('pdata_id')).agg( func.count('sid').alias('Total App Sessions'), func.countDistinct('did').alias('Total Devices on App'), (func.sum('time_spent') / 3600).alias('Total Time on App (in hours)')).toPandas() app_df['x_index'] = 0 app_df.set_index("x_index", inplace=True) x_app = app_df.pivot(columns='pdata_id') result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True) x_app.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'app_sessions.csv'), index=False) post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'app_sessions.csv'), backup=True) play = data.filter(func.col('mode').isin('play')) content = spark.read.csv( str( result_loc_.parent.joinpath('tb_metadata', date_.strftime('%Y-%m-%d'), 'textbook_snapshot.csv')), header=True).select(func.col('identifier'), func.col('channel')).distinct() play_df = play.join( content, play.l1 == content.identifier, how='left').groupBy(func.col('channel'), func.col('pdata_id')).agg( func.count('sid').alias('Total Content Plays'), func.countDistinct('did').alias( 'Total Devices that played content'), (func.sum('time_spent') / 3600).alias('Content Play Time (in hours)')).toPandas() x_play = play_df.pivot(index='channel', columns='pdata_id') x_play.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'plays.csv')) post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'plays.csv'), backup=True) spark.stop()
def findOccupAggCount(self, userDF): occupAggCntDF = userDF.groupBy("occupation"). \ agg(F.count("occupation").alias("CountOfEmployeesByOccup")). \ sort(F.desc("CountOfEmployeesByOccup")) return occupAggCntDF
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 ## Phonetic dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object" ) dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic') dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name']) # df_phonemic = dyf_phonemic.toDF() # df_phonemic = df_phonemic.withColumn('lo_name', convertedudf(df_phonemic.learning_object_name)) # df_phonemic.show() # Lay ra ngu am df1 = dyf_phonemic.toDF() df1 = df1.select('learning_object_id', 'learning_object_name') # myArr = np.array(df1.select('phonemic').collect()) arrPhonetic = [row.learning_object_name for row in df1.collect()] arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()] # print(unicode(arrPhonetic[2])) # print('ARR:', arrPhonetic) # print('ARR:', arrPhonetic[2].encode('utf-8', 'replace')) # print('ARR1 :', (u'i:' in arrPhonetic)) # ETL TBHV # Custom function def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_get_phone_tic_id(phonetic): phonetic = phonetic.encode('utf-8', 'replace').strip() for x in arrPhoneticId: p = x[0].encode('utf-8', 'replace').strip() if p == phonetic: return x[1] get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) def doSplitWord(word): rs = [] if word is not None: i = 0 size = len(word) while i < size: s = word[i:i + 2] i += 2 if s in arrPhonetic: rs.append(s) if s not in arrPhonetic: i -= 2 s = word[i:i + 1] i += 1 if s in arrPhonetic: rs.append(s) return rs splitWord = udf(lambda x: doSplitWord(x)) state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 # knowledge = [] # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = [] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = [] def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 2 if state == state_wrong: score = -1 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) # chuoi ky tu can replace special_str = '["] ;' ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts" ) dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')]) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show(2) dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show() # xu ly truong hop start_read is null # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_top_quiz_attempts.count()) if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user" ) dyf_top_user = dyf_top_user.select_fields( ['id', 'student_id']).rename_field('id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question" ) dyf_top_question = dyf_top_question.select_fields( ['id', 'name']) # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai" ) dyf_top_result_ai = dyf_top_result_ai.select_fields( ['question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word']) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["student_id"] == 259442) # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] " # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select( 'quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn("right_word_list", f.split( df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn("right", f.explode(df_study_right.right_word_list)) # convert to lowercase df_study_right = df_study_right.withColumn("right", f.lower(f.col("right"))) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object # dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( # database="nvn_knowledge", # table_name="nvn_knowledge_learning_object" # ) dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'vocabulary') dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name', 'transcription']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn("learning_object_name", f.lower(f.col("learning_object_name"))) # replace cac ky tu df_learning_object = df_learning_object.withColumn("phone_tic_new", f.translate(df_learning_object.transcription, '\',', '')) df_learning_object = df_learning_object.withColumn("phone_tic_tmp", splitWord(df_learning_object.phone_tic_new)) df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01", f.translate(df_learning_object.phone_tic_tmp, '[]', '')) df_learning_object = df_learning_object.withColumn("phone_tic_arr", f.split(df_learning_object.phone_tic_tmp_01, ',')) df_learning_object = df_learning_object.withColumn("split_phonetic", f.explode(df_learning_object.phone_tic_arr)) df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name', 'split_phonetic') dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # 1 df_knowledge_right = dyf_knowledge_right.toDF() # df_knowledge_right = df_knowledge_right.withColumn("right_phonetic", # f.explode(df_knowledge_right.phone_tic_arr)) df_knowledge_right = df_knowledge_right.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.split_phonetic)) # dyf_phonemic_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_phonemic_right") # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'split_phonetic', 'learning_object_name') # # dropnullfields = DropNullFields.apply(frame=dyf_phonemic_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v06", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_right = dyf_phonemic_right.toDF() # print("COUNT 4:") # df_knowledge_right.printSchema() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(2)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) \ .withColumn("lo_type", f.lit(2)) dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # dropnullfields = DropNullFields.apply(frame=dyf_knowledge_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v02", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # print("COUNT 444444444444444:", df_knowledge_right.count()) # df_knowledge_right.printSchema() # df_knowledge_right.show() # # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # chon cac truong va kieu du lieu day vao db # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("name", 'string', 'name', 'string'), # ("learning_object_id", "long", "learning_object_id", "long"), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), # ("lo_type", "int", "lo_type", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################## # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -1 diem neu sai df_study_wrong = df_study.withColumn("wrong_word_list", f.split( df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn("wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn("wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2222:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') df_knowledge_wrong = dyf_knowledge_wrong.toDF() # df_knowledge_wrong = df_knowledge_wrong.withColumn("wrong_phonetic", # f.explode(df_knowledge_wrong.phone_tic_arr)) df_knowledge_wrong = df_knowledge_wrong.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_wrong = df_knowledge_wrong.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_wrong.split_phonetic)) # dyf_study_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_study_wrong") # dyf_phonemic_wrong = Join.apply(dyf_study_wrong, dyf_phonemic, 'split_phonetic', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_wrong = dyf_phonemic_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-1)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) # df_knowledge_wrong.printSchema() # df_knowledge_wrong.show() # # dyf_knowledge_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_knowledge_wrong") # # # chon cac truong va kieu du lieu day vao db # applymapping1 = ApplyMapping.apply(frame=dyf_knowledge_wrong, # mappings=[("timestart", "long", "timestart", "long"), # ("name", 'string', 'name', 'string'), # ("student_id", 'int', 'student_id', 'long'), # ("id", "int", "learning_object_id", 'long'), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long")]) # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", # transformation_ctx="resolvechoice1") # dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") # # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin", # "postactions": """ call proc_knowledge_ngu_am_top_result_ai () """ # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") ### Luu bang mapping_lo_student_history df_knowledge_right = df_knowledge_right.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where('student_id is not null') df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus")) \ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where('student_id_wrong is not null') df_knowledge = df_knowledge_right.join(df_knowledge_wrong, ( df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & ( df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & ( df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai_phonetic')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") # dyf_knowledge.printSchema() dyf_knowledge.printSchema() dyf_knowledge.show() # dyf_knowledge = DynamicFrame.fromDF(dyf_knowledge, glueContext, "dyf_knowledge") # chon cac truong va kieu du lieu day vao db applymapping = ApplyMapping.apply(frame=dyf_knowledge, mappings=[("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"]}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink5") ### END Luu bang mapping_lo_student_history # END Tru diem cac tu sai # lay max _key tren datasource datasource = dyf_top_quiz_attempts.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai", mode="overwrite") # xoa cache df_study.unpersist() df_knowledge_right.unpersist() # df_knowledge_right.unpersist() except Exception as e: print("###################### Exception ##########################") print(e)
df.show() df.count() df.describe().show() pd.options.display.html.table_schema = True df.describe().toPandas() df.select("dept_name").describe().show() # The count in the describe method is a count of non-missing values. df.describe().show() df.count() df.head(5) rows = df.head(5) type(rows) rows[0][0] rows[0]['dept_division'] df.take(5) df.show(10) df.describe("dept_division").show() from pyspark.sql.functions import count, countDistinct df.select(count("dept_division"), countDistinct("dept_division")).show() df.select("dept_name").show(10) df.select("dept_name").distinct().show()
def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError( "Column {c} is of type {t} and cannot be analyzed".format( c=column, t=column_type)) distinct_count = df.select(column).agg( countDistinct(col(column)).alias("distinct_count")).toPandas() non_nan_count = df.select(column).na.drop().select( count(col(column)).alias("count")).toPandas() results_data = pd.concat([distinct_count, non_nan_count], axis=1) results_data["p_unique"] = results_data["distinct_count"] / float( results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.ix[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append( describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config[ 'col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup( df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str( df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result
import sys from pyspark.sql import SparkSession from pyspark.sql.functions import count reload(sys) sys.setdefaultencoding('utf8') spark = SparkSession \ .builder \ .appName("p3b") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df_escuelas = spark.read.csv("hdfs://localhost/data/escuelasPR.csv") df_count = df_escuelas.filter(df_escuelas._c0 == "Arecibo").groupBy( "_c1", "_c2").agg(count("*")) df_escuelas.show() df_count = df_count.toDF("Distrito", "Ciudad", "Count") df_count.show()
def main(input1,input2,input3,input4,input5,input6,output): # main logic starts here business = spark.read.json(input1) bus = business.select(business['business_id'],business['name'],business['latitude'],business['longitude'], business['categories'], business['stars'], business['review_count']).filter(business['categories'].contains("Restaurants,")).filter(business['city'].contains("Toronto")) review = spark.read.json(input2) review_final = review.select(review['business_id'].alias("bus_id_rev"), review['user_id'], review['user_id'], review['stars'], review['text']) rev = review.select(review['business_id'].alias("bus_id"), review['text']) bus_rev = bus.join(rev, rev['bus_id']==bus['business_id']).filter(bus['business_id']=='m2xeKBhS0szlm7xfU5b8ew') bus_count = bus_rev.select(functions.count(bus_rev['business_id'])).collect() food = spark.read.csv(input3, header = True, schema = food_schema) fd = food.select(food['business_id'].alias("bus_id_fd"), food['text'], food['food'].alias("food_rating"), (functions.regexp_extract(food['prob1'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob1"), (functions.regexp_extract(food['prob2'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob2"), (functions.regexp_extract(food['prob3'], '(.)(\d+.\d+)(.)', 2)).alias("f_prob3")) bus_fd = bus.join(fd, fd['bus_id_fd']==bus['business_id']) fd_business1 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_negative")).filter(bus_fd['food_rating']==1) fd_business2 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_negative")).filter(bus_fd['food_rating']==2) fd_business3 = bus_fd.select(bus_fd['bus_id_fd'], (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_positive"), (bus_fd['food_rating']-bus_fd['food_rating']+0).alias("f_neutral"), (bus_fd['food_rating']-bus_fd['food_rating']+1).alias("f_negative")).filter(bus_fd['food_rating']==3) fd_bus = fd_business1.unionAll(fd_business2) fd_business = fd_bus.unionAll(fd_business3) fd_group = fd_business.groupby(fd_business['bus_id_fd']).agg(functions.sum(fd_business['f_positive']).alias("f_positive"), functions.sum(fd_business['f_neutral']).alias("f_neutral"), functions.sum(fd_business['f_negative']).alias("f_negative")) fd_count = fd_group.select(functions.count(fd_group['bus_id_fd'])).collect() fd_c1 = fd_group.select(functions.sum(fd_group['f_positive'])).collect() fd_c2 = fd_group.select(functions.sum(fd_group['f_neutral'])).collect() fd_c3 = fd_group.select(functions.sum(fd_group['f_negative'])).collect() price = spark.read.csv(input5, header = True, schema = price_schema) pr = price.select(price['business_id'].alias("bus_id_pr"), price['text'], price['price'].alias("price_rating"), (functions.regexp_extract(price['p_prob1'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob1"), (functions.regexp_extract(price['p_prob2'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob2"), (functions.regexp_extract(price['p_prob3'], '(.)(\d+.\d+)(.)', 2)).alias("p_prob3")) bus_pr = bus.join(pr, pr['bus_id_pr']==bus['business_id']) pr_business1 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_negative")).filter(bus_pr['price_rating']==1) pr_business2 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_negative")).filter(bus_pr['price_rating']==2) pr_business3 = bus_pr.select(bus_pr['bus_id_pr'], (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_positive"), (bus_pr['price_rating']-bus_pr['price_rating']+0).alias("p_neutral"), (bus_pr['price_rating']-bus_pr['price_rating']+1).alias("p_negative")).filter(bus_pr['price_rating']==3) pr_bus = pr_business1.unionAll(pr_business2) pr_business = pr_bus.unionAll(pr_business3) pr_group = pr_business.groupby(pr_business['bus_id_pr']).agg(functions.sum(pr_business['p_positive']).alias("p_positive"), functions.sum(pr_business['p_neutral']).alias("p_neutral"), functions.sum(pr_business['p_negative']).alias("p_negative")) pr_count = pr_group.select(functions.count(pr_group['bus_id_pr'])).collect() pr_c1 = pr_group.select(functions.sum(pr_group['p_positive'])).collect() pr_c2 = pr_group.select(functions.sum(pr_group['p_neutral'])).collect() pr_c3 = pr_group.select(functions.sum(pr_group['p_negative'])).collect() service = spark.read.csv(input4, header = True, schema = service_schema) sr = service.select(service['business_id'].alias("bus_id_sr"), service['text'], service['service'].alias("service_rating"), (functions.regexp_extract(service['s_prob1'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob1"), (functions.regexp_extract(service['s_prob2'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob2"), (functions.regexp_extract(service['s_prob3'], '(.)(\d+.\d+)(.)', 2)).alias("s_prob3")) bus_sr = bus.join(sr, sr['bus_id_sr']==bus['business_id']) sr_business1 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_negative")).filter(bus_sr['service_rating']==1) sr_business2 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_negative")).filter(bus_sr['service_rating']==2) sr_business3 = bus_sr.select(bus_sr['bus_id_sr'], (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_positive"), (bus_sr['service_rating']-bus_sr['service_rating']+0).alias("s_neutral"), (bus_sr['service_rating']-bus_sr['service_rating']+1).alias("s_negative")).filter(bus_sr['service_rating']==3) sr_bus = sr_business1.unionAll(sr_business2) sr_business = sr_bus.unionAll(sr_business3) sr_group = sr_business.groupby(sr_business['bus_id_sr']).agg(functions.sum(sr_business['s_positive']).alias("s_positive"), functions.sum(sr_business['s_neutral']).alias("s_neutral"), functions.sum(sr_business['s_negative']).alias("s_negative")) sr_count = sr_group.select(functions.count(sr_group['bus_id_sr'])).collect() sr_c1 = sr_group.select(functions.sum(sr_group['s_positive'])).collect() sr_c2 = sr_group.select(functions.sum(sr_group['s_neutral'])).collect() sr_c3 = sr_group.select(functions.sum(sr_group['s_negative'])).collect() bus_fd_pr= bus.join(fd_group, fd_group['bus_id_fd']==bus['business_id']) bus_fd_final = bus_fd_pr.select(bus_fd_pr['business_id'],bus_fd_pr['name'],bus_fd_pr['latitude'],bus_fd_pr['longitude'], bus_fd_pr['categories'], bus_fd_pr['stars'], bus_fd_pr['review_count'],bus_fd_pr['f_positive'],bus_fd_pr['f_neutral'],bus_fd_pr['f_negative']) bus_pr_fd = bus_fd_final.join(pr_group, pr_group['bus_id_pr']==bus_fd_pr['business_id']) bus_fd_pr_final = bus_pr_fd.select(bus_pr_fd['business_id'],bus_pr_fd['name'],bus_pr_fd['latitude'],bus_pr_fd['longitude'], bus_pr_fd['categories'], bus_pr_fd['stars'], bus_pr_fd['review_count'],bus_pr_fd['f_positive'],bus_pr_fd['f_neutral'],bus_pr_fd['f_negative'],bus_pr_fd['p_positive'],bus_pr_fd['p_neutral'],bus_pr_fd['p_negative']) bus_pr_fd_sr = bus_fd_pr_final.join(sr_group, sr_group['bus_id_sr']==bus_fd_pr_final['business_id']) bus_fd_pr_sr_final = bus_pr_fd_sr.select(bus_pr_fd_sr['business_id'],bus_pr_fd_sr['name'],bus_pr_fd_sr['latitude'],bus_pr_fd_sr['longitude'], bus_pr_fd_sr['categories'], bus_pr_fd_sr['stars'], bus_pr_fd_sr['review_count'],bus_pr_fd_sr['f_positive'],bus_pr_fd_sr['f_neutral'],bus_pr_fd_sr['f_negative'],bus_pr_fd_sr['p_positive'],bus_pr_fd_sr['p_neutral'],bus_pr_fd_sr['p_negative'],bus_pr_fd_sr['s_positive'],bus_pr_fd_sr['s_neutral'],bus_pr_fd_sr['s_negative']) vader = spark.read.json(input6) vd = vader.select(vader['id'], (vader['composite']).alias("v_composite"), (vader['positive']).alias("v_positive"), (vader['neutral']).alias("v_neutral"), (vader['negative']).alias("v_negative")) vd_group = vd.groupby(vader['id']).agg(functions.round(functions.sum(vd['v_composite'])/functions.count(vd['id']),2).alias("v_composite"), functions.round(functions.sum(vd['v_positive'])/functions.count(vd['id']),2).alias("v_positive"),functions.round(functions.sum(vd['v_neutral'])/functions.count(vd['id']),2).alias("v_neutral"), functions.round(functions.sum(vd['v_negative'])/functions.count(vd['id']),2).alias("v_negative")) bus_pr_fd_sr_vd = bus_fd_pr_sr_final.join(vd_group, vd_group['id']==bus_fd_pr_sr_final['business_id']) bus_fd_pr_sr_vd_final = bus_pr_fd_sr_vd.select(bus_pr_fd_sr_vd['business_id'],bus_pr_fd_sr_vd['name'],bus_pr_fd_sr_vd['latitude'],bus_pr_fd_sr_vd['longitude'], bus_pr_fd_sr_vd['categories'], bus_pr_fd_sr_vd['stars'], bus_pr_fd_sr_vd['review_count'],bus_pr_fd_sr_vd['f_positive'],bus_pr_fd_sr_vd['f_neutral'],bus_pr_fd_sr_vd['f_negative'],bus_pr_fd_sr_vd['p_positive'],bus_pr_fd_sr_vd['p_neutral'],bus_pr_fd_sr_vd['p_negative'],bus_pr_fd_sr_vd['s_positive'],bus_pr_fd_sr_vd['s_neutral'],bus_pr_fd_sr_vd['s_negative'],bus_pr_fd_sr_vd['v_composite'],bus_pr_fd_sr_vd['v_positive'],bus_pr_fd_sr_vd['v_neutral'],bus_pr_fd_sr_vd['v_negative']) vd_c3 = bus_fd_pr_sr_vd_final.select(functions.count(bus_fd_pr_sr_final['business_id'])).collect() print(bus_fd_pr_sr_vd_final.show(10))
df = spark.read.json("data/purchases.json") # Basic Operations df.printSchema() df.describe().show() df.show(2) print "Num of records:", df.count() print "" print "Answers: Juan David Botero" print "" # 1. Top 10 most purchased products print "1. Top 10 most purchased products:" df.groupBy(df.product_id, df.item_type).agg( sf.count(df.product_id).alias("top_items")).orderBy( "top_items", ascending=False).show(10) # Top 10 by product_id print "Top 10 by product_id:" df.groupBy(df.product_id).agg(sf.count( df.product_id).alias("top_items")).orderBy("top_items", ascending=False).show(10) # Top by item_type print "Top by item_type:" df.groupBy(df.item_type).agg(sf.count( df.item_type).alias("top_item_type")).orderBy("top_item_type", ascending=False).show() # 2. Purchase percentage of each product type (item_type)
def groupByUserId(df): return df.groupBy('user_id')\ .agg(f.count('text_lower').alias('tweet_count'),f.avg('sentiment').alias('sentiment'), \ f.first('user_name').alias('user_name'), f.avg('followers_count').cast(IntegerType()).alias('followers_count'))
def doRender(self,handlerId): g = self.entity width = int(self.getPreferredOutputWidth() - 10 ) height = int(self.getPreferredOutputHeight() - 10 ) if handlerId == "graphMap": graphNodesJson="{" for r in g.vertices.rdd.map(lambda row: """"{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}""" .format(row.id, row.name.encode("ascii","ignore").decode("ascii"),0.0 if row.latitude is None else row.latitude,0.0 if row.longitude is None else row.longitude)).collect(): graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r) graphNodesJson+="}" graphLinksJson = str(g.edges.select("src","dst").groupBy("src","dst").agg(F.count("src").alias("count")).toJSON().map(lambda j: yaml.safe_load(j)).collect()) myLogger.debug("graphMap - nodes: {0}".format(graphNodesJson)) myLogger.debug("graphMap - links: {0}".format(graphLinksJson)) self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", callback=self.renderTemplate("graphMap.js", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson, preferredWidth=width, preferredHeight=height)) self._addHTMLTemplate("graphMap.html") elif handlerId == "graphTree": def expand(values, visited, level): results=[] if values is not None and level < maxDepth: for v in values: if v not in visited and len(results)<maxChildren: visited[v]=True results.append({ "name": str(v), "children": {}}) for item in results: nextVisited = {} nextVisited.update(visited) item["children"]=expand(dic.get(item["name"]), nextVisited, level+1) return results ar = g.edges.select("src","dst").rdd.map(lambda row: (row[0],[row[1]]))\ .reduceByKey(lambda d1,d2: d1+d2).map(lambda row: (row[0], list(set(row[1]))))\ .collect() dic = {item[0] : item[1] for item in ar} maxDepth = self.options.get("maxDepth", 5) maxChildren = self.options.get("maxChildren", 10) root = self.options.get("root") rootNode = ar[0] if root: def findRoot(ar): for a in ar: if a[0]==root: return a rootNode = findRoot(ar) if not rootNode: self._addHTML("<p>Root node not found!</p>") else: res = { "name": str(rootNode[0]), "children":expand(dic[ar[0][0]], {rootNode[0]:True}, 1)} tree = json.dumps(res) myLogger.debug("graphTree - tree: {0}".format(res)) #if user specified root, then only send back the json tree if root: self.addProfilingTime = False print(tree) else: nodes = g.vertices.select('id').orderBy('id').rdd.map(lambda r: r[0]).collect() self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", callback=self.renderTemplate("graphTree.js", root=str(rootNode[0]), tree=tree, preferredWidth=width, preferredHeight=height)) self._addHTMLTemplate("graph.html", root=str(rootNode[0]), nodes=nodes, maxDepth=maxDepth, maxChildren=maxChildren, handlerId=handlerId) else: # force-directed graph maxEdges = self.options.get("maxEdges", 100) cols = [g.edges.columns[i] for i in range(len(g.edges.columns)) if g.edges.columns[i] not in ['src', 'dst']] edges = g.edges.toPandas()[:maxEdges].to_json(orient='records') graph = json.dumps(edges) isupdate = self.options.get("isupdate") cols.sort() colorBy = self.options.get("colorBy", cols[0] if len(cols) > 0 else "") myLogger.debug("graphDirected - edges: {0}".format(edges)) #if user specified update, then only send back the json graph if isupdate: self.addProfilingTime = False print(graph) else: self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", callback=self.renderTemplate("graphDirected.js", graph=graph, preferredWidth=width, preferredHeight=height, colorBy=colorBy)) self._addHTMLTemplate("graph.html", maxEdges=maxEdges, handlerId=handlerId, cols=cols, colorBy=colorBy)
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5) display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20)) # COMMAND ---------- # MAGIC %md ## Most popular flights (single city hops) # MAGIC Using the `tripGraph`, we can quickly determine what are the most popular single city hop flights # COMMAND ---------- # Determine the most popular flights (single city hops) import pyspark.sql.functions as func topTrips = tripGraph \ .edges \ .groupBy("src", "dst") \ .agg(func.count("delay").alias("trips")) # COMMAND ---------- # Show the top 20 most popular flights (single city hops) display(topTrips.orderBy(topTrips.trips.desc()).limit(20)) # COMMAND ---------- # MAGIC %md ## Top Transfer Cities # MAGIC Many airports are used as transfer points instead of the final Destination. An easy way to calculate this is by calculating the ratio of inDegree (the number of flights to the airport) / outDegree (the number of flights leaving the airport). Values close to 1 may indicate many transfers, whereas values < 1 indicate many outgoing flights and > 1 indicate many incoming flights. Note, this is a simple calculation that does not take into account of timing or scheduling of flights, just the overall aggregate number within the dataset. # COMMAND ---------- # Calculate the inDeg (flights into the airport) and outDeg (flights leaving the airport) inDeg = tripGraph.inDegrees
flights \ .withColumn("flight_code", concat("carrier", "flight")) \ .show() # `agg()` ejecutar agregaciones usando expresiones # especificas # la sentencia agg() te permite crear un Dataframe agregado # importar y usar funciones de agregacion como `count()`, # `countDistinct()`, `sum()`, and `mean()`: from pyspark.sql.functions import count, countDistinct flights.agg(count("*")).show() flights.agg(countDistinct("carrier")).show() # usar el metodo de una columna llamado `alias()` # para asignar un nombre a la columna resultado: flights \ .agg(countDistinct("carrier").alias("num_carriers")) \ .show() # `groupBy()` agrupa datos por columnas especificas # las agregaciones pueden ser calculadas por grupos: from pyspark.sql.functions import mean
sqlCtx = SQLContext(sc) lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"]) record = lines.map(lambda line: line.split(",")).map( lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2])) recordSchema = sqlCtx.createDataFrame(record) recordSchema.groupBy().agg({"*": "count"}).show() recordSchema.groupBy("machine", recordSchema["domain"]).agg( {"domain": "max", "request": "min"}).show() recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max( recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int")).groupBy("machine").count().show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").max("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").min("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").sum("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast(
def doRender(self,handlerId): g=self.entity if ( handlerId == "nodeLinkGraph"): import json ar = g.edges.select("src","dst").map(lambda (s,d): (s,[d]))\ .reduceByKey(lambda d1,d2: d1+d2).map(lambda (src, arTargets): (src, list(set(arTargets))))\ .collect() dic = {item[0] : item[1] for item in ar} limitLevel = self.options.get("limitLevel", 5) limitChildren = self.options.get("limitChildren", 10) def expand(values, visited,level): results=[] if values is not None and level < limitLevel: for v in values: if v not in visited and len(results)<limitChildren: visited[v]=True results.append({ "name": str(v), "children": {}}) for item in results: nextVisited = {} nextVisited.update(visited) item["children"]=expand(dic.get(item["name"]), nextVisited, level+1) return results root = self.options.get("root") rootNode = ar[0] if root: def findRoot(ar): for a in ar: if a[0]==root: return a rootNode = findRoot(ar) if not rootNode: self._addHTML("<p>Can't find the airport</p>"); return; res = { "name": str(rootNode[0]), "children":expand(dic[ar[0][0]], {rootNode[0]:True}, 1)} tree = json.dumps(res) #if user specified root, then only send back the json tree if root: print(tree) return self._addScriptElement("https://d3js.org/d3.v3.js", checkJSVar="d3", callback=self.renderTemplate("nodeLinkGraph.js", root=tree)) self._addHTMLTemplate("nodeLinkGraph.html", root=tree, res=res) else: graphNodesJson="{" for r in g.vertices.map(lambda row: """"{0}":{{"id":"{0}","name":"{1}","latitude":{2},"longitude":{3}}}""" .format(row.id, row.name.encode("ascii","ignore"),0.0 if row.latitude is None else row.latitude,0.0 if row.longitude is None else row.longitude)).collect(): graphNodesJson+=("," if len(graphNodesJson)>1 else "") + str(r) graphNodesJson+="}" graphLinksJson=str(g.edges.select("src","dst").groupBy("src","dst").agg(F.count("src").alias("count")).toJSON().map(lambda j: yaml.safe_load(j)).collect()) self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.js", checkJSVar="d3") self._addScriptElement("https://mbostock.github.io/d3/talk/20111116/d3/d3.geo.js") self._addScriptElement( "https://mbostock.github.io/d3/talk/20111116/d3/d3.geom.js", callback= self.renderTemplate("graphMap.js", graphNodesJson=graphNodesJson, graphLinksJson=graphLinksJson) ) self._addHTMLTemplate("graphMap.html")
df = spark.read.format("csv")\ .option("header", "true")\ .option("inferSchema", "true")\ .load("/data/retail-data/all/*.csv")\ .coalesce(5) df.cache() df.createOrReplaceTempView("dfTable") # COMMAND ---------- from pyspark.sql.functions import count df.select(count("StockCode")).show() # 541909 # COMMAND ---------- from pyspark.sql.functions import countDistinct df.select(countDistinct("StockCode")).show() # 4070 # COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show()
from pyspark.sql import SparkSession import pyspark.sql.functions as F spark = SparkSession.builder.appName("SimpleApp").getOrCreate() df = spark.createDataFrame( [(0, 0, 4.0), (0, 1, 2.0), (0, 3, 3.0), (1, 0, 4.0), (1, 1, 1.0), (1, 2, 5.0)], ["user", "item", "rating"] ) df_pandas = df.groupBy("user").agg(F.count(F.col("item"))).toPandas() print(df_pandas) spark.stop()
# COMMAND ---------- # MAGIC %md # MAGIC For your final task, you'll group by word and count the number of times each word occurs. Make sure to return the counts in descending order and to call them `counts`. # MAGIC # MAGIC For this task, you can use: # MAGIC * `DataFrame` operations `groupBy`, `agg`, and `sort` # MAGIC * the `Column` operation `alias` # MAGIC * functions `func.count` and `func.desc`. # COMMAND ---------- # ANSWER wordGroupCount = (wordList .groupBy('word') # group .agg(func.count('word').alias('counts')) # aggregate .sort(func.desc('counts'))) #sort wordGroupCount.take(5) # COMMAND ---------- # TEST Test.assertEquals(tuple(wordGroupCount.first()), (u'ref', 29263), 'incorrect counts.') # COMMAND ---------- # MAGIC %md # MAGIC We could also use SQL to accomplish this counting. # COMMAND ----------
# DBTITLE 1,Quais os tipos de variáveis da base "Closed Deals"? closed_deals.dtypes # COMMAND ---------- # DBTITLE 1,Resumo estatístico da base "Closed Deals" antes do tratamento display(closed_deals.describe()) # COMMAND ---------- # DBTITLE 1,Quantidade de missing na base "Closed Deals" from pyspark.sql.functions import isnull, when, count, col aux = [] for c in closed_deals.columns: aux.append(count(when(isnull(c), c)).alias(c)) display(closed_deals.select(aux)) # COMMAND ---------- # DBTITLE 1,Tratamento de missing nas variáveis categóricas closed_deals = closed_deals.fillna('NA', subset=['business_segment']) closed_deals = closed_deals.fillna('NA', subset=['lead_type']) closed_deals = closed_deals.fillna('NA', subset=['lead_behaviour_profile']) closed_deals = closed_deals.fillna('NA', subset=['has_company']) closed_deals = closed_deals.fillna('NA', subset=['has_gtin']) closed_deals = closed_deals.fillna('NA', subset=['average_stock']) closed_deals = closed_deals.fillna('NA', subset=['business_type']) # COMMAND ----------
# 重製日期格式 tDf = (pDf.withColumn('dateYear', pDf['Tran_time'].substr(1, 4)).withColumn( 'dateMonth', pDf['Tran_time'].substr(6, 2)).withColumn('dateDay', pDf['Tran_time'].substr(9, 2))) # 刪除不使用欄位 tDf = tDf.drop(tDf.Tran_time) # 支付方式 paymentColumn = ['900', '901', '902', '903', '905', '906', '907', '931', '933'] # 根據加油站、年及月,計算各類支付方式的使用次數 groupColumn = ['Deptno', 'dateYear', 'dateMonth'] deptnoYMaPayment = (tDf.groupBy(groupColumn).agg( count(when((col("Payment") == paymentColumn[0]), True)).alias('a900'), count(when((col("Payment") == paymentColumn[1]), True)).alias('a901'), count(when((col("Payment") == paymentColumn[2]), True)).alias('a902'), count(when((col("Payment") == paymentColumn[3]), True)).alias('a903'), count(when((col("Payment") == paymentColumn[4]), True)).alias('a905'), count(when((col("Payment") == paymentColumn[5]), True)).alias('a906'), count(when((col("Payment") == paymentColumn[6]), True)).alias('a907'), count(when((col("Payment") == paymentColumn[7]), True)).alias('a931'), count(when((col("Payment") == paymentColumn[8]), True)).alias('a933')).orderBy(groupColumn)) # 路徑 outputPath = "/home/cpc/data/resultData" # 資料 outputFile = "stdnoPaymentYearMonthDayCount.json" # 完整路徑和資料
master_url = open("/root/spark-ec2/cluster-url").read().strip() context = SparkContext(master_url) context.setLogLevel("WARN") sqlcontext = SQLContext(context) def extract_kmers(r): for i in range(0,len(r.seq)-k+1): yield r.seq[i:i+k] for sample_name in samples: sample_filename = "s3n://helgag/ocean_metagenome/overlapped/{sample_name}.csv".format(sample_name=sample_name) customSchema = StructType([ \ StructField("id", StringType(), True), \ StructField("seq", StringType(), True)]) sample = sqlcontext.read.format('com.databricks.spark.csv').options(header='true').load(sample_filename, schema=customSchema).repartition(80) sample = sample.flatMap(extract_kmers).map(Row("kmer")).toDF().groupBy("kmer").agg(count("*")) #Toggle comment the following to export the data sample.registerTempTable(sample_name + "_count") #sample.repartition(1).write.format('com.databricks.spark.csv').options(header='true').save(sample_name+'.csv') #Or this for pushing to s3 #sample.repartition(1).write.format('com.databricks.spark.csv').options(header='true').save('s3n://oceankmers/overlapped/'+sample_name+'.csv') for i, sample_a in enumerate(samples): for j in range(i+1): if i == j: print 0 continue X_sql = """ select '{sample1}' as asample, '{sample2}' as bsample, case when a.count < b.count then a.count else b.count end as minv,
sys.exit(-1) spark = SparkSession\ .builder\ .appName("PythonMnMCount")\ .getOrCreate() mnm_file = sys.argv[1] mnm_df = spark.read.format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load(mnm_file) # aggregate count of all colors and groupBy state and color # orderBy descending order count_mnm_df = mnm_df.select("State", "Color", "Count") \ .groupBy("State", "Color") \ .agg(count("Count") \ .alias("Total")) \ .orderBy("Total", ascending=False) count_mnm_df.show(n=60, truncate=False) print("Total Rows = %d" % (count_mnm_df.count())) # # find the aggregate count for California ca_count_mnm_df = mnm_df.select("*") \ .where(mnm_df.State == 'CA') \ .groupBy("State", "Color") \ .agg(count("Count") \ .alias("Total")) \ .orderBy("Total", ascending=False) ca_count_mnm_df.show(n=10, truncate=False) # stop the SparkSession spark.stop()
def compute_hist(psdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) sdf = psdf._internal.spark_frame scols = [] input_column_names = [] for label in psdf._internal.column_labels: input_column_name = name_like_string(label) input_column_names.append(input_column_name) scols.append(psdf._internal.spark_column_for(label).alias(input_column_name)) sdf = sdf.select(*scols) # 1. Make the bucket output flat to: # +----------+-------+ # |__group_id|buckets| # +----------+-------+ # |0 |0.0 | # |0 |0.0 | # |0 |1.0 | # |0 |2.0 | # |0 |3.0 | # |0 |3.0 | # |1 |0.0 | # |1 |1.0 | # |1 |1.0 | # |1 |2.0 | # |1 |1.0 | # |1 |0.0 | # +----------+-------+ colnames = sdf.columns bucket_names = ["__{}_bucket".format(colname) for colname in colnames] output_df = None for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer( splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" ) bucket_df = bucketizer.transform(sdf) if output_df is None: output_df = bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket") ) else: output_df = output_df.union( bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket") ) ) # 2. Calculate the count based on each group and bucket. # +----------+-------+------+ # |__group_id|buckets| count| # +----------+-------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+-------+------+ result = ( output_df.groupby("__group_id", "__bucket") .agg(F.count("*").alias("count")) .toPandas() .sort_values(by=["__group_id", "__bucket"]) ) # 3. Fill empty bins and calculate based on each group id. From: # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # +----------+--------+------+ # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+--------+------+ # # to: # +-----------------+ # |__values1__bucket| # +-----------------+ # |2 | # |1 | # |1 | # |2 | # |0 | # +-----------------+ # +-----------------+ # |__values2__bucket| # +-----------------+ # |2 | # |3 | # |1 | # |0 | # |0 | # +-----------------+ output_series = [] for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)): current_bucket_result = result[result["__group_id"] == i] # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[ ["count"] ] pdf.columns = [input_column_name] output_series.append(pdf[input_column_name]) return output_series
def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type)) distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas() non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas() results_data = pd.concat([distinct_count, non_nan_count],axis=1) results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.ix[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append(describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result
def countByWord(spark, df): return df.groupBy(WORD_COL).agg( F.count(WORD_COL)).alias('count').orderBy(WORD_COL)
def process_data(self): ############################################################################## # DECLARE VARIABLES ############################################################################## dt_range = self.study_dates("2020-07-30") dt = dt_range s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s1_initial_bucket_depth = 'cuebiq/daily-feed/US/' s1_bucket_output = 'cuebiq/daily-feed-reduced/US/' s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/' s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/' anchor_dist = 430 time_thresh = 28800 part_num = 9 gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True) ]) s2_gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True), StructField("row_number", IntegerType(), True) ]) ############################################################################## # WINDOWS ############################################################################## w = Window().partitionBy('device_id').orderBy('utc_timestamp') l = Window().partitionBy('device_id', 'lin_grp').orderBy('utc_timestamp') w2 = Window().partitionBy('device_id').orderBy('row_number') ############################################################################## # BEGIN DAILY ITERATION ############################################################################## print("Reading in files for {}".format(str(dt['study_dt'])[:10])) print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name, s1_initial_bucket_depth, dt['s3_before'], dt['s3_study_dt'], dt['s3_after'])) print("") ################################################################################################# # START STEP 1 ################################################################################################# df1 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_before'] +"/*.gz") # the day before df2 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_study_dt'] +"/*.gz") # actual study date df3 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_after'] +"/*.gz") # the day after # Union data from three inputs into 1 dataframe df = df1.union(df2).union(df3) \ .repartition(part_num, 'device_id') del df1 del df2 del df3 ############################################################################## # FILTER INITIAL JUNK RECORDS # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range ############################################################################## df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \ .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \ & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \ & (df['utc_timestamp'] + df['tz_offset']) \ .between(dt['utc_study_dt'], dt['utc_after'])) \ .dropDuplicates(['utc_timestamp','device_id']) ############################################################################## # EXCESSIVE SPEED REMOVAL ############################################################################## df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \ lead(df['longitude'],1).over(w))) \ .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \ .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \ .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \ .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \ .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \ .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \ | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \ & ((col('speed_from') < 600) & (col('speed_to') < 600)) \ & ((col('speed_from') < 20) | (col('speed_to') < 20))) \ .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') ############################################################################## # LINEAR TRAVEL PING REMOVAL # Break pings out into groups of 4 to measure the linear distance ############################################################################## #Assign a record number and linear grouping and lead distance df = df.withColumn('RecordNum',row_number().over(w)) \ .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \ .withColumn('dist_to', distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters')) # Create aggregated table for linear groupings expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \ py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')] dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr) dfl_grp.createOrReplaceTempView('dfl_grp') df.createOrReplaceTempView('dfl') # Grab just the first and last records in each linear grouping and append aggregated info dfls = spark.sql( "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.min_utc_timestamp \ UNION ALL \ SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.max_utc_timestamp") # Measure the distance between first and last in each linear grouping and compare to sum distance of all points # Only keep groups that meet criteria for being straight-line df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \ lead(dfls['latitude'],1).over(l), \ lead(dfls['longitude'],1).over(l), 'meters')) \ .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \ .na.drop(subset=['strt_dist']) \ .filter((dfls['min_dist'] > 0) \ & (col('strt_dist').between(150, 2000)) \ & (dfls['cnt'] == 4) \ & (col('lin') >= .99825)) \ .select('device_id','lin_grp', 'lin') # Outer join main dataframe to linears groups to filter non-linear pings df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \ .filter(col('lin').isNull()) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') del dfl_grp del dfls del df_j ####################################### # CHAIN # Calculating the dynamic chain threshold to find proximate ping relationships ####################################### df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) .when((distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \ .filter(col('chain') == 1) \ .withColumn('row_number', row_number().over(w)) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \ .persist() df \ .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \ .write \ .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") ############################################################################################## # START STEP 2 ############################################################################################## print('Begin micro-clustering') # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \ .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \ .when(distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \ >= anchor_dist, col('tz_timestamp')) \ .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \ .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \ .repartition(part_num, 'device_id') \ .persist() print('df_dist starting count = {}'.format( self.df_dist.count())) # Materialize table for caching df.unpersist() del df ##################################################################################################### # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration ###################################################################################################### df_anchor1 = self.anchor_func(3, 3) df_anchor2 = self.anchor_func(5, 5) df_anchor3 = self.anchor_func(12, 6) df_anchor4 = self.anchor_func(20, 5) df_anchor5 = self.anchor_func(30, 5) df_anchor6 = self.anchor_func(50, 5) df_anchor7 = self.anchor_func(80, 5, 1000000) df_anchor8 = self.anchor_func(1000, 5, 1000000) ################################################################################################## # Collect remaining pings to driver for Python analysis print('collect remaining pings') anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \ 'longitude':row[4], 'anchor':row[7]}).collect() # Sort elements in list by device_id and timestamp anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp')) # Python analysis on driver of final remaining pings print('iterate through remaining pings on driver') anchor_dr = [] for r in anchor_list: if r['anchor'] is not None: anchor_dr.append(r) else: if anchor_dr[-1]['device_id'] == r['device_id']: if distance_dr(r['latitude'],r['longitude'], \ anchor_dr[-1]['latitude'], \ anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \ & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh: anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \ 'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \ 'anchor':anchor_dr[-1]['anchor']}) else: r['anchor'] = r['timestamp'] anchor_dr.append(r) # Condense result table for dataframe distribution print('generate driver anchor table') new_anchor = [] for r in anchor_dr: new_anchor.append([r['timestamp'], r['device_id'], r['anchor']]) # Bring driver results back into a distributed dataframe and join results print('disperse driver anchor table back to cluster') new_anchor_schema = StructType([ StructField('tz_timestamp', IntegerType(), True), StructField('device_id', StringType(), True), StructField('anchor', IntegerType(), True) ]) df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \ .repartition(part_num, 'device_id') # Join remaining anchors to main analysis table self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \ 'accuracy','row_number') \ .join(df_anchor_dr,['tz_timestamp','device_id']) \ # Union all anchor tables together and sort print('finalizing anchor results into central table') df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \ .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \ .repartition(part_num,'device_id') \ .persist() self.df_dist.unpersist() ####################################################################################### # Calculate centroids ####################################################################################### print('start calculating centroids') # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \ py.count(col('tz_timestamp')).alias('cnt')]) \ .withColumn('max_acc_1', col('max_accuracy') + 1) \ .filter(col('cnt') > 1) \ .select('device_id','anchor','max_acc_1','cnt') # Calculate the nominator for each micro-cluster df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \ .withColumn('nom',col('max_acc_1') - col('accuracy')) df_denom = df_anchors_fnl.groupBy( 'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')]) df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \ .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \ .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \ .withColumn('lon', df_anchors_fnl['longitude'] * col('weight')) expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \ py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \ py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \ py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \ py.avg(col('accuracy')).alias('avg_accuracy')] df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \ .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \ .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \ .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \ .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \ .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \ .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \ .filter(col('dwell_seconds') > 1) \ .select('device_id','os','start_tm','end_tm', \ 'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \ col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy') df_micro \ .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \ .write \ .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") df_anchors_fnl.unpersist() return
def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues): allNumericCols = self.getNumericalFieldNames() if len(allNumericCols) == 0: self._addHTML("Unable to find a numerical column in the dataframe") return keyFields = self.options.get("keyFields") valueField = self.options.get("valueFields") if(keyFields==None and valueField==None): keyFields=self.getFirstStringColInfo() valueField=self.getFirstNumericalColInfo() else: keyFields = keyFields.split(',') valueField = valueField.split(',') if(len(valueField) > 1): self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField))) return keyFields = keyFields[0] valueField=valueField[0] #if(len(valueFields>)): #init fig=figure ax=axes #fig, ax = plt.subplots() #fig = plt.figure() params = plt.gcf() plSize = params.get_size_inches() params.set_size_inches( (plSize[0]*2, plSize[1]*2) ) agg=self.options.get("aggregation") groupByCol=self.options.get("groupByCol") if (agg=="None" or agg==None): colLabel = keyFields y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist() x_intv = np.arange(len(y)) labels = self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel(valueField, fontsize=18) elif(agg=='AVG'): y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields) y=y1["avg"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("Average "+valueField, fontsize=18) elif(agg=='SUM'): y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields) y=y1["sum"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("sum "+valueField, fontsize=18) elif(agg=='MAX'): y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields) y=y1["max"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("max "+valueField, fontsize=18) elif(agg=='MIN'): y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields) y=y1["min"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("min "+valueField, fontsize=18) elif(agg=='COUNT'): y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields) y=y1["count"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("count "+valueField, fontsize=18) mpld3.enable_notebook() plt.bar(x_intv,y,color="blue",alpha=0.5) ax_fmt = BarChart(labels) mpld3.plugins.connect(fig, ax_fmt)