def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql(sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print(final_df.rdd.collect())
def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql( sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none") else: print(final_df.rdd.collect())
motifs.selectExpr("*", "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart", "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart", "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\ .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\ .where("a.id != b.id").where("b.id != c.id")\ .where("abStart < bcStart").where("bcStart < caStart")\ .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\ .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")\ .limit(1).show(1, False) # COMMAND ---------- from pyspark.sql.functions import desc ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10) ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10) # COMMAND ---------- inDeg = stationGraph.inDegrees inDeg.orderBy(desc("inDegree")).show(5, False) # COMMAND ---------- outDeg = stationGraph.outDegrees outDeg.orderBy(desc("outDegree")).show(5, False)
def generateGraphs(): ''' This function takes the top routes found in the countRoutes function, for the source_ip and dest_ip columns and creates a graph of each one. Ultimately it will generate a list of the most significant target and attack IPs and Ports. This function utilises the built-in PageRank algorithm from GraphX to determine the significance of each route. ''' for column_key in GRAPH_DATA.keys(): print(f'\nGenerating graph for column {column_key}...') # Start by concatenating the route counts into one dict object routes = {} for dict in GRAPH_DATA[column_key]: for key in dict.keys(): routes[key] = dict[key] # Now we'll extract each vertex (unique IP address) from the routes, # and we'll take the number of connections as the weight for the edge. # # We'll also have to assign an ID for each IP address and build an array # of edges where each route is represented with its corresponding weight verts = [] edges = [] ids = {} id = 1 for key in routes.keys(): weight = getWeight( routes[key]) # Get the weight from the dict value ips_from_route = (key.split('/')[1], key.split('/')[2]) for ip in ips_from_route: # Create a new ID for the IP address if it hasn't appeared yet if not ip in [i[1] for i in verts]: verts.append((id, ip)) # Add it to the vertex array too ids[ip] = id id += 1 # And increment the ID # Now get the route and add it to the edges array as a tuple of: # (src, dest, weight) edges.append( (ids[ips_from_route[0]], ids[ips_from_route[1]], weight)) # Next, we'll convert the verts and edges arrays into Spark dataframes # so that we can load them into GraphFrames verts = SPARK.createDataFrame(verts, ['id', 'ip_address']) edges = SPARK.createDataFrame(edges, ['src', 'dst', 'count']) # Then we'll create the GraphFrames object, look at the edges, verts # and degrees, then run PageRank. graph = GraphFrame(verts, edges) print('Graph vertices:') graph.vertices.show() print('Graph edges:') graph.edges.show() print('Graph degrees:') graph.degrees.show() print('Performing PageRank on the graph:') pr = graph.pageRank(resetProbability=0.15, tol=0.01) print('Graph PageRank vertices:') pr.vertices.show() print('Graph PageRank edges:') pr.edges.show()
deg_table = 'direct-analog-308416:project_data.degree' logging.info('Writing Degree centrality data to BQ') deg_centrality.write.format('bigquery')\ .mode('overwrite')\ .option('table',deg_table)\ .save() logging.info('Done!') """ Pagerank damping factor = 1 - resetProbability damping factor defines the probability that the next click will be through a link (webpage context) """ print('Attempting pagerank') results = g.pageRank(resetProbability=0.15, maxIter=1) print(results) #results.vertices.select("id", "pagerank").show() pagerank_res = results.vertices.sort("pagerank", ascending=False).show() # save pagerank results to BQ pagerank_table = 'direct-analog-308416:project_data.pagerank_results' logging.info('Writing Pagerank data to BQ') pagerank_res.write\ .format('bigquery')\ .mode('overwrite')\ .option('table',pagerank_table)\ .save()
# COMMAND ---------- from pyspark.sql.functions import col, concat_ws, round from graphframes import GraphFrame flightVerticesDF = (spark.read.option("header", True).option( "delimiter", "\t").csv("dbfs:/mnt/training/asa/airport-codes/airport-codes.txt"). withColumnRenamed("IATA", "id")) flightEdgesDF = (spark.table("Databricks.AirlineFlight").withColumnRenamed( "Origin", "src").withColumnRenamed("Dest", "dst")) flightGF = GraphFrame(flightVerticesDF, flightEdgesDF) pageRankDF = flightGF.pageRank(tol=0.05) resultsDF = (pageRankDF.vertices.select( concat_ws(", ", col("city"), col("state")).alias("Location"), round(col("pagerank"), 1).alias("Rank")).orderBy(col("pagerank").desc())) display(resultsDF) # COMMAND ---------- # MAGIC %md # MAGIC ## Review # MAGIC **Question:** Which of the following are good applications for Apache Spark? (Select all that apply.) # MAGIC 0. Querying, exploring, and analyzing very large files and data sets # MAGIC 0. Joining data lakes # MAGIC 0. Machine learning and predictive analytics
edges.show(3) # COMMAND ---------- #Build GraphFrame object from graphframes import GraphFrame friendGraph = GraphFrame(vertices, edges) friendGraph.vertices.show(3) friendGraph.edges.show(3) # COMMAND ---------- #Determine the top 5 most important users ranks = friendGraph.pageRank( resetProbability=0.15, maxIter=10 ) #0.15 is same as original value in Google Search Engine; function returns a GraphFrame ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").limit(5).show( 5, False) # COMMAND ---------- #Determine the top 5 users with the highest indegree/outdegree from pyspark.sql.functions import desc inDeg = friendGraph.inDegrees inDeg.orderBy(desc("inDegree")).limit(5).show(5, False) #Alternative #outDeg = friendGraph.outDegrees
doc.show() """ Show all motifs which satisfy a->b->c """ display_graph(graph.find("(a)-[e]->(b); (b)-[e2]->(a)")) def display_graph(item): # Redirects Standard Out to the document with io.StringIO() as buf, redirect_stdout(buf): graph.find("(a)-[e]->(b); (b)-[e2]->(a)").show() redirect_to_handout(buf.getvalue()) doc.show() """ ## Get pagerank using m=0.15 and tolerance=0.01 """ pr = graph.pageRank(resetProbability=0.15, tol=0.01) """ ### look at the pagerank score for every vertex """ display_graph(pr.vertices) doc.show() """ ### look at the weight of every edge """ display_graph(pr.edges) doc.show() """ We can compare the results as follows: """ # GraphFrames rankings sum to N where N is the number of nodes graphframes_pagerank = get_pagerank_dictionary(pr) # Google rankings sum to 1 google_pagerank = {1: 0.368, 2: 0.142, 3: 0.288, 4: 0.202} pretty_print_pagerank(graphframes_pagerank, google_pagerank)
import sys from functools import reduce from pyspark.sql.functions import col, lit, when import pyspark from graphframes.examples import Graphs from graphframes import GraphFrame import config sc = pyspark.SparkContext() sqlContext = pyspark.SQLContext(sc) inputFile = sys.argv[1] # g = Graphs(sqlContext).friends() # Get example graph df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile) # Rename columns to something decent. df = df.withColumnRenamed("_c0", "src")\ .withColumnRenamed("_c1", "dst")\ .withColumnRenamed("_c2", "weight") df.show(5) aggcodes = df.select("src", "dst").rdd.flatMap(lambda x: x).distinct() vertices = aggcodes.map(lambda x: (x, x)).toDF(["id", "name"]) edges = df.select("src", "dst") graph = GraphFrame(vertices, edges) results = graph.pageRank(resetProbability=0.01, maxIter=20) results.vertices.select("id", "pagerank")\ .join(vertices, on="id").orderBy("pagerank", ascending=False)\ .show(10)
# Determine the most popular flights (single city hops) import pyspark.sql.functions as func topTrips = tripGraph \ .edges \ .groupBy("src", "dst") \ .agg(func.count("delay").alias("trips")) display(topTrips.orderBy(topTrips.trips.desc()).limit(20)) # COMMAND ---------- # MAGIC %md ## To determining Airport ranking of importance using `pageRank` # COMMAND ---------- # Determining Airport ranking of importance using `pageRank` ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5) display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20)) # COMMAND ---------- # MAGIC %md ##D3 Visualization # COMMAND ---------- # MAGIC %scala # MAGIC package d3a # MAGIC // We use a package object so that we can define top level classes like Edge that need to be used in other cells # MAGIC # MAGIC import org.apache.spark.sql._ # MAGIC import com.databricks.backend.daemon.driver.EnhancedRDDFunctions.displayHTML # MAGIC