Esempio n. 1
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print(final_df.rdd.collect())
Esempio n. 2
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(
            sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
motifs.selectExpr("*",
    "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart",
    "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart",
    "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\
  .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\
  .where("a.id != b.id").where("b.id != c.id")\
  .where("abStart < bcStart").where("bcStart < caStart")\
  .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\
  .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")\
  .limit(1).show(1, False)


# COMMAND ----------

from pyspark.sql.functions import desc
ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10)


# COMMAND ----------

inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)


# COMMAND ----------

outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)

def generateGraphs():
    '''
    This function takes the top routes found in the countRoutes function,
    for the source_ip and dest_ip columns and creates a graph of each one.

    Ultimately it will generate a list of the most significant target and
    attack IPs and Ports.

    This function utilises the built-in PageRank algorithm from GraphX to
    determine the significance of each route.
    '''
    for column_key in GRAPH_DATA.keys():
        print(f'\nGenerating graph for column {column_key}...')
        # Start by concatenating the route counts into one dict object
        routes = {}
        for dict in GRAPH_DATA[column_key]:
            for key in dict.keys():
                routes[key] = dict[key]

        # Now we'll extract each vertex (unique IP address) from the routes,
        # and we'll take the number of connections as the weight for the edge.
        #
        # We'll also have to assign an ID for each IP address and build an array
        # of edges where each route is represented with its corresponding weight
        verts = []
        edges = []
        ids = {}
        id = 1
        for key in routes.keys():
            weight = getWeight(
                routes[key])  # Get the weight from the dict value
            ips_from_route = (key.split('/')[1], key.split('/')[2])
            for ip in ips_from_route:
                # Create a new ID for the IP address if it hasn't appeared yet
                if not ip in [i[1] for i in verts]:
                    verts.append((id, ip))  # Add it to the vertex array too
                    ids[ip] = id
                    id += 1  # And increment the ID
            # Now get the route and add it to the edges array as a tuple of:
            # (src, dest, weight)
            edges.append(
                (ids[ips_from_route[0]], ids[ips_from_route[1]], weight))

        # Next, we'll convert the verts and edges arrays into Spark dataframes
        # so that we can load them into GraphFrames
        verts = SPARK.createDataFrame(verts, ['id', 'ip_address'])
        edges = SPARK.createDataFrame(edges, ['src', 'dst', 'count'])

        # Then we'll create the GraphFrames object, look at the edges, verts
        # and degrees, then run PageRank.
        graph = GraphFrame(verts, edges)
        print('Graph vertices:')
        graph.vertices.show()
        print('Graph edges:')
        graph.edges.show()
        print('Graph degrees:')
        graph.degrees.show()

        print('Performing PageRank on the graph:')
        pr = graph.pageRank(resetProbability=0.15, tol=0.01)
        print('Graph PageRank vertices:')
        pr.vertices.show()
        print('Graph PageRank edges:')
        pr.edges.show()
deg_table = 'direct-analog-308416:project_data.degree'

logging.info('Writing Degree centrality data to BQ')
deg_centrality.write.format('bigquery')\
    .mode('overwrite')\
    .option('table',deg_table)\
    .save()

logging.info('Done!')
"""
Pagerank
damping factor = 1 - resetProbability
damping factor defines the probability that the next click will be through a link (webpage context)
"""
print('Attempting pagerank')
results = g.pageRank(resetProbability=0.15, maxIter=1)
print(results)
#results.vertices.select("id", "pagerank").show()
pagerank_res = results.vertices.sort("pagerank", ascending=False).show()

# save pagerank results to BQ

pagerank_table = 'direct-analog-308416:project_data.pagerank_results'

logging.info('Writing Pagerank data to BQ')
pagerank_res.write\
    .format('bigquery')\
    .mode('overwrite')\
    .option('table',pagerank_table)\
    .save()
# COMMAND ----------

from pyspark.sql.functions import col, concat_ws, round
from graphframes import GraphFrame

flightVerticesDF = (spark.read.option("header", True).option(
    "delimiter",
    "\t").csv("dbfs:/mnt/training/asa/airport-codes/airport-codes.txt").
                    withColumnRenamed("IATA", "id"))

flightEdgesDF = (spark.table("Databricks.AirlineFlight").withColumnRenamed(
    "Origin", "src").withColumnRenamed("Dest", "dst"))

flightGF = GraphFrame(flightVerticesDF, flightEdgesDF)
pageRankDF = flightGF.pageRank(tol=0.05)

resultsDF = (pageRankDF.vertices.select(
    concat_ws(", ", col("city"), col("state")).alias("Location"),
    round(col("pagerank"), 1).alias("Rank")).orderBy(col("pagerank").desc()))

display(resultsDF)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Review
# MAGIC **Question:** Which of the following are good applications for Apache Spark? (Select all that apply.)
# MAGIC 0. Querying, exploring, and analyzing very large files and data sets
# MAGIC 0. Joining data lakes
# MAGIC 0. Machine learning and predictive analytics
edges.show(3)

# COMMAND ----------

#Build GraphFrame object
from graphframes import GraphFrame

friendGraph = GraphFrame(vertices, edges)
friendGraph.vertices.show(3)
friendGraph.edges.show(3)

# COMMAND ----------

#Determine the top 5 most important users
ranks = friendGraph.pageRank(
    resetProbability=0.15, maxIter=10
)  #0.15 is same as original value in Google Search Engine; function returns a GraphFrame
ranks.vertices.orderBy(desc("pagerank")).select("id",
                                                "pagerank").limit(5).show(
                                                    5, False)

# COMMAND ----------

#Determine the top 5 users with the highest indegree/outdegree
from pyspark.sql.functions import desc

inDeg = friendGraph.inDegrees
inDeg.orderBy(desc("inDegree")).limit(5).show(5, False)

#Alternative
#outDeg = friendGraph.outDegrees
Esempio n. 8
0
doc.show()
""" Show all motifs which satisfy a->b->c """
display_graph(graph.find("(a)-[e]->(b); (b)-[e2]->(a)"))


def display_graph(item):
    # Redirects Standard Out to the document
    with io.StringIO() as buf, redirect_stdout(buf):
        graph.find("(a)-[e]->(b); (b)-[e2]->(a)").show()
        redirect_to_handout(buf.getvalue())


doc.show()
""" ## Get pagerank using m=0.15 and tolerance=0.01
"""
pr = graph.pageRank(resetProbability=0.15, tol=0.01)
""" ### look at the pagerank score for every vertex """
display_graph(pr.vertices)
doc.show()
""" ### look at the weight of every edge
"""
display_graph(pr.edges)
doc.show()
""" We can compare the results as follows: """
# GraphFrames rankings sum to N where N is the number of nodes
graphframes_pagerank = get_pagerank_dictionary(pr)

# Google rankings sum to 1
google_pagerank = {1: 0.368, 2: 0.142, 3: 0.288, 4: 0.202}

pretty_print_pagerank(graphframes_pagerank, google_pagerank)
Esempio n. 9
0
import sys
from functools import reduce
from pyspark.sql.functions import col, lit, when
import pyspark
from graphframes.examples import Graphs
from graphframes import GraphFrame
import config

sc = pyspark.SparkContext()
sqlContext = pyspark.SQLContext(sc)
inputFile = sys.argv[1]
# g = Graphs(sqlContext).friends()  # Get example graph
df = sqlContext.read.format("csv").option("delimiter",
                                          config.delimiter).load(inputFile)
# Rename columns to something decent.
df = df.withColumnRenamed("_c0", "src")\
.withColumnRenamed("_c1", "dst")\
.withColumnRenamed("_c2", "weight")
df.show(5)

aggcodes = df.select("src", "dst").rdd.flatMap(lambda x: x).distinct()
vertices = aggcodes.map(lambda x: (x, x)).toDF(["id", "name"])

edges = df.select("src", "dst")
graph = GraphFrame(vertices, edges)

results = graph.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank")\
.join(vertices, on="id").orderBy("pagerank", ascending=False)\
.show(10)
# Determine the most popular flights (single city hops)
import pyspark.sql.functions as func
topTrips = tripGraph \
  .edges \
  .groupBy("src", "dst") \
  .agg(func.count("delay").alias("trips"))
display(topTrips.orderBy(topTrips.trips.desc()).limit(20))

# COMMAND ----------

# MAGIC %md ## To determining Airport ranking of importance using `pageRank`

# COMMAND ----------

# Determining Airport ranking of importance using `pageRank`
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20))

# COMMAND ----------

# MAGIC %md ##D3 Visualization

# COMMAND ----------

# MAGIC %scala
# MAGIC package d3a
# MAGIC // We use a package object so that we can define top level classes like Edge that need to be used in other cells
# MAGIC
# MAGIC import org.apache.spark.sql._
# MAGIC import com.databricks.backend.daemon.driver.EnhancedRDDFunctions.displayHTML
# MAGIC