コード例 #1
0
ファイル: webgraph.py プロジェクト: jhildreth/cosr-back
    def run_job(self, sc, sqlc):

        if self.args.input_parquet:
            edge_df = sqlc.read.load(os.path.join(self.args.input_parquet, "edges"))
            vertex_df = sqlc.read.load(os.path.join(self.args.input_parquet, "vertices"))
        else:
            raise Exception("No input given!")

        if self.args.output_txt:
            vertices = sql(sqlc, """
                SELECT CONCAT(id, " ", domain) r
                FROM vertices
            """, {"vertices": vertex_df})

            edges = sql(sqlc, """
                SELECT CONCAT(src, " ", dst) r
                FROM edges
            """, {"edges": edge_df})

            vertices.coalesce(self.args.coalesce).write.text(
                os.path.join(self.args.output_txt, "vertices"),
                compression="gzip" if self.args.gzip else "none"
            )

            edges.coalesce(self.args.coalesce).write.text(
                os.path.join(self.args.output_txt, "edges"),
                compression="gzip" if self.args.gzip else "none"
            )
コード例 #2
0
    def run_job(self, sc, sqlc):

        if self.args.input_parquet:
            edge_df = sqlc.read.load(
                os.path.join(self.args.input_parquet, "edges"))
            vertex_df = sqlc.read.load(
                os.path.join(self.args.input_parquet, "vertices"))
        else:
            raise Exception("No input given!")

        if self.args.output_txt:
            vertices = sql(
                sqlc, """
                SELECT CONCAT(id, " ", domain) r
                FROM vertices
            """, {"vertices": vertex_df})

            edges = sql(
                sqlc, """
                SELECT CONCAT(src, " ", dst) r
                FROM edges
            """, {"edges": edge_df})

            vertices.coalesce(self.args.coalesce).write.text(
                os.path.join(self.args.output_txt, "vertices"),
                compression="gzip" if self.args.gzip else "none")

            edges.coalesce(self.args.coalesce).write.text(
                os.path.join(self.args.output_txt, "edges"),
                compression="gzip" if self.args.gzip else "none")
コード例 #3
0
ファイル: webgraph.py プロジェクト: jhildreth/cosr-back
    def save_vertex_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """

        vertex_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("domain", SparkTypes.StringType(), nullable=False)
        ])

        # TODO ?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"])

        # We collect all unique domains from the page URLs & destination of all external links
        d1_df = sql(sqlc, """
            SELECT parse_url(url, "HOST") as domain from df
        """, {"df": df}).distinct()

        d2_df = sql(sqlc, """
            SELECT parse_url(link, "HOST") as domain
            FROM (
                SELECT EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df})

        all_domains_df = d1_df.unionAll(d2_df).distinct()

        def iter_domain(record):
            """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """

            domain = record["domain"]
            if not domain or not domain.strip():
                return []

            name = URL("http://" + domain).normalized_domain

            try:
                _id = _fast_make_domain_id(name)
            except Exception:  # pylint: disable=broad-except
                return []

            return [(long(_id), str(name))]

        rdd_domains = all_domains_df.rdd.flatMap(iter_domain)

        vertex_df = createDataFrame(sqlc, rdd_domains, vertex_graph_schema).distinct()

        if self.args.get("coalesce_vertices") or self.args.get("coalesce"):
            vertex_df = vertex_df.coalesce(
                int(self.args.get("coalesce_vertices") or self.args.get("coalesce"))
            )

        vertex_df.write.parquet(os.path.join(self.args["path"], "vertices"))
コード例 #4
0
ファイル: pagerank.py プロジェクト: bakztfuture/cosr-back
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print(final_df.rdd.collect())
コード例 #5
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(
            sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
コード例 #6
0
ファイル: webgraph.py プロジェクト: bakztfuture/cosr-back
    def hook_spark_pipeline_action(self, sc, sqlc, df, indexer):

        # Get all unique (host1 => host2) pairs
        domain_pairs = sql(sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df}).distinct()

        # Format as csv
        lines = sql(sqlc, """
            SELECT CONCAT(d1, " ", d2) as r
            FROM pairs
        """, {"pairs": domain_pairs})

        self.save_dataframe(lines, "text")

        return True
コード例 #7
0
ファイル: grep.py プロジェクト: x0rzkov/cosr-back
    def hook_spark_pipeline_action(self, sc, sqlc, df, indexer):

        lines_df = sql(
            sqlc, """
            SELECT CONCAT(CONCAT_WS(",", SORT_ARRAY(grep_words)), " ", url) r
            FROM df
            WHERE size(grep_words) > 0
        """, {"df": df})

        self.save_dataframe(lines_df, "text")

        return True
コード例 #8
0
    def hook_spark_pipeline_action(self, sc, sqlc, df, indexer):

        # Get all unique (host1 => host2) pairs
        domain_pairs = sql(
            sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {
                "df": df
            }).distinct()

        # Format as csv
        lines = sql(
            sqlc, """
            SELECT CONCAT(d1, " ", d2) as r
            FROM pairs
        """, {"pairs": domain_pairs})

        self.save_dataframe(lines, "text")

        return True
コード例 #9
0
ファイル: hyperlinks.py プロジェクト: tarunjindl/cosr-back
    def spark_pipeline_action(self, sc, sqlc, df, indexer):

        domain = self.args["domain"]

        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"])

        lines_df = sql(sqlc, """
            SELECT
                CONCAT(
                    regexp_replace(url_to, "^http(s?)://", ""),
                    " ",
                    COUNT(*),
                    " ",
                    CONCAT_WS(" ", COLLECT_LIST(url_from))
                ) r
            FROM (
                SELECT url url_from, EXPLODE(external_links.href) url_to
                FROM df
                WHERE size(external_links) > 0
            ) links
            WHERE SUBSTRING(
                PARSE_URL(links.url_to, "HOST"),
                LENGTH(PARSE_URL(links.url_to, "HOST")) - %s,
                %s
            ) == "%s"
            GROUP BY regexp_replace(url_to, "^http(s?)://", "")
            ORDER BY COUNT(*) DESC
        """ % (len(domain), len(domain), domain), {"df": df})

        if self.args.get("limit"):
            lines_df = lines_df.limit(int(self.args["limit"]))

        if self.args.get("partitions"):
            lines_df = lines_df.coalesce(int(self.args["partitions"]))
            lines_df.persist()
            print "Number of destination URLs: %s" % lines_df.count()

        if self.args.get("coalesce"):
            lines_df = lines_df.coalesce(int(self.args["coalesce"]))

        lines_df.write.text(
            self.args["path"],
            compression="gzip" if self.args.get("gzip") else "none"
        )

        return True
コード例 #10
0
ファイル: grep.py プロジェクト: jhildreth/cosr-back
    def spark_pipeline_action(self, sc, sqlc, df, indexer):

        lines_df = sql(sqlc, """
            SELECT CONCAT(CONCAT_WS(",", SORT_ARRAY(grep_words)), " ", url) r
            FROM df
            WHERE size(grep_words) > 0
        """, {"df": df})

        if self.args.get("coalesce"):
            lines_df = lines_df.coalesce(int(self.args["coalesce"]))

        lines_df.write.text(
            self.args["path"],
            compression="gzip" if self.args.get("gzip") else "none"
        )

        return True
コード例 #11
0
    def hook_spark_pipeline_action(self, sc, sqlc, df, indexer):

        domain = self.args["domain"]

        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions",
                         self.args["shuffle_partitions"])

        lines_df = sql(
            sqlc, """
            SELECT
                CONCAT(
                    regexp_replace(url_to, "^http(s?)://", ""),
                    " ",
                    COUNT(*),
                    " ",
                    CONCAT_WS(" ", COLLECT_LIST(url_from))
                ) r
            FROM (
                SELECT url url_from, EXPLODE(external_links.href) url_to
                FROM df
                WHERE size(external_links) > 0
            ) links
            WHERE SUBSTRING(
                PARSE_URL(links.url_to, "HOST"),
                LENGTH(PARSE_URL(links.url_to, "HOST")) - %s,
                %s
            ) == "%s"
            GROUP BY regexp_replace(url_to, "^http(s?)://", "")
            ORDER BY COUNT(*) DESC
        """ % (len(domain) - 1, len(domain), domain), {"df": df})

        if self.args.get("limit"):
            lines_df = lines_df.limit(int(self.args["limit"]))

        if self.args.get("partitions"):
            lines_df = lines_df.coalesce(int(self.args["partitions"]))
            lines_df.persist()
            print("Number of destination URLs: %s" % lines_df.count())

        self.save_dataframe(lines_df, "text")

        return True
コード例 #12
0
ファイル: pagerank.py プロジェクト: bakztfuture/cosr-back
    def custom_pagerank_2(self, sc, sqlc):
        """ Alternative PageRank implementation, with fixed number of steps """

        sc.setCheckpointDir("/tmp/spark-checkpoints")

        # ranks_schema = SparkTypes.StructType([
        #     SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
        #     SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False)
        # ])

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))

        if self.args.maxedges:
            edge_df = edge_df.limit(self.args.maxedges)

        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices"))

        if self.args.maxvertices:
            vertex_df = vertex_df.limit(self.args.maxvertices)

        sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions))

        # TODO: bootstrap with previous pageranks to accelerate convergence?
        ranks_df = sql(sqlc, """
            SELECT id, cast(1.0 as float) rank
            FROM vertices
        """, {"vertices": vertex_df})

        edge_df.persist()
        vertex_df.persist()
        print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count()))

        iteration_tmpdir = None

        for iteration in range(self.args.maxiter):

            new_ranks_df = sql(sqlc, """
                SELECT ranks.id id, cast(0.15 + 0.85 * COALESCE(contribs.contrib, 0) as float) rank
                FROM ranks
                LEFT OUTER JOIN (
                    SELECT edges.dst id, cast(sum(ranks.rank * COALESCE(edges.weight, 0)) as float) contrib
                    FROM edges
                    LEFT OUTER JOIN ranks ON edges.src = ranks.id
                    GROUP BY edges.dst
                ) contribs ON contribs.id = ranks.id
            """, {"ranks": ranks_df, "edges": edge_df})

            # At this point we need to break the RDD dependency chain
            # Writing & loading Parquet seems to be more efficient than checkpointing the RDD.

            iteration_tmpdir_previous = iteration_tmpdir
            iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration)

            # Every N iterations, we check if we got below the tolerance level.
            if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0):

                new_ranks_df.persist()
                ranks_df.persist()
                vertex_df.persist()

                stats_df = sql(sqlc, """
                    SELECT
                        sum(diff) as sum_diff,
                        count(*) as count_diff,
                        min(diff) as min_diff,
                        max(diff) as max_diff,
                        avg(diff) as avg_diff,
                        stddev(diff) as stddev_diff
                    FROM (
                        SELECT ABS(old_ranks.rank - new_ranks.rank) diff
                        FROM old_ranks
                        JOIN new_ranks ON old_ranks.id = new_ranks.id
                        WHERE old_ranks.rank != new_ranks.rank
                    ) diffs
                """, {"old_ranks": ranks_df, "new_ranks": new_ranks_df})

                stats = stats_df.collect()[0]
                print("Max diff at iteration %s : %s" % (iteration, stats["max_diff"]))
                print("Other stats: %s" % repr(stats))

                if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol):
                    print("Max diff was below tolerance: stopping iterations!")
                    break

                top_diffs_df = sql(sqlc, """
                    SELECT
                        (new_ranks.rank - old_ranks.rank) diff,
                        old_ranks.rank old_rank,
                        new_ranks.rank new_rank,
                        names.domain domain
                    FROM old_ranks
                    JOIN new_ranks ON old_ranks.id = new_ranks.id
                    JOIN names ON names.id = old_ranks.id
                    WHERE old_ranks.rank != new_ranks.rank
                    ORDER BY ABS(diff) DESC
                """, {"old_ranks": ranks_df, "new_ranks": new_ranks_df, "names": vertex_df})

                print("Top 100 diffs")
                print("\n".join(["%3.3f %3.3f %3.3f %s " % x for x in top_diffs_df.limit(100).collect()]))

            new_ranks_df.write.parquet(iteration_tmpdir)

            # S3 in us-east-1 should support read-after-write consistency since 2015
            # but we still have transient errors
            self.wait_for_tmpdir(iteration_tmpdir)

            new_ranks_df.unpersist()
            ranks_df.unpersist()

            ranks_df = sqlc.read.load(iteration_tmpdir)

            if iteration_tmpdir_previous is not None:
                self.clean_tmpdir(directory=iteration_tmpdir_previous)

        # No more need for the edges after iterations
        edge_df.unpersist()

        final_df = sql(sqlc, """
            SELECT CONCAT(names.domain, ' ', ranks.rank) r
            FROM ranks
            JOIN names ON names.id = ranks.id
            ORDER BY ranks.rank DESC
        """, {"names": vertex_df, "ranks": ranks_df})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print(final_df.rdd.collect())
コード例 #13
0
ファイル: pagerank.py プロジェクト: bakztfuture/cosr-back
    def custom_pagerank(self, sc, sqlc):
        """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """
        # pylint: disable=too-many-statements

        # sc.setCheckpointDir("/tmp/spark-checkpoints")

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))

        if self.args.maxedges:
            edge_df = edge_df.limit(self.args.maxedges)

        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices"))

        if self.args.maxvertices:
            vertex_df = vertex_df.limit(self.args.maxvertices)

        sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions))

        edge_df.persist(StorageLevel.MEMORY_AND_DISK)
        vertex_df.persist(StorageLevel.MEMORY_AND_DISK)

        print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count()))

        # TODO: bootstrap with previous pageranks to accelerate convergence?
        ranks_df = sql(sqlc, """
            SELECT vertices.id id, cast(1.0 as float) rank
            FROM vertices
            JOIN edges ON edges.dst = vertices.id
            GROUP BY vertices.id
        """, {"vertices": vertex_df, "edges": edge_df})

        # TODO: optimize further by taking out outDegree=0 vertices and computing their pagerank
        # as a post-filter.
        # LEFT OUTER JOIN edges edges_src on edges_src.src  = vertices.id
        # WHERE edges_src.src IS NOT NULL

        iteration_tmpdir = None

        for iteration in range(self.args.maxiter):

            changed_ranks_df = sql(sqlc, """
                SELECT
                    edges.dst id,
                    cast(
                        0.15 + 0.85 * sum(COALESCE(ranks_src.rank, 0.15) * edges.weight)
                        as float
                    ) rank_new,
                    first(ranks_dst.rank) rank_old
                FROM edges
                LEFT OUTER JOIN ranks_src ON edges.src = ranks_src.id
                LEFT OUTER JOIN ranks_dst ON edges.dst = ranks_dst.id
                GROUP BY edges.dst
                HAVING ABS(rank_old - rank_new) > %s
            """ % self.args.precision, {"ranks_src": ranks_df, "ranks_dst": ranks_df, "edges": edge_df})

            # Every N iterations, we check if we got below the tolerance level.
            if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0):

                changed_ranks_df.persist(StorageLevel.MEMORY_AND_DISK)

                stats_df = sql(sqlc, """
                    SELECT
                        sum(abs(rank_new - rank_old)) as sum_diff,
                        count(*) as count_diff,
                        min(abs(rank_new - rank_old)) as min_diff,
                        max(abs(rank_new - rank_old)) as max_diff,
                        avg(abs(rank_new - rank_old)) as avg_diff,
                        stddev(abs(rank_new - rank_old)) as stddev_diff
                    FROM changes
                """, {"changes": changed_ranks_df})

                stats = stats_df.collect()[0]

                print("Iteration %s, %s changed ranks" % (iteration, stats["count_diff"]))
                print("Stats: %s" % repr(stats))

                if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol):
                    print("Max diff was below tolerance: stopping iterations!")
                    break

                if self.args.top_diffs > 0:

                    top_changes_df = sql(sqlc, """
                        SELECT
                            (rank_new - rank_old) diff,
                            rank_old,
                            rank_new,
                            names.domain domain
                        FROM changes
                        JOIN names ON names.id = changes.id
                        ORDER BY abs(rank_new - rank_old) DESC
                    """, {"changes": changed_ranks_df, "names": vertex_df})

                    print("Top %s diffs" % self.args.top_changes)
                    print("\n".join([
                        "%3.3f (%3.3f => %3.3f) %s " % x
                        for x in top_changes_df.limit(self.args.top_diffs).collect()
                    ]))

                    top_changes_df.unpersist()

            new_ranks_df = sql(sqlc, """
                SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank
                FROM ranks
                LEFT JOIN changed_ranks ON changed_ranks.id = ranks.id
            """, {"ranks": ranks_df, "changed_ranks": changed_ranks_df})

            if (iteration + 1) % 5 != 0:

                new_ranks_df.persist(StorageLevel.MEMORY_AND_DISK)

                new_ranks_df.count()  # Materialize the RDD

                print("Iteration %s cached" % (iteration, ))

                ranks_df.unpersist()
                changed_ranks_df.unpersist()
                ranks_df = new_ranks_df

            # At this point we need to break the RDD dependency chain
            # Writing & loading Parquet seems to be more efficient than checkpointing the RDD.
            else:

                print("Iteration %s, saving to parquet" % iteration)

                iteration_tmpdir_previous = iteration_tmpdir
                iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration)

                new_ranks_df.write.parquet(iteration_tmpdir)

                # S3 in us-east-1 should support read-after-write consistency since 2015
                # but we still have transient errors
                self.wait_for_tmpdir(iteration_tmpdir)

                new_ranks_df.unpersist()
                ranks_df.unpersist()
                changed_ranks_df.unpersist()

                ranks_df = sqlc.read.load(iteration_tmpdir)

                if iteration_tmpdir_previous is not None:
                    self.clean_tmpdir(directory=iteration_tmpdir_previous)

        if self.args.include_orphans:

            ranks_df = ranks_df.unionAll(sql(sqlc, """
                SELECT vertices.id id, cast(0.15 as float) rank
                FROM vertices
                LEFT OUTER JOIN edges ON edges.dst  = vertices.id
                WHERE edges.dst is NULL
            """, {"vertices": vertex_df, "edges": edge_df}))

        # No more need for the edges after iterations
        edge_df.unpersist()

        final_df = sql(sqlc, """
            SELECT CONCAT(names.domain, ' ', ranks.rank) r
            FROM ranks
            JOIN names ON names.id = ranks.id
            ORDER BY ranks.rank DESC
        """, {"names": vertex_df, "ranks": ranks_df})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print(final_df.rdd.collect())
コード例 #14
0
    def custom_pagerank_2(self, sc, sqlc):
        """ Alternative PageRank implementation, with fixed number of steps """

        sc.setCheckpointDir("/tmp/spark-checkpoints")

        # ranks_schema = SparkTypes.StructType([
        #     SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
        #     SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False)
        # ])

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))

        if self.args.maxedges:
            edge_df = edge_df.limit(self.args.maxedges)

        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        if self.args.maxvertices:
            vertex_df = vertex_df.limit(self.args.maxvertices)

        sqlc.setConf("spark.sql.shuffle.partitions",
                     str(self.args.shuffle_partitions))

        # TODO: bootstrap with previous pageranks to accelerate convergence?
        ranks_df = sql(
            sqlc, """
            SELECT id, cast(1.0 as float) rank
            FROM vertices
        """, {"vertices": vertex_df})

        edge_df.persist()
        vertex_df.persist()
        print("Starting iterations. %s edges, %s vertices." %
              (edge_df.count(), vertex_df.count()))

        iteration_tmpdir = None

        for iteration in range(self.args.maxiter):

            new_ranks_df = sql(
                sqlc, """
                SELECT ranks.id id, cast(0.15 + 0.85 * COALESCE(contribs.contrib, 0) as float) rank
                FROM ranks
                LEFT OUTER JOIN (
                    SELECT edges.dst id, cast(sum(ranks.rank * COALESCE(edges.weight, 0)) as float) contrib
                    FROM edges
                    LEFT OUTER JOIN ranks ON edges.src = ranks.id
                    GROUP BY edges.dst
                ) contribs ON contribs.id = ranks.id
            """, {
                    "ranks": ranks_df,
                    "edges": edge_df
                })

            # At this point we need to break the RDD dependency chain
            # Writing & loading Parquet seems to be more efficient than checkpointing the RDD.

            iteration_tmpdir_previous = iteration_tmpdir
            iteration_tmpdir = os.path.join(self.args.tmpdir,
                                            "iter_%s" % iteration)

            # Every N iterations, we check if we got below the tolerance level.
            if (self.args.tol >= 0
                    or self.args.stats > 0) and (iteration % self.args.stats
                                                 == 0):

                new_ranks_df.persist()
                ranks_df.persist()
                vertex_df.persist()

                stats_df = sql(
                    sqlc, """
                    SELECT
                        sum(diff) as sum_diff,
                        count(*) as count_diff,
                        min(diff) as min_diff,
                        max(diff) as max_diff,
                        avg(diff) as avg_diff,
                        stddev(diff) as stddev_diff
                    FROM (
                        SELECT ABS(old_ranks.rank - new_ranks.rank) diff
                        FROM old_ranks
                        JOIN new_ranks ON old_ranks.id = new_ranks.id
                        WHERE old_ranks.rank != new_ranks.rank
                    ) diffs
                """, {
                        "old_ranks": ranks_df,
                        "new_ranks": new_ranks_df
                    })

                stats = stats_df.collect()[0]
                print("Max diff at iteration %s : %s" %
                      (iteration, stats["max_diff"]))
                print("Other stats: %s" % repr(stats))

                if (stats["count_diff"]
                        == 0) or (stats["max_diff"] <= self.args.tol):
                    print("Max diff was below tolerance: stopping iterations!")
                    break

                top_diffs_df = sql(
                    sqlc, """
                    SELECT
                        (new_ranks.rank - old_ranks.rank) diff,
                        old_ranks.rank old_rank,
                        new_ranks.rank new_rank,
                        names.domain domain
                    FROM old_ranks
                    JOIN new_ranks ON old_ranks.id = new_ranks.id
                    JOIN names ON names.id = old_ranks.id
                    WHERE old_ranks.rank != new_ranks.rank
                    ORDER BY ABS(diff) DESC
                """, {
                        "old_ranks": ranks_df,
                        "new_ranks": new_ranks_df,
                        "names": vertex_df
                    })

                print("Top 100 diffs")
                print("\n".join([
                    "%3.3f %3.3f %3.3f %s " % x
                    for x in top_diffs_df.limit(100).collect()
                ]))

            new_ranks_df.write.parquet(iteration_tmpdir)

            # S3 in us-east-1 should support read-after-write consistency since 2015
            # but we still have transient errors
            self.wait_for_tmpdir(iteration_tmpdir)

            new_ranks_df.unpersist()
            ranks_df.unpersist()

            ranks_df = sqlc.read.load(iteration_tmpdir)

            if iteration_tmpdir_previous is not None:
                self.clean_tmpdir(directory=iteration_tmpdir_previous)

        # No more need for the edges after iterations
        edge_df.unpersist()

        final_df = sql(
            sqlc, """
            SELECT CONCAT(names.domain, ' ', ranks.rank) r
            FROM ranks
            JOIN names ON names.id = ranks.id
            ORDER BY ranks.rank DESC
        """, {
                "names": vertex_df,
                "ranks": ranks_df
            })

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
コード例 #15
0
    def custom_pagerank(self, sc, sqlc):
        """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """
        # pylint: disable=too-many-statements

        # sc.setCheckpointDir("/tmp/spark-checkpoints")

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))

        if self.args.maxedges:
            edge_df = edge_df.limit(self.args.maxedges)

        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        if self.args.maxvertices:
            vertex_df = vertex_df.limit(self.args.maxvertices)

        sqlc.setConf("spark.sql.shuffle.partitions",
                     str(self.args.shuffle_partitions))

        edge_df.persist(StorageLevel.MEMORY_AND_DISK)
        vertex_df.persist(StorageLevel.MEMORY_AND_DISK)

        print("Starting iterations. %s edges, %s vertices." %
              (edge_df.count(), vertex_df.count()))

        # TODO: bootstrap with previous pageranks to accelerate convergence?
        ranks_df = sql(
            sqlc, """
            SELECT vertices.id id, cast(1.0 as float) rank
            FROM vertices
            JOIN edges ON edges.dst = vertices.id
            GROUP BY vertices.id
        """, {
                "vertices": vertex_df,
                "edges": edge_df
            })

        # TODO: optimize further by taking out outDegree=0 vertices and computing their pagerank
        # as a post-filter.
        # LEFT OUTER JOIN edges edges_src on edges_src.src  = vertices.id
        # WHERE edges_src.src IS NOT NULL

        iteration_tmpdir = None

        for iteration in range(self.args.maxiter):

            changed_ranks_df = sql(
                sqlc, """
                SELECT
                    edges.dst id,
                    cast(
                        0.15 + 0.85 * sum(COALESCE(ranks_src.rank, 0.15) * edges.weight)
                        as float
                    ) rank_new,
                    first(ranks_dst.rank) rank_old
                FROM edges
                LEFT OUTER JOIN ranks_src ON edges.src = ranks_src.id
                LEFT OUTER JOIN ranks_dst ON edges.dst = ranks_dst.id
                GROUP BY edges.dst
                HAVING ABS(rank_old - rank_new) > %s
            """ % self.args.precision, {
                    "ranks_src": ranks_df,
                    "ranks_dst": ranks_df,
                    "edges": edge_df
                })

            # Every N iterations, we check if we got below the tolerance level.
            if (self.args.tol >= 0
                    or self.args.stats > 0) and (iteration % self.args.stats
                                                 == 0):

                changed_ranks_df.persist(StorageLevel.MEMORY_AND_DISK)

                stats_df = sql(
                    sqlc, """
                    SELECT
                        sum(abs(rank_new - rank_old)) as sum_diff,
                        count(*) as count_diff,
                        min(abs(rank_new - rank_old)) as min_diff,
                        max(abs(rank_new - rank_old)) as max_diff,
                        avg(abs(rank_new - rank_old)) as avg_diff,
                        stddev(abs(rank_new - rank_old)) as stddev_diff
                    FROM changes
                """, {"changes": changed_ranks_df})

                stats = stats_df.collect()[0]

                print("Iteration %s, %s changed ranks" %
                      (iteration, stats["count_diff"]))
                print("Stats: %s" % repr(stats))

                if (stats["count_diff"]
                        == 0) or (stats["max_diff"] <= self.args.tol):
                    print("Max diff was below tolerance: stopping iterations!")
                    break

                if self.args.top_diffs > 0:

                    top_changes_df = sql(
                        sqlc, """
                        SELECT
                            (rank_new - rank_old) diff,
                            rank_old,
                            rank_new,
                            names.domain domain
                        FROM changes
                        JOIN names ON names.id = changes.id
                        ORDER BY abs(rank_new - rank_old) DESC
                    """, {
                            "changes": changed_ranks_df,
                            "names": vertex_df
                        })

                    print("Top %s diffs" % self.args.top_changes)
                    print("\n".join([
                        "%3.3f (%3.3f => %3.3f) %s " % x for x in
                        top_changes_df.limit(self.args.top_diffs).collect()
                    ]))

                    top_changes_df.unpersist()

            new_ranks_df = sql(
                sqlc, """
                SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank
                FROM ranks
                LEFT JOIN changed_ranks ON changed_ranks.id = ranks.id
            """, {
                    "ranks": ranks_df,
                    "changed_ranks": changed_ranks_df
                })

            if (iteration + 1) % 5 != 0:

                new_ranks_df.persist(StorageLevel.MEMORY_AND_DISK)

                new_ranks_df.count()  # Materialize the RDD

                print("Iteration %s cached" % (iteration, ))

                ranks_df.unpersist()
                changed_ranks_df.unpersist()
                ranks_df = new_ranks_df

            # At this point we need to break the RDD dependency chain
            # Writing & loading Parquet seems to be more efficient than checkpointing the RDD.
            else:

                print("Iteration %s, saving to parquet" % iteration)

                iteration_tmpdir_previous = iteration_tmpdir
                iteration_tmpdir = os.path.join(self.args.tmpdir,
                                                "iter_%s" % iteration)

                new_ranks_df.write.parquet(iteration_tmpdir)

                # S3 in us-east-1 should support read-after-write consistency since 2015
                # but we still have transient errors
                self.wait_for_tmpdir(iteration_tmpdir)

                new_ranks_df.unpersist()
                ranks_df.unpersist()
                changed_ranks_df.unpersist()

                ranks_df = sqlc.read.load(iteration_tmpdir)

                if iteration_tmpdir_previous is not None:
                    self.clean_tmpdir(directory=iteration_tmpdir_previous)

        if self.args.include_orphans:

            ranks_df = ranks_df.unionAll(
                sql(
                    sqlc, """
                SELECT vertices.id id, cast(0.15 as float) rank
                FROM vertices
                LEFT OUTER JOIN edges ON edges.dst  = vertices.id
                WHERE edges.dst is NULL
            """, {
                        "vertices": vertex_df,
                        "edges": edge_df
                    }))

        # No more need for the edges after iterations
        edge_df.unpersist()

        final_df = sql(
            sqlc, """
            SELECT CONCAT(names.domain, ' ', ranks.rank) r
            FROM ranks
            JOIN names ON names.id = ranks.id
            ORDER BY ranks.rank DESC
        """, {
                "names": vertex_df,
                "ranks": ranks_df
            })

        if self.args.dump:
            final_df.coalesce(1).write.format('text').mode(
                self.get_write_mode()).save(
                    self.args.dump,
                    compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
コード例 #16
0
ファイル: webgraph.py プロジェクト: bakztfuture/cosr-back
    def save_edge_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """

        edge_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("src", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("dst", SparkTypes.LongType(), nullable=False),

            # Sum of weights must be 1
            # This field will automatically be added by the SQL query
            # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True)
        ])

        # TODO?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"])

        # Get all unique (host1 => host2) pairs
        new_df = sql(sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df}).distinct()

        def iter_links_domain(record):
            """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """

            d1 = record["d1"]
            d2 = record["d2"]
            if not d1 or not d2:
                return []

            try:
                from_domain = _fast_make_domain_id(d1)
                to_domain = _fast_make_domain_id(d2)
            except Exception:  # pylint: disable=broad-except
                return []

            if from_domain == to_domain:
                return []
            else:
                return [(py2_long(from_domain), py2_long(to_domain))]

        rdd_couples = new_df.rdd.flatMap(iter_links_domain)

        edge_df = createDataFrame(sqlc, rdd_couples, edge_graph_schema).distinct()

        # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge
        # The current algorithm is naive: edge weight is equally split between all the links, with
        # the sum of all weights for a source domain always = 1.
        weights_df = sql(sqlc, """
            SELECT src id, cast(1 / count(*) as float) weight
            FROM edges
            GROUP BY src
        """, {"edges": edge_df})

        weighted_edge_df = sql(sqlc, """
            SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight
            FROM edges
            JOIN weights on edges.src = weights.id
        """, {"edges": edge_df, "weights": weights_df})

        coalesce = int(self.args.get("coalesce_edges") or self.args.get("coalesce", 1) or 0)
        if coalesce > 0:
            weighted_edge_df = weighted_edge_df.coalesce(coalesce)

        weighted_edge_df.write.parquet(os.path.join(self.args["path"], "edges"))
コード例 #17
0
ファイル: pagerank.py プロジェクト: tarunjindl/cosr-back
    def custom_pagerank(self, sc, sqlc):
        """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """

        sc.setCheckpointDir("/tmp/spark-checkpoints")

        edge_df = sqlc.read.load(self.args.edges)

        if self.args.maxedges:
            edge_df = edge_df.limit(self.args.maxedges)

        vertex_df = sqlc.read.load(self.args.vertices)

        if self.args.maxvertices:
            vertex_df = vertex_df.limit(self.args.maxvertices)

        sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions))

        # TODO: bootstrap with previous pageranks to accelerate convergence?
        ranks_df = sql(sqlc, """
            SELECT id, cast(0.15 as float) rank
            FROM vertices
        """, {"vertices": vertex_df})

        edge_df.persist()
        vertex_df.persist()
        print "Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count())

        iteration_tmpdir = None

        for iteration in range(self.args.maxiter):

            # We cast as strings because of https://issues.apache.org/jira/browse/SPARK-16802
            # TODO: remove them once it's fixed!
            #
            changed_ranks_df = sql(sqlc, """
                SELECT
                    cast(edges.dst as string) id,
                    cast(
                        0.15 + 0.85 * sum(ranks_src.rank * edges.weight)
                        as float
                    ) rank_new,
                    first(ranks_dst.rank) rank_old
                FROM edges
                LEFT OUTER JOIN ranks_src ON cast(edges.src as string) = cast(ranks_src.id as string)
                LEFT OUTER JOIN ranks_dst ON cast(edges.dst as string) = cast(ranks_dst.id as string)
                GROUP BY cast(edges.dst as string)
                HAVING ABS(rank_old - rank_new) > %s
            """ % self.args.precision, {"ranks_src": ranks_df, "ranks_dst": ranks_df, "edges": edge_df})

            # Every N iterations, we check if we got below the tolerance level.
            if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0):

                changed_ranks_df.persist()

                stats_df = sql(sqlc, """
                    SELECT
                        sum(abs(rank_new - rank_old)) as sum_diff,
                        count(*) as count_diff,
                        min(abs(rank_new - rank_old)) as min_diff,
                        max(abs(rank_new - rank_old)) as max_diff,
                        avg(abs(rank_new - rank_old)) as avg_diff,
                        stddev(abs(rank_new - rank_old)) as stddev_diff
                    FROM changes
                """, {"changes": changed_ranks_df})

                stats = stats_df.collect()[0]

                print "Iteration %s, %s changed ranks" % (iteration, stats["count_diff"])
                print "Stats: %s" % repr(stats)

                if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol):
                    print "Max diff was below tolerance: stopping iterations!"
                    break

                top_changes_df = sql(sqlc, """
                    SELECT
                        (rank_new - rank_old) diff,
                        rank_old,
                        rank_new,
                        names.domain domain
                    FROM changes
                    JOIN names ON names.id = changes.id
                    ORDER BY abs(rank_new - rank_old) DESC
                """, {"changes": changed_ranks_df, "names": vertex_df})

                print "Top 20 diffs"
                print "\n".join([
                    "%3.3f (%3.3f => %3.3f) %s " % x
                    for x in top_changes_df.limit(20).collect()
                ])

            new_ranks_df = sql(sqlc, """
                SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank
                FROM ranks
                LEFT JOIN changed_ranks ON cast(changed_ranks.id as string) = cast(ranks.id as string)
            """, {"ranks": ranks_df, "changed_ranks": changed_ranks_df})

            # At this point we need to break the RDD dependency chain
            # Writing & loading Parquet seems to be more efficient than checkpointing the RDD.

            iteration_tmpdir_previous = iteration_tmpdir
            iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration)

            new_ranks_df.write.parquet(iteration_tmpdir)

            # S3 in us-east-1 should support read-after-write consistency since 2015
            # but we still have transient errors
            self.wait_for_tmpdir(iteration_tmpdir)

            new_ranks_df.unpersist()
            ranks_df.unpersist()
            changed_ranks_df.unpersist()

            ranks_df = sqlc.read.load(iteration_tmpdir)

            if iteration_tmpdir_previous is not None:
                self.clean_tmpdir(directory=iteration_tmpdir_previous)

        # No more need for the edges after iterations
        edge_df.unpersist()

        final_df = sql(sqlc, """
            SELECT CONCAT(names.domain, ' ', ranks.rank) r
            FROM ranks
            JOIN names ON cast(names.id as string) = cast(ranks.id as string)
            ORDER BY ranks.rank DESC
        """, {"names": vertex_df, "ranks": ranks_df})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print final_df.rdd.collect()
コード例 #18
0
    def save_edge_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """

        edge_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("src",
                                   SparkTypes.LongType(),
                                   nullable=False),
            SparkTypes.StructField("dst",
                                   SparkTypes.LongType(),
                                   nullable=False),

            # Sum of weights must be 1
            # This field will automatically be added by the SQL query
            # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True)
        ])

        # TODO?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions",
                         self.args["shuffle_partitions"])

        # Get all unique (host1 => host2) pairs
        new_df = sql(
            sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {
                "df": df
            }).distinct()

        def iter_links_domain(record):
            """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """

            d1 = record["d1"]
            d2 = record["d2"]
            if not d1 or not d2:
                return []

            try:
                from_domain = _fast_make_domain_id(d1)
                to_domain = _fast_make_domain_id(d2)
            except Exception:  # pylint: disable=broad-except
                return []

            if from_domain == to_domain:
                return []
            else:
                return [(py2_long(from_domain), py2_long(to_domain))]

        rdd_couples = new_df.rdd.flatMap(iter_links_domain)

        edge_df = createDataFrame(sqlc, rdd_couples,
                                  edge_graph_schema).distinct()

        # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge
        # The current algorithm is naive: edge weight is equally split between all the links, with
        # the sum of all weights for a source domain always = 1.
        weights_df = sql(
            sqlc, """
            SELECT src id, cast(1 / count(*) as float) weight
            FROM edges
            GROUP BY src
        """, {"edges": edge_df})

        weighted_edge_df = sql(
            sqlc, """
            SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight
            FROM edges
            JOIN weights on edges.src = weights.id
        """, {
                "edges": edge_df,
                "weights": weights_df
            })

        coalesce = int(
            self.args.get("coalesce_edges") or self.args.get("coalesce", 1)
            or 0)
        if coalesce > 0:
            weighted_edge_df = weighted_edge_df.coalesce(coalesce)

        weighted_edge_df.write.parquet(
            os.path.join(self.args["output"], "edges"))
コード例 #19
0
    def save_vertex_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """

        vertex_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("id", SparkTypes.LongType(),
                                   nullable=False),
            SparkTypes.StructField("domain",
                                   SparkTypes.StringType(),
                                   nullable=False)
        ])

        # TODO ?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions",
                         self.args["shuffle_partitions"])

        # We collect all unique domains from the page URLs & destination of all external links
        d1_df = sql(
            sqlc, """
            SELECT parse_url(url, "HOST") as domain from df
        """, {
                "df": df
            }).distinct()

        d2_df = sql(
            sqlc, """
            SELECT parse_url(CONCAT("http://", link), "HOST") as domain
            FROM (
                SELECT EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df})

        all_domains_df = d1_df.unionAll(d2_df).distinct()

        def iter_domain(record):
            """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """

            domain = record["domain"]
            if not domain or not domain.strip():
                return []

            name = URL("http://" + domain).normalized_domain

            try:
                _id = _fast_make_domain_id(name)
            except Exception:  # pylint: disable=broad-except
                return []

            return [(py2_long(_id), str(name))]

        rdd_domains = all_domains_df.rdd.flatMap(iter_domain)

        vertex_df = createDataFrame(sqlc, rdd_domains,
                                    vertex_graph_schema).distinct()

        coalesce = int(
            self.args.get("coalesce_vertices") or self.args.get("coalesce", 1)
            or 0)
        if coalesce > 0:
            vertex_df = vertex_df.coalesce(coalesce)

        vertex_df.write.parquet(os.path.join(self.args["output"], "vertices"))