Example #1
0
    def save_vertex_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """

        vertex_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("domain", SparkTypes.StringType(), nullable=False)
        ])

        # TODO ?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"])

        # We collect all unique domains from the page URLs & destination of all external links
        d1_df = sql(sqlc, """
            SELECT parse_url(url, "HOST") as domain from df
        """, {"df": df}).distinct()

        d2_df = sql(sqlc, """
            SELECT parse_url(link, "HOST") as domain
            FROM (
                SELECT EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df})

        all_domains_df = d1_df.unionAll(d2_df).distinct()

        def iter_domain(record):
            """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """

            domain = record["domain"]
            if not domain or not domain.strip():
                return []

            name = URL("http://" + domain).normalized_domain

            try:
                _id = _fast_make_domain_id(name)
            except Exception:  # pylint: disable=broad-except
                return []

            return [(long(_id), str(name))]

        rdd_domains = all_domains_df.rdd.flatMap(iter_domain)

        vertex_df = createDataFrame(sqlc, rdd_domains, vertex_graph_schema).distinct()

        if self.args.get("coalesce_vertices") or self.args.get("coalesce"):
            vertex_df = vertex_df.coalesce(
                int(self.args.get("coalesce_vertices") or self.args.get("coalesce"))
            )

        vertex_df.write.parquet(os.path.join(self.args["path"], "vertices"))
Example #2
0
    def get_indexed_documents_from_source(self, sc, sqlc, document_schema, source):
        """ Returns indexed documents from a source """

        if source.already_parsed:

            # Some sources return already parsed documents
            return source.get_documents(sqlc), False

        else:

            partitions = source.get_partitions()

            rdd = sc \
                .parallelize(partitions, len(partitions)) \
                .flatMap(lambda partition: self.index_documents(source, partition))

            source_documents = createDataFrame(sqlc, rdd, document_schema)

            return source_documents, True
Example #3
0
    def save_edge_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """

        edge_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("src", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("dst", SparkTypes.LongType(), nullable=False),

            # Sum of weights must be 1
            # This field will automatically be added by the SQL query
            # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True)
        ])

        # TODO?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"])

        # Get all unique (host1 => host2) pairs
        new_df = sql(sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df}).distinct()

        def iter_links_domain(record):
            """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """

            d1 = record["d1"]
            d2 = record["d2"]
            if not d1 or not d2:
                return []

            try:
                from_domain = _fast_make_domain_id(d1)
                to_domain = _fast_make_domain_id(d2)
            except Exception:  # pylint: disable=broad-except
                return []

            if from_domain == to_domain:
                return []
            else:
                return [(py2_long(from_domain), py2_long(to_domain))]

        rdd_couples = new_df.rdd.flatMap(iter_links_domain)

        edge_df = createDataFrame(sqlc, rdd_couples, edge_graph_schema).distinct()

        # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge
        # The current algorithm is naive: edge weight is equally split between all the links, with
        # the sum of all weights for a source domain always = 1.
        weights_df = sql(sqlc, """
            SELECT src id, cast(1 / count(*) as float) weight
            FROM edges
            GROUP BY src
        """, {"edges": edge_df})

        weighted_edge_df = sql(sqlc, """
            SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight
            FROM edges
            JOIN weights on edges.src = weights.id
        """, {"edges": edge_df, "weights": weights_df})

        coalesce = int(self.args.get("coalesce_edges") or self.args.get("coalesce", 1) or 0)
        if coalesce > 0:
            weighted_edge_df = weighted_edge_df.coalesce(coalesce)

        weighted_edge_df.write.parquet(os.path.join(self.args["path"], "edges"))
Example #4
0
    def run_job(self, sc, sqlc):

        """ Execute our indexing pipeline with a Spark Context """

        self.plugins = load_plugins(self.args.plugin)
        self.accumulator_indexed = sc.accumulator(0)

        maxdocs = {}

        # What fields will be sent to Spark
        document_schema_columns = [
            SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("url", SparkTypes.StringType(), nullable=False),
            SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False)
        ]

        # Some plugins need to add new fields to the schema
        exec_hook(
            self.plugins, "spark_pipeline_init", sc, sqlc, document_schema_columns, indexer
        )

        exec_hook(self.plugins, "document_schema", document_schema_columns)

        document_schema = SparkTypes.StructType(document_schema_columns)

        # Spark DataFrame containing everything we indexed
        all_documents = None

        executed_pipeline = False

        for source_spec in self.args.source:

            source_documents = None

            source_name, source_args = parse_plugin_cli_args(source_spec)
            maxdocs[source_spec] = source_args.get("maxdocs")

            if source_name == "commoncrawl":
                partitions = list_commoncrawl_warc_filenames(
                    limit=source_args.get("limit"),
                    skip=source_args.get("skip"),
                    version=source_args.get("version")
                )

                def index_partition(filename):
                    ds = load_source("commoncrawl", {
                        "file": filename,
                        "plugins": self.plugins,
                        "maxdocs": maxdocs[source_spec]  # pylint: disable=cell-var-from-loop
                    })
                    return self.index_documents(ds)

            elif source_name == "warc":

                # We have been given a .txt file with a list of WARC file paths
                if source_args["file"].endswith(".txt"):
                    with open(source_args["file"], "rb") as f:
                        partitions = [x.strip() for x in f.readlines()]

                # Single WARC file path
                else:
                    partitions = [source_args["file"]]

                def index_partition(filename):
                    ds = load_source("webarchive", {
                        "file": filename,
                        "plugins": self.plugins,
                        "maxdocs": maxdocs[source_spec]  # pylint: disable=cell-var-from-loop
                    })
                    return self.index_documents(ds)

            elif source_name == "wikidata":

                partitions = ["__wikidata_single_dump__"]

                def index_partition(_):
                    ds = load_source("wikidata", {
                        "maxdocs": maxdocs[source_spec],  # pylint: disable=cell-var-from-loop
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "corpus":

                partitions = source_args.get("docs", ["__from_file__"])
                path = source_args.get("path")

                def index_partition(doc):

                    ds = load_source("corpus", {
                        "maxdocs": maxdocs[source_spec],  # pylint: disable=cell-var-from-loop
                        "docs": [doc],
                        "path": path,  # pylint: disable=cell-var-from-loop
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "url":

                partitions = source_args.get("urls") or [source_args["url"]]

                def index_partition(url):
                    ds = load_source("url", {
                        "urls": [url],
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "parquet":

                # Read an intermediate dump of document metadata generated by
                # --plugin plugins.dump.DocumentMetadataParquet
                df = sqlc.read.parquet(source_args["path"])

                if maxdocs[source_spec]:
                    df = df.limit(int(maxdocs[source_spec]))

                if source_args.get("fields"):
                    df = df.select(source_args["fields"].split("+"))

                source_documents = df

            # Split indexing of each partition in Spark workers
            if source_documents is None:

                executed_pipeline = False
                rdd = sc \
                    .parallelize(partitions, len(partitions)) \
                    .flatMap(index_partition)

                source_documents = createDataFrame(sqlc, rdd, document_schema)

            if source_args.get("persist") == "1":
                source_documents.persist(StorageLevel.MEMORY_AND_DISK)

            # The count() here will execute the pipeline so far to allow for sources to be done sequentially
            if source_args.get("block") == "1":
                executed_pipeline = True
                print "Source %s done, indexed %s documents (%s total so far)" % (
                    source_name, source_documents.rdd.count(), self.accumulator_indexed.value
                )

            if all_documents is None:
                all_documents = source_documents
            else:
                all_documents = all_documents.unionAll(source_documents)

        done_actions = exec_hook(
            self.plugins, "spark_pipeline_action", sc, sqlc, all_documents, indexer
        )

        # If no action was done, we need to do a count() to actually execute the spark pipeline
        if any(done_actions):
            executed_pipeline = True

        if not executed_pipeline:
            print "Total documents: %s" % all_documents.rdd.count()
Example #5
0
    def save_edge_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """

        edge_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("src",
                                   SparkTypes.LongType(),
                                   nullable=False),
            SparkTypes.StructField("dst",
                                   SparkTypes.LongType(),
                                   nullable=False),

            # Sum of weights must be 1
            # This field will automatically be added by the SQL query
            # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True)
        ])

        # TODO?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions",
                         self.args["shuffle_partitions"])

        # Get all unique (host1 => host2) pairs
        new_df = sql(
            sqlc, """
            SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2
            FROM (
                SELECT url, EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {
                "df": df
            }).distinct()

        def iter_links_domain(record):
            """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """

            d1 = record["d1"]
            d2 = record["d2"]
            if not d1 or not d2:
                return []

            try:
                from_domain = _fast_make_domain_id(d1)
                to_domain = _fast_make_domain_id(d2)
            except Exception:  # pylint: disable=broad-except
                return []

            if from_domain == to_domain:
                return []
            else:
                return [(py2_long(from_domain), py2_long(to_domain))]

        rdd_couples = new_df.rdd.flatMap(iter_links_domain)

        edge_df = createDataFrame(sqlc, rdd_couples,
                                  edge_graph_schema).distinct()

        # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge
        # The current algorithm is naive: edge weight is equally split between all the links, with
        # the sum of all weights for a source domain always = 1.
        weights_df = sql(
            sqlc, """
            SELECT src id, cast(1 / count(*) as float) weight
            FROM edges
            GROUP BY src
        """, {"edges": edge_df})

        weighted_edge_df = sql(
            sqlc, """
            SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight
            FROM edges
            JOIN weights on edges.src = weights.id
        """, {
                "edges": edge_df,
                "weights": weights_df
            })

        coalesce = int(
            self.args.get("coalesce_edges") or self.args.get("coalesce", 1)
            or 0)
        if coalesce > 0:
            weighted_edge_df = weighted_edge_df.coalesce(coalesce)

        weighted_edge_df.write.parquet(
            os.path.join(self.args["output"], "edges"))
Example #6
0
    def save_vertex_graph(self, sqlc, df):
        """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """

        vertex_graph_schema = SparkTypes.StructType([
            SparkTypes.StructField("id", SparkTypes.LongType(),
                                   nullable=False),
            SparkTypes.StructField("domain",
                                   SparkTypes.StringType(),
                                   nullable=False)
        ])

        # TODO ?!
        if self.args.get("shuffle_partitions"):
            sqlc.setConf("spark.sql.shuffle.partitions",
                         self.args["shuffle_partitions"])

        # We collect all unique domains from the page URLs & destination of all external links
        d1_df = sql(
            sqlc, """
            SELECT parse_url(url, "HOST") as domain from df
        """, {
                "df": df
            }).distinct()

        d2_df = sql(
            sqlc, """
            SELECT parse_url(CONCAT("http://", link), "HOST") as domain
            FROM (
                SELECT EXPLODE(external_links.href) as link FROM df
            ) as pairs
        """, {"df": df})

        all_domains_df = d1_df.unionAll(d2_df).distinct()

        def iter_domain(record):
            """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """

            domain = record["domain"]
            if not domain or not domain.strip():
                return []

            name = URL("http://" + domain).normalized_domain

            try:
                _id = _fast_make_domain_id(name)
            except Exception:  # pylint: disable=broad-except
                return []

            return [(py2_long(_id), str(name))]

        rdd_domains = all_domains_df.rdd.flatMap(iter_domain)

        vertex_df = createDataFrame(sqlc, rdd_domains,
                                    vertex_graph_schema).distinct()

        coalesce = int(
            self.args.get("coalesce_vertices") or self.args.get("coalesce", 1)
            or 0)
        if coalesce > 0:
            vertex_df = vertex_df.coalesce(coalesce)

        vertex_df.write.parquet(os.path.join(self.args["output"], "vertices"))