Ejemplo n.º 1
0
    def index_from_source(self, source, partition, _indexer, **kwargs):
        """ Indexes all documents from a source """

        for document in source.iter_documents(partition):

            print("Indexing", document.source_url.url)

            metadata = {}

            exec_hook(self.plugins, "document_pre_index", document, metadata)

            metadata.update(_indexer.index_document(document, **kwargs))

            exec_hook(self.plugins, "document_post_index", document, metadata)

            yield metadata
Ejemplo n.º 2
0
    def index_from_source(self, source, _indexer, **kwargs):
        """ Indexes all documents from a source """

        # plugins = load_plugins(plugin_list)

        for document in source.iter_documents():

            print "Indexing", document.source_url.url

            metadata = {}

            exec_hook(self.plugins, "document_pre_index", document, metadata)

            metadata.update(_indexer.index_document(document, **kwargs))

            exec_hook(self.plugins, "document_post_index", document, metadata)

            yield metadata
Ejemplo n.º 3
0
    def run_job(self, sc, sqlc):

        """ Execute our indexing pipeline with a Spark Context """

        self.plugins = load_plugins(self.args.plugin)
        self.accumulator_indexed = sc.accumulator(0)

        maxdocs = {}

        # What fields will be sent to Spark
        document_schema_columns = [
            SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("url", SparkTypes.StringType(), nullable=False),
            SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False)
        ]

        # Some plugins need to add new fields to the schema
        exec_hook(
            self.plugins, "spark_pipeline_init", sc, sqlc, document_schema_columns, indexer
        )

        exec_hook(self.plugins, "document_schema", document_schema_columns)

        document_schema = SparkTypes.StructType(document_schema_columns)

        # Spark DataFrame containing everything we indexed
        all_documents = None

        executed_pipeline = False

        for source_spec in self.args.source:

            source_documents = None

            source_name, source_args = parse_plugin_cli_args(source_spec)
            maxdocs[source_spec] = source_args.get("maxdocs")

            if source_name == "commoncrawl":
                partitions = list_commoncrawl_warc_filenames(
                    limit=source_args.get("limit"),
                    skip=source_args.get("skip"),
                    version=source_args.get("version")
                )

                def index_partition(filename):
                    ds = load_source("commoncrawl", {
                        "file": filename,
                        "plugins": self.plugins,
                        "maxdocs": maxdocs[source_spec]  # pylint: disable=cell-var-from-loop
                    })
                    return self.index_documents(ds)

            elif source_name == "warc":

                # We have been given a .txt file with a list of WARC file paths
                if source_args["file"].endswith(".txt"):
                    with open(source_args["file"], "rb") as f:
                        partitions = [x.strip() for x in f.readlines()]

                # Single WARC file path
                else:
                    partitions = [source_args["file"]]

                def index_partition(filename):
                    ds = load_source("webarchive", {
                        "file": filename,
                        "plugins": self.plugins,
                        "maxdocs": maxdocs[source_spec]  # pylint: disable=cell-var-from-loop
                    })
                    return self.index_documents(ds)

            elif source_name == "wikidata":

                partitions = ["__wikidata_single_dump__"]

                def index_partition(_):
                    ds = load_source("wikidata", {
                        "maxdocs": maxdocs[source_spec],  # pylint: disable=cell-var-from-loop
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "corpus":

                partitions = source_args.get("docs", ["__from_file__"])
                path = source_args.get("path")

                def index_partition(doc):

                    ds = load_source("corpus", {
                        "maxdocs": maxdocs[source_spec],  # pylint: disable=cell-var-from-loop
                        "docs": [doc],
                        "path": path,  # pylint: disable=cell-var-from-loop
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "url":

                partitions = source_args.get("urls") or [source_args["url"]]

                def index_partition(url):
                    ds = load_source("url", {
                        "urls": [url],
                        "plugins": self.plugins
                    })
                    return self.index_documents(ds)

            elif source_name == "parquet":

                # Read an intermediate dump of document metadata generated by
                # --plugin plugins.dump.DocumentMetadataParquet
                df = sqlc.read.parquet(source_args["path"])

                if maxdocs[source_spec]:
                    df = df.limit(int(maxdocs[source_spec]))

                if source_args.get("fields"):
                    df = df.select(source_args["fields"].split("+"))

                source_documents = df

            # Split indexing of each partition in Spark workers
            if source_documents is None:

                executed_pipeline = False
                rdd = sc \
                    .parallelize(partitions, len(partitions)) \
                    .flatMap(index_partition)

                source_documents = createDataFrame(sqlc, rdd, document_schema)

            if source_args.get("persist") == "1":
                source_documents.persist(StorageLevel.MEMORY_AND_DISK)

            # The count() here will execute the pipeline so far to allow for sources to be done sequentially
            if source_args.get("block") == "1":
                executed_pipeline = True
                print "Source %s done, indexed %s documents (%s total so far)" % (
                    source_name, source_documents.rdd.count(), self.accumulator_indexed.value
                )

            if all_documents is None:
                all_documents = source_documents
            else:
                all_documents = all_documents.unionAll(source_documents)

        done_actions = exec_hook(
            self.plugins, "spark_pipeline_action", sc, sqlc, all_documents, indexer
        )

        # If no action was done, we need to do a count() to actually execute the spark pipeline
        if any(done_actions):
            executed_pipeline = True

        if not executed_pipeline:
            print "Total documents: %s" % all_documents.rdd.count()
Ejemplo n.º 4
0
    def run_job(self, sc, sqlc):

        """ Execute our indexing pipeline with a Spark Context """

        self.plugins = load_plugins(self.args.plugin)
        self.accumulator_indexed = sc.accumulator(0)

        # What fields will be sent to Spark
        document_schema_columns = [
            SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False),
            SparkTypes.StructField("url", SparkTypes.StringType(), nullable=False),
            SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False)
        ]

        # Some plugins need to add new fields to the schema
        exec_hook(
            self.plugins, "spark_pipeline_init", sc, sqlc, document_schema_columns, indexer
        )

        exec_hook(self.plugins, "document_schema", document_schema_columns)

        document_schema = SparkTypes.StructType(document_schema_columns)

        # Spark DataFrame containing everything we indexed
        all_documents = None

        executed_pipeline = False

        for source_spec in self.args.source:

            source_name, source_args = parse_plugin_cli_args(source_spec)

            ds = load_source(source_name, source_args, plugins=self.plugins)

            source_documents, needs_execution = self.get_indexed_documents_from_source(
                sc, sqlc, document_schema, ds
            )

            executed_pipeline = executed_pipeline and (not needs_execution)

            #
            # At this point, we have a DataFrame with every document from this source.
            #

            if source_args.get("persist") == "1":
                source_documents.persist(StorageLevel.MEMORY_AND_DISK)

            # The count() here will execute the pipeline so far to allow for sources to be done sequentially
            if source_args.get("block") == "1":
                executed_pipeline = True
                print("Source %s done, indexed %s documents (%s total so far)" % (
                    source_name, source_documents.rdd.count(), self.accumulator_indexed.value
                ))

            if all_documents is None:
                all_documents = source_documents
            else:
                all_documents = all_documents.unionAll(source_documents)

        #
        # At this point, we have a DataFrame with all documents from all sources.
        #

        done_actions = exec_hook(
            self.plugins, "spark_pipeline_action", sc, sqlc, all_documents, indexer
        )

        # If no action was done, we need to do a count() to actually
        # execute ("materialize") the spark pipeline
        if any(done_actions):
            executed_pipeline = True

        if not executed_pipeline:
            print("Total documents: %s" % all_documents.rdd.count())