コード例 #1
0
class PersistGrobidRefsWorker(SandcrawlerWorker):
    """
    Simple persist worker to backfill GROBID references in to postgresql
    locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
    """
    def __init__(self, db_url: str, **kwargs):
        super().__init__(**kwargs)
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)

        refs_batch = []
        for record in batch:
            assert record["source"]
            assert record["source_id"]
            refs_batch.append(record)

        resp = self.db.insert_grobid_refs(self.cur, refs_batch)
        if len(refs_batch) < len(batch):
            self.counts["skip"] += len(batch) - len(refs_batch)
        self.counts["insert-grobid_refs"] += resp[0]
        self.counts["update-grobid_refs"] += resp[1]

        self.db.commit()
        return []
コード例 #2
0
 def __init__(self,
              db_url: str,
              grobid_client: Optional[GrobidClient],
              parse_refs: bool = True,
              **kwargs):
     super().__init__(**kwargs)
     self.db = SandcrawlerPostgresClient(db_url)
     self.cur = self.db.conn.cursor()
     if grobid_client:
         self.grobid_client = grobid_client
     else:
         self.grobid_client = GrobidClient()
     self.parse_refs = parse_refs
コード例 #3
0
class PersistCdxWorker(SandcrawlerWorker):
    def __init__(self, db_url: str, **kwargs):
        super().__init__()
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)
        # filter to full CDX lines, no liveweb
        cdx_batch = [
            r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])
        ]
        resp = self.db.insert_cdx(self.cur, cdx_batch)
        if len(cdx_batch) < len(batch):
            self.counts["skip"] += len(batch) - len(cdx_batch)
        self.counts["insert-cdx"] += resp[0]
        self.counts["update-cdx"] += resp[1]
        self.db.commit()
        return []
コード例 #4
0
class PersistPdfTrioWorker(SandcrawlerWorker):
    def __init__(self, db_url: str, **kwargs):
        super().__init__()
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)

        batch = [
            r for r in batch
            if "pdf_trio" in r and r["pdf_trio"].get("status_code")
        ]
        for r in batch:
            # copy key (sha1hex) into sub-object
            r["pdf_trio"]["key"] = r["key"]
        pdftrio_batch = [r["pdf_trio"] for r in batch]
        resp = self.db.insert_pdftrio(self.cur,
                                      pdftrio_batch,
                                      on_conflict="update")
        self.counts["insert-pdftrio"] += resp[0]
        self.counts["update-pdftrio"] += resp[1]

        file_meta_batch = [
            r["file_meta"] for r in batch
            if r["pdf_trio"]["status"] == "success" and r.get("file_meta")
        ]
        resp = self.db.insert_file_meta(self.cur, file_meta_batch)
        self.counts["insert-file-meta"] += resp[0]
        self.counts["update-file-meta"] += resp[1]

        self.db.commit()
        return []
コード例 #5
0
 def __init__(self, db_url: str, **kwargs):
     super().__init__()
     self.s3 = SandcrawlerMinioClient(
         host_url=kwargs.get("s3_url", "localhost:9000"),
         access_key=kwargs["s3_access_key"],
         secret_key=kwargs["s3_secret_key"],
         default_bucket=kwargs["s3_bucket"],
     )
     self.s3_only = kwargs.get("s3_only", False)
     self.db_only = kwargs.get("db_only", False)
     assert not (self.s3_only and
                 self.db_only), "Only one of s3_only and db_only allowed"
     if not self.s3_only:
         self.db: Optional[
             SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
         self.cur: Optional[
             psycopg2.extensions.cursor] = self.db.conn.cursor()
     else:
         self.db = None
         self.cur = None
コード例 #6
0
 def __init__(self, db_url: str, **kwargs):
     super().__init__(**kwargs)
     self.db = SandcrawlerPostgresClient(db_url)
     self.cur = self.db.conn.cursor()
コード例 #7
0
class PersistCrossrefWorker(SandcrawlerWorker):
    """
    Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
    parsed 'unstructured' references, and push the results in to postgresql at
    the same time.
    """
    def __init__(self,
                 db_url: str,
                 grobid_client: Optional[GrobidClient],
                 parse_refs: bool = True,
                 **kwargs):
        super().__init__(**kwargs)
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()
        if grobid_client:
            self.grobid_client = grobid_client
        else:
            self.grobid_client = GrobidClient()
        self.parse_refs = parse_refs

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)

        crossref_batch = []
        refs_batch = []
        for record in batch:
            crossref_batch.append(
                dict(
                    doi=record["DOI"].lower().strip(),
                    indexed=record["indexed"]["date-time"],
                    record=record,
                ))
            if self.parse_refs:
                try:
                    parsed_refs = self.grobid_client.crossref_refs(record)
                    refs_batch.append(parsed_refs)
                except (
                        xml.etree.ElementTree.ParseError,
                        requests.exceptions.HTTPError,
                        requests.exceptions.ReadTimeout,
                ):
                    print(
                        "GROBID crossref refs parsing error, skipping with a sleep"
                    )
                    time.sleep(3)
                    pass

        resp = self.db.insert_crossref(self.cur, crossref_batch)
        if len(crossref_batch) < len(batch):
            self.counts["skip"] += len(batch) - len(crossref_batch)
        self.counts["insert-crossref"] += resp[0]
        self.counts["update-crossref"] += resp[1]

        if refs_batch:
            resp = self.db.insert_grobid_refs(self.cur, refs_batch)
            if len(refs_batch) < len(batch):
                self.counts["skip"] += len(batch) - len(refs_batch)
            self.counts["insert-grobid_refs"] += resp[0]
            self.counts["update-grobid_refs"] += resp[1]

        self.db.commit()
        return []
コード例 #8
0
class PersistIngestFileResultWorker(SandcrawlerWorker):
    def __init__(self, db_url: str, **kwargs):
        super().__init__()
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema

        if there is a problem with conversion, return None
        """
        # backwards compat hacks; transform request to look like current schema
        if raw.get("ingest_type") == "file":
            raw["ingest_type"] = "pdf"
        if (not raw.get("link_source") and raw.get("base_url")
                and raw.get("ext_ids", {}).get("doi") and raw["base_url"]
                == "https://doi.org/{}".format(raw["ext_ids"]["doi"])):
            # set link_source(_id) for old ingest requests
            raw["link_source"] = "doi"
            raw["link_source_id"] = raw["ext_ids"]["doi"]
        if (not raw.get("link_source") and raw.get(
                "ingest_request_source", "").startswith("savepapernow")
                and raw.get("fatcat", {}).get("release_ident")):
            # set link_source(_id) for old ingest requests
            raw["link_source"] = "spn"
            raw["link_source_id"] = raw["fatcat"]["release_ident"]

        for k in ("ingest_type", "base_url", "link_source", "link_source_id"):
            if k not in raw:
                self.counts["skip-request-fields"] += 1
                return None
        if raw["ingest_type"] not in ("pdf", "xml", "html"):
            self.counts["skip-ingest-type"] += 1
            return None
        request = {
            "ingest_type": raw["ingest_type"],
            "base_url": raw["base_url"],
            "link_source": raw["link_source"],
            "link_source_id": raw["link_source_id"],
            "ingest_request_source": raw.get("ingest_request_source"),
            "request": {},
        }
        # extra/optional fields
        if raw.get("release_stage"):
            request["release_stage"] = raw["release_stage"]
        if raw.get("fatcat", {}).get("release_ident"):
            request["request"]["release_ident"] = raw["fatcat"][
                "release_ident"]
        for k in ("ext_ids", "edit_extra", "rel"):
            if raw.get(k):
                request["request"][k] = raw[k]
        # if this dict is empty, trim it to save DB space
        if not request["request"]:
            request["request"] = None
        return request

    def file_result_to_row(self, raw: dict) -> Optional[dict]:
        """
        Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema

        if there is a problem with conversion, return None and set skip count
        """
        for k in ("request", "hit", "status"):
            if k not in raw:
                self.counts["skip-result-fields"] += 1
                return None
        if "base_url" not in raw["request"]:
            self.counts["skip-result-fields"] += 1
            return None
        ingest_type = raw["request"].get("ingest_type")
        if ingest_type == "file":
            ingest_type = "pdf"
        if ingest_type not in (
                "pdf",
                "xml",
                "html",
                "component",
                "src",
                "dataset",
                "dataset-file",
        ):
            self.counts["skip-ingest-type"] += 1
            return None
        if raw["status"] in ("existing", ):
            self.counts["skip-existing"] += 1
            return None
        result = {
            "ingest_type": ingest_type,
            "base_url": raw["request"]["base_url"],
            "hit": raw["hit"],
            "status": raw["status"],
        }
        terminal = raw.get("terminal")
        if terminal:
            result["terminal_url"] = terminal.get(
                "terminal_url") or terminal.get("url")
            result["terminal_dt"] = terminal.get("terminal_dt")
            result["terminal_status_code"] = (
                terminal.get("terminal_status_code")
                or terminal.get("status_code") or terminal.get("http_code"))
            if result["terminal_status_code"]:
                result["terminal_status_code"] = int(
                    result["terminal_status_code"])
            result["terminal_sha1hex"] = terminal.get("terminal_sha1hex")
            if len(result["terminal_url"]) > 2048:
                # postgresql13 doesn't like extremely large URLs in b-tree index
                self.counts["skip-huge-url"] += 1
                return None
        return result

    def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
        html_body = record.get("html_body")
        file_meta = record.get("file_meta")
        if not (file_meta and html_body):
            return None
        return HtmlMetaRow(
            sha1hex=file_meta["sha1hex"],
            status=record.get("status"),
            scope=record.get("scope"),
            has_teixml=bool(html_body and html_body["status"] == "success"),
            has_thumbnail=False,  # TODO
            word_count=(html_body and html_body.get("word_count")) or None,
            biblio=record.get("html_biblio"),
            resources=record.get("html_resources"),
        )

    def result_to_platform_row(self, raw: dict) -> Optional[dict]:
        """
        Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema

        if there is a problem with conversion, return None and set skip count
        """
        for k in ("request", "hit", "status"):
            if k not in raw:
                return None
        if "base_url" not in raw["request"]:
            return None
        ingest_type = raw["request"].get("ingest_type")
        if ingest_type not in ("dataset"):
            return None
        if raw["status"] in ("existing", ):
            return None
        if not raw.get("platform_name"):
            return None
        result = {
            "ingest_type": ingest_type,
            "base_url": raw["request"]["base_url"],
            "hit": raw["hit"],
            "status": raw["status"],
            "platform_name": raw.get("platform_name"),
            "platform_domain": raw.get("platform_domain"),
            "platform_id": raw.get("platform_id"),
            "ingest_strategy": raw.get("ingest_strategy"),
            "total_size": raw.get("total_size"),
            "file_count": raw.get("file_count"),
            "archiveorg_item_name": raw.get("archiveorg_item_name"),
            "archiveorg_item_bundle_path": None,
            "web_bundle_url": None,
            "web_bundle_dt": None,
            "manifest": raw.get("manifest"),
        }
        if result.get("fileset_bundle"):
            result["archiveorg_item_bundle_path"] = result[
                "fileset_bundle"].get("archiveorg_item_bundle_path")
            result["web_bundle_url"] = (result["fileset_bundle"].get(
                "terminal", {}).get("terminal_url"))
            result["web_bundle_dt"] = (result["fileset_bundle"].get(
                "terminal", {}).get("terminal_dt"))
        return result

    def push_batch(self, batch: List[Any]) -> List[Any]:
        self.counts["total"] += len(batch)

        if not batch:
            return []

        results_unfiltered = [self.file_result_to_row(raw) for raw in batch]
        results = [r for r in results_unfiltered if r]

        irequests_unfiltered = [
            self.request_to_row(raw["request"]) for raw in batch
            if raw.get("request")
        ]
        irequests = [
            r for r in irequests_unfiltered
            if r and r["ingest_type"] != "dataset-file"
        ]

        if irequests:
            resp = self.db.insert_ingest_request(self.cur, irequests)
            self.counts["insert-requests"] += resp[0]
            self.counts["update-requests"] += resp[1]
        if results:
            resp = self.db.insert_ingest_file_result(self.cur,
                                                     results,
                                                     on_conflict="update")
            self.counts["insert-results"] += resp[0]
            self.counts["update-results"] += resp[1]

        # these schemas match, so can just pass through
        cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")]
        revisit_cdx_batch = [
            r["revisit_cdx"] for r in batch
            if r.get("hit") and r.get("revisit_cdx")
        ]
        cdx_batch.extend(revisit_cdx_batch)
        # filter to full CDX lines, with full warc_paths (not liveweb)
        cdx_batch = [
            r for r in cdx_batch
            if r.get("warc_path") and ("/" in r["warc_path"])
        ]
        if cdx_batch:
            resp = self.db.insert_cdx(self.cur, cdx_batch)
            self.counts["insert-cdx"] += resp[0]
            self.counts["update-cdx"] += resp[1]

        file_meta_batch = [
            r["file_meta"] for r in batch
            if r.get("hit") and r.get("file_meta")
        ]
        if file_meta_batch:
            resp = self.db.insert_file_meta(self.cur,
                                            file_meta_batch,
                                            on_conflict="nothing")
            self.counts["insert-file_meta"] += resp[0]
            self.counts["update-file_meta"] += resp[1]

        html_meta_batch = [
            self.result_to_html_meta(r) for r in batch
            if r.get("hit") and r.get("html_body")
        ]
        if html_meta_batch:
            rows = [d.to_sql_tuple() for d in html_meta_batch if d]
            resp = self.db.insert_html_meta(self.cur,
                                            rows,
                                            on_conflict="update")
            self.counts["insert-html_meta"] += resp[0]
            self.counts["update-html_meta"] += resp[1]

        fileset_platform_batch_all = [
            self.result_to_platform_row(raw) for raw in batch
            if raw.get("request", {}).get("ingest_type") == "dataset"
            and raw.get("platform_name")
        ]
        fileset_platform_batch: List[Dict] = [
            p for p in fileset_platform_batch_all if p
        ]
        if fileset_platform_batch:
            resp = self.db.insert_ingest_fileset_platform(
                self.cur, fileset_platform_batch, on_conflict="update")
            self.counts["insert-fileset_platform"] += resp[0]
            self.counts["update-fileset_platform"] += resp[1]

        self.db.commit()
        return []