Ejemplo n.º 1
0
class PersistGrobidRefsWorker(SandcrawlerWorker):
    """
    Simple persist worker to backfill GROBID references in to postgresql
    locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
    """
    def __init__(self, db_url: str, **kwargs):
        super().__init__(**kwargs)
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)

        refs_batch = []
        for record in batch:
            assert record["source"]
            assert record["source_id"]
            refs_batch.append(record)

        resp = self.db.insert_grobid_refs(self.cur, refs_batch)
        if len(refs_batch) < len(batch):
            self.counts["skip"] += len(batch) - len(refs_batch)
        self.counts["insert-grobid_refs"] += resp[0]
        self.counts["update-grobid_refs"] += resp[1]

        self.db.commit()
        return []
Ejemplo n.º 2
0
class PersistCrossrefWorker(SandcrawlerWorker):
    """
    Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
    parsed 'unstructured' references, and push the results in to postgresql at
    the same time.
    """
    def __init__(self,
                 db_url: str,
                 grobid_client: Optional[GrobidClient],
                 parse_refs: bool = True,
                 **kwargs):
        super().__init__(**kwargs)
        self.db = SandcrawlerPostgresClient(db_url)
        self.cur = self.db.conn.cursor()
        if grobid_client:
            self.grobid_client = grobid_client
        else:
            self.grobid_client = GrobidClient()
        self.parse_refs = parse_refs

    def process(self, record: Any, key: Optional[str] = None) -> Any:
        """Only do batches (as transactions)"""
        raise NotImplementedError

    def push_batch(self, batch: list) -> list:
        self.counts["total"] += len(batch)

        crossref_batch = []
        refs_batch = []
        for record in batch:
            crossref_batch.append(
                dict(
                    doi=record["DOI"].lower().strip(),
                    indexed=record["indexed"]["date-time"],
                    record=record,
                ))
            if self.parse_refs:
                try:
                    parsed_refs = self.grobid_client.crossref_refs(record)
                    refs_batch.append(parsed_refs)
                except (
                        xml.etree.ElementTree.ParseError,
                        requests.exceptions.HTTPError,
                        requests.exceptions.ReadTimeout,
                ):
                    print(
                        "GROBID crossref refs parsing error, skipping with a sleep"
                    )
                    time.sleep(3)
                    pass

        resp = self.db.insert_crossref(self.cur, crossref_batch)
        if len(crossref_batch) < len(batch):
            self.counts["skip"] += len(batch) - len(crossref_batch)
        self.counts["insert-crossref"] += resp[0]
        self.counts["update-crossref"] += resp[1]

        if refs_batch:
            resp = self.db.insert_grobid_refs(self.cur, refs_batch)
            if len(refs_batch) < len(batch):
                self.counts["skip"] += len(batch) - len(refs_batch)
            self.counts["insert-grobid_refs"] += resp[0]
            self.counts["update-grobid_refs"] += resp[1]

        self.db.commit()
        return []