Beispiel #1
0
    def process_batch(self, batch: List[dict]) -> None:

        bulk_actions = []
        for obj in batch:
            bundle = IntermediateBundle.from_json(obj)
            assert bundle.doc_type
            es_doc = transform_heavy(bundle)
            if not es_doc:
                continue
            else:
                bulk_actions.append(
                    {
                        "_index": self.es_index,
                        "_op_type": "index",
                        "_id": es_doc.key,
                        "_source": es_doc.json(exclude_none=True, sort_keys=True),
                    }
                )
                self.counts["docs-indexed"] += 1

        if not bulk_actions:
            return

        elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="50s")
        self.counts["batches-indexed"] += 1
def run_refs(infile: Sequence) -> None:
    for line in infile:
        obj = json.loads(line)

        heavy = IntermediateBundle.from_json(obj)
        assert heavy.doc_type
        refs = refs_from_heavy(heavy)
        for ref in refs:
            print(ref.json(exclude_none=True, sort_keys=True))
Beispiel #3
0
 def run_issue_db(self, limit: int = None) -> None:
     count = 0
     self.issue_db.db.row_factory = sqlite3.Row
     cur = self.issue_db.db.cursor()
     for row in cur.execute(
             "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3"
     ):
         # filter out "contents" and "index" items
         # TODO: more filters; also redundant with IssueDB code?
         if row["issue_item"].endswith(
                 "_contents") or row["issue_item"].endswith("_index"):
             continue
         try:
             full_issue = self.fetch_sim_issue(row)
         except requests.exceptions.ConnectionError as e:
             print(str(e), file=sys.stderr)
             continue
         except requests.exceptions.ReadTimeout as e:
             print(str(e), file=sys.stderr)
             continue
         if not full_issue:
             continue
         for leaf in full_issue["page_texts"]:
             bundle = IntermediateBundle(
                 doc_type=DocType.sim_page,
                 releases=[],
                 biblio_release_ident=None,
                 grobid_fulltext=None,
                 pdftotext_fulltext=None,
                 sim_fulltext=dict(
                     issue_item=full_issue["issue_item"],
                     pages=str(leaf["page_num"]),
                     page_texts=[leaf],
                     release_ident=None,
                     pub_item_metadata=full_issue["pub_item_metadata"],
                     issue_item_metadata=full_issue["issue_item_metadata"],
                 ),
             )
             print(bundle.json(exclude_none=True, sort_keys=True))
             count += 1
             if limit is not None and count >= limit:
                 break
         if limit is not None and count >= limit:
             break
def run_transform(infile: Sequence) -> None:
    for line in infile:
        obj = json.loads(line)

        heavy = IntermediateBundle.from_json(obj)
        assert heavy.doc_type
        es_doc = transform_heavy(heavy)
        if not es_doc:
            continue
        print(es_doc.json(exclude_none=True, sort_keys=True))
Beispiel #5
0
 def full_issue_to_pages(self,
                         full_issue: dict) -> List[IntermediateBundle]:
     pages = []
     for leaf in full_issue["page_texts"]:
         bundle = IntermediateBundle(
             doc_type=DocType.sim_page,
             releases=[],
             biblio_release_ident=None,
             grobid_fulltext=None,
             pdftotext_fulltext=None,
             sim_fulltext=dict(
                 issue_item=full_issue["issue_item"],
                 pages=str(leaf["page_num"]),
                 page_texts=[leaf],
                 release_ident=None,
                 pub_item_metadata=full_issue["pub_item_metadata"],
                 issue_item_metadata=full_issue["issue_item_metadata"],
             ),
         )
         pages.append(bundle)
     return pages
Beispiel #6
0
    def process_batch(self, batch: List[dict]) -> None:

        bulk_actions = []
        for obj in batch:
            bundle = IntermediateBundle(
                doc_type=DocType(obj["doc_type"]),
                releases=[
                    entity_from_json(json.dumps(re), ReleaseEntity)
                    for re in obj["releases"]
                ],
                biblio_release_ident=obj.get("biblio_release_ident"),
                grobid_fulltext=obj.get("grobid_fulltext"),
                pdftotext_fulltext=obj.get("pdftotext_fulltext"),
                pdf_meta=obj.get("pdf_meta"),
                html_fulltext=obj.get("html_fulltext"),
                sim_fulltext=obj.get("sim_fulltext"),
            )
            es_doc = transform_heavy(bundle)
            if not es_doc:
                continue
            else:
                bulk_actions.append({
                    "_index":
                    self.es_index,
                    "_op_type":
                    "index",
                    "_id":
                    es_doc.key,
                    "_source":
                    es_doc.json(exclude_none=True, sort_keys=True),
                })
                self.counts["docs-indexed"] += 1

        if not bulk_actions:
            return

        elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="30s")
        self.counts["batches-indexed"] += 1
    def process_release_list(
            self, releases: List[ReleaseEntity]) -> IntermediateBundle:
        """
        1. find best accessible fatcat file
            => fetch GROBID XML if available, else pdftotext if available
            => link to thumbnail if available
        2. find best SIM microfilm copy available
        """
        pref_idents = fulltext_pref_list(releases)
        release_dict = {r.ident: r for r in releases}

        # print(f"pref_idents={pref_idents}", file=sys.stderr)

        # find best accessible fatcat file
        grobid_fulltext: Optional[Any] = None
        pdf_meta: Optional[Any] = None
        pdftotext_fulltext: Optional[Any] = None
        html_fulltext: Optional[Any] = None
        for ident in pref_idents:
            release = release_dict[ident]
            if not (release.files or release.webcaptures):
                continue
            for fe in release.files:
                if not fe.sha1 or fe.mimetype not in (None, "application/pdf"):
                    continue
                if not fe.urls:
                    continue
                grobid_fulltext = self.fetch_file_grobid(fe, ident)
                pdf_meta = self.fetch_pdf_meta(fe, ident)
                pdftotext_fulltext = None
                if pdf_meta and not grobid_fulltext:
                    pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
                if grobid_fulltext or pdftotext_fulltext:
                    break
                pdf_meta = None
            for wc in release.webcaptures:
                # find primary web capture object
                html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident)
                if html_fulltext and html_fulltext.get("tei_xml"):
                    break
                html_fulltext = None
            if grobid_fulltext or pdftotext_fulltext or html_fulltext:
                break

        # find best accessible SIM metadata and fulltext
        sim_fulltext: Optional[Any] = None
        sim_issue: Optional[Any] = None
        for ident in pref_idents:
            release = release_dict[ident]
            # print(f"{release.extra}\n{release.pages}", file=sys.stderr)
            if not release.pages:
                continue
            # TODO: in the future, will use release.extra.ia.sim.sim_pubid for lookup
            sim_issue = self.lookup_sim(release)
            # print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr)
            if not sim_issue:
                continue
            sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid)
            if not sim_pub:
                continue
            # XXX: control flow tweak?
            try:
                sim_fulltext = self.fetch_sim(sim_issue, sim_pub,
                                              release.pages, release.ident)
            except requests.exceptions.ConnectionError as e:
                print(str(e), file=sys.stderr)
                continue
            except requests.exceptions.ReadTimeout as e:
                print(str(e), file=sys.stderr)
                continue
            if sim_fulltext:
                break

        return IntermediateBundle(
            doc_type=DocType.work,
            releases=releases,
            biblio_release_ident=pref_idents[0],
            grobid_fulltext=grobid_fulltext,
            pdftotext_fulltext=pdftotext_fulltext,
            pdf_meta=pdf_meta,
            html_fulltext=html_fulltext,
            sim_fulltext=sim_fulltext,
        )