def process_batch(self, batch: List[dict]) -> None: bulk_actions = [] for obj in batch: bundle = IntermediateBundle.from_json(obj) assert bundle.doc_type es_doc = transform_heavy(bundle) if not es_doc: continue else: bulk_actions.append( { "_index": self.es_index, "_op_type": "index", "_id": es_doc.key, "_source": es_doc.json(exclude_none=True, sort_keys=True), } ) self.counts["docs-indexed"] += 1 if not bulk_actions: return elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="50s") self.counts["batches-indexed"] += 1
def run_refs(infile: Sequence) -> None: for line in infile: obj = json.loads(line) heavy = IntermediateBundle.from_json(obj) assert heavy.doc_type refs = refs_from_heavy(heavy) for ref in refs: print(ref.json(exclude_none=True, sort_keys=True))
def run_issue_db(self, limit: int = None) -> None: count = 0 self.issue_db.db.row_factory = sqlite3.Row cur = self.issue_db.db.cursor() for row in cur.execute( "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3" ): # filter out "contents" and "index" items # TODO: more filters; also redundant with IssueDB code? if row["issue_item"].endswith( "_contents") or row["issue_item"].endswith("_index"): continue try: full_issue = self.fetch_sim_issue(row) except requests.exceptions.ConnectionError as e: print(str(e), file=sys.stderr) continue except requests.exceptions.ReadTimeout as e: print(str(e), file=sys.stderr) continue if not full_issue: continue for leaf in full_issue["page_texts"]: bundle = IntermediateBundle( doc_type=DocType.sim_page, releases=[], biblio_release_ident=None, grobid_fulltext=None, pdftotext_fulltext=None, sim_fulltext=dict( issue_item=full_issue["issue_item"], pages=str(leaf["page_num"]), page_texts=[leaf], release_ident=None, pub_item_metadata=full_issue["pub_item_metadata"], issue_item_metadata=full_issue["issue_item_metadata"], ), ) print(bundle.json(exclude_none=True, sort_keys=True)) count += 1 if limit is not None and count >= limit: break if limit is not None and count >= limit: break
def run_transform(infile: Sequence) -> None: for line in infile: obj = json.loads(line) heavy = IntermediateBundle.from_json(obj) assert heavy.doc_type es_doc = transform_heavy(heavy) if not es_doc: continue print(es_doc.json(exclude_none=True, sort_keys=True))
def full_issue_to_pages(self, full_issue: dict) -> List[IntermediateBundle]: pages = [] for leaf in full_issue["page_texts"]: bundle = IntermediateBundle( doc_type=DocType.sim_page, releases=[], biblio_release_ident=None, grobid_fulltext=None, pdftotext_fulltext=None, sim_fulltext=dict( issue_item=full_issue["issue_item"], pages=str(leaf["page_num"]), page_texts=[leaf], release_ident=None, pub_item_metadata=full_issue["pub_item_metadata"], issue_item_metadata=full_issue["issue_item_metadata"], ), ) pages.append(bundle) return pages
def process_batch(self, batch: List[dict]) -> None: bulk_actions = [] for obj in batch: bundle = IntermediateBundle( doc_type=DocType(obj["doc_type"]), releases=[ entity_from_json(json.dumps(re), ReleaseEntity) for re in obj["releases"] ], biblio_release_ident=obj.get("biblio_release_ident"), grobid_fulltext=obj.get("grobid_fulltext"), pdftotext_fulltext=obj.get("pdftotext_fulltext"), pdf_meta=obj.get("pdf_meta"), html_fulltext=obj.get("html_fulltext"), sim_fulltext=obj.get("sim_fulltext"), ) es_doc = transform_heavy(bundle) if not es_doc: continue else: bulk_actions.append({ "_index": self.es_index, "_op_type": "index", "_id": es_doc.key, "_source": es_doc.json(exclude_none=True, sort_keys=True), }) self.counts["docs-indexed"] += 1 if not bulk_actions: return elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="30s") self.counts["batches-indexed"] += 1
def process_release_list( self, releases: List[ReleaseEntity]) -> IntermediateBundle: """ 1. find best accessible fatcat file => fetch GROBID XML if available, else pdftotext if available => link to thumbnail if available 2. find best SIM microfilm copy available """ pref_idents = fulltext_pref_list(releases) release_dict = {r.ident: r for r in releases} # print(f"pref_idents={pref_idents}", file=sys.stderr) # find best accessible fatcat file grobid_fulltext: Optional[Any] = None pdf_meta: Optional[Any] = None pdftotext_fulltext: Optional[Any] = None html_fulltext: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] if not (release.files or release.webcaptures): continue for fe in release.files: if not fe.sha1 or fe.mimetype not in (None, "application/pdf"): continue if not fe.urls: continue grobid_fulltext = self.fetch_file_grobid(fe, ident) pdf_meta = self.fetch_pdf_meta(fe, ident) pdftotext_fulltext = None if pdf_meta and not grobid_fulltext: pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break pdf_meta = None for wc in release.webcaptures: # find primary web capture object html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident) if html_fulltext and html_fulltext.get("tei_xml"): break html_fulltext = None if grobid_fulltext or pdftotext_fulltext or html_fulltext: break # find best accessible SIM metadata and fulltext sim_fulltext: Optional[Any] = None sim_issue: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] # print(f"{release.extra}\n{release.pages}", file=sys.stderr) if not release.pages: continue # TODO: in the future, will use release.extra.ia.sim.sim_pubid for lookup sim_issue = self.lookup_sim(release) # print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr) if not sim_issue: continue sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid) if not sim_pub: continue # XXX: control flow tweak? try: sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident) except requests.exceptions.ConnectionError as e: print(str(e), file=sys.stderr) continue except requests.exceptions.ReadTimeout as e: print(str(e), file=sys.stderr) continue if sim_fulltext: break return IntermediateBundle( doc_type=DocType.work, releases=releases, biblio_release_ident=pref_idents[0], grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, pdf_meta=pdf_meta, html_fulltext=html_fulltext, sim_fulltext=sim_fulltext, )