def retrieve_journals_from_titles(stub, withText=0, titleIds=[]): """ Retrieves the pages belonging to the requested journals, collects them into Journal objects, and streams them back to the caller. The format of a page id is "JournalName_PageNumber". For example, "americanjournalo03amer_0497" indicates the journal name is "americanjournalo03amer" and we are looking at page 0497. The page number corresponds to the keys the Journal object's dictionaries Inputs: withText - Either 0 or 1, indicating whether or not we want to stream the pages with their associated text. By default, we do not stream the page's associated text. titleIds - A list of journal ids that we are interested in. These correspond to the indices of the journals in the database. If this field is [], we collect every journal """ pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=titleIds) pages = stub.Pages(pagesOpt) # A stream of Page objects prev_title = "" first = True journal = None for page in pages: page_num = page.id[page.id.rfind("_") + 1:] if page.title_id != prev_title: if not first: # Yield what we have currently before creating a new journal yield journal else: first = False journal = Journal( page.title_id ) # Create a new journal without information about its id, path, or language prev_title = page.title_id if withText == 1: journal.add_page(page_num, page.names, str(page.text)) else: journal.add_names(page_num, page.names)
def stream_journal_collections(stub, filepath, withText=1): """ Streams all of the journals from the database, puts them into JournalCollections, and then dumps to a specified location """ journalCollections = dict() num_journals = 10000 titles = stub.Titles(protob_pb2.TitlesOpt()) for title in titles: if num_journals < 0: break print(title.archive_id) journalCollection = journalCollections.get( title.archive_id, JournalCollection(title.archive_id)) journal = Journal(title) pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=[title.id]) for page in stub.Pages(pagesOpt): # A stream of Page objects journal.add_page(page) journalCollection.add_edition(journal) journalCollections[title.archive_id] = journalCollection num_journals -= 1 print("Dumping") for k, v in journalCollections.items(): print(k) journalCollections[k] = v.convertToDict() with open(filepath, "w") as f: json.dump(journalCollections, f)
def Pages(stub, withText=False, titles=[]): """ Implements the Pages RPC service. Streams Page objects back to the caller. If the caller provides a list of Journal titles, they will receive the pages corresponding to those journals. Otherwise, they will receive every page in the database. """ pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=titles) pages = stub.Pages(pagesOpt) # A stream of Pages for page in pages: yield page """
def retrieve_journals(stub, withText=0): """ Retrieves the pages belonging to all of the journals in the database, collects them into Journal objects, and streams them back to the caller """ titles = stub.Titles(protob_pb2.TitlesOpt()) for title in titles: pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=[title.id]) journal = Journal(title.archive_id, title.id, title.path, title.lang) for page in stub.Pages(pagesOpt): # A stream of Page objects page_num = int(page.id[page.id.rfind("_") + 1:]) if withText == 1: journal.add_page(page_num, page.names, str(page.text)) else: journal.add_names(page_num, page.names) yield journal
def Pages(stub, withText=0): """ Streams back the pages to the caller """ pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=[]) pages = stub.Pages(pagesOpt) # A stream of pages if withText == 0: for page in pages: print( "ID: {}\nOffset: {}\nTitle_ID: {}\nTitle_Path: {}\nNames: {}\n\n" .format(page.id, page.offset, page.title_id, page.title_path, page.names)) yield page else: for page in pages: text = page.text text = text.replace(b'\r', b'') text = text.replace(b'\n', b'') text = text.decode("ascii", "ignore") print( "ID: {}\nOffset: {}\nTitle_ID: {}\nTitle_Path: {}\nNames: {}\nText:{}\n\n" .format(page.id, page.offset, page.title_id, page.title_path, page.names, text)) yield page
def pages(self): return self.stub.Pages( protob_pb2.PagesOpt(title_ids=[1, 2], with_text=True))