Esempio n. 1
0
def retrieve_journals_from_titles(stub, withText=0, titleIds=[]):
    """ Retrieves the pages belonging to the requested journals, collects them into Journal objects, and streams them back to the caller.

    The format of a page id is "JournalName_PageNumber". For example, "americanjournalo03amer_0497" indicates the journal name is "americanjournalo03amer" and we are 
    looking at page 0497. The page number corresponds to the keys the Journal object's dictionaries

    Inputs:
    withText - Either 0 or 1, indicating whether or not we want to stream the pages with their associated text. By default, we do not stream the page's associated text.
    titleIds - A list of journal ids that we are interested in. These correspond to the indices of the journals in the database. If this field is [], we collect 
                every journal 
    """
    pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=titleIds)
    pages = stub.Pages(pagesOpt)  # A stream of Page objects

    prev_title = ""
    first = True
    journal = None

    for page in pages:

        page_num = page.id[page.id.rfind("_") + 1:]
        if page.title_id != prev_title:
            if not first:  # Yield what we have currently before creating a new journal
                yield journal
            else:
                first = False
            journal = Journal(
                page.title_id
            )  # Create a new journal without information about its id, path, or language
            prev_title = page.title_id
        if withText == 1:
            journal.add_page(page_num, page.names, str(page.text))
        else:
            journal.add_names(page_num, page.names)
Esempio n. 2
0
def stream_journal_collections(stub, filepath, withText=1):
    """ Streams all of the journals from the database, puts them into JournalCollections, and then dumps to a specified location """
    journalCollections = dict()
    num_journals = 10000
    titles = stub.Titles(protob_pb2.TitlesOpt())
    for title in titles:
        if num_journals < 0:
            break
        print(title.archive_id)
        journalCollection = journalCollections.get(
            title.archive_id, JournalCollection(title.archive_id))
        journal = Journal(title)
        pagesOpt = protob_pb2.PagesOpt(with_text=withText,
                                       title_ids=[title.id])
        for page in stub.Pages(pagesOpt):  # A stream of Page objects
            journal.add_page(page)
        journalCollection.add_edition(journal)
        journalCollections[title.archive_id] = journalCollection
        num_journals -= 1
    print("Dumping")
    for k, v in journalCollections.items():
        print(k)
        journalCollections[k] = v.convertToDict()
    with open(filepath, "w") as f:
        json.dump(journalCollections, f)
Esempio n. 3
0
def Pages(stub, withText=False, titles=[]):
    """ Implements the Pages RPC service. Streams Page objects back to the caller.

    If the caller provides a list of Journal titles, they will receive the pages corresponding to those journals. 
    Otherwise, they will receive every page in the database.
    """
    pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=titles)
    pages = stub.Pages(pagesOpt)  # A stream of Pages

    for page in pages:
        yield page
        """ 
Esempio n. 4
0
def retrieve_journals(stub, withText=0):
    """ Retrieves the pages belonging to all of the journals in the database, collects them into Journal objects, and streams them back to the caller """
    titles = stub.Titles(protob_pb2.TitlesOpt())
    for title in titles:
        pagesOpt = protob_pb2.PagesOpt(with_text=withText,
                                       title_ids=[title.id])
        journal = Journal(title.archive_id, title.id, title.path, title.lang)
        for page in stub.Pages(pagesOpt):  # A stream of Page objects
            page_num = int(page.id[page.id.rfind("_") + 1:])
            if withText == 1:
                journal.add_page(page_num, page.names, str(page.text))
            else:
                journal.add_names(page_num, page.names)
        yield journal
Esempio n. 5
0
def Pages(stub, withText=0):
    """ Streams back the pages to the caller """
    pagesOpt = protob_pb2.PagesOpt(with_text=withText, title_ids=[])
    pages = stub.Pages(pagesOpt)  # A stream of pages

    if withText == 0:
        for page in pages:
            print(
                "ID: {}\nOffset: {}\nTitle_ID: {}\nTitle_Path: {}\nNames: {}\n\n"
                .format(page.id, page.offset, page.title_id, page.title_path,
                        page.names))
            yield page
    else:
        for page in pages:
            text = page.text
            text = text.replace(b'\r', b'')
            text = text.replace(b'\n', b'')
            text = text.decode("ascii", "ignore")
            print(
                "ID: {}\nOffset: {}\nTitle_ID: {}\nTitle_Path: {}\nNames: {}\nText:{}\n\n"
                .format(page.id, page.offset, page.title_id, page.title_path,
                        page.names, text))
            yield page
Esempio n. 6
0
 def pages(self):
     return self.stub.Pages(
         protob_pb2.PagesOpt(title_ids=[1, 2], with_text=True))