def get_headers_for_document_id(document_id, document_text=None):
    if document_text is None:
        ecco_api_client = OctavoEccoClient()
        document_data = ecco_api_client.get_text_for_document_id(document_id)
        document_text = document_data.get('text')
    headerdata = get_headers_from_document_text(document_text)
    return headerdata
Example #2
0
 def set_octavo_indices(self):
     ecco_api_client = OctavoEccoClient()
     eebo_api_client = OctavoEeboClient()
     for document_id, fragment_list in self.fragments_by_ecco_id.items():
         # TODO: handle the notes somehow!
         if "_note_" in str(document_id):
             textdata = None
         elif len(document_id) < 10:
             textdata = eebo_api_client.get_text_for_document_id(
                 document_id).get('text')
         else:
             textdata = ecco_api_client.get_text_for_document_id(
                 document_id).get('text')
         for fragment in fragment_list:
             fragment.set_octavo_index(textdata)
from lib.octavo_api_client import (
    OctavoEccoClient,
    OctavoEccoClusterClient
    )

from lib.fragmentlists import (
    get_fragmentlist,
    get_doctext_indexmap,
    test_fragment_text)

from lib.utils_common import create_dir_if_not_exists
from lib.headerdata_dump_common import read_docid_asciimap_csv


ecco_api_client = OctavoEccoClient()
cluster_api_client = OctavoEccoClusterClient(timeout=600)

docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv')

fields_ecco = ["documentID", "content"]
field_eccocluster = ["documentID", "fragmentID", "text",
                     "startIndex", "endIndex"]


docid_to_process = "0162900301"

docid_clusterdata = (
    cluster_api_client.get_cluster_data_for_document_id(
        docid_to_process, fields=field_eccocluster))