def get_headers_for_document_id(document_id, document_text=None):
    if document_text is None:
        ecco_api_client = OctavoEccoClient()
        document_data = ecco_api_client.get_text_for_document_id(document_id)
        document_text = document_data.get('text')
    headerdata = get_headers_from_document_text(document_text)
    return headerdata
Esempio n. 2
0
 def set_octavo_indices(self):
     ecco_api_client = OctavoEccoClient()
     eebo_api_client = OctavoEeboClient()
     for document_id, fragment_list in self.fragments_by_ecco_id.items():
         # TODO: handle the notes somehow!
         if "_note_" in str(document_id):
             textdata = None
         elif len(document_id) < 10:
             textdata = eebo_api_client.get_text_for_document_id(
                 document_id).get('text')
         else:
             textdata = ecco_api_client.get_text_for_document_id(
                 document_id).get('text')
         for fragment in fragment_list:
             fragment.set_octavo_index(textdata)
from lib.octavo_api_client import (
    OctavoEccoClient,
    OctavoEccoClusterClient
    )

from lib.fragmentlists import (
    get_fragmentlist,
    get_doctext_indexmap,
    test_fragment_text)

from lib.utils_common import create_dir_if_not_exists
from lib.headerdata_dump_common import read_docid_asciimap_csv


ecco_api_client = OctavoEccoClient()
cluster_api_client = OctavoEccoClusterClient(timeout=600)

docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv')

fields_ecco = ["documentID", "content"]
field_eccocluster = ["documentID", "fragmentID", "text",
                     "startIndex", "endIndex"]


docid_to_process = "0162900301"

docid_clusterdata = (
    cluster_api_client.get_cluster_data_for_document_id(
        docid_to_process, fields=field_eccocluster))
Esempio n. 4
0

def get_datadir():
    if len(sys.argv) == 1:
        sys.exit("Provide datadir.")
    elif len(sys.argv) == 2:
        return sys.argv[1]
    else:
        sys.exit("Too many command line args.")


# ---------------------------
# main script
# ---------------------------

ecco_api_client = OctavoEccoClient()
ecco_api_client = "local"
cluster_api_client = OctavoEccoClusterClient()
cluster_api_client = None

# docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv')
xml_img_page_datadir = ("../data/raw/ecco-xml-img/")

fields_ecco = ["documentID", "content"]
field_eccocluster = [
    "documentID", "fragmentID", "text", "startIndex", "endIndex"
]

datadir = get_datadir() + "/"

# reuse data list of JSON files
#     # "../data/work/hackathon/",
#     # "../data/work/hume_full/",
#     ]


# all_data = []
# for path in jsonpaths:
#     jsonfiles = get_datafiles(path)
#     for jsonfileloc in jsonfiles:
#         with open(jsonfileloc, 'r') as jsonfile:
#             jsondata = json.load(jsonfile)
#             all_data.extend(jsondata)


# xxxx
ecco_api_client = OctavoEccoClient()
eebo_api_client = OctavoEeboClient()
char_offsets = {}

text_ids = []
for item in galeitems:
    text_ids.append(item['document_id'])
    # text_ids.append(item['id_secondary'])

# text_ids = text_ids[:50]
# text_ids = ["A56206.headed_2_text", "A56206.headed_1_text"]
# text_ids = ['0081400111']
# text_ids = ["A65112.headed_2_text"]
# text_ids = ["A56206.headed_2_text"]
# text_ids = ['0818700401']
# text_ids = ["A90295.headed_1_text"]
                            get_headers_from_document_text)
from lib.author_metadata import read_author_metadata_csv
from lib.text_reuse_common import (load_good_metadata)


def read_txt_file_to_string(file_path):
    with open(file_path, 'r') as txtfile:
        str_data = txtfile.read()
        return str_data


# document_id = document_id_dict.get('id')

# get doc from api
# api_limit = -1
ecco_api_client = OctavoEccoClient()
cluster_api_client = OctavoEccoClusterClient(limit=-1, timeout=60)

document_id = "0175300500"
document_text = ecco_api_client.get_text_for_document_id(document_id)['text']
# document_text = document_data.get('text')
document_meta = ecco_api_client.get_document_id_metadata(document_id)

# test_text_loc = "/media/vvaara/uh-villevaara-ext1/eccotxt/ECCO_I/ECCO_2of2/RelAndPhil/0010800104/xml/0010800104.txt"
# test_text = read_txt_file_to_string(test_text_loc)

headerdata = get_headers_for_document_id(document_id, document_text)
print("> Fetching clusterIDs ...")
cluster_ids = cluster_api_client.get_cluster_ids_list_for_document_id(
    document_id)
print("  >> Done!")
Esempio n. 7
0
#             page_snip_start = snip_start
#         else:
#             page_snip_start = page_first_char_index
#         page_snip = fulltext[page_snip_start:(page_snip_end + 1)]
#         snip_page_dict[page_number] = {
#             'snip_start': page_snip_start,
#             'snip_end': page_snip_end,
#             'snip_text': page_snip
#         }
#     return snip_page_dict

# transfer >>>

# estc T082481
cluster_api_client = OctavoEccoClusterClient(timeout=600)
ecco_api_client = OctavoEccoClient()

docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv')

fields_ecco = ["documentID", "content"]
field_eccocluster = [
    "documentID", "fragmentID", "text", "startIndex", "endIndex"
]

docid_text = ecco_api_client.get_text_for_document_id('0162200200').get('text')

from lib.octavo_api_client import (OctavoEccoClient, OctavoEccoClusterClient)
from lib.tr_bookcontainer import BookContainer

ecco_api_client = OctavoEccoClient()
humebook = BookContainer(
Esempio n. 8
0
# find_ecco_id.py

from lib.octavo_api_client import (
    OctavoEccoClient, )


def print_response(responsedata):
    if len(responsedata) == 0:
        print("\nNo match\n")
    else:
        for item in responsedata:
            print()
            for key, value in item.items():
                print(key + ": " + str(value))
            print()


ecco_api_client = OctavoEccoClient()

print_response(ecco_api_client.get_estc_id_metadata("R223440"))
Esempio n. 9
0
all_outpaths = []
documents_meta_dict = {}

for document_id_dict in document_ids:
    document_id = document_id_dict.get('id')

    if (document_id_dict.get('filter_out_year_above') != -1):
        filter_out_year_above = document_id_dict.get('filter_out_year_above')
    if (document_id_dict.get('filter_out_year_below') != -1
            and document_id_dict.get('filter_out_year_below') is not None):
        filter_out_year_below = document_id_dict.get('filter_out_year_below')

    outpath_prefix = outpath_prefix_base + "/" + document_id
    all_outpaths.append(get_outpath_prefix_with_date(outpath_prefix))
    # get doc from api
    ecco_api_client = OctavoEccoClient()
    cluster_api_client = OctavoEccoClusterClient(limit=api_limit, timeout=60)
    document_data = ecco_api_client.get_text_for_document_id(document_id)
    document_text = document_data.get('text')

    documents_meta_dict[document_id] = {
        'id': document_id,
        'length': len(document_text),
        'sequence': document_id_dict.get('sequence'),
        'description': document_id_dict.get('description')
    }

    headerdata = get_headers_for_document_id(document_id, document_text)
    print("> Fetching clusterIDs ...")
    cluster_ids = cluster_api_client.get_cluster_ids_list_for_document_id(
        document_id)