def get_image(id): # hab.de site does not use any metadata and just sends unnecessary requests to backend # Using head requests to get maximum available zoom and class UrlMaker: def __init__(self, zoom): self.zoom = zoom def __call__(self, tile_x, tile_y): for tile_group in [0, 1, 2]: probable_url = f"http://diglib.hab.de/varia/{id}/TileGroup{tile_group}/{self.zoom}-{tile_x}-{tile_y}.jpg" head_response = requests.head(probable_url) if head_response.status_code == 200: return probable_url return None MAX_ZOOM = 10 TILE_SIZE = 256 max_zoom = None for test_zoom in range(MAX_ZOOM + 1): if UrlMaker(test_zoom)(0, 0) is not None: max_zoom = test_zoom else: # current zoom is not available - consider previous one to be maximal break assert(max_zoom is not None) print(f"Guessed max_zoom={max_zoom}") url_maker = UrlMaker(max_zoom) tiles_number_x = utils.guess_tiles_number_x(url_maker) print(f"Guessed tiles_number_x={tiles_number_x}") tiles_number_y = utils.guess_tiles_number_y(url_maker) print(f"Guessed tiles_number_y={tiles_number_y}") policy = utils.TileSewingPolicy(tiles_number_x, tiles_number_y, TILE_SIZE) output_filename = utils.make_output_filename(id.replace("/", ".")) utils.download_and_sew_tiles(output_filename, url_maker, policy)
def get(id): # First, normalizing id id = id.replace('/', '_') if id.startswith("ABO"): flavour = "OnbViewer" elif id.startswith("DTL"): flavour = "RepViewer" else: raise RuntimeError(f"Can not determine flavour for {id}") # Second, obtaining JSESSIONID cookie value viewer_url = f"http://digital.onb.ac.at/{flavour}/viewer.faces?doc={id}" viewer_response = requests.get(viewer_url) cookies = viewer_response.cookies metadata_url = f"http://digital.onb.ac.at/{flavour}/service/viewer/imageData?doc={id}&from=1&to=1000" metadata = utils.get_json(metadata_url, cookies=cookies) output_folder = utils.make_output_folder("onb", id) image_data = metadata["imageData"] print(f"Going to download {len(image_data)} images") for image in image_data: query_args = image["queryArgs"] image_id = image["imageID"] image_url = f"http://digital.onb.ac.at/{flavour}/image?{query_args}&s=1.0&q=100" output_filename = utils.make_output_filename(output_folder, image_id, extension=None) if os.path.isfile(output_filename): print(f"Skip downloading existing image {image_id}") continue print(f"Downloading {image_id}") utils.get_binary(output_filename, image_url, cookies=cookies)
def compute_stats_for_all_combinations(combinations, pilot): """ Compute statistics for all combinations of incident type and languages. """ bin_folder = 'bin' for incident_type, languages in combinations: if pilot: languages.append('pilot') filename = utils.make_output_filename(bin_folder, incident_type, languages) with open(filename, 'rb') as f: collection = pickle.load(f) num_incidents, \ num_with_wikipedia, \ wiki_from_which_method, \ num_with_prim_rt, \ num_with_annotations, \ desc_prim_rt, \ cntr_prim_rt, \ countries_dist, \ numwiki_dist, \ numlang_dist, \ extra_info_dist_agg,\ count_occurrences,\ count_values, \ all_info = collection.compute_stats() example_incident = collection.incidents.pop() print(example_incident.extra_info) print() print('*' * 50) print('Incident type:', incident_type, '; Languages:', '-'.join(languages)) print('*' * 50) print('Num incidents:', num_incidents) print('With wiki content:', num_with_wikipedia) print('Found by:', wiki_from_which_method) print('Wikipages with primary reference texts:', num_with_prim_rt) print('Description of primary reference texts:', desc_prim_rt) print('Distribution of primary reference texts:', cntr_prim_rt) print('Wikipages with annotations', num_with_annotations) print('Countries distribution:\n', countries_dist) print('Number of Wikipages per incident:\n', numwiki_dist) print('Number of languages per incident:\n', numlang_dist) print('Distribution of properties', extra_info_dist_agg) print('Count of occurrences', count_occurrences) print('Count of values', count_values) print('Incidents with full info', all_info) return
def get(id): children_url = f"https://kramerius.difmoe.eu/search/api/v5.0/item/uuid:{id}/children" children = utils.get_json(children_url) print(f"Downloading {len(children)} images from kramerius.difmoe.eu") output_folder = utils.make_output_folder("difmoe", id) for page, child in enumerate(children, start=1): child_pid = child["pid"] image_url = f"https://kramerius.difmoe.eu/search/img?pid={child_pid}&stream=IMG_FULL" output_filename = utils.make_output_filename(output_folder, page=page, extension="jpg") utils.get_binary(output_filename, image_url)
def get_book(id): output_folder = utils.make_output_folder("hab", id) page = 0 for page in range(1, 1000): url = f"http://diglib.hab.de/{id}/max/{page:05d}.jpg" output_filename = utils.make_output_filename(output_folder, page=page, extension="jpg") if os.path.exists(output_filename): print(f"Skip downloading existing page #{page:05d}") continue try: print(f"Downloading page #{page:05d} from {url}") utils.get_binary(output_filename, url) except ValueError: break
def download_book(manifest_url, output_folder): """ Downloads entire book via IIIF protocol. API is documented here: http://iiif.io/about/ """ manifest = utils.get_json(manifest_url) canvases = manifest["sequences"][0]["canvases"] for page, metadata in enumerate(canvases): output_filename = utils.make_output_filename(output_folder, page) if os.path.isfile(output_filename): print(f"Skip downloading existing page #{page:04d}") continue base_url = metadata["images"][-1]["resource"]["service"]["@id"] download_image(base_url, output_filename)
def get(id): output_folder = utils.make_output_folder("fulda", id) for page in range(1, 1000): # it looks like Fulda library does not use manifest.json, hence it is not possible to guess number of pages in the book in advance image_url = f"https://fuldig.hs-fulda.de/viewer/rest/image/{id}/{page:08d}.tif/full/10000,/0/default.jpg" output_filename = utils.make_output_filename(output_folder, page, extension="jpg") if os.path.exists(output_filename): print(f"Skip downloading existing page #{page:08d}") continue print(f"Downloading page {page} to {output_filename}") try: utils.get_binary(output_filename, image_url) except ValueError: break
def get(id): output_folder = utils.make_output_folder("hathitrust", id) metadata_url = f"https://babel.hathitrust.org/cgi/imgsrv/meta?id={id}" metadata = utils.get_json(metadata_url) total_pages = metadata["total_items"] print(f"Going to download {total_pages} pages to {output_folder}") for page in range(1, total_pages + 1): url = f"https://babel.hathitrust.org/cgi/imgsrv/image?id={id};seq={page};width=1000000" output_filename = utils.make_output_filename(output_folder, page, extension="jpg") if os.path.exists(output_filename): print(f"Skip downloading existing page #{page:08d}") continue print(f"Downloading page {page} to {output_filename}") utils.get_binary(output_filename, url)
def download_book_fast(manifest_url, output_folder): """ Downloads entire book via IIIF protocol. Issues single request per image, but might be unsupported by certain backends. API is documented here: http://iiif.io/about/ """ manifest = utils.get_json(manifest_url) canvases = manifest["sequences"][0]["canvases"] for page, metadata in enumerate(canvases): output_filename = utils.make_output_filename(output_folder, page, extension="jpg") if os.path.isfile(output_filename): print(f"Skip downloading existing page #{page:04d}") continue full_url = metadata["images"][-1]["resource"]["@id"] print(f"Downloading page #{page:04d} from {full_url}") utils.get_binary(output_filename, full_url)
def get(id): full_id = f"oai:www.internetculturale.sbn.it/{id}" # FIXME: this xpath is just broken # metadata_url = f"http://www.internetculturale.it/jmms/magparser?id={full_id}&teca=MagTeca+-+ICCU&mode=all" # metadata = utils.get_xml(metadata_url) # page_nodes = metadata.findall("./package/medias/media[1]/pages") # page_count = int(page_nodes[0].attrib("count")) page_url_base = f"http://www.internetculturale.it/jmms/objdownload?id={full_id}&teca=MagTeca%20-%20ICCU&resource=img&mode=raw" output_folder = utils.make_output_folder("iculturale", id) for page in range(1, 1000): page_url = f"{page_url_base}&start={page}" print(f"Downloading page #{page} from {page_url}") output_filename = utils.make_output_filename(output_folder, page=page, extension="jpg") if os.path.exists(output_filename): print(f"Skip downloading existing page #{page:08d}") continue data_size = utils.get_binary(output_filename, page_url) if data_size == 0: os.remove(output_filename) break
import pickle import utils import config if __name__ == '__main__': incident_types = config.incident_types languages_list = config.languages_list cartesian_product = [(x, y) for x in incident_types for y in languages_list] for incident_type, languages in cartesian_product: filename = utils.make_output_filename(incident_type, languages) with open(filename, 'rb') as f: collection = pickle.load(f) ttl_filename = filename.rsplit('.', 1)[0] + '.ttl' collection.serialize(ttl_filename)
if not len(incidents): print('NO INCIDENTS FOUND FOR %s. Continuing to next type...') continue new_incidents = obtain_reference_texts(incidents, wiki_folder, wiki_uri2path_info, language2info) collection = classes.IncidentCollection( incidents=new_incidents, incident_type=incident_type, incident_type_uri=inc_type_uri, languages=languages) output_file = utils.make_output_filename(bin_folder, incident_type, languages) with open(output_file, 'wb') as of: pickle.dump(collection, of) inc_stats.append(len(collection.incidents)) ttl_filename = '%s/%s_%s.ttl' % (rdf_folder, incident_type, '_'.join(languages)) collection.serialize(ttl_filename) after_extraction = time.time() pilots = pilot_utils.create_pilot_data(collection) after_pilot_selection = time.time()