Exemple #1
0
def get_image(id):
	# hab.de site does not use any metadata and just sends unnecessary requests to backend
	# Using head requests to get maximum available zoom and
	class UrlMaker:
		def __init__(self, zoom):
			self.zoom = zoom

		def __call__(self, tile_x, tile_y):
			for tile_group in [0, 1, 2]:
				probable_url = f"http://diglib.hab.de/varia/{id}/TileGroup{tile_group}/{self.zoom}-{tile_x}-{tile_y}.jpg"
				head_response = requests.head(probable_url)
				if head_response.status_code == 200:
					return probable_url
			return None

	MAX_ZOOM = 10
	TILE_SIZE = 256
	max_zoom = None
	for test_zoom in range(MAX_ZOOM + 1):
		if UrlMaker(test_zoom)(0, 0) is not None:
			max_zoom = test_zoom
		else:
			# current zoom is not available - consider previous one to be maximal
			break
	assert(max_zoom is not None)
	print(f"Guessed max_zoom={max_zoom}")
	url_maker = UrlMaker(max_zoom)
	tiles_number_x = utils.guess_tiles_number_x(url_maker)
	print(f"Guessed tiles_number_x={tiles_number_x}")
	tiles_number_y = utils.guess_tiles_number_y(url_maker)
	print(f"Guessed tiles_number_y={tiles_number_y}")

	policy = utils.TileSewingPolicy(tiles_number_x, tiles_number_y, TILE_SIZE)
	output_filename = utils.make_output_filename(id.replace("/", "."))
	utils.download_and_sew_tiles(output_filename, url_maker, policy)
Exemple #2
0
def get(id):
    # First, normalizing id
    id = id.replace('/', '_')
    if id.startswith("ABO"):
        flavour = "OnbViewer"
    elif id.startswith("DTL"):
        flavour = "RepViewer"
    else:
        raise RuntimeError(f"Can not determine flavour for {id}")

    # Second, obtaining JSESSIONID cookie value
    viewer_url = f"http://digital.onb.ac.at/{flavour}/viewer.faces?doc={id}"
    viewer_response = requests.get(viewer_url)
    cookies = viewer_response.cookies
    metadata_url = f"http://digital.onb.ac.at/{flavour}/service/viewer/imageData?doc={id}&from=1&to=1000"
    metadata = utils.get_json(metadata_url, cookies=cookies)
    output_folder = utils.make_output_folder("onb", id)
    image_data = metadata["imageData"]
    print(f"Going to download {len(image_data)} images")
    for image in image_data:
        query_args = image["queryArgs"]
        image_id = image["imageID"]
        image_url = f"http://digital.onb.ac.at/{flavour}/image?{query_args}&s=1.0&q=100"
        output_filename = utils.make_output_filename(output_folder,
                                                     image_id,
                                                     extension=None)
        if os.path.isfile(output_filename):
            print(f"Skip downloading existing image {image_id}")
            continue
        print(f"Downloading {image_id}")
        utils.get_binary(output_filename, image_url, cookies=cookies)
Exemple #3
0
def compute_stats_for_all_combinations(combinations, pilot):
    """
    Compute statistics for all combinations of incident type and languages.
    """
    bin_folder = 'bin'

    for incident_type, languages in combinations:

        if pilot:
            languages.append('pilot')
        filename = utils.make_output_filename(bin_folder, incident_type,
                                              languages)

        with open(filename, 'rb') as f:
            collection = pickle.load(f)

        num_incidents, \
        num_with_wikipedia, \
        wiki_from_which_method, \
        num_with_prim_rt, \
        num_with_annotations, \
        desc_prim_rt, \
        cntr_prim_rt, \
        countries_dist, \
        numwiki_dist, \
        numlang_dist, \
        extra_info_dist_agg,\
        count_occurrences,\
        count_values, \
        all_info = collection.compute_stats()

        example_incident = collection.incidents.pop()
        print(example_incident.extra_info)

        print()
        print('*' * 50)
        print('Incident type:', incident_type, '; Languages:',
              '-'.join(languages))
        print('*' * 50)
        print('Num incidents:', num_incidents)
        print('With wiki content:', num_with_wikipedia)
        print('Found by:', wiki_from_which_method)

        print('Wikipages with primary reference texts:', num_with_prim_rt)
        print('Description of primary reference texts:', desc_prim_rt)
        print('Distribution of primary reference texts:', cntr_prim_rt)
        print('Wikipages with annotations', num_with_annotations)

        print('Countries distribution:\n', countries_dist)
        print('Number of Wikipages per incident:\n', numwiki_dist)
        print('Number of languages per incident:\n', numlang_dist)
        print('Distribution of properties', extra_info_dist_agg)
        print('Count of occurrences', count_occurrences)
        print('Count of values', count_values)

        print('Incidents with full info', all_info)
    return
Exemple #4
0
def get(id):
	children_url = f"https://kramerius.difmoe.eu/search/api/v5.0/item/uuid:{id}/children"
	children = utils.get_json(children_url)
	print(f"Downloading {len(children)} images from kramerius.difmoe.eu")
	
	output_folder = utils.make_output_folder("difmoe", id)
	for page, child in enumerate(children, start=1):
		child_pid = child["pid"]
		image_url = f"https://kramerius.difmoe.eu/search/img?pid={child_pid}&stream=IMG_FULL"
		output_filename = utils.make_output_filename(output_folder, page=page, extension="jpg")
		utils.get_binary(output_filename, image_url)
Exemple #5
0
def get_book(id):
	output_folder = utils.make_output_folder("hab", id)
	page = 0
	for page in range(1, 1000):
		url = f"http://diglib.hab.de/{id}/max/{page:05d}.jpg"
		output_filename = utils.make_output_filename(output_folder, page=page, extension="jpg")
		if os.path.exists(output_filename):
			print(f"Skip downloading existing page #{page:05d}")
			continue
		try:
			print(f"Downloading page #{page:05d} from {url}")
			utils.get_binary(output_filename, url)
		except ValueError:
			break
Exemple #6
0
def download_book(manifest_url, output_folder):
    """
	Downloads entire book via IIIF protocol.
	API is documented here:
	http://iiif.io/about/
	"""
    manifest = utils.get_json(manifest_url)
    canvases = manifest["sequences"][0]["canvases"]
    for page, metadata in enumerate(canvases):
        output_filename = utils.make_output_filename(output_folder, page)
        if os.path.isfile(output_filename):
            print(f"Skip downloading existing page #{page:04d}")
            continue
        base_url = metadata["images"][-1]["resource"]["service"]["@id"]
        download_image(base_url, output_filename)
Exemple #7
0
def get(id):
    output_folder = utils.make_output_folder("fulda", id)
    for page in range(1, 1000):
        # it looks like Fulda library does not use manifest.json, hence it is not possible to guess number of pages in the book in advance
        image_url = f"https://fuldig.hs-fulda.de/viewer/rest/image/{id}/{page:08d}.tif/full/10000,/0/default.jpg"
        output_filename = utils.make_output_filename(output_folder,
                                                     page,
                                                     extension="jpg")
        if os.path.exists(output_filename):
            print(f"Skip downloading existing page #{page:08d}")
            continue
        print(f"Downloading page {page} to {output_filename}")
        try:
            utils.get_binary(output_filename, image_url)
        except ValueError:
            break
Exemple #8
0
def get(id):
    output_folder = utils.make_output_folder("hathitrust", id)
    metadata_url = f"https://babel.hathitrust.org/cgi/imgsrv/meta?id={id}"
    metadata = utils.get_json(metadata_url)
    total_pages = metadata["total_items"]
    print(f"Going to download {total_pages} pages to {output_folder}")
    for page in range(1, total_pages + 1):
        url = f"https://babel.hathitrust.org/cgi/imgsrv/image?id={id};seq={page};width=1000000"
        output_filename = utils.make_output_filename(output_folder,
                                                     page,
                                                     extension="jpg")
        if os.path.exists(output_filename):
            print(f"Skip downloading existing page #{page:08d}")
            continue
        print(f"Downloading page {page} to {output_filename}")
        utils.get_binary(output_filename, url)
Exemple #9
0
def download_book_fast(manifest_url, output_folder):
    """
	Downloads entire book via IIIF protocol.
	Issues single request per image, but might be unsupported by certain backends.

	API is documented here:
	http://iiif.io/about/
	"""
    manifest = utils.get_json(manifest_url)
    canvases = manifest["sequences"][0]["canvases"]
    for page, metadata in enumerate(canvases):
        output_filename = utils.make_output_filename(output_folder,
                                                     page,
                                                     extension="jpg")
        if os.path.isfile(output_filename):
            print(f"Skip downloading existing page #{page:04d}")
            continue
        full_url = metadata["images"][-1]["resource"]["@id"]
        print(f"Downloading page #{page:04d} from {full_url}")
        utils.get_binary(output_filename, full_url)
def get(id):
    full_id = f"oai:www.internetculturale.sbn.it/{id}"
    # FIXME: this xpath is just broken
    # metadata_url = f"http://www.internetculturale.it/jmms/magparser?id={full_id}&teca=MagTeca+-+ICCU&mode=all"
    # metadata = utils.get_xml(metadata_url)
    # page_nodes = metadata.findall("./package/medias/media[1]/pages")
    # page_count = int(page_nodes[0].attrib("count"))
    page_url_base = f"http://www.internetculturale.it/jmms/objdownload?id={full_id}&teca=MagTeca%20-%20ICCU&resource=img&mode=raw"

    output_folder = utils.make_output_folder("iculturale", id)
    for page in range(1, 1000):
        page_url = f"{page_url_base}&start={page}"
        print(f"Downloading page #{page} from {page_url}")
        output_filename = utils.make_output_filename(output_folder,
                                                     page=page,
                                                     extension="jpg")
        if os.path.exists(output_filename):
            print(f"Skip downloading existing page #{page:08d}")
            continue
        data_size = utils.get_binary(output_filename, page_url)
        if data_size == 0:
            os.remove(output_filename)
            break
import pickle

import utils
import config

if __name__ == '__main__':

    incident_types = config.incident_types
    languages_list = config.languages_list

    cartesian_product = [(x, y) for x in incident_types
                         for y in languages_list]

    for incident_type, languages in cartesian_product:
        filename = utils.make_output_filename(incident_type, languages)

        with open(filename, 'rb') as f:
            collection = pickle.load(f)

        ttl_filename = filename.rsplit('.', 1)[0] + '.ttl'
        collection.serialize(ttl_filename)
            if not len(incidents):
                print('NO INCIDENTS FOUND FOR %s. Continuing to next type...')
                continue

            new_incidents = obtain_reference_texts(incidents, wiki_folder,
                                                   wiki_uri2path_info,
                                                   language2info)

            collection = classes.IncidentCollection(
                incidents=new_incidents,
                incident_type=incident_type,
                incident_type_uri=inc_type_uri,
                languages=languages)

            output_file = utils.make_output_filename(bin_folder, incident_type,
                                                     languages)

            with open(output_file, 'wb') as of:
                pickle.dump(collection, of)

            inc_stats.append(len(collection.incidents))

            ttl_filename = '%s/%s_%s.ttl' % (rdf_folder, incident_type,
                                             '_'.join(languages))
            collection.serialize(ttl_filename)

            after_extraction = time.time()

            pilots = pilot_utils.create_pilot_data(collection)

            after_pilot_selection = time.time()