Ejemplo n.º 1
0
	def process_constituent_row(constituent, current_id):
		constituent_id = row[indices['constituent_id_index']]
		type_key = int(row[indices['type_id_index']])
		type = CONSTITUENTTYPES.get(type_key)

		if constituent_id != current_id:
			# will likely have multiple rows for one constituent because of many related objects
			# only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
			save(constituent)
			current_id = constituent_id
			constituent = {}
			if elasticsearch_connection.item_exists(constituent_id, type, ELASTICSEARCH_INDEX):
				constituent = elasticsearch_connection.get_item(constituent_id, type, ELASTICSEARCH_INDEX)
			else:
				print("%s could not be found!" % constituent_id)
				return (constituent, current_id)

		if 'altnames' not in constituent:
			constituent['altnames'] = []
		altname = row[indices['altname_index']]
		name_type = row[indices['name_type_index']]
		constituent['altnames'].append({
			'name' : altname,
			'type' : name_type
		})
		return (constituent, current_id)
Ejemplo n.º 2
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related published
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        reference_id = row[indices["reference_id_index"]]
        title = row[indices["title_index"]]
        boiler_text = row[indices["boiler_text_index"]]
        date = row[indices["date_index"]]
        main_url = get_media_url(row[indices["path_index"]], row[indices["file_index"]])

        if "pubdocs" not in constituent["relateditems"]:
            constituent["relateditems"]["pubdocs"] = []
        constituent["relateditems"]["pubdocs"].append(
            {"id": reference_id, "boilertext": boiler_text, "displaytext": title, "date": date, "url": main_url}
        )
        # keep the related items sorted
        constituent["relateditems"]["pubdocs"].sort(key=operator.itemgetter("displaytext"))
        return (constituent, current_id)
Ejemplo n.º 3
0
    def process_media_row(media, current_id):
        id = row[indices['id_index']]
        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)

        # for now, ignore Microfilm and Document media types
        if (media_type_key in [4, 5]):
            return (media, current_id)

        if id != current_id:
            save(media)
            current_id = id
            media = {}
            if elasticsearch_connection.item_exists(id, media_type):
                media = elasticsearch_connection.get_item(id, media_type)
            else:
                print "%s could not be found!" % id
                return (media, current_id)
        if 'relateditems' not in media:
            media['relateditems'] = {}

        constituent_id = row[indices['constituent_id_index']]
        display_name = row[indices['display_name_index']]
        description = row[indices['remarks_index']] if row[
            indices['remarks_index']] != "NULL" else ""
        display_date = ""
        if row[indices['display_date_index']] != "NULL":
            display_date = row[indices['display_date_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        constituent_dict = {}
        role = row[indices['role_index']]
        # update the set of roles for this media
        if role not in media['roles']:
            # make sure Photographer is first
            if role == "Photographer":
                media['roles'].insert(0, role)
            else:
                media['roles'].append(role)

        constituent_dict['role'] = row[indices['role_index']]
        constituent_dict['roleid'] = row[indices['role_id_index']]
        constituent_dict['id'] = constituent_id
        constituent_dict['displayname'] = display_name
        constituent_dict['displaydate'] = display_date
        constituent_dict['displaytext'] = display_name
        constituent_dict['description'] = description
        constituent_dict['thumbnail'] = thumbnail_url

        constituent_type_key = int(row[indices['constituent_type_id_index']])
        constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
        if constituent_type not in media['relateditems']:
            media['relateditems'][constituent_type] = []
        media['relateditems'][constituent_type].append(constituent_dict)
        # keep the related items sorted
        media['relateditems'][constituent_type].sort(
            key=operator.itemgetter('displaytext'))

        return (media, current_id)
Ejemplo n.º 4
0
	def process_site_row(site, current_id):
		site_id = row[site_id_index]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related photos
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)		
		if 'relateditems' not in site:
			site['relateditems'] = {}

		media_master_id = row[media_master_id_index]
		if "photos" not in site['relateditems']:
			site['relateditems']["photos"] = []
		site['relateditems']["photos"].append({
			'id' : media_master_id,
			'displaytext' : media_master_id
			})
		return(site, current_id)
Ejemplo n.º 5
0
    def process_pub_row(pub, current_id):
        pub_id = row[indices['pub_id_index']]

        if pub_id != current_id:
            save(pub)
            current_id = pub_id
            pub = {}
            if elasticsearch_connection.item_exists(pub_id, 'pubdocs',
                                                    ELASTICSEARCH_INDEX):
                pub = elasticsearch_connection.get_item(
                    pub_id, 'pubdocs', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % pub_id)
                return (pub, current_id)

        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        main_url = get_media_url(row[indices['main_path_index']],
                                 row[indices['main_file_index']])

        pub['primarydisplay'] = {
            'thumbnail': thumbnail_url,
            'main': thumbnail_url
        }
        pub['pdf'] = main_url

        return (pub, current_id)
Ejemplo n.º 6
0
    def process_object_row(object, current_id):
        object_id = row[indices['object_id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if object_id != current_id:
            # will likely have multiple rows
            save(object)
            current_id = object_id
            object = {}
            if elasticsearch_connection.item_exists(object_id, classification):
                object = elasticsearch_connection.get_item(
                    object_id, classification)
            else:
                print "%s could not be found!" % object_id
                return (object, current_id)

        if 'altnums' not in object:
            object['altnums'] = []
        altnum = row[indices['altnum_index']]
        prefix_idx = altnum.find('_')
        without_prefix = altnum[prefix_idx + 1:]
        description = row[indices['description_index']] if row[
            indices['description_index']] != "NULL" else ""
        object['altnums'].append({
            "altnum": altnum,
            "description": description,
            'without_prefix': without_prefix
        })
        object['allnumbers'].extend((altnum, without_prefix))
        return (object, current_id)
Ejemplo n.º 7
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related constituents
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this site
		if role not in site['roles']:
			# make sure Tomb Owner is first
			if role == "Tomb Owner":
				site['roles'].insert(0, role)
			else:
				site['roles'].append(role)

		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		constituent_dict['role'] = role
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)

		# add to array of people for easier searching
		if (constituent_type_key in [1,3]):
			site['people'].append(display_name)

		if constituent_type not in site['relateditems']:
			site['relateditems'][constituent_type] = []
		site['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		site['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		if role == 'Tomb Owner':
			site['tombowner'] = "Yes"
		return(site, current_id)
Ejemplo n.º 8
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]

		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

			if 'sitedates' not in site:
				site['sitedates'] = []

			event_type = row[indices['event_type_index']]
			date_text = row[indices['date_text_index']]

			site['sitedates'].append({
				'type' : event_type,
				'date' : date_text
			})
			site['datevalues'].append(date_text)
		return (site, current_id)
Ejemplo n.º 9
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue

		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

		if 'altnums' not in site:
			site['altnums'] = []
		if 'altnum_types' not in site:
			site['altnum_types'] = []

		altnum = row[indices['altnum_index']]
		description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else ""
		if description not in site['altnum_types']:
			site['altnum_types'].append(description)

		site['altnums'].append({"altnum" : altnum, "description" : description})
		return (site, current_id)
Ejemplo n.º 10
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]

        if site_id != current_id:
            # will likely have multiple rows for one site because of many related objects
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites',
                                                    ELASTICSEARCH_INDEX):
                site = elasticsearch_connection.get_item(
                    site_id, 'sites', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % site_id)
                return (site, current_id)

        if 'sitedates' not in site:
            site['sitedates'] = []

        event_type = row[indices['event_type_index']]
        date_text = row[indices['date_text_index']]

        site['sitedates'].append({'type': event_type, 'date': date_text})
        site['datevalues'].append(date_text)
        return (site, current_id)
Ejemplo n.º 11
0
    def process_object_row(object, current_id):
        object_id = row[indices['object_id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if object_id != current_id:
            # will likely have multiple rows
            save(object)
            current_id = object_id
            object = {}
            if elasticsearch_connection.item_exists(object_id, classification):
                object = elasticsearch_connection.get_item(
                    object_id, classification)
            else:
                print "%s could not be found!" % object_id
                return (object, current_id)

        if 'flexfields' not in object:
            object['flexfields'] = {}

        groupname = row[indices['group_name_index']]
        if groupname not in object['flexfields']:
            object['flexfields'][groupname] = []

        fieldname = row[indices['field_name_index']]
        fieldvalue = row[indices['field_value_index']]
        object['flexfields'][groupname].append({fieldname: fieldvalue})
        return (object, current_id)
Ejemplo n.º 12
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue

        if site_id != current_id:
            # will likely have multiple rows for one site because of many related objects
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites',
                                                    ELASTICSEARCH_INDEX):
                site = elasticsearch_connection.get_item(
                    site_id, 'sites', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % site_id)
                return (site, current_id)

        if 'altnums' not in site:
            site['altnums'] = []
        if 'altnum_types' not in site:
            site['altnum_types'] = []

        altnum = row[indices['altnum_index']]
        description = row[indices['description_index']] if row[
            indices['description_index']] != "NULL" else ""
        if description not in site['altnum_types']:
            site['altnum_types'].append(description)

        site['altnums'].append({"altnum": altnum, "description": description})
        return (site, current_id)
Ejemplo n.º 13
0
	def process_object_row(object, current_id):
		object_id = row[indices['object_id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if object_id != current_id:
			# will likely have multiple rows
			save(object)
			current_id = object_id
			object = {}
			if elasticsearch_connection.item_exists(object_id, classification):
				object = elasticsearch_connection.get_item(object_id, classification)
			else:
				print "%s could not be found!" % object_id
				return (object, current_id)

		if 'altnums' not in object:
			object['altnums'] = []
		altnum = row[indices['altnum_index']]
		prefix_idx = altnum.find('_')
		without_prefix = altnum[prefix_idx+1:]
		description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else ""
		object['altnums'].append({"altnum" : altnum, "description" : description, 'without_prefix': without_prefix})
		object['allnumbers'].extend((altnum, without_prefix))
		return (object, current_id)
Ejemplo n.º 14
0
	def process_object_row(object, current_id):
		object_id = row[indices['object_id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if object_id != current_id:
			# will likely have multiple rows
			save(object)
			current_id = object_id
			object = {}
			if elasticsearch_connection.item_exists(object_id, classification):
				object = elasticsearch_connection.get_item(object_id, classification)
			else:
				print "%s could not be found!" % object_id
				return (object, current_id)

		if 'flexfields' not in object:
			object['flexfields'] = {}

		groupname = row[indices['group_name_index']]
		if groupname not in object['flexfields']:
			object['flexfields'][groupname] = []

		fieldname = row[indices['field_name_index']]
		fieldvalue = row[indices['field_value_index']]
		object['flexfields'][groupname].append({fieldname : fieldvalue})
		return (object, current_id)
Ejemplo n.º 15
0
	def process_pub_row(pub, current_id):
		id = row[indices['id_index']]

		if id != current_id:
			save(pub)
			current_id = id
			pub = {}
			if elasticsearch_connection.item_exists(id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(id, 'pubdocs')
			else:
				print "%s could not be found!" % id
				return(pub, current_id)
		if 'relateditems' not in pub:
			pub['relateditems'] = {}

		site_id = row[indices['site_id_index']]
		site_name = row[indices['site_name_index']]
		site_number = row[indices['site_number_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = site_number
		site_dict['thumbnail'] = thumbnail_url

		if 'sites' not in pub['relateditems']:
			pub['relateditems']['sites'] = []
		pub['relateditems']['sites'].append(site_dict)
		# keep the related items sorted
		pub['relateditems']['sites'].sort(key=operator.itemgetter('displaytext'))

		return(pub, current_id)
Ejemplo n.º 16
0
	def process_object_row(object, current_id):
		id = row[id_index]
		classification_key = int(row[classification_id_index])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		media_master_id = row[media_master_id_index]

		if 'photos' not in object['relateditems']:
			object['relateditems']['photos'] = []
		object['relateditems']['photos'].append({
			'id' : media_master_id, 
			'displaytext' : media_master_id})
		return(object, current_id)
Ejemplo n.º 17
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices['constituent_id_index']]
        type_key = int(row[indices['type_id_index']])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(
                    constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if 'relateditems' not in constituent:
            constituent['relateditems'] = {}
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices['object_id_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        object_title = row[indices['object_title_index']]
        object_number = row[indices['object_number_index']]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find('_')
            object_title = object_number[idx + 1:]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in constituent['relateditems']:
            constituent['relateditems'][classification] = []
        constituent['relateditems'][classification].append({
            'id':
            object_id,
            'title':
            object_title,
            'displaytext':
            object_title,
            'classificationid':
            classification_key,
            'number':
            object_number,
            'date':
            date,
            'thumbnail':
            thumbnail_url
        })
        # keep the related items sorted
        constituent['relateditems'][classification].sort(
            key=operator.itemgetter('displaytext'))

        return (constituent, current_id)
Ejemplo n.º 18
0
    def process_pub_row(pub, current_id):
        pub_id = row[indices['pub_id_index']]

        if pub_id != current_id:
            # will likely have multiple rows for one pub because of many related constituents
            # only get a new pub if we have a new pub id, but first save old pub to elasticsearch
            save(pub)
            current_id = pub_id
            pub = {}
            if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
                pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
            else:
                print "%s could not be found!" % pub_id
                return (pub, current_id)
        if 'relateditems' not in pub:
            pub['relateditems'] = {}

        constituent_id = row[indices['constituent_id_index']]
        display_name = row[indices['display_name_index']]
        display_date = ""
        if row[indices['display_date_index']] != "NULL":
            display_date = row[indices['display_date_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        alpha_sort = row[indices['alpha_sort_index']]

        constituent_dict = {}
        role = row[indices['role_index']]
        # update the set of roles for this pub
        if role not in pub['roles']:
            pub['roles'].append(role)

        if role == "Author":
            pub["authors"].append(alpha_sort)

        description = row[indices['remarks_index']] if row[
            indices['remarks_index']] != "NULL" else ""
        constituent_dict['role'] = role
        constituent_dict['id'] = constituent_id
        constituent_dict['displayname'] = display_name
        constituent_dict['displaydate'] = display_date
        constituent_dict['displaytext'] = display_name
        constituent_dict['description'] = description
        constituent_dict['thumbnail'] = thumbnail_url

        constituent_type_key = int(row[indices['constituent_type_id_index']])
        constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
        if constituent_type not in pub['relateditems']:
            pub['relateditems'][constituent_type] = []
        pub['relateditems'][constituent_type].append(constituent_dict)
        # keep the related items sorted
        pub['relateditems'][constituent_type].sort(
            key=operator.itemgetter('displaytext'))

        return (pub, current_id)
Ejemplo n.º 19
0
    def process_pub_row(pub, current_id):
        pub_id = row[indices['pub_id_index']]

        if pub_id != current_id:
            save(pub)
            current_id = pub_id
            pub = {}
            if elasticsearch_connection.item_exists(pub_id, 'pubdocs',
                                                    ELASTICSEARCH_INDEX):
                pub = elasticsearch_connection.get_item(
                    pub_id, 'pubdocs', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % pub_id)
                return (pub, current_id)

        if 'relateditems' not in pub:
            pub['relateditems'] = {}
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices['object_id_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        drs_id = "" if row[indices['drs_id']].lower() == "null" else row[
            indices['drs_id']]
        has_manifest = False if drs_id == "" else True
        if not thumbnail_url and drs_id:
            thumbnail_url = create_thumbnail_url(drs_id)

        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        object_title = row[indices['object_title_index']]
        object_number = row[indices['object_number_index']]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find('_')
            object_title = object_number[idx + 1:]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in pub['relateditems']:
            pub['relateditems'][classification] = []
        pub['relateditems'][classification].append({
            'id': object_id,
            'title': object_title,
            'displaytext': object_title,
            'classificationid': classification_key,
            'number': object_number,
            'date': date,
            'thumbnail': thumbnail_url,
            'has_manifest': has_manifest
        })
        # keep the related items sorted
        pub['relateditems'][classification].sort(
            key=operator.itemgetter('displaytext'))

        return (pub, current_id)
Ejemplo n.º 20
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)
        if 'relateditems' not in object:
            object['relateditems'] = {}

        constituent_id = row[indices['constituent_id_index']]
        display_name = row[indices['display_name_index']]
        description = row[indices['remarks_index']] if row[
            indices['remarks_index']] != "NULL" else ""
        display_date = ""
        if row[indices['display_date_index']] != "NULL":
            display_date = row[indices['display_date_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        constituent_dict = {}
        role = row[indices['role_index']]
        # update the set of roles for this object
        if role not in object['roles']:
            object['roles'].append(role)

        constituent_dict['role'] = row[indices['role_index']]
        constituent_dict['roleid'] = row[indices['role_id_index']]
        constituent_dict['id'] = constituent_id
        constituent_dict['displayname'] = display_name
        constituent_dict['displaydate'] = display_date
        constituent_dict['displaytext'] = display_name
        constituent_dict['description'] = description
        constituent_dict['thumbnail'] = thumbnail_url

        constituent_type_key = int(row[indices['constituent_type_id_index']])
        constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
        if constituent_type not in object['relateditems']:
            object['relateditems'][constituent_type] = []
        object['relateditems'][constituent_type].append(constituent_dict)
        # keep the related items sorted
        object['relateditems'][constituent_type].sort(
            key=operator.itemgetter('displaytext'))

        return (object, current_id)
Ejemplo n.º 21
0
	def process_object_row(object, current_id):
		id = row[id_index]
		classification_key = int(row[classification_id_index])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		constituent_id = row[constituent_id_index]
		display_name = row[display_name_index]
		display_date = ""
		if row[display_date_index] != "NULL":
			display_date = row[display_date_index]

		constituent_dict = {}
		constituent_dict['role'] = row[role_index]
		constituent_dict['roleid'] = row[role_id_index]
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name

		constituent_type_key = int(row[constituent_type_id_index])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in object['relateditems']:
			object['relateditems'][constituent_type] = []
		object['relateditems'][constituent_type].append(constituent_dict)

		# parse out any constituents "Mentioned on this page" (RoleID==48)
		if constituent_dict['roleid'] == '48':
			if 'mentioned' not in object:
				object['mentioned'] = {}
			if 'people' not in object['mentioned']:
				object['mentioned']['people'] = []
			object['mentioned']['people'].append(constituent_dict)

		# parse out any "Author" (RoleID==50)
		if constituent_dict['roleid'] == '50':
			if 'author' not in object:
				object['author'] = []
			object['author'].append(constituent_dict)
		
		return(object, current_id)
Ejemplo n.º 22
0
    def process_media_row(media, current_id):
        id = row[indices['media_id_index']]
        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)

        # for now, ignore Microfilm and Document media types
        if (media_type_key in [4, 5]):
            return (media, current_id)

        if id != current_id:
            save(media)
            current_id = id
            media = {}
            if elasticsearch_connection.item_exists(id, media_type):
                media = elasticsearch_connection.get_item(id, media_type)
            else:
                print "%s could not be found!" % id
                return (media, current_id)

        if 'relateditems' not in media:
            media['relateditems'] = {}
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices['object_id_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        object_title = row[indices['object_title_index']]
        object_number = row[indices['object_number_index']]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find('_')
            object_title = object_number[idx + 1:]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in media['relateditems']:
            media['relateditems'][classification] = []
        media['relateditems'][classification].append({
            'id': object_id,
            'title': object_title,
            'displaytext': object_title,
            'classificationid': classification_key,
            'number': object_number,
            'date': date,
            'thumbnail': thumbnail_url
        })
        # keep the related items sorted
        media['relateditems'][classification].sort(
            key=operator.itemgetter('displaytext'))

        return (media, current_id)
Ejemplo n.º 23
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		media_type_key = int(row[indices['media_type_id_index']])
		media_type = MEDIATYPES.get(media_type_key)
		media_master_id = row[indices['media_master_id_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])
		display_text = row[indices['caption_index']]

		# this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
		# so, make sure we are only using videos that are mp4s
		if media_type_key == 3:
			if not row[indices['main_file_index']].endswith('mp4'):
				return(object, current_id)

		if media_type not in object['relateditems']:
			object['relateditems'][media_type] = []

		if media_type == 'photos':
			object['hasphoto'] = "Yes"
		# add primary photo as a top level item as well
		if row[indices['primary_display_index']] == '1':
			object['primarydisplay'] = {
			'thumbnail' : thumbnail_url,
			'main' : main_url,
			'displaytext' : display_text
			}
		if not (classification == '3dmodels' and media_type == '3dmodels'):
			object['relateditems'][media_type].append({
				'id' : media_master_id,
				'displaytext' : display_text,
				'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False,
				'thumbnail' : thumbnail_url,
				'main' : main_url
				})
		return(object, current_id)
Ejemplo n.º 24
0
	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			# will likely have multiple rows for one pub because of many related constituents
			# only get a new pub if we have a new pub id, but first save old pub to elasticsearch
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return(pub, current_id)
		if 'relateditems' not in pub:
			pub['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		alpha_sort = row[indices['alpha_sort_index']]

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this pub
		if role not in pub['roles']:
			pub['roles'].append(role)

		if role == "Author":
			pub["authors"].append(alpha_sort)

		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		constituent_dict['role'] = role
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in pub['relateditems']:
			pub['relateditems'][constituent_type] = []
		pub['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		pub['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		return(pub, current_id)
Ejemplo n.º 25
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this object
		if role not in object['roles']:
			object['roles'].append(role)

		constituent_dict['role'] = row[indices['role_index']]
		constituent_dict['roleid'] = row[indices['role_id_index']]
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in object['relateditems']:
			object['relateditems'][constituent_type] = []
		object['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		object['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		return(object, current_id)
Ejemplo n.º 26
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if "relateditems" not in constituent:
            constituent["relateditems"] = {}
        classification_key = int(row[indices["classification_id_index"]])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices["object_id_index"]])
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])

        date = "" if row[indices["object_date_index"]].lower() == "null" else row[indices["object_date_index"]]
        object_title = row[indices["object_title_index"]]
        object_number = row[indices["object_number_index"]]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find("_")
            object_title = object_number[idx + 1 :]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in constituent["relateditems"]:
            constituent["relateditems"][classification] = []
        constituent["relateditems"][classification].append(
            {
                "id": object_id,
                "title": object_title,
                "displaytext": object_title,
                "classificationid": classification_key,
                "number": object_number,
                "date": date,
                "thumbnail": thumbnail_url,
            }
        )
        # keep the related items sorted
        constituent["relateditems"][classification].sort(key=operator.itemgetter("displaytext"))

        return (constituent, current_id)
Ejemplo n.º 27
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue
        if site_id != current_id:
            # will likely have multiple rows for one site because of many related objects
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites'):
                site = elasticsearch_connection.get_item(site_id, 'sites')
            else:
                print "%s could not be found!" % site_id
                return (site, current_id)

        if 'relateditems' not in site:
            site['relateditems'] = {}
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices['object_id_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        object_title = row[indices['object_title_index']]
        object_number = row[indices['object_number_index']]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find('_')
            object_title = object_number[idx + 1:]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in site['relateditems']:
            site['relateditems'][classification] = []
        site['relateditems'][classification].append({
            'id': object_id,
            'title': object_title,
            'displaytext': object_title,
            'classificationid': classification_key,
            'number': object_number,
            'date': date,
            'thumbnail': thumbnail_url
        })
        # keep the related items sorted
        site['relateditems'][classification].sort(
            key=operator.itemgetter('displaytext'))
        return (site, current_id)
Ejemplo n.º 28
0
    def process_media_row(media, current_id):
        id = row[indices['id_index']]
        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)

        # for now, ignore Microfilm and Document media types
        if (media_type_key in [4, 5]):
            return (media, current_id)

        if id != current_id:
            save(media)
            current_id = id
            media = {}
            if elasticsearch_connection.item_exists(id, media_type,
                                                    ELASTICSEARCH_INDEX):
                media = elasticsearch_connection.get_item(
                    id, media_type, ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % id)
                return (media, current_id)
        if 'relateditems' not in media:
            media['relateditems'] = {}

        site_id = row[indices['site_id_index']]
        site_name = row[indices['site_name_index']]
        site_number = row[indices['site_number_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        drs_id = "" if row[indices['drs_id']].lower() == "null" else row[
            indices['drs_id']]
        if not thumbnail_url and drs_id:
            thumbnail_url = create_thumbnail_url(drs_id)

        site_dict = {}
        site_dict['id'] = site_id
        site_dict['sitename'] = site_name
        site_dict['sitenumber'] = site_number
        site_dict['displaytext'] = site_number
        site_dict['thumbnail'] = thumbnail_url

        if 'sites' not in media['relateditems']:
            media['relateditems']['sites'] = []
        media['relateditems']['sites'].append(site_dict)
        # keep the related items sorted
        media['relateditems']['sites'].sort(
            key=operator.itemgetter('displaytext'))

        return (media, current_id)
Ejemplo n.º 29
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related photos
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		media_type_key = int(row[indices['media_type_id_index']])
		media_type = MEDIATYPES.get(media_type_key)
		media_master_id = row[indices['media_master_id_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])
		display_text = row[indices['caption_index']]

		# this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
		# so, make sure we are only using videos that are mp4s
		if media_type_key == 3:
			if not row[indices['main_file_index']].endswith('mp4'):
				return(site, current_id)

		if media_type not in site['relateditems']:
			site['relateditems'][media_type] = []
		# add primary photo as a top level item as well
		if row[indices['primary_display_index']] == '1':
			site['primarydisplay'] = {
			'thumbnail' : thumbnail_url,
			'main' : main_url,
			'displaytext' : display_text
			}
		site['relateditems'][media_type].append({
			'id' : media_master_id,
			'displaytext' : display_text,
			'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False,
			'thumbnail' : thumbnail_url,
			'main' : main_url
			})
		return(site, current_id)
Ejemplo n.º 30
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)
        if 'relateditems' not in object:
            object['relateditems'] = {}

        site_id = row[indices['site_id_index']]
        site_name = row[indices['site_name_index']]
        site_number = row[indices['site_number_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        site_dict = {}
        site_dict['id'] = site_id
        site_dict['sitename'] = site_name
        site_dict['sitenumber'] = site_number
        site_dict['displaytext'] = site_number
        site_dict['thumbnail'] = thumbnail_url

        if 'sites' not in object['relateditems']:
            object['relateditems']['sites'] = []
        object['relateditems']['sites'].append(site_dict)
        # keep the related items sorted
        object['relateditems']['sites'].sort(
            key=operator.itemgetter('displaytext'))

        # for unpubdocs, add sites for "Mentioned on this page"
        if classification == "unpubdocs":
            if 'mentioned' not in object:
                object['mentioned'] = {}
            if 'sites' not in object['mentioned']:
                object['mentioned']['sites'] = []
            object['mentioned']['sites'].append(site_dict)

        return (object, current_id)
Ejemplo n.º 31
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related photos
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        media_type_key = int(row[indices["media_type_id_index"]])
        media_type = MEDIATYPES.get(media_type_key)
        media_master_id = row[indices["media_master_id_index"]]
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])
        main_url = get_media_url(row[indices["main_path_index"]], row[indices["main_file_index"]])
        display_text = row[indices["caption_index"]]

        # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
        # so, make sure we are only using videos that are mp4s
        if media_type_key == 3:
            if not row[indices["main_file_index"]].endswith("mp4"):
                return (constituent, current_id)

        if media_type not in constituent["relateditems"]:
            constituent["relateditems"][media_type] = []
            # add primary photo as a top level item as well
        if row[indices["primary_display_index"]] == "1":
            constituent["primarydisplay"] = {"thumbnail": thumbnail_url, "main": main_url, "displaytext": display_text}
        constituent["relateditems"][media_type].append(
            {
                "id": media_master_id,
                "displaytext": display_text,
                "primarydisplay": True if row[indices["primary_display_index"]] == "1" else False,
                "thumbnail": thumbnail_url,
                "main": main_url,
            }
        )
        return (constituent, current_id)
Ejemplo n.º 32
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		site_id = row[indices['site_id_index']]
		site_name = row[indices['site_name_index']]
		site_number = row[indices['site_number_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = site_number
		site_dict['thumbnail'] = thumbnail_url

		if 'sites' not in object['relateditems']:
			object['relateditems']['sites'] = []
		object['relateditems']['sites'].append(site_dict)
		# keep the related items sorted
		object['relateditems']['sites'].sort(key=operator.itemgetter('displaytext'))

		# for unpubdocs, add sites for "Mentioned on this page"
		if classification == "unpubdocs":
			if 'mentioned' not in object:
				object['mentioned'] = {}
			if 'sites' not in object['mentioned']:
				object['mentioned']['sites'] = []
			object['mentioned']['sites'].append(site_dict)

		return(object, current_id)
Ejemplo n.º 33
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

		if 'relateditems' not in site:
			site['relateditems'] = {}
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)
		object_id = int(row[indices['object_id_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		object_title = row[indices['object_title_index']]
		object_number = row[indices['object_number_index']]
		if classification == "diarypages" and object_title.lower() == "null":
			idx = object_number.find('_')
			object_title = object_number[idx+1:]
		if object_title.lower() == "null":
			object_title = "[No Title]"

		if classification not in site['relateditems']:
			site['relateditems'][classification] = []
		site['relateditems'][classification].append({
			'id' : object_id,
			'title' : object_title,
			'displaytext' : object_title,
			'classificationid' : classification_key,
			'number' : object_number,
			'date' : date,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		site['relateditems'][classification].sort(key=operator.itemgetter('displaytext'))
		return (site, current_id)
Ejemplo n.º 34
0
    def process_pub_row(pub, current_id):
        id = row[indices['id_index']]

        if id != current_id:
            save(pub)
            current_id = id
            pub = {}
            if elasticsearch_connection.item_exists(id, 'pubdocs',
                                                    ELASTICSEARCH_INDEX):
                pub = elasticsearch_connection.get_item(
                    id, 'pubdocs', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % id)
                return (pub, current_id)
        if 'relateditems' not in pub:
            pub['relateditems'] = {}

        site_id = row[indices['site_id_index']]
        site_name = row[indices['site_name_index']]
        site_number = row[indices['site_number_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        drs_id = "" if row[indices['drs_id']].lower() == "null" else row[
            indices['drs_id']]
        has_manifest = False if drs_id == "" else True
        if not thumbnail_url and drs_id:
            thumbnail_url = create_thumbnail_url(drs_id)

        site_dict = {}
        site_dict['id'] = site_id
        site_dict['sitename'] = site_name
        site_dict['sitenumber'] = site_number
        site_dict['displaytext'] = site_number
        site_dict['thumbnail'] = thumbnail_url
        site_dict['has_manifest'] = has_manifest

        if 'sites' not in pub['relateditems']:
            pub['relateditems']['sites'] = []
        pub['relateditems']['sites'].append(site_dict)
        # keep the related items sorted
        pub['relateditems']['sites'].sort(
            key=operator.itemgetter('displaytext'))

        return (pub, current_id)
Ejemplo n.º 35
0
    def process_media_row(media, current_id):
        id = row[indices['id_index']]
        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)

        # for now, ignore Microfilm and Document media types
        if (media_type_key in [4, 5]):
            return (media, current_id)

        if id != current_id:
            save(media)
            current_id = id
            media = {}
            if elasticsearch_connection.item_exists(id, media_type,
                                                    ELASTICSEARCH_INDEX):
                media = elasticsearch_connection.get_item(
                    id, media_type, ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % id)
                return (media, current_id)
        if 'relateditems' not in media:
            media['relateditems'] = {}

        reference_id = row[indices['reference_id_index']]
        title = row[indices['title_index']]
        boiler_text = row[indices['boiler_text_index']]
        date = row[indices['date_index']]
        main_url = get_media_url(row[indices['path_index']],
                                 row[indices['file_index']])

        if 'pubdocs' not in media['relateditems']:
            media['relateditems']['pubdocs'] = []
        media['relateditems']['pubdocs'].append({
            'id': reference_id,
            'boilertext': boiler_text,
            'displaytext': boiler_text,
            'date': date,
            'url': main_url
        })
        # keep the related items sorted
        media['relateditems']['pubdocs'].sort(
            key=operator.itemgetter('displaytext'))

        return (media, current_id)
Ejemplo n.º 36
0
	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return (pub, current_id)

		if 'relateditems' not in pub:
			pub['relateditems'] = {}
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)
		object_id = int(row[indices['object_id_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		object_title = row[indices['object_title_index']]
		object_number = row[indices['object_number_index']]
		if classification == "diarypages" and object_title.lower() == "null":
			idx = object_number.find('_')
			object_title = object_number[idx+1:]
		if object_title.lower() == "null":
			object_title = "[No Title]"

		if classification not in pub['relateditems']:
			pub['relateditems'][classification] = []
		pub['relateditems'][classification].append({
			'id' : object_id,
			'title' : object_title,
			'displaytext' : object_title,
			'classificationid' : classification_key,
			'number' : object_number,
			'date' : date,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		pub['relateditems'][classification].sort(key=operator.itemgetter('displaytext'))

		return (pub, current_id)
Ejemplo n.º 37
0
	def process_constituent_row(constituent, current_id):
		constituent_id = row[indices['constituent_id_index']]
		type_key = int(row[indices['type_id_index']])
		type = CONSTITUENTTYPES.get(type_key)

		if constituent_id != current_id:
			# will likely have multiple rows for one constituent because of many related constituents
			# only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
			save(constituent)
			current_id = constituent_id
			constituent = {}
			if elasticsearch_connection.item_exists(constituent_id, type, ELASTICSEARCH_INDEX):
				constituent = elasticsearch_connection.get_item(constituent_id, type, ELASTICSEARCH_INDEX)
			else:
				print("%s could not be found!" % constituent_id)
				return(constituent, current_id)
		if 'relateditems' not in constituent:
			constituent['relateditems'] = {}

		site_id = row[indices['site_id_index']]
		site_name = row[indices['site_name_index']]
		site_number = row[indices['site_number_index']]
		drs_id = "" if row[indices['drs_id']].lower() == "null" else row[indices['drs_id']]
		has_manifest = False if drs_id == "" else True
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		if not thumbnail_url and drs_id:
			thumbnail_url = create_thumbnail_url(drs_id)

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = "%s, %s" % (site_name, site_number)
		site_dict['thumbnail'] = thumbnail_url
		site_dict['has_manifest'] = has_manifest

		if 'sites' not in constituent['relateditems']:
			constituent['relateditems']['sites'] = []
		constituent['relateditems']['sites'].append(site_dict)
		# keep the related items sorted
		constituent['relateditems']['sites'].sort(key=operator.itemgetter('displaytext'))

		return(constituent, current_id)
Ejemplo n.º 38
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue
        if site_id != current_id:
            # will likely have multiple rows for one site because of many related published
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites',
                                                    ELASTICSEARCH_INDEX):
                site = elasticsearch_connection.get_item(
                    site_id, 'sites', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % site_id)
                return (site, current_id)
        if 'relateditems' not in site:
            site['relateditems'] = {}

        reference_id = row[indices['reference_id_index']]
        title = row[indices['title_index']]
        boiler_text = row[indices['boiler_text_index']]
        date = row[indices['date_index']]
        main_url = get_media_url(row[indices['path_index']],
                                 row[indices['file_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        if "pubdocs" not in site['relateditems']:
            site['relateditems']["pubdocs"] = []
        site['relateditems']["pubdocs"].append({
            'id': reference_id,
            'boilertext': boiler_text,
            'displaytext': boiler_text,
            'date': date,
            'url': main_url,
            'thumbnail': thumbnail_url
        })
        # keep the related items sorted
        site['relateditems']['pubdocs'].sort(
            key=operator.itemgetter('displaytext'))
        return (site, current_id)
Ejemplo n.º 39
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices['constituent_id_index']]
        type_key = int(row[indices['type_id_index']])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related published
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(
                    constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if 'relateditems' not in constituent:
            constituent['relateditems'] = {}

        reference_id = row[indices['reference_id_index']]
        title = row[indices['title_index']]
        boiler_text = row[indices['boiler_text_index']]
        date = "" if row[indices['date_index']].lower() == "null" else row[
            indices['date_index']]
        main_url = get_media_url(row[indices['path_index']],
                                 row[indices['file_index']])

        if "pubdocs" not in constituent['relateditems']:
            constituent['relateditems']["pubdocs"] = []
        constituent['relateditems']["pubdocs"].append({
            'id': reference_id,
            'boilertext': boiler_text,
            'displaytext': title,
            'date': date,
            'url': main_url
        })
        # keep the related items sorted
        constituent['relateditems']['pubdocs'].sort(
            key=operator.itemgetter('displaytext'))
        return (constituent, current_id)
Ejemplo n.º 40
0
	def process_object_row(object, current_id):
		id = row[id_index]
		classification_key = int(row[classification_id_index])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		site_id = row[site_id_index]
		site_name = row[site_name_index]
		site_number = row[site_number_index]

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = "%s, %s" % (site_name, site_number)

		if 'sites' not in object['relateditems']:
			object['relateditems']['sites'] = []
		object['relateditems']['sites'].append(site_dict)

		# for unpubdocs, add sites for "Mentioned on this page"
		if classification == "unpubdocs":
			if 'mentioned' not in object:
				object['mentioned'] = {}
			if 'sites' not in object['mentioned']:
				object['mentioned']['sites'] = []
			object['mentioned']['sites'].append(site_dict)

		return(object, current_id)
Ejemplo n.º 41
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)
        if 'relateditems' not in object:
            object['relateditems'] = {}

        unpublished_id = row[indices['unpublished_id_index']]
        unpublished_title = row[indices['unpublished_title_index']]
        number = row[indices['object_number_index']]
        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        if 'unpubdocs' not in object['relateditems']:
            object['relateditems']['unpubdocs'] = []
        object['relateditems']['unpubdocs'].append({
            'id': unpublished_id,
            'text': unpublished_title,
            'displaytext': unpublished_title,
            'date': date,
            'number': number,
            'thumbnail': thumbnail_url
        })
        # keep the related items sorted
        object['relateditems']['unpubdocs'].sort(
            key=operator.itemgetter('displaytext'))

        return (object, current_id)
Ejemplo n.º 42
0
	def process_site_row(site, current_id):
		site_id = row[site_id_index]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related constituents
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		constituent_id = row[constituent_id_index]
		display_name = row[display_name_index]
		display_date = ""
		if row[display_date_index] != "NULL":
			display_date = row[display_date_index]

		constituent_dict = {}
		role = row[role_index]
		constituent_dict['role'] = role
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name

		constituent_type_key = int(row[constituent_type_id_index])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in site['relateditems']:
			site['relateditems'][constituent_type] = []
		site['relateditems'][constituent_type].append(constituent_dict)
		if role == 'Tomb Owner':
			site['tombowner'] = True
		return(site, current_id)
Ejemplo n.º 43
0
    def process_media_row(manifest, current_id):
        media_id = row[indices['id_index']]
        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)
        manifest_id = media_type + '-' + media_id

        # ignore non-photos, if they somehow managed to show up in the query results
        if media_type_key != 1:
            return (manifest, current_id)

        if manifest_id != current_id:
            save(manifest)
            current_id = manifest_id
            manifest = {}
            if elasticsearch_connection.item_exists(manifest_id, 'manifest',
                                                    ELASTICSEARCH_INDEX):
                manifest = elasticsearch_connection.get_item(
                    manifest_id, 'manifest', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % manifest_id)
                return (manifest, current_id)

        role = "" if row[indices['role_index']].lower() == "null" else row[
            indices['role_index']]
        display_name = "" if row[indices['display_name_index']].lower(
        ) == "null" else row[indices['display_name_index']]
        display_date = "" if row[indices['display_date_index']].lower(
        ) == "null" else row[indices['display_date_index']]

        if display_date:
            value = display_name + ", " + display_date
        else:
            value = display_name
        manifest['manifest']['metadata'].append({
            'label': role,
            'value': value
        })

        return (manifest, current_id)
Ejemplo n.º 44
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)
        if 'relateditems' not in object:
            object['relateditems'] = {}

        reference_id = row[indices['reference_id_index']]
        title = row[indices['title_index']]
        boiler_text = row[indices['boiler_text_index']]
        date = row[indices['date_index']]
        main_url = get_media_url(row[indices['path_index']],
                                 row[indices['file_index']])

        if 'pubdocs' not in object['relateditems']:
            object['relateditems']['pubdocs'] = []
        object['relateditems']['pubdocs'].append({
            'id': reference_id,
            'boilertext': boiler_text,
            'displaytext': boiler_text,
            'date': date,
            'url': main_url
        })
        # keep the related items sorted
        object['relateditems']['pubdocs'].sort(
            key=operator.itemgetter('displaytext'))

        return (object, current_id)
Ejemplo n.º 45
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related constituents
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        site_id = row[indices["site_id_index"]]
        site_name = row[indices["site_name_index"]]
        site_number = row[indices["site_number_index"]]
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])

        site_dict = {}
        site_dict["id"] = site_id
        site_dict["sitename"] = site_name
        site_dict["sitenumber"] = site_number
        site_dict["displaytext"] = "%s, %s" % (site_name, site_number)
        site_dict["thumbnail"] = thumbnail_url

        if "sites" not in constituent["relateditems"]:
            constituent["relateditems"]["sites"] = []
        constituent["relateditems"]["sites"].append(site_dict)
        # keep the related items sorted
        constituent["relateditems"]["sites"].sort(key=operator.itemgetter("displaytext"))

        return (constituent, current_id)
Ejemplo n.º 46
0
	def process_site_row(site, current_id):
		site_id = row[site_id_index]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)
		
		if 'relateditems' not in site:
			site['relateditems'] = {}
		classification_key = int(row[classification_id_index])
		classification = CLASSIFICATIONS.get(classification_key)
		object_id = int(row[object_id_index])

		object_title = row[object_title_index]
		if classification == "diarypages" and object_title.lower() == "null":
			object_number = row[object_number_index]
			idx = object_number.find('_')
			object_title = object_number[idx+1:]
		if object_title.lower() == "null":
			object_title = "[No Title]"

		if classification not in site['relateditems']:
			site['relateditems'][classification] = []
		site['relateditems'][classification].append({
			'id' : object_id, 
			'title' : object_title, 
			'displaytext' : object_title,
			'classificationid' : classification_key})
		return (site, current_id)
Ejemplo n.º 47
0
    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if "altnames" not in constituent:
            constituent["altnames"] = []
        altname = row[indices["altname_index"]]
        name_type = row[indices["name_type_index"]]
        constituent["altnames"].append({"name": altname, "type": name_type})
        return (constituent, current_id)
Ejemplo n.º 48
0
	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return(pub, current_id)

		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])

		pub['primarydisplay'] = {
		'thumbnail' : thumbnail_url,
		'main' : thumbnail_url
		}
		pub['pdf'] = main_url

		return(pub, current_id)
Ejemplo n.º 49
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		unpublished_id = row[indices['unpublished_id_index']]
		unpublished_title = row[indices['unpublished_title_index']]
		number = row[indices['object_number_index']]
		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		if 'unpubdocs' not in object['relateditems']:
			object['relateditems']['unpubdocs'] = []
		object['relateditems']['unpubdocs'].append({
			'id' : unpublished_id,
			'text' : unpublished_title,
			'displaytext' : unpublished_title,
			'date' : date,
			'number' : number,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		object['relateditems']['unpubdocs'].sort(key=operator.itemgetter('displaytext'))

		return(object, current_id)
Ejemplo n.º 50
0
	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related published
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		reference_id = row[indices['reference_id_index']]
		title = row[indices['title_index']]
		boiler_text = row[indices['boiler_text_index']]
		date = row[indices['date_index']]
		main_url = get_media_url(row[indices['path_index']], row[indices['file_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		if "pubdocs" not in site['relateditems']:
			site['relateditems']["pubdocs"] = []
		site['relateditems']["pubdocs"].append({
			'id' : reference_id,
			'boilertext' : boiler_text,
			'displaytext' : title,
			'date' : date,
			'url' : main_url,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		site['relateditems']['pubdocs'].sort(key=operator.itemgetter('displaytext'))
		return(site, current_id)
Ejemplo n.º 51
0
    def process_pub_row(pub, current_id):
        id = row[indices['id_index']]

        if id != current_id:
            save(pub)
            current_id = id
            pub = {}
            if elasticsearch_connection.item_exists(id, 'pubdocs'):
                pub = elasticsearch_connection.get_item(id, 'pubdocs')
            else:
                print "%s could not be found!" % id
                return (pub, current_id)
        if 'relateditems' not in pub:
            pub['relateditems'] = {}

        site_id = row[indices['site_id_index']]
        site_name = row[indices['site_name_index']]
        site_number = row[indices['site_number_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        site_dict = {}
        site_dict['id'] = site_id
        site_dict['sitename'] = site_name
        site_dict['sitenumber'] = site_number
        site_dict['displaytext'] = site_number
        site_dict['thumbnail'] = thumbnail_url

        if 'sites' not in pub['relateditems']:
            pub['relateditems']['sites'] = []
        pub['relateditems']['sites'].append(site_dict)
        # keep the related items sorted
        pub['relateditems']['sites'].sort(
            key=operator.itemgetter('displaytext'))

        return (pub, current_id)
Ejemplo n.º 52
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)

        geocode_dict = {}
        geocode_dict['id'] = row[indices['geo_code_id_index']]
        geocode_dict['geocode'] = row[indices['geo_code_index']]
        geocode_dict['region'] = row[indices['region_index']]
        geocode_dict['city'] = row[indices['city_index']]
        object['geocode'] = geocode_dict

        return (object, current_id)
Ejemplo n.º 53
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)

		geocode_dict = {}
		geocode_dict['id'] = row[indices['geo_code_id_index']]
		geocode_dict['geocode'] = row[indices['geo_code_index']]
		geocode_dict['region'] = row[indices['region_index']]
		geocode_dict['city'] = row[indices['city_index']]
		object['geocode'] = geocode_dict

		return(object, current_id)
Ejemplo n.º 54
0
	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		reference_id = row[indices['reference_id_index']]
		title = row[indices['title_index']]
		boiler_text = row[indices['boiler_text_index']]
		date = row[indices['date_index']]
		main_url = get_media_url(row[indices['path_index']], row[indices['file_index']])

		if 'pubdocs' not in object['relateditems']:
			object['relateditems']['pubdocs'] = []
		object['relateditems']['pubdocs'].append({
			'id' : reference_id,
			'boilertext' : boiler_text,
			'displaytext' : boiler_text,
			'date' : date,
			'url' : main_url})
		# keep the related items sorted
		object['relateditems']['pubdocs'].sort(key=operator.itemgetter('displaytext'))

		return(object, current_id)
Ejemplo n.º 55
0
def create_library():
	print "Creating Digital Library..."

	author_ids = []
	size = 20
	results_from = 0
	es = elasticsearch_connection.get_connection()
	es_index = elasticsearch_connection.ELASTICSEARCH_INDEX

	# delete library
	results = es.search(index=es_index, doc_type='library', body={
		"size" : 500,
		"fields" : ["_id", "name"],
		"query": {
			"match_all" : {}
		}
	})['hits']['hits']
	for r in results:
		elasticsearch_connection.delete(r['_id'], 'library')

	total = es.search(index=es_index, doc_type='pubdocs', body={
		"size" : 0,
		"query": {
			"match_all" : {}
		}
	})['hits']['total']

	while results_from < total:
		results = es.search(index=es_index, doc_type='pubdocs', body={
			"size" : size,
			"from" : results_from,
			"query": {
				"match_all" : {}
			}
		})
		for r in results['hits']['hits']:
			result = r['_source']
			if 'pdf' not in result or result['pdf'] == '':
				continue
			authors = result['authors']

			# if this doc has no authors, set the author to 'No Author' and proceed
			if len(authors) == 0:
				authors.append('No Author')

			for author in authors:
				author_id = author.replace(' ', '')
				# see if this author already exists
				if author_id in author_ids:
					author_data = elasticsearch_connection.get_item(author_id, 'library')
				else:
					author_ids.append(author_id)
					author_data = {}
					author_data['name'] = author
					author_data['docs'] = []

				author_data['docs'].append({
					'displaytext' : result['boilertext'],
					'format' : result['format'],
					# add file size
					'url' : result['pdf']
				})
				author_data['docs'].sort(key=operator.itemgetter('displaytext'))

				data = json.dumps(author_data)
				elasticsearch_connection.add_or_update_item(author_id, data, 'library')

		results_from = results_from + size
	print "Finished Digital Library..."
Ejemplo n.º 56
0
    def process_object_row(object, current_id):
        id = row[indices['id_index']]
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)

        if id != current_id:
            # may have multiple rows for one object because of many related constituents
            save(object)
            current_id = id
            object = {}
            if elasticsearch_connection.item_exists(id, classification):
                object = elasticsearch_connection.get_item(id, classification)
            else:
                print "%s could not be found!" % id
                return (object, current_id)
        if 'relateditems' not in object:
            object['relateditems'] = {}

        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)
        number = "" if row[indices['rendition_number_index']].lower(
        ) == "null" else row[indices['rendition_number_index']]
        media_master_id = row[indices['media_master_id_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        main_url = get_media_url(row[indices['main_path_index']],
                                 row[indices['main_file_index']])
        description = "" if row[indices['description_index']].lower(
        ) == "null" else row[indices['description_index']]
        mediaview = "" if row[indices['media_view_index']].lower(
        ) == "null" else row[indices['media_view_index']]
        caption = "" if row[indices['caption_index']].lower(
        ) == "null" else row[indices['caption_index']]
        display_text = ": ".join([mediaview, caption])
        # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
        # so, make sure we are only using videos that are mp4s
        if media_type_key == 3:
            if not row[indices['main_file_index']].endswith('mp4'):
                return (object, current_id)

        if media_type not in object['relateditems']:
            object['relateditems'][media_type] = []

        if media_type == 'photos':
            object['hasphoto'] = "Yes"
        # add primary photo as a top level item as well
        if row[indices['primary_display_index']] == '1':
            object['primarydisplay'] = {
                'thumbnail': thumbnail_url,
                'main': main_url,
                'displaytext': display_text,
                'number': number,
                'description': description
            }
        if not (classification == '3dmodels' and media_type == '3dmodels'):
            object['relateditems'][media_type].append({
                'id':
                media_master_id,
                'displaytext':
                display_text,
                'primarydisplay':
                True
                if row[indices['primary_display_index']] == '1' else False,
                'thumbnail':
                thumbnail_url,
                'main':
                main_url,
                'number':
                number,
                'description':
                description
            })
        return (object, current_id)
Ejemplo n.º 57
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue
        if site_id != current_id:
            # will likely have multiple rows for one site because of many related constituents
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites',
                                                    ELASTICSEARCH_INDEX):
                site = elasticsearch_connection.get_item(
                    site_id, 'sites', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % site_id)
                return (site, current_id)
        if 'relateditems' not in site:
            site['relateditems'] = {}

        constituent_id = row[indices['constituent_id_index']]
        display_name = row[indices['display_name_index']]
        display_date = ""
        if row[indices['display_date_index']] != "NULL":
            display_date = row[indices['display_date_index']]
        drs_id = "" if row[indices['drs_id']].lower() == "null" else row[
            indices['drs_id']]
        has_manifest = False if drs_id == "" else True
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        if not thumbnail_url and drs_id:
            thumbnail_url = create_thumbnail_url(drs_id)

        constituent_dict = {}
        role = row[indices['role_index']]
        # update the set of roles for this site
        if role not in site['roles']:
            # make sure Tomb Owner is first
            if role == "Tomb Owner":
                site['roles'].insert(0, role)
            else:
                site['roles'].append(role)

        description = row[indices['remarks_index']] if row[
            indices['remarks_index']] != "NULL" else ""
        constituent_dict['role'] = role
        constituent_dict['id'] = constituent_id
        constituent_dict['displayname'] = display_name
        constituent_dict['displaydate'] = display_date
        constituent_dict['displaytext'] = display_name
        constituent_dict['description'] = description
        constituent_dict['thumbnail'] = thumbnail_url
        constituent_dict['has_manifest'] = has_manifest

        constituent_type_key = int(row[indices['constituent_type_id_index']])
        constituent_type = CONSTITUENTTYPES.get(constituent_type_key)

        # add to array of people for easier searching
        if (constituent_type_key in [1, 3]):
            site['people'].append(display_name)

        if constituent_type not in site['relateditems']:
            site['relateditems'][constituent_type] = []
        site['relateditems'][constituent_type].append(constituent_dict)
        # keep the related items sorted
        site['relateditems'][constituent_type].sort(
            key=operator.itemgetter('displaytext'))

        if role == 'Tomb Owner':
            site['tombowner'] = "Yes"
        return (site, current_id)
Ejemplo n.º 58
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue
        if site_id != current_id:
            # will likely have multiple rows for one site because of many related photos
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites',
                                                    ELASTICSEARCH_INDEX):
                site = elasticsearch_connection.get_item(
                    site_id, 'sites', ELASTICSEARCH_INDEX)
            else:
                print("%s could not be found!" % site_id)
                return (site, current_id)
        if 'relateditems' not in site:
            site['relateditems'] = {}

        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)
        number = "" if row[indices['rendition_number_index']].lower(
        ) == "null" else row[indices['rendition_number_index']]
        description = "" if row[indices['description_index']].lower(
        ) == "null" else row[indices['description_index']]
        mediaview = "" if row[indices['media_view_index']].lower(
        ) == "null" else row[indices['media_view_index']]
        caption = "" if row[indices['caption_index']].lower(
        ) == "null" else row[indices['caption_index']]
        display_text = ": ".join([mediaview, caption])
        media_master_id = row[indices['media_master_id_index']]
        main_url = get_media_url(row[indices['main_path_index']],
                                 row[indices['main_file_index']])
        drs_id = "" if row[indices['drs_id']].lower() == "null" else row[
            indices['drs_id']]
        has_manifest = False if drs_id == "" else True
        primary_display = True if row[
            indices['primary_display_index']] == '1' else False
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        if not thumbnail_url and drs_id:
            thumbnail_url = create_thumbnail_url(drs_id)

        # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
        # so, make sure we are only using videos that are mp4s
        if media_type_key == 3:
            if not row[indices['main_file_index']].endswith('mp4'):
                return (site, current_id)

        if media_type not in site['relateditems']:
            site['relateditems'][media_type] = []
        # add primary photo as a top level item as well
        if primary_display:
            site['primarydisplay'] = {
                'thumbnail': thumbnail_url,
                'main': main_url,
                'displaytext': display_text,
                'number': number,
                'description': description,
                'has_manifest': has_manifest,
                'media_id': media_master_id
            }
        site['relateditems'][media_type].append({
            'id': media_master_id,
            'displaytext': display_text,
            'primarydisplay': primary_display,
            'thumbnail': thumbnail_url,
            'main': main_url,
            'number': number,
            'description': description,
            'has_manifest': has_manifest,
            'drs_id': drs_id
        })

        if has_manifest:
            object = elasticsearch_connection.get_item(
                media_type + '-' + media_master_id, 'manifest',
                ELASTICSEARCH_IIIF_INDEX)
            resource = object['manifest']['sequences'][0]['canvases'][0][
                'images'][0]['resource']
            canvas_label = object['manifest']['description']
            canvas_metadata = object['manifest'][
                'metadata']  #add photo manifest-level metadata as canvas-level metadata for site

            if site_id not in SITE_RELATIONS.keys():
                metadata = add_metadata_to_manifest(site)

                SITE_RELATIONS[site_id] = {
                    'description': site['description'],
                    'label': site['displaytext'],
                    'resources': [resource],
                    'drs_ids': [drs_id],
                    'canvas_labels': [canvas_label],
                    'canvas_metadatas': [canvas_metadata],
                    'metadata': metadata
                }
            else:
                SITE_RELATIONS[site_id]['resources'].append(resource)
                SITE_RELATIONS[site_id]['drs_ids'].append(drs_id)
                SITE_RELATIONS[site_id]['canvas_labels'].append(canvas_label)
                SITE_RELATIONS[site_id]['canvas_metadatas'].append(
                    canvas_metadata)
            if primary_display:
                SITE_RELATIONS[site_id]['startCanvas'] = drs_id

        return (site, current_id)
Ejemplo n.º 59
0
def create_library():
    print("Creating Digital Library...")
    time.sleep(
        3
    )  # for some reason the library isn't always fully populated. see if a time delay helps

    author_ids = []
    size = 20
    results_from = 0
    es = elasticsearch_connection.get_connection()
    es_index = ELASTICSEARCH_INDEX

    # delete library
    results = es.search(index=es_index,
                        doc_type='library',
                        body={
                            "size": 500,
                            "stored_fields": ["_id", "name"],
                            "query": {
                                "match_all": {}
                            }
                        })['hits']['hits']
    for r in results:
        elasticsearch_connection.delete(r['_id'], 'library',
                                        ELASTICSEARCH_INDEX)

    total = es.search(index=es_index,
                      doc_type='pubdocs',
                      body={
                          "size": 0,
                          "query": {
                              "match_all": {}
                          }
                      })['hits']['total']

    while results_from < total:
        results = es.search(index=es_index,
                            doc_type='pubdocs',
                            body={
                                "size": size,
                                "from": results_from,
                                "query": {
                                    "match_all": {}
                                }
                            })
        for r in results['hits']['hits']:
            result = r['_source']
            if 'pdf' not in result or result['pdf'] == '':
                continue
            authors = result['authors']

            # if this doc has no authors, set the author to 'No Author' and proceed
            if len(authors) == 0:
                authors.append('No Author')

            for author in authors:
                author_id = author.replace(' ', '')
                sortauthor = author.lower().strip()
                sortauthor = str(
                    unicodedata.normalize('NFD', sortauthor).encode(
                        'ascii', 'ignore').decode("utf-8"))
                # see if this author already exists
                if author_id in author_ids:
                    author_data = elasticsearch_connection.get_item(
                        author_id, 'library', ELASTICSEARCH_INDEX)
                else:
                    author_ids.append(author_id)
                    author_data = {}
                    author_data['name'] = author
                    author_data['sortname'] = sortauthor
                    author_data['docs'] = []

                author_data['docs'].append({
                    'displaytext':
                    result['boilertext'],
                    'sorttext':
                    result['notes']
                    if result['notes'] is not None else result['title'],
                    'format':
                    result['format'],
                    # add file size
                    'url':
                    result['pdf']
                })
                author_data['docs'].sort(key=operator.itemgetter('sorttext'))

                data = json.dumps(author_data)
                elasticsearch_connection.add_or_update_item(
                    author_id, data, 'library', ELASTICSEARCH_INDEX)

        results_from = results_from + size
    print("Finished Digital Library...")
Ejemplo n.º 60
0
    def process_site_row(site, current_id):
        site_id = row[indices['site_id_index']]
        #if site_id not in SAMPLE_SITES:
        #	continue
        if site_id != current_id:
            # will likely have multiple rows for one site because of many related photos
            # only get a new site if we have a new site id, but first save old site to elasticsearch
            save(site)
            current_id = site_id
            site = {}
            if elasticsearch_connection.item_exists(site_id, 'sites'):
                site = elasticsearch_connection.get_item(site_id, 'sites')
            else:
                print "%s could not be found!" % site_id
                return (site, current_id)
        if 'relateditems' not in site:
            site['relateditems'] = {}

        media_type_key = int(row[indices['media_type_id_index']])
        media_type = MEDIATYPES.get(media_type_key)
        number = "" if row[indices['rendition_number_index']].lower(
        ) == "null" else row[indices['rendition_number_index']]
        description = "" if row[indices['description_index']].lower(
        ) == "null" else row[indices['description_index']]
        mediaview = "" if row[indices['media_view_index']].lower(
        ) == "null" else row[indices['media_view_index']]
        caption = "" if row[indices['caption_index']].lower(
        ) == "null" else row[indices['caption_index']]
        display_text = ": ".join([mediaview, caption])
        media_master_id = row[indices['media_master_id_index']]
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])
        main_url = get_media_url(row[indices['main_path_index']],
                                 row[indices['main_file_index']])

        # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
        # so, make sure we are only using videos that are mp4s
        if media_type_key == 3:
            if not row[indices['main_file_index']].endswith('mp4'):
                return (site, current_id)

        if media_type not in site['relateditems']:
            site['relateditems'][media_type] = []
        # add primary photo as a top level item as well
        if row[indices['primary_display_index']] == '1':
            site['primarydisplay'] = {
                'thumbnail': thumbnail_url,
                'main': main_url,
                'displaytext': display_text,
                'number': number,
                'description': description
            }
        site['relateditems'][media_type].append({
            'id':
            media_master_id,
            'displaytext':
            display_text,
            'primarydisplay':
            True if row[indices['primary_display_index']] == '1' else False,
            'thumbnail':
            thumbnail_url,
            'main':
            main_url,
            'number':
            number,
            'description':
            description
        })
        return (site, current_id)