def process_constituents_related_objects(CURSOR): def get_indices(): indices = { "constituent_id_index": columns.index("ConstituentID"), "type_id_index": columns.index("ConstituentTypeID"), "object_id_index": columns.index("ObjectID"), "object_title_index": columns.index("Title"), "object_number_index": columns.index("ObjectNumber"), "classification_id_index": columns.index("ClassificationID"), "object_date_index": columns.index("ObjectDate"), "thumb_path_index": columns.index("ThumbPathName"), "thumb_file_index": columns.index("ThumbFileName"), } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related objects # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item(constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if "relateditems" not in constituent: constituent["relateditems"] = {} classification_key = int(row[indices["classification_id_index"]]) classification = CLASSIFICATIONS.get(classification_key) object_id = int(row[indices["object_id_index"]]) thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]]) date = "" if row[indices["object_date_index"]].lower() == "null" else row[indices["object_date_index"]] object_title = row[indices["object_title_index"]] object_number = row[indices["object_number_index"]] if classification == "diarypages" and object_title.lower() == "null": idx = object_number.find("_") object_title = object_number[idx + 1 :] if object_title.lower() == "null": object_title = "[No Title]" if classification not in constituent["relateditems"]: constituent["relateditems"][classification] = [] constituent["relateditems"][classification].append( { "id": object_id, "title": object_title, "displaytext": object_title, "classificationid": classification_key, "number": object_number, "date": date, "thumbnail": thumbnail_url, } ) # keep the related items sorted constituent["relateditems"][classification].sort(key=operator.itemgetter("displaytext")) return (constituent, current_id) print "Starting Constituents Related Objects..." if CURSOR: sql_command = constituents_sql.RELATED_OBJECTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents_objects_related.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents Related Objects..."
def process_object_related_unpublished(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'unpublished_id_index' : columns.index('UnpublishedID'), 'unpublished_title_index' : columns.index('UnpublishedTitle'), 'classification_id_index' : columns.index('ClassificationID'), 'object_date_index' : columns.index('ObjectDate'), 'object_number_index' : columns.index('ObjectNumber'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if id != current_id: # may have multiple rows for one object because of many related constituents save(object) current_id = id object = {} if elasticsearch_connection.item_exists(id, classification): object = elasticsearch_connection.get_item(id, classification) else: print "%s could not be found!" % id return(object, current_id) if 'relateditems' not in object: object['relateditems'] = {} unpublished_id = row[indices['unpublished_id_index']] unpublished_title = row[indices['unpublished_title_index']] number = row[indices['object_number_index']] date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) if 'unpubdocs' not in object['relateditems']: object['relateditems']['unpubdocs'] = [] object['relateditems']['unpubdocs'].append({ 'id' : unpublished_id, 'text' : unpublished_title, 'displaytext' : unpublished_title, 'date' : date, 'number' : number, 'thumbnail' : thumbnail_url}) # keep the related items sorted object['relateditems']['unpubdocs'].sort(key=operator.itemgetter('displaytext')) return(object, current_id) print "Starting Objects Related Unpublished..." if CURSOR: sql_command = objects_sql.RELATED_UNPUBLISHED CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_unpublished_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Related Unpublished..."
def process_constituents_related_objects(CURSOR): def get_indices(): indices = { 'constituent_id_index': columns.index('ConstituentID'), 'type_id_index': columns.index('ConstituentTypeID'), 'object_id_index': columns.index('ObjectID'), 'object_title_index': columns.index('Title'), 'object_number_index': columns.index('ObjectNumber'), 'classification_id_index': columns.index('ClassificationID'), 'object_date_index': columns.index('ObjectDate'), 'thumb_path_index': columns.index('ThumbPathName'), 'thumb_file_index': columns.index('ThumbFileName') } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices['constituent_id_index']] type_key = int(row[indices['type_id_index']]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related objects # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item( constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if 'relateditems' not in constituent: constituent['relateditems'] = {} classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) object_id = int(row[indices['object_id_index']]) thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) date = "" if row[indices['object_date_index']].lower( ) == "null" else row[indices['object_date_index']] object_title = row[indices['object_title_index']] object_number = row[indices['object_number_index']] if classification == "diarypages" and object_title.lower() == "null": idx = object_number.find('_') object_title = object_number[idx + 1:] if object_title.lower() == "null": object_title = "[No Title]" if classification not in constituent['relateditems']: constituent['relateditems'][classification] = [] constituent['relateditems'][classification].append({ 'id': object_id, 'title': object_title, 'displaytext': object_title, 'classificationid': classification_key, 'number': object_number, 'date': date, 'thumbnail': thumbnail_url }) # keep the related items sorted constituent['relateditems'][classification].sort( key=operator.itemgetter('displaytext')) return (constituent, current_id) print "Starting Constituents Related Objects..." if CURSOR: sql_command = constituents_sql.RELATED_OBJECTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open('../data/constituents_objects_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n', '') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') constituent = {} current_id = '-1' for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents Related Objects..."
def process_object_altnums(CURSOR): def get_indices(): indices = { 'object_id_index' : columns.index('ObjectID'), 'classification_id_index' : columns.index('ClassificationID'), 'altnum_index' : columns.index('AltNum'), 'description_index' : columns.index('Description') } return indices def process_object_row(object, current_id): object_id = row[indices['object_id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if object_id != current_id: # will likely have multiple rows save(object) current_id = object_id object = {} if elasticsearch_connection.item_exists(object_id, classification): object = elasticsearch_connection.get_item(object_id, classification) else: print "%s could not be found!" % object_id return (object, current_id) if 'altnums' not in object: object['altnums'] = [] altnum = row[indices['altnum_index']] prefix_idx = altnum.find('_') without_prefix = altnum[prefix_idx+1:] description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else "" object['altnums'].append({"altnum" : altnum, "description" : description, 'without_prefix': without_prefix}) object['allnumbers'].extend((altnum, without_prefix)) return (object, current_id) print "Starting Objects AltNums..." if CURSOR: sql_command = objects_sql.ALTNUMS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_altnums.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects AltNums..."
def process_object_related_sites(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'site_id_index' : columns.index('SiteID'), 'site_name_index' : columns.index('SiteName'), 'site_number_index' : columns.index('SiteNumber'), 'classification_id_index' : columns.index('ClassificationID'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if id != current_id: # may have multiple rows for one object because of many related constituents save(object) current_id = id object = {} if elasticsearch_connection.item_exists(id, classification): object = elasticsearch_connection.get_item(id, classification) else: print "%s could not be found!" % id return(object, current_id) if 'relateditems' not in object: object['relateditems'] = {} site_id = row[indices['site_id_index']] site_name = row[indices['site_name_index']] site_number = row[indices['site_number_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) site_dict = {} site_dict['id'] = site_id site_dict['sitename'] = site_name site_dict['sitenumber'] = site_number site_dict['displaytext'] = site_number site_dict['thumbnail'] = thumbnail_url if 'sites' not in object['relateditems']: object['relateditems']['sites'] = [] object['relateditems']['sites'].append(site_dict) # keep the related items sorted object['relateditems']['sites'].sort(key=operator.itemgetter('displaytext')) # for unpubdocs, add sites for "Mentioned on this page" if classification == "unpubdocs": if 'mentioned' not in object: object['mentioned'] = {} if 'sites' not in object['mentioned']: object['mentioned']['sites'] = [] object['mentioned']['sites'].append(site_dict) return(object, current_id) print "Starting Objects Related Sites..." if CURSOR: sql_command = objects_sql.RELATED_SITES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_sites_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Related Sites..."
def process_constituents_related_media(CURSOR): def get_indices(): indices = { "constituent_id_index": columns.index("ID"), "type_id_index": columns.index("ConstituentTypeID"), "media_master_id_index": columns.index("MediaMasterID"), "primary_display_index": columns.index("PrimaryDisplay"), "media_type_id_index": columns.index("MediaTypeID"), "description_index": columns.index("Description"), "caption_index": columns.index("PublicCaption"), "thumb_path_index": columns.index("ThumbPathName"), "thumb_file_index": columns.index("ThumbFileName"), "main_path_index": columns.index("MainPathName"), "main_file_index": columns.index("MainFileName"), } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related photos # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item(constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if "relateditems" not in constituent: constituent["relateditems"] = {} media_type_key = int(row[indices["media_type_id_index"]]) media_type = MEDIATYPES.get(media_type_key) media_master_id = row[indices["media_master_id_index"]] thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]]) main_url = get_media_url(row[indices["main_path_index"]], row[indices["main_file_index"]]) display_text = row[indices["caption_index"]] # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video # so, make sure we are only using videos that are mp4s if media_type_key == 3: if not row[indices["main_file_index"]].endswith("mp4"): return (constituent, current_id) if media_type not in constituent["relateditems"]: constituent["relateditems"][media_type] = [] # add primary photo as a top level item as well if row[indices["primary_display_index"]] == "1": constituent["primarydisplay"] = {"thumbnail": thumbnail_url, "main": main_url, "displaytext": display_text} constituent["relateditems"][media_type].append( { "id": media_master_id, "displaytext": display_text, "primarydisplay": True if row[indices["primary_display_index"]] == "1" else False, "thumbnail": thumbnail_url, "main": main_url, } ) return (constituent, current_id) print "Starting Constituents Related Media..." if CURSOR: sql_command = constituents_sql.RELATED_MEDIA CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents_media_related.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents Related Media..."
def process_object_geocodes(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'geo_code_id_index' : columns.index('GeoCodeID'), 'geo_code_index' : columns.index('GeoCode'), 'region_index' : columns.index('Region'), 'city_index' : columns.index('City'), 'classification_id_index' : columns.index('ClassificationID') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if id != current_id: # may have multiple rows for one object because of many related constituents save(object) current_id = id object = {} if elasticsearch_connection.item_exists(id, classification): object = elasticsearch_connection.get_item(id, classification) else: print "%s could not be found!" % id return(object, current_id) geocode_dict = {} geocode_dict['id'] = row[indices['geo_code_id_index']] geocode_dict['geocode'] = row[indices['geo_code_index']] geocode_dict['region'] = row[indices['region_index']] geocode_dict['city'] = row[indices['city_index']] object['geocode'] = geocode_dict return(object, current_id) print "Starting Objects Geocodes..." if CURSOR: sql_command = objects_sql.GEOCODES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_geocodes.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Geocodes..."
def process_site_related_media(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'media_master_id_index' : columns.index('MediaMasterID'), 'primary_display_index' : columns.index('PrimaryDisplay'), 'media_type_id_index' : columns.index('MediaTypeID'), 'description_index' : columns.index('Description'), 'caption_index' : columns.index('PublicCaption'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName'), 'main_path_index' : columns.index('MainPathName'), 'main_file_index' : columns.index('MainFileName') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] #if site_id not in SAMPLE_SITES: # continue if site_id != current_id: # will likely have multiple rows for one site because of many related photos # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return(site, current_id) if 'relateditems' not in site: site['relateditems'] = {} media_type_key = int(row[indices['media_type_id_index']]) media_type = MEDIATYPES.get(media_type_key) media_master_id = row[indices['media_master_id_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']]) display_text = row[indices['caption_index']] # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video # so, make sure we are only using videos that are mp4s if media_type_key == 3: if not row[indices['main_file_index']].endswith('mp4'): return(site, current_id) if media_type not in site['relateditems']: site['relateditems'][media_type] = [] # add primary photo as a top level item as well if row[indices['primary_display_index']] == '1': site['primarydisplay'] = { 'thumbnail' : thumbnail_url, 'main' : main_url, 'displaytext' : display_text } site['relateditems'][media_type].append({ 'id' : media_master_id, 'displaytext' : display_text, 'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False, 'thumbnail' : thumbnail_url, 'main' : main_url }) return(site, current_id) print "Starting Sites Related Media..." if CURSOR: sql_command = sites_sql.RELATED_MEDIA CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_media_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites Related Media..."
def process_site_dates(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'event_type_index' : columns.index('EventType'), 'date_text_index' : columns.index('DateText') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] if site_id != current_id: # will likely have multiple rows for one site because of many related objects # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return (site, current_id) if 'sitedates' not in site: site['sitedates'] = [] event_type = row[indices['event_type_index']] date_text = row[indices['date_text_index']] site['sitedates'].append({ 'type' : event_type, 'date' : date_text }) site['datevalues'].append(date_text) return (site, current_id) print "Starting Sites Dates..." if CURSOR: sql_command = sites_sql.SITEDATES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_dates.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites Dates..."
def process_site_related_constituents(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'role_index' : columns.index('Role'), 'constituent_id_index' : columns.index('ConstituentID'), 'constituent_type_id_index' : columns.index('ConstituentTypeID'), 'display_name_index' : columns.index('DisplayName'), 'display_date_index' : columns.index('DisplayDate'), 'remarks_index' : columns.index('Remarks'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] #if site_id not in SAMPLE_SITES: # continue if site_id != current_id: # will likely have multiple rows for one site because of many related constituents # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return(site, current_id) if 'relateditems' not in site: site['relateditems'] = {} constituent_id = row[indices['constituent_id_index']] display_name = row[indices['display_name_index']] display_date = "" if row[indices['display_date_index']] != "NULL": display_date = row[indices['display_date_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) constituent_dict = {} role = row[indices['role_index']] # update the set of roles for this site if role not in site['roles']: # make sure Tomb Owner is first if role == "Tomb Owner": site['roles'].insert(0, role) else: site['roles'].append(role) description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else "" constituent_dict['role'] = role constituent_dict['id'] = constituent_id constituent_dict['displayname'] = display_name constituent_dict['displaydate'] = display_date constituent_dict['displaytext'] = display_name constituent_dict['description'] = description constituent_dict['thumbnail'] = thumbnail_url constituent_type_key = int(row[indices['constituent_type_id_index']]) constituent_type = CONSTITUENTTYPES.get(constituent_type_key) # add to array of people for easier searching if (constituent_type_key in [1,3]): site['people'].append(display_name) if constituent_type not in site['relateditems']: site['relateditems'][constituent_type] = [] site['relateditems'][constituent_type].append(constituent_dict) # keep the related items sorted site['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext')) if role == 'Tomb Owner': site['tombowner'] = "Yes" return(site, current_id) print "Starting Sites Related Constituents..." if CURSOR: sql_command = sites_sql.RELATED_CONSTITUENTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_constituents_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites Related Constituents..."
def process_site_related_published(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'reference_id_index' : columns.index('ReferenceID'), 'title_index' : columns.index('Title'), 'boiler_text_index' : columns.index('BoilerText'), 'date_index' : columns.index('DisplayDate'), 'path_index' : columns.index('MainPathName'), 'file_index' : columns.index('MainFileName'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] #if site_id not in SAMPLE_SITES: # continue if site_id != current_id: # will likely have multiple rows for one site because of many related published # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return(site, current_id) if 'relateditems' not in site: site['relateditems'] = {} reference_id = row[indices['reference_id_index']] title = row[indices['title_index']] boiler_text = row[indices['boiler_text_index']] date = row[indices['date_index']] main_url = get_media_url(row[indices['path_index']], row[indices['file_index']]) thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) if "pubdocs" not in site['relateditems']: site['relateditems']["pubdocs"] = [] site['relateditems']["pubdocs"].append({ 'id' : reference_id, 'boilertext' : boiler_text, 'displaytext' : title, 'date' : date, 'url' : main_url, 'thumbnail' : thumbnail_url}) # keep the related items sorted site['relateditems']['pubdocs'].sort(key=operator.itemgetter('displaytext')) return(site, current_id) print "Starting Sites Related Published..." if CURSOR: sql_command = sites_sql.RELATED_PUBLISHED CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_published_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites Related Published..."
def process_site_related_objects(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'classification_id_index' : columns.index('ClassificationID'), 'object_id_index' : columns.index('ObjectID'), 'object_title_index' : columns.index('Title'), 'object_number_index' : columns.index('ObjectNumber'), 'object_date_index' : columns.index('ObjectDate'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] #if site_id not in SAMPLE_SITES: # continue if site_id != current_id: # will likely have multiple rows for one site because of many related objects # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return (site, current_id) if 'relateditems' not in site: site['relateditems'] = {} classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) object_id = int(row[indices['object_id_index']]) thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']] object_title = row[indices['object_title_index']] object_number = row[indices['object_number_index']] if classification == "diarypages" and object_title.lower() == "null": idx = object_number.find('_') object_title = object_number[idx+1:] if object_title.lower() == "null": object_title = "[No Title]" if classification not in site['relateditems']: site['relateditems'][classification] = [] site['relateditems'][classification].append({ 'id' : object_id, 'title' : object_title, 'displaytext' : object_title, 'classificationid' : classification_key, 'number' : object_number, 'date' : date, 'thumbnail' : thumbnail_url}) # keep the related items sorted site['relateditems'][classification].sort(key=operator.itemgetter('displaytext')) return (site, current_id) print "Starting Sites Related Objects..." if CURSOR: sql_command = sites_sql.RELATED_OBJECTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_objects_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites Related Objects..."
def process_site_altnums(CURSOR): def get_indices(): indices = { 'site_id_index' : columns.index('SiteID'), 'altnum_index' : columns.index('AltNum'), 'description_index' : columns.index('Description') } return indices def process_site_row(site, current_id): site_id = row[indices['site_id_index']] #if site_id not in SAMPLE_SITES: # continue if site_id != current_id: # will likely have multiple rows for one site because of many related objects # only get a new site if we have a new site id, but first save old site to elasticsearch save(site) current_id = site_id site = {} if elasticsearch_connection.item_exists(site_id, 'sites'): site = elasticsearch_connection.get_item(site_id, 'sites') else: print "%s could not be found!" % site_id return (site, current_id) if 'altnums' not in site: site['altnums'] = [] if 'altnum_types' not in site: site['altnum_types'] = [] altnum = row[indices['altnum_index']] description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else "" if description not in site['altnum_types']: site['altnum_types'].append(description) site['altnums'].append({"altnum" : altnum, "description" : description}) return (site, current_id) print "Starting Sites AltNums..." if CURSOR: sql_command = sites_sql.ALTNUMS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites_altnums.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites AltNums..."
def process_sites(CURSOR): def get_indices(): site_id_index = columns.index('ID') return site_id_index def process_site_row(site, current_id): site_id = row[site_id_index] #if site_id not in SAMPLE_SITES: # continue # could have multiple rows for one site because of multiple SiteDates or other pieces of information # only create a new site if we have a new site id, but first save old site to elasticsearch if site_id != current_id: save(site) current_id = site_id site = {} # loop through each row for index, value in enumerate(columns): key = value.lower() row_value = row[index] # cleanup row data if row_value.isdigit(): row_value = int(row_value) elif row_value == "NULL": row_value = None else: row_value = row_value.replace(',,','') if 'sitetype' in key: if not row_value: continue # group sitetype fields into an object if 'sitetype' not in site: site['sitetype'] = {} site['sitetype'][key] = row_value else: # no special processing - just add it to the JSON site[key] = row_value display_text = site['number'] site['displaytext'] = display_text site['tombowner'] = "No" site['roles'] = [] site['people'] = [] site['datevalues'] = [] return (site, current_id) print "Starting Sites..." if CURSOR: sql_command = sites_sql.SITES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] site_id_index = get_indices() site = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (site, current_id) = process_site_row(site, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(site) else: with open('../data/sites.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') site_id_index = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') site = {} current_id = '-1' for row in rows: (site, current_id) = process_site_row(site, current_id) # save last object to elasticsearch save(site) print "Finished Sites..."
def process_constituents_related_sites(CURSOR): def get_indices(): indices = { "constituent_id_index": columns.index("ConstituentID"), "type_id_index": columns.index("ConstituentTypeID"), "site_id_index": columns.index("SiteID"), "site_name_index": columns.index("SiteName"), "site_number_index": columns.index("SiteNumber"), "thumb_path_index": columns.index("ThumbPathName"), "thumb_file_index": columns.index("ThumbFileName"), } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related constituents # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item(constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if "relateditems" not in constituent: constituent["relateditems"] = {} site_id = row[indices["site_id_index"]] site_name = row[indices["site_name_index"]] site_number = row[indices["site_number_index"]] thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]]) site_dict = {} site_dict["id"] = site_id site_dict["sitename"] = site_name site_dict["sitenumber"] = site_number site_dict["displaytext"] = "%s, %s" % (site_name, site_number) site_dict["thumbnail"] = thumbnail_url if "sites" not in constituent["relateditems"]: constituent["relateditems"]["sites"] = [] constituent["relateditems"]["sites"].append(site_dict) # keep the related items sorted constituent["relateditems"]["sites"].sort(key=operator.itemgetter("displaytext")) return (constituent, current_id) print "Starting Constituents Related Sites..." if CURSOR: sql_command = constituents_sql.RELATED_SITES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents_sites_related.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents Related Sites..."
def process_pubs(CURSOR): def get_indices(): pub_id_index = columns.index('ID') return pub_id_index def process_pub_row(pub, current_id): pub_id = row[pub_id_index] if pub_id != current_id: save(pub) current_id = pub_id pub = {} # loop through columns in row for index, value in enumerate(columns): key = value.lower() row_value = row[index] # cleanup row data if row_value.isdigit(): row_value = int(row_value) elif row_value == "NULL": row_value = None else: row_value = row_value.replace(',,','') pub[key] = row_value display_text = pub['title'] pub['displaytext'] = display_text pub['roles'] = [] pub["authors"] = [] return (pub, current_id) print "Starting Pub Docs..." if CURSOR: sql_command = published_sql.PUBLISHED CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] pub_id_index = get_indices() pub = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (pub, current_id) = process_pub_row(pub, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(pub) else: with open('../data/pubdocs.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') pub_id_index = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') pub = {} current_id = '-1' for row in rows: (pub, current_id) = process_pub_row(pub, current_id) # save last object to elasticsearch save(pub) print "Finished Pub Docs..."
def process_constituents_related_published(CURSOR): def get_indices(): indices = { "constituent_id_index": columns.index("ConstituentID"), "type_id_index": columns.index("ConstituentTypeID"), "reference_id_index": columns.index("ReferenceID"), "title_index": columns.index("Title"), "boiler_text_index": columns.index("BoilerText"), "date_index": columns.index("DisplayDate"), "path_index": columns.index("MainPathName"), "file_index": columns.index("MainFileName"), } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related published # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item(constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if "relateditems" not in constituent: constituent["relateditems"] = {} reference_id = row[indices["reference_id_index"]] title = row[indices["title_index"]] boiler_text = row[indices["boiler_text_index"]] date = row[indices["date_index"]] main_url = get_media_url(row[indices["path_index"]], row[indices["file_index"]]) if "pubdocs" not in constituent["relateditems"]: constituent["relateditems"]["pubdocs"] = [] constituent["relateditems"]["pubdocs"].append( {"id": reference_id, "boilertext": boiler_text, "displaytext": title, "date": date, "url": main_url} ) # keep the related items sorted constituent["relateditems"]["pubdocs"].sort(key=operator.itemgetter("displaytext")) return (constituent, current_id) print "Starting Constituents Related Published..." if CURSOR: sql_command = constituents_sql.RELATED_PUBLISHED CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents_published_related.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents Related Published..."
def process_pub_related_objects(CURSOR): def get_indices(): indices = { 'pub_id_index' : columns.index('ReferenceID'), 'classification_id_index' : columns.index('ClassificationID'), 'object_id_index' : columns.index('ObjectID'), 'object_title_index' : columns.index('Title'), 'object_number_index' : columns.index('ObjectNumber'), 'object_date_index' : columns.index('ObjectDate'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_pub_row(pub, current_id): pub_id = row[indices['pub_id_index']] if pub_id != current_id: save(pub) current_id = pub_id pub = {} if elasticsearch_connection.item_exists(pub_id, 'pubdocs'): pub = elasticsearch_connection.get_item(pub_id, 'pubdocs') else: print "%s could not be found!" % pub_id return (pub, current_id) if 'relateditems' not in pub: pub['relateditems'] = {} classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) object_id = int(row[indices['object_id_index']]) thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']] object_title = row[indices['object_title_index']] object_number = row[indices['object_number_index']] if classification == "diarypages" and object_title.lower() == "null": idx = object_number.find('_') object_title = object_number[idx+1:] if object_title.lower() == "null": object_title = "[No Title]" if classification not in pub['relateditems']: pub['relateditems'][classification] = [] pub['relateditems'][classification].append({ 'id' : object_id, 'title' : object_title, 'displaytext' : object_title, 'classificationid' : classification_key, 'number' : object_number, 'date' : date, 'thumbnail' : thumbnail_url}) # keep the related items sorted pub['relateditems'][classification].sort(key=operator.itemgetter('displaytext')) return (pub, current_id) print "Starting Pub Docs Related Objects..." if CURSOR: sql_command = published_sql.RELATED_OBJECTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() pub = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (pub, current_id) = process_pub_row(pub, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(pub) else: with open('../data/pubdocs_objects_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') pub = {} current_id = '-1' for row in rows: (pub, current_id) = process_pub_row(pub, current_id) # save last object to elasticsearch save(pub) print "Finished Pub Docs Related Objects..."
def process_constituents_altnames(CURSOR): def get_indices(): indices = { "constituent_id_index": columns.index("ConstituentID"), "type_id_index": columns.index("ConstituentTypeID"), "altname_index": columns.index("DisplayName"), "name_type_index": columns.index("NameType"), } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) if constituent_id != current_id: # will likely have multiple rows for one constituent because of many related objects # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch save(constituent) current_id = constituent_id constituent = {} if elasticsearch_connection.item_exists(constituent_id, type): constituent = elasticsearch_connection.get_item(constituent_id, type) else: print "%s could not be found!" % constituent_id return (constituent, current_id) if "altnames" not in constituent: constituent["altnames"] = [] altname = row[indices["altname_index"]] name_type = row[indices["name_type_index"]] constituent["altnames"].append({"name": altname, "type": name_type}) return (constituent, current_id) print "Starting Constituents AltNames..." if CURSOR: sql_command = constituents_sql.ALT_NAMES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents_altnames.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents AltNames..."
def process_pub_related_constituents(CURSOR): def get_indices(): indices = { 'pub_id_index' : columns.index('ReferenceID'), 'role_index' : columns.index('Role'), 'constituent_id_index' : columns.index('ConstituentID'), 'constituent_type_id_index' : columns.index('ConstituentTypeID'), 'display_name_index' : columns.index('DisplayName'), 'alpha_sort_index' : columns.index('AlphaSort'), 'display_date_index' : columns.index('DisplayDate'), 'remarks_index' : columns.index('Remarks'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_pub_row(pub, current_id): pub_id = row[indices['pub_id_index']] if pub_id != current_id: # will likely have multiple rows for one pub because of many related constituents # only get a new pub if we have a new pub id, but first save old pub to elasticsearch save(pub) current_id = pub_id pub = {} if elasticsearch_connection.item_exists(pub_id, 'pubdocs'): pub = elasticsearch_connection.get_item(pub_id, 'pubdocs') else: print "%s could not be found!" % pub_id return(pub, current_id) if 'relateditems' not in pub: pub['relateditems'] = {} constituent_id = row[indices['constituent_id_index']] display_name = row[indices['display_name_index']] display_date = "" if row[indices['display_date_index']] != "NULL": display_date = row[indices['display_date_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) alpha_sort = row[indices['alpha_sort_index']] constituent_dict = {} role = row[indices['role_index']] # update the set of roles for this pub if role not in pub['roles']: pub['roles'].append(role) if role == "Author": pub["authors"].append(alpha_sort) description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else "" constituent_dict['role'] = role constituent_dict['id'] = constituent_id constituent_dict['displayname'] = display_name constituent_dict['displaydate'] = display_date constituent_dict['displaytext'] = display_name constituent_dict['description'] = description constituent_dict['thumbnail'] = thumbnail_url constituent_type_key = int(row[indices['constituent_type_id_index']]) constituent_type = CONSTITUENTTYPES.get(constituent_type_key) if constituent_type not in pub['relateditems']: pub['relateditems'][constituent_type] = [] pub['relateditems'][constituent_type].append(constituent_dict) # keep the related items sorted pub['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext')) return(pub, current_id) print "Starting Pub Docs Related Constituents..." if CURSOR: sql_command = published_sql.RELATED_CONSTITUENTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() pub = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (pub, current_id) = process_pub_row(pub, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(pub) else: with open('../data/pubdocs_constituents_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') pub = {} current_id = '-1' for row in rows: (pub, current_id) = process_pub_row(pub, current_id) # save last object to elasticsearch save(pub) print "Finished Pub Docs Related Constituents..."
def process_objects(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'classification_id_index' : columns.index('ClassificationID'), 'object_number_index' : columns.index('Number') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) object_number = row[indices['object_number_index']] #if id not in SAMPLE_OBJECTS: # return (object, current_id) # I don't think there are any duplicate rows for objects, but keep it here since it doesn't break anything if id != current_id: save(object) current_id = id object = {} object['classification'] = classification # loop through each row for index, value in enumerate(columns): key = value.lower() row_value = row[index] # cleanup row data if row_value.isdigit(): row_value = int(row_value) elif row_value == "NULL": row_value = None else: row_value = row_value.replace(',,','') if 'title' in key: object_title = row_value if classification == "diarypages" and object_title is None: idx = object_number.find('_') object_title = object_number[idx+1:] object[key] = object_title else: object[key] = row_value else: # remove whitespace at end of line if a string object[key] = row_value.rstrip() if type(row_value) is str else row_value # Add some extra fields not in the TMS data object['displaytext'] = object['title'] prefix_idx = object_number.find('_') object['allnumbers'] = [object_number, object_number[prefix_idx+1:]] object['roles'] = [] object['hasphoto'] = "No" return (object, current_id) print "Starting Objects..." if CURSOR: sql_command = objects_sql.OBJECTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects..."
def process_pub_related_media(CURSOR): def get_indices(): indices = { 'pub_id_index' : columns.index('ReferenceID'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName'), 'main_path_index' : columns.index('MainPathName'), 'main_file_index' : columns.index('MainFileName') } return indices def process_pub_row(pub, current_id): pub_id = row[indices['pub_id_index']] if pub_id != current_id: save(pub) current_id = pub_id pub = {} if elasticsearch_connection.item_exists(pub_id, 'pubdocs'): pub = elasticsearch_connection.get_item(pub_id, 'pubdocs') else: print "%s could not be found!" % pub_id return(pub, current_id) thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']]) pub['primarydisplay'] = { 'thumbnail' : thumbnail_url, 'main' : thumbnail_url } pub['pdf'] = main_url return(pub, current_id) print "Starting Pub Docs Related Media..." if CURSOR: sql_command = published_sql.RELATED_MEDIA CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() pub = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (pub, current_id) = process_pub_row(pub, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(pub) else: with open('../data/pubdocs_media_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') pub = {} current_id = '-1' for row in rows: (pub, current_id) = process_pub_row(pub, current_id) # save last object to elasticsearch save(pub) print "Finished Pub Docs Related Media..."
def process_object_flexfields(CURSOR): def get_indices(): indices = { 'object_id_index' : columns.index('ObjectID'), 'classification_id_index' : columns.index('ClassificationID'), 'group_name_index' : columns.index('GroupName'), 'field_name_index' : columns.index('UserFieldName'), 'field_value_index' : columns.index('UserFieldValue') } return indices def process_object_row(object, current_id): object_id = row[indices['object_id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if object_id != current_id: # will likely have multiple rows save(object) current_id = object_id object = {} if elasticsearch_connection.item_exists(object_id, classification): object = elasticsearch_connection.get_item(object_id, classification) else: print "%s could not be found!" % object_id return (object, current_id) if 'flexfields' not in object: object['flexfields'] = {} groupname = row[indices['group_name_index']] if groupname not in object['flexfields']: object['flexfields'][groupname] = [] fieldname = row[indices['field_name_index']] fieldvalue = row[indices['field_value_index']] object['flexfields'][groupname].append({fieldname : fieldvalue}) return (object, current_id) print "Starting Objects Flex Fields..." if CURSOR: sql_command = objects_sql.FLEXFIELDS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_flexfields.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Flex Fields..."
def process_pub_related_sites(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ReferenceID'), 'site_id_index' : columns.index('SiteID'), 'site_name_index' : columns.index('SiteName'), 'site_number_index' : columns.index('SiteNumber'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_pub_row(pub, current_id): id = row[indices['id_index']] if id != current_id: save(pub) current_id = id pub = {} if elasticsearch_connection.item_exists(id, 'pubdocs'): pub = elasticsearch_connection.get_item(id, 'pubdocs') else: print "%s could not be found!" % id return(pub, current_id) if 'relateditems' not in pub: pub['relateditems'] = {} site_id = row[indices['site_id_index']] site_name = row[indices['site_name_index']] site_number = row[indices['site_number_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) site_dict = {} site_dict['id'] = site_id site_dict['sitename'] = site_name site_dict['sitenumber'] = site_number site_dict['displaytext'] = site_number site_dict['thumbnail'] = thumbnail_url if 'sites' not in pub['relateditems']: pub['relateditems']['sites'] = [] pub['relateditems']['sites'].append(site_dict) # keep the related items sorted pub['relateditems']['sites'].sort(key=operator.itemgetter('displaytext')) return(pub, current_id) print "Starting Pub Docs Related Sites..." if CURSOR: sql_command = published_sql.RELATED_SITES CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() pub = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (pub, current_id) = process_pub_row(pub, current_id) cursor_row = CURSOR.fetchone() # save last pub to elasticsearch save(pub) else: with open('../data/pubdocs_sites_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') pub = {} current_id = '-1' for row in rows: (pub, current_id) = process_pub_row(pub, current_id) # save last pub to elasticsearch save(pub) print "Finished Pub Docs Related Sites..."
def process_object_related_constituents(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'role_index' : columns.index('Role'), 'role_id_index' : columns.index('RoleID'), 'constituent_id_index' : columns.index('ConstituentID'), 'constituent_type_id_index' : columns.index('ConstituentTypeID'), 'display_name_index' : columns.index('DisplayName'), 'display_date_index' : columns.index('DisplayDate'), 'classification_id_index' : columns.index('ClassificationID'), 'remarks_index' : columns.index('Remarks'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if id != current_id: # may have multiple rows for one object because of many related constituents save(object) current_id = id object = {} if elasticsearch_connection.item_exists(id, classification): object = elasticsearch_connection.get_item(id, classification) else: print "%s could not be found!" % id return(object, current_id) if 'relateditems' not in object: object['relateditems'] = {} constituent_id = row[indices['constituent_id_index']] display_name = row[indices['display_name_index']] description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else "" display_date = "" if row[indices['display_date_index']] != "NULL": display_date = row[indices['display_date_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) constituent_dict = {} role = row[indices['role_index']] # update the set of roles for this object if role not in object['roles']: object['roles'].append(role) constituent_dict['role'] = row[indices['role_index']] constituent_dict['roleid'] = row[indices['role_id_index']] constituent_dict['id'] = constituent_id constituent_dict['displayname'] = display_name constituent_dict['displaydate'] = display_date constituent_dict['displaytext'] = display_name constituent_dict['description'] = description constituent_dict['thumbnail'] = thumbnail_url constituent_type_key = int(row[indices['constituent_type_id_index']]) constituent_type = CONSTITUENTTYPES.get(constituent_type_key) if constituent_type not in object['relateditems']: object['relateditems'][constituent_type] = [] object['relateditems'][constituent_type].append(constituent_dict) # keep the related items sorted object['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext')) return(object, current_id) print "Starting Objects Related Constituents..." if CURSOR: sql_command = objects_sql.RELATED_CONSTITUENTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_constituents_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Related Constituents..."
def process_constituents(CURSOR): def get_indices(): indices = {"constituent_id_index": columns.index("ID"), "type_id_index": columns.index("ConstituentTypeID")} return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices["constituent_id_index"]] type_key = int(row[indices["type_id_index"]]) type = CONSTITUENTTYPES.get(type_key) # could have multiple rows if constituent_id != current_id: save(constituent) current_id = constituent_id constituent = {} constituent["type"] = type # loop through each row for index, value in enumerate(columns): key = value.lower() row_value = row[index] # cleanup row data if row_value.isdigit(): row_value = int(row_value) elif row_value == "NULL": row_value = None else: row_value = row_value.replace(",,", "") if key in ["begindate", "enddate"]: if row_value == 0: row_value = None constituent[key] = row_value display_text = constituent["displayname"] constituent["displaytext"] = display_text return (constituent, current_id) print "Starting Constituents..." if CURSOR: sql_command = constituents_sql.CONSTITUENTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = "-1" cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open("../data/constituents.csv", "rb") as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace("\r\n", "") columns = headers.split(",") indices = get_indices() rows = csv.reader(csvfile, delimiter=",", quotechar='"') constituent = {} current_id = "-1" for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents..."
def process_object_related_media(CURSOR): def get_indices(): indices = { 'id_index' : columns.index('ID'), 'media_master_id_index' : columns.index('MediaMasterID'), 'classification_id_index' : columns.index('ClassificationID'), 'primary_display_index' : columns.index('PrimaryDisplay'), 'media_type_id_index' : columns.index('MediaTypeID'), 'description_index' : columns.index('Description'), 'caption_index' : columns.index('PublicCaption'), 'thumb_path_index' : columns.index('ThumbPathName'), 'thumb_file_index' : columns.index('ThumbFileName'), 'main_path_index' : columns.index('MainPathName'), 'main_file_index' : columns.index('MainFileName') } return indices def process_object_row(object, current_id): id = row[indices['id_index']] classification_key = int(row[indices['classification_id_index']]) classification = CLASSIFICATIONS.get(classification_key) if id != current_id: # may have multiple rows for one object because of many related constituents save(object) current_id = id object = {} if elasticsearch_connection.item_exists(id, classification): object = elasticsearch_connection.get_item(id, classification) else: print "%s could not be found!" % id return(object, current_id) if 'relateditems' not in object: object['relateditems'] = {} media_type_key = int(row[indices['media_type_id_index']]) media_type = MEDIATYPES.get(media_type_key) media_master_id = row[indices['media_master_id_index']] thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']]) main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']]) display_text = row[indices['caption_index']] # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video # so, make sure we are only using videos that are mp4s if media_type_key == 3: if not row[indices['main_file_index']].endswith('mp4'): return(object, current_id) if media_type not in object['relateditems']: object['relateditems'][media_type] = [] if media_type == 'photos': object['hasphoto'] = "Yes" # add primary photo as a top level item as well if row[indices['primary_display_index']] == '1': object['primarydisplay'] = { 'thumbnail' : thumbnail_url, 'main' : main_url, 'displaytext' : display_text } if not (classification == '3dmodels' and media_type == '3dmodels'): object['relateditems'][media_type].append({ 'id' : media_master_id, 'displaytext' : display_text, 'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False, 'thumbnail' : thumbnail_url, 'main' : main_url }) return(object, current_id) print "Starting Objects Related Media..." if CURSOR: sql_command = objects_sql.RELATED_MEDIA CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() object = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (object, current_id) = process_object_row(object, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(object) else: with open('../data/objects_media_related.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n','') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') object = {} current_id = '-1' for row in rows: (object, current_id) = process_object_row(object, current_id) # save last object to elasticsearch save(object) print "Finished Objects Related Media..."
def process_constituents(CURSOR): def get_indices(): indices = { 'constituent_id_index': columns.index('ID'), 'type_id_index': columns.index('ConstituentTypeID') } return indices def process_constituent_row(constituent, current_id): constituent_id = row[indices['constituent_id_index']] type_key = int(row[indices['type_id_index']]) type = CONSTITUENTTYPES.get(type_key) # could have multiple rows if constituent_id != current_id: save(constituent) current_id = constituent_id constituent = {} constituent['type'] = type # loop through each row for index, value in enumerate(columns): key = value.lower() row_value = row[index] # cleanup row data if row_value.isdigit(): row_value = int(row_value) elif row_value == "NULL": row_value = None else: row_value = row_value.replace(',,', '') if key in ['begindate', 'enddate']: if row_value == 0: row_value = None constituent[key] = row_value display_text = constituent['displayname'] constituent['displaytext'] = display_text return (constituent, current_id) print "Starting Constituents..." if CURSOR: sql_command = constituents_sql.CONSTITUENTS CURSOR.execute(sql_command) columns = [column[0] for column in CURSOR.description] indices = get_indices() constituent = {} current_id = '-1' cursor_row = CURSOR.fetchone() while cursor_row is not None: row = process_cursor_row(cursor_row) (constituent, current_id) = process_constituent_row(constituent, current_id) cursor_row = CURSOR.fetchone() # save last object to elasticsearch save(constituent) else: with open('../data/constituents.csv', 'rb') as csvfile: # Get the query headers to use as keys in the JSON headers = next(csvfile) if headers.startswith(codecs.BOM_UTF8): headers = headers[3:] headers = headers.replace('\r\n', '') columns = headers.split(',') indices = get_indices() rows = csv.reader(csvfile, delimiter=',', quotechar='"') constituent = {} current_id = '-1' for row in rows: (constituent, current_id) = process_constituent_row(constituent, current_id) # save last object to elasticsearch save(constituent) print "Finished Constituents..."