def get_providers(): api_key = dpla_config.API_KEY query_terms = {DPLA_FACET: DPLA_PROVIDER_HEADER_NAME} facet_response = dpla_utils.dpla_fetch_facets_remote( api_key, **query_terms) provider_result_list = [] #travers subarray key: id or name for key, value in facet_response.items(): provider_list = value['terms'] # type: list # treaverse the list of id and name for index in range(len(provider_list)): sub_facet_response = provider_list[index] length = len(provider_result_list) single_facet_response = { DPLA_PROVIDER_HEADER_COUNT: sub_facet_response['count'], DPLA_PROVIDER_HEADER_NAME: sub_facet_response['term'] } # question: How many items are contributed by each provider? print("Provider ", sub_facet_response['term'], " contributes ", sub_facet_response['count'], " items.") provider_result_list.append(single_facet_response) return provider_result_list
def retrieve_facets_by_provider(): query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName \ + "," + DPLA_PROP_providerId} response = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_provider_facets) return response
def get_provider_data(): # create object to store provider data CCD_OBJ_provider_parent = [] response_provider_facets = retrieve_facets_by_provider() for provider in \ response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_terms]: # retrieve dataProver details query_dataProvider_facets = {DPLA_API_QUERY_facets: \ DPLA_PROP_dataProvider + "," + \ DPLA_PROP_providerName,\ DPLA_PROP_providerId: \ provider[DPLA_API_PROP_term]} response_dataProvider_facets = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_dataProvider_facets) # create the object to store each provider data CCD_OBJ_providerData = {} # provider name provider_name = response_dataProvider_facets[DPLA_PROP_providerName][DPLA_API_PROP_terms][0][DPLA_API_PROP_term] CCD_OBJ_providerData[DPLA_PROP_providerName] =provider_name # number of items from this provider CCD_OBJ_providerData[CCD_PROP_LBL_itemCount] = provider[DPLA_API_PROP_count] # number of data providers providing data through this provider CCD_OBJ_providerData[CCD_PROP_LBL_dataProviderCount] = \ len( response_dataProvider_facets[DPLA_PROP_dataProvider][DPLA_API_PROP_terms]) # add provider data to the root json structure CCD_OBJ_provider_parent.append(CCD_OBJ_providerData) return CCD_OBJ_provider_parent
def getAllProviders(base_dpla_filename): provider_dict = {} query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName} response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_provider_facets) for provider in response_provider_facets[DPLA_PROP_providerName][ DPLA_API_PROP_terms]: provider_dict[ provider[DPLA_API_PROP_term]] = provider[DPLA_API_PROP_count] # save the base dpla data to file base_dpla_data_file = open(base_dpla_filename, 'w') base_dpla_data_file.write(json.dumps(provider_dict)) base_dpla_data_file.close() sys.exit(0)
def profile_dpla(base_dpla_filename): # create base object to store the base dpla-level data CCD_OBJ_dpla_parent = {} # create the admin details object CCD_OBJ_adminDetails = {} CCD_OBJ_adminDetails[CCD_PROP_LBL_analysisDate] = time.strftime("%d/%m/%Y") CCD_OBJ_adminDetails[CCD_PROP_LBL_analysisTime] = time.strftime("%H:%M:%S") # add adminDetails to the root json structure in the dpla base data file CCD_OBJ_dpla_parent[CCD_OBJ_LBL_adminDetails] = CCD_OBJ_adminDetails # create the object to store the dpla-level data CCD_OBJ_dplaData = {} # retrieve the number of items in collections query_items_in_collections = { DPLA_API_QUERY_facets: DPLA_PROP_collectionId } response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_items_in_collections) CCD_OBJ_dplaData[CCD_PROP_LBL_itemsInCollections] = \ response_provider_facets[DPLA_PROP_collectionId][DPLA_API_PROP_total] # retrieve from the DPLA and store provider-related data query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName \ + "," + DPLA_PROP_providerId} response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_provider_facets) # retrieve count of total DPLA items by name and id CCD_OBJ_dplaData[CCD_PROP_LBL_itemCountByName] = \ response_provider_facets[DPLA_PROP_providerName][DPLA_API_PROP_total] CCD_OBJ_dplaData[CCD_PROP_LBL_itemCountById] = \ response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_total] # retrieve missing items by name and id CCD_OBJ_dplaData[CCD_PROP_LBL_missingItemsByName] = \ response_provider_facets[DPLA_PROP_providerName][DPLA_API_PROP_missing] CCD_OBJ_dplaData[CCD_PROP_LBL_missingItemsById] = \ response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_missing] # retrieve number of missing items by provider name and id CCD_OBJ_dplaData[CCD_PROP_LBL_providerCountByName] = \ len( response_provider_facets[DPLA_PROP_providerName]\ [DPLA_API_PROP_terms]) CCD_OBJ_dplaData[CCD_PROP_LBL_providerCountById] = \ len( response_provider_facets[DPLA_PROP_providerId]\ [DPLA_API_PROP_terms]) # for each provider, retrieve and process the data for provider in \ response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_terms]: # create object to store provider data CCD_OBJ_provider_parent = {} # retrieve dataProver details query_dataProvider_facets = {DPLA_API_QUERY_facets: \ DPLA_PROP_dataProvider + "," + \ DPLA_PROP_providerName,\ DPLA_PROP_providerId: \ provider[DPLA_API_PROP_term]} response_dataProvider_facets = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_dataProvider_facets) #pprint( query_dataProvider_facets) #pprint( response_dataProvider_facets) # number of items from this provider CCD_OBJ_provider_parent[CCD_PROP_LBL_itemCount] = \ provider[DPLA_API_PROP_count] #pprint( response_dataProvider_facets[DPLA_PROP_dataProvider][DPLA_API_PROP_terms]) #pprint( DPLA_PROP_dataProvider) #pprint( DPLA_API_PROP_terms) # number of data providers providing data through this provider CCD_OBJ_provider_parent[CCD_PROP_LBL_dataProviderCount] = len( response_dataProvider_facets[DPLA_PROP_dataProvider] [DPLA_API_PROP_terms]) pprint(CCD_OBJ_provider_parent) # save the base dpla data to file provider_name = response_dataProvider_facets[DPLA_PROP_providerName][ DPLA_API_PROP_terms][0][DPLA_API_PROP_term].replace(' ', '-') provider_filename = provider_name + FILE_SUFFIX_provider_data provider_data_file = open(provider_filename, 'w') provider_data_file.write(json.dumps(CCD_OBJ_provider_parent)) provider_data_file.close() # add dpla-level data to the root json structure CCD_OBJ_dpla_parent[CCD_OBJ_LBL_dplaData] = CCD_OBJ_dplaData # save the base dpla data to file base_dpla_data_file = open(base_dpla_filename, 'w') base_dpla_data_file.write(json.dumps(CCD_OBJ_dpla_parent)) base_dpla_data_file.close() sys.exit(0) """ :rtype : object """ id_list = [] id_duplicated = [] condition = { 'sourceResource.collection.id': collection_id, 'fields': 'sourceResource.date.displayDate' } count_item = dpla_utils.dpla_get_count(config.CP_REMOTE, api_key=config.API_KEY, **condition) print("Total item:", count_item) page_size = 500 total_num_pages = int(count_item / page_size) + 1 print('Total Pages:', total_num_pages) # define the collection that will hold the output object collection = collections.OrderedDict() sample_collection = dpla_utils.dpla_get_collection_info( config.CP_REMOTE, collection_id=collection_id, api_key=config.API_KEY, ) collection.update({'collection': sample_collection}) # data_provider_info = {"dataProvider": sample_doc["dataProvider"], "provider": sample_doc["provider"]} # collection['collection'].update(data_provider_info) # define the counter for usage collection_volume_title = 0 collection_volume_subject = 0 collection_volume_displaydate = 0 collection_volume_language = 0 collection_volume_creator = 0 collection_volume_publisher = 0 collection_volume_spatialname = 0 collection_volume_spatialcoords = 0 collection_volume_rights = 0 collection_volume_description = 0 collection_volume_provider = 0 collection_volume_dataprovider = 0 collection_volume_format = 0 collection_volume_type = 0 # add counter for collection collection_volume_collection = 0 # define the counter for usage collection_usage_title = 0 collection_usage_subject = 0 collection_usage_displaydate = 0 collection_usage_language = 0 collection_usage_creator = 0 collection_usage_publisher = 0 collection_usage_spatialname = 0 collection_usage_spatialcoords = 0 collection_usage_rights = 0 collection_usage_description = 0 collection_usage_provider = 0 collection_usage_dataprovider = 0 collection_usage_format = 0 collection_usage_type = 0 # add counter for usage of collection collection_usage_collection = 0 item_detail = [] # process items, in each time, get 500 items, process them and drop them for i in range(1, total_num_pages + 1, 1): print('processing page', i) condition['page_size'] = page_size condition['page'] = i dpla_response = dpla_utils.dpla_fetch(api_key=config.API_KEY, count=1000, **condition) docs = dpla_response for doc in docs: id_list.append(doc) # sourceresource = doc['sourceResource'] # # compose item_volume section # # add collection information # item_volume = dict(title=get_item_volume('title', sourceresource), # format=get_item_volume('format', sourceresource), # type=get_item_volume('type', sourceresource), # subject=get_item_volume('subject', sourceresource), # rights=get_item_volume('rights', sourceresource), # language=get_item_volume('language', sourceresource), # creator=get_item_volume('creator', sourceresource), # publisher=get_item_volume('publisher', sourceresource), # description=get_item_volume('description', sourceresource), # collection=get_item_volume('collection', sourceresource), # provider=get_item_volume('provider', doc), # dataProvider=get_item_volume('dataProvider', doc)) # # add spatial to item_volume # if 'spatial' in sourceresource: # spatials = sourceresource['spatial'] # item_volume['spatialName'] = 0 # item_volume['spatialCoords'] = 0 # for spatial in spatials: # item_volume['spatialName'] += get_item_volume('name', spatial) # item_volume['spatialCoords'] += get_item_volume('coordinates', spatial) # else: # item_volume['spatialName'] = 0 # item_volume['spatialCoords'] = 0 # # # add displaydate to item_volume # if 'date' in sourceresource: # date = sourceresource['date'] # item_volume['displayDate'] = get_item_volume('displayDate', date) # # # else: # item_volume['displayDate'] = 0 # # commented by unmil to improve code run time # # `if item_volume['displayDate']<1: # # print (item_volume['displayDate']) # item['itemVolume'] = item_volume # item_detail.append(item) # # sum each status # collection_volume_title = collection_volume_title + item['itemVolume']['title'] # collection_volume_subject = collection_volume_subject + item['itemVolume']['subject'] # collection_volume_displaydate = collection_volume_displaydate + item['itemVolume']['displayDate'] # collection_volume_language = collection_volume_language + item['itemVolume']['language'] # collection_volume_creator = collection_volume_creator + item['itemVolume']['creator'] # collection_volume_publisher = collection_volume_publisher + item['itemVolume']['publisher'] # collection_volume_rights = collection_volume_rights + item['itemVolume']['rights'] # collection_volume_spatialname = collection_volume_spatialname + item['itemVolume']['spatialName'] # collection_volume_spatialcoords = collection_volume_spatialcoords + item['itemVolume']['spatialCoords'] # collection_volume_description = collection_volume_description + item['itemVolume']['description'] # collection_volume_provider = collection_volume_provider + item['itemVolume']['provider'] # collection_volume_dataprovider = collection_volume_dataprovider + item['itemVolume']['dataProvider'] # collection_volume_format = collection_volume_format + item['itemVolume']['format'] # collection_volume_type = collection_volume_type + item['itemVolume']['type'] # # add collection # collection_volume_collection = collection_volume_collection + item['itemVolume']['collection'] # # sum usage information # collection_usage_title += get_usage(item['itemVolume']['title']) # collection_usage_subject += get_usage(item['itemVolume']['subject']) # collection_usage_displaydate += get_usage(item['itemVolume']['displayDate']) # collection_usage_language += get_usage(item['itemVolume']['language']) # collection_usage_creator += get_usage(item['itemVolume']['creator']) # collection_usage_publisher += get_usage(item['itemVolume']['publisher']) # collection_usage_rights += get_usage(item['itemVolume']['rights']) # collection_usage_spatialcoords += get_usage(item['itemVolume']['spatialName']) # collection_usage_spatialname += get_usage(item['itemVolume']['spatialName']) # collection_usage_description += get_usage(item['itemVolume']['description']) # collection_usage_provider += get_usage(item['itemVolume']['provider']) # collection_usage_dataprovider += get_usage(item['itemVolume']['dataProvider']) # collection_usage_format += get_usage(item['itemVolume']['format']) # collection_usage_type += get_usage(item['itemVolume']['type']) # # add collection # collection_usage_collection += get_usage(collection_volume_collection) # # collection_volume = { # 'title': collection_volume_title, # 'format': collection_volume_format, # 'type': collection_volume_type, # 'subject': collection_volume_subject, # 'rights': collection_volume_rights, # 'language': collection_volume_language, # 'creator': collection_volume_creator, # 'publisher': collection_volume_publisher, # 'description': collection_volume_description, # 'provider': collection_volume_provider, # 'dataProvider': collection_volume_dataprovider, # 'displayDate': collection_volume_displaydate, # 'spatialCoords': collection_volume_spatialcoords, # 'spatialName': collection_volume_spatialname, # 'collection': collection_volume_collection # } # collection_usage = { # 'title': collection_usage_title, # 'format': collection_usage_format, # 'type': collection_usage_type, # 'subject': collection_usage_subject, # 'rights': collection_usage_rights, # 'language': collection_usage_language, # 'creator': collection_usage_creator, # 'publisher': collection_usage_publisher, # 'description': collection_usage_description, # 'provider': collection_usage_provider, # 'dataProvider': collection_usage_dataprovider, # 'displayDate': collection_usage_displaydate, # 'spatialCoords': collection_usage_spatialcoords, # 'spatialName': collection_usage_spatialname, # 'collection': collection_usage_collection # } # # coll_metadata_detail = collections.OrderedDict() # coll_metadata_detail['itemCount'] = len(item_detail) # coll_metadata_detail['dateProfiled'] = time.strftime("%d/%m/%Y") # if config.CP_REMOTE: # coll_metadata_detail['Source'] = "Remote" # else: # coll_metadata_detail['Source'] = 'Local' # # coll_metadata_detail['collectionVolume'] = collection_volume # coll_metadata_detail['collectionUsage'] = collection_usage # # # put the metadataDetail to the collection # collection['collMetadataDetail'] = coll_metadata_detail # collection['duplicated_item'] = id_duplicated # # put itemDetail to collection # if not collection_only: # collection['itemDetail'] = item_detail # write the json format fo the data to local file named by user dest_file = open(filename, 'w') dest_file.write(json.dumps(id_list)) dest_file.close()
def retrieve_facets_by_collection_id(): query_items_in_collections = {DPLA_API_QUERY_facets: DPLA_PROP_collectionId} response = dpla_utils.dpla_fetch_facets_remote( \ api_key=config.API_KEY, **query_items_in_collections) return response
def set_dataProvider_list(): api_key = dpla_config.API_KEY # read input file and folder parser = argparse.ArgumentParser() parser.add_argument( '-i', action='append', dest='provider_input_file', help= "A path of input provider file generated by set_provider_list.py is needed" ) parser.add_argument( '-e', action='append', dest='dataProvider_folder', help= "A folder for storing dataProvider files is needed, the folder must exist" ) results = parser.parse_args() provider_input_file = results.provider_input_file[0] folder = results.dataProvider_folder[0] # append / into folder path if not folder.endswith("/"): folder = folder + "/" try: #open input file with open(provider_input_file) as input_file: provider_csv_reader = csv.reader(input_file) #check input header input_header = next(provider_csv_reader, None) if input_header != EXPECTED_INPUT_HEADER: print("input header doesn`t match") sys.exit() dataProvider_count = 0 #print("************* Question 7: How many items are contributed by each dataProvider? **********") for provider_information_row in provider_csv_reader: #query dataProvider by faceting provider.name current_provider_name = provider_information_row[0] query_terms = {PROVIDER_NAME: current_provider_name} query_terms['facets'] = DPLA_DATAPROVIDER dataProvider_query_response = dpla_utils.dpla_fetch_facets_remote( api_key, **query_terms) dataprovider_result_list = dataProvider_query_response[ 'dataProvider']['terms'] new_dataProvider_result_list = [] for index in range(len(dataprovider_result_list)): current_dataProvider_result_list_entry = dataprovider_result_list[ index] new_dataProvider_result_list_entry = { DATAPROVIDER_HEADER_NAME: current_dataProvider_result_list_entry['term'], DATAPROVIDER_HEADER_COUNT: current_dataProvider_result_list_entry['count'] } new_dataProvider_result_list.append( new_dataProvider_result_list_entry) # Question: How many items are contributed by each dataProvider? #print("dataProvider ", dataprovider_result_list['term'], " contributes ", dataprovider_result_list['term'], " items.") dataProvider_count += len(new_dataProvider_result_list) try: # process current_provider_name current_provider_name = current_provider_name.replace( ' ', '_', 256) # write dataProvider information into files #print(folder + current_provider_name + DATAPROVIDER_FILE_SUFFLIX) with open( folder + current_provider_name + DATAPROVIDER_FILE_SUFFLIX, 'w') as dataProvider_file: #header = ['term', 'count'] # put in begining header = [ DATAPROVIDER_HEADER_NAME, DATAPROVIDER_HEADER_COUNT ] dataProvider_csv_writer = csv.DictWriter( dataProvider_file, header) dataProvider_csv_writer.writeheader() dataProvider_csv_writer.writerows( new_dataProvider_result_list) #header = [DATAPROVIDER_HEADER_COUNT, DATAPROVIDER_HEADER_NAME] #dataProvider_csv_writer.writeheader() dataProvider_file.close() except IOError as folder_error: print("couldn`t create file in this folder") sys.exit() print("There are " + str(dataProvider_count) + " dataProviders contribute data for DPLA") input_file.close() except IOError as input_file_error: print("couldn`t read the input file ") #process provider list, add dataProvider_path column process_provider_list(provider_input_file, folder)