Example #1
0
def get_providers():
    api_key = dpla_config.API_KEY
    query_terms = {DPLA_FACET: DPLA_PROVIDER_HEADER_NAME}
    facet_response = dpla_utils.dpla_fetch_facets_remote(
        api_key, **query_terms)
    provider_result_list = []

    #travers subarray key: id or name
    for key, value in facet_response.items():
        provider_list = value['terms']  # type: list
        # treaverse the list of id and name
        for index in range(len(provider_list)):

            sub_facet_response = provider_list[index]

            length = len(provider_result_list)
            single_facet_response = {
                DPLA_PROVIDER_HEADER_COUNT: sub_facet_response['count'],
                DPLA_PROVIDER_HEADER_NAME: sub_facet_response['term']
            }

            # question: How many items are contributed by each provider?
            print("Provider ", sub_facet_response['term'], " contributes ",
                  sub_facet_response['count'], " items.")
            provider_result_list.append(single_facet_response)

    return provider_result_list
Example #2
0
def retrieve_facets_by_provider():
    query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName \
                               + "," + DPLA_PROP_providerId}
    response = dpla_utils.dpla_fetch_facets_remote( \
    api_key=config.API_KEY, **query_provider_facets)

    return response
Example #3
0
def get_provider_data():
    # create object to store provider data
    CCD_OBJ_provider_parent = []

    response_provider_facets = retrieve_facets_by_provider()

    for provider in \
    response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_terms]:
        # retrieve dataProver details
        query_dataProvider_facets = {DPLA_API_QUERY_facets: \
                                     DPLA_PROP_dataProvider + "," + \
                                     DPLA_PROP_providerName,\
                                     DPLA_PROP_providerId: \
                                     provider[DPLA_API_PROP_term]}
        response_dataProvider_facets = dpla_utils.dpla_fetch_facets_remote( \
        api_key=config.API_KEY, **query_dataProvider_facets)

        # create the object to store each provider data
        CCD_OBJ_providerData = {}

        # provider name
        provider_name = response_dataProvider_facets[DPLA_PROP_providerName][DPLA_API_PROP_terms][0][DPLA_API_PROP_term]
        CCD_OBJ_providerData[DPLA_PROP_providerName] =provider_name

        # number of items from this provider
        CCD_OBJ_providerData[CCD_PROP_LBL_itemCount] = provider[DPLA_API_PROP_count]

        # number of data providers providing data through this provider
        CCD_OBJ_providerData[CCD_PROP_LBL_dataProviderCount] = \
        len( response_dataProvider_facets[DPLA_PROP_dataProvider][DPLA_API_PROP_terms])

        # add provider data to the root json structure
        CCD_OBJ_provider_parent.append(CCD_OBJ_providerData)

    return CCD_OBJ_provider_parent
Example #4
0
def getAllProviders(base_dpla_filename):
    provider_dict = {}

    query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName}

    response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \
                                api_key=config.API_KEY, **query_provider_facets)

    for provider in response_provider_facets[DPLA_PROP_providerName][
            DPLA_API_PROP_terms]:
        provider_dict[
            provider[DPLA_API_PROP_term]] = provider[DPLA_API_PROP_count]

    # save the base dpla data to file
    base_dpla_data_file = open(base_dpla_filename, 'w')
    base_dpla_data_file.write(json.dumps(provider_dict))
    base_dpla_data_file.close()

    sys.exit(0)
def profile_dpla(base_dpla_filename):

    # create base object to store the base dpla-level data
    CCD_OBJ_dpla_parent = {}

    # create the admin details object
    CCD_OBJ_adminDetails = {}
    CCD_OBJ_adminDetails[CCD_PROP_LBL_analysisDate] = time.strftime("%d/%m/%Y")
    CCD_OBJ_adminDetails[CCD_PROP_LBL_analysisTime] = time.strftime("%H:%M:%S")

    # add adminDetails to the root json structure in the dpla base data file
    CCD_OBJ_dpla_parent[CCD_OBJ_LBL_adminDetails] = CCD_OBJ_adminDetails

    # create the object to store the dpla-level data
    CCD_OBJ_dplaData = {}

    # retrieve the number of items in collections
    query_items_in_collections = {
        DPLA_API_QUERY_facets: DPLA_PROP_collectionId
    }
    response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \
    api_key=config.API_KEY, **query_items_in_collections)

    CCD_OBJ_dplaData[CCD_PROP_LBL_itemsInCollections] = \
      response_provider_facets[DPLA_PROP_collectionId][DPLA_API_PROP_total]

    # retrieve from the DPLA and store provider-related data
    query_provider_facets = {DPLA_API_QUERY_facets: DPLA_PROP_providerName \
                               + "," + DPLA_PROP_providerId}
    response_provider_facets = dpla_utils.dpla_fetch_facets_remote( \
    api_key=config.API_KEY, **query_provider_facets)

    # retrieve count of total DPLA items by name and id
    CCD_OBJ_dplaData[CCD_PROP_LBL_itemCountByName] = \
      response_provider_facets[DPLA_PROP_providerName][DPLA_API_PROP_total]

    CCD_OBJ_dplaData[CCD_PROP_LBL_itemCountById] = \
      response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_total]

    # retrieve missing items by name and id
    CCD_OBJ_dplaData[CCD_PROP_LBL_missingItemsByName] = \
      response_provider_facets[DPLA_PROP_providerName][DPLA_API_PROP_missing]

    CCD_OBJ_dplaData[CCD_PROP_LBL_missingItemsById] = \
      response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_missing]

    # retrieve number of missing items by provider name and id
    CCD_OBJ_dplaData[CCD_PROP_LBL_providerCountByName] = \
      len( response_provider_facets[DPLA_PROP_providerName]\
           [DPLA_API_PROP_terms])

    CCD_OBJ_dplaData[CCD_PROP_LBL_providerCountById] = \
      len( response_provider_facets[DPLA_PROP_providerId]\
           [DPLA_API_PROP_terms])

    # for each provider, retrieve and process the data
    for provider in \
    response_provider_facets[DPLA_PROP_providerId][DPLA_API_PROP_terms]:

        # create object to store provider data
        CCD_OBJ_provider_parent = {}

        # retrieve dataProver details
        query_dataProvider_facets = {DPLA_API_QUERY_facets: \
                                     DPLA_PROP_dataProvider + "," + \
                                     DPLA_PROP_providerName,\
                                     DPLA_PROP_providerId: \
                                     provider[DPLA_API_PROP_term]}
        response_dataProvider_facets = dpla_utils.dpla_fetch_facets_remote( \
        api_key=config.API_KEY, **query_dataProvider_facets)

        #pprint( query_dataProvider_facets)
        #pprint( response_dataProvider_facets)

        # number of items from this provider
        CCD_OBJ_provider_parent[CCD_PROP_LBL_itemCount] = \
                                       provider[DPLA_API_PROP_count]

        #pprint( response_dataProvider_facets[DPLA_PROP_dataProvider][DPLA_API_PROP_terms])
        #pprint( DPLA_PROP_dataProvider)
        #pprint( DPLA_API_PROP_terms)
        # number of data providers providing data through this provider
        CCD_OBJ_provider_parent[CCD_PROP_LBL_dataProviderCount] = len(
            response_dataProvider_facets[DPLA_PROP_dataProvider]
            [DPLA_API_PROP_terms])

        pprint(CCD_OBJ_provider_parent)

        # save the base dpla data to file
        provider_name = response_dataProvider_facets[DPLA_PROP_providerName][
            DPLA_API_PROP_terms][0][DPLA_API_PROP_term].replace(' ', '-')
        provider_filename = provider_name + FILE_SUFFIX_provider_data
        provider_data_file = open(provider_filename, 'w')
        provider_data_file.write(json.dumps(CCD_OBJ_provider_parent))
        provider_data_file.close()

    # add dpla-level data to the root json structure
    CCD_OBJ_dpla_parent[CCD_OBJ_LBL_dplaData] = CCD_OBJ_dplaData

    # save the base dpla data to file
    base_dpla_data_file = open(base_dpla_filename, 'w')
    base_dpla_data_file.write(json.dumps(CCD_OBJ_dpla_parent))
    base_dpla_data_file.close()

    sys.exit(0)
    """
    :rtype : object

    """
    id_list = []
    id_duplicated = []
    condition = {
        'sourceResource.collection.id': collection_id,
        'fields': 'sourceResource.date.displayDate'
    }
    count_item = dpla_utils.dpla_get_count(config.CP_REMOTE,
                                           api_key=config.API_KEY,
                                           **condition)
    print("Total item:", count_item)
    page_size = 500

    total_num_pages = int(count_item / page_size) + 1
    print('Total Pages:', total_num_pages)
    # define the collection that will hold the output object
    collection = collections.OrderedDict()
    sample_collection = dpla_utils.dpla_get_collection_info(
        config.CP_REMOTE,
        collection_id=collection_id,
        api_key=config.API_KEY,
    )
    collection.update({'collection': sample_collection})
    # data_provider_info = {"dataProvider": sample_doc["dataProvider"], "provider": sample_doc["provider"]}
    # collection['collection'].update(data_provider_info)
    # define the counter for usage
    collection_volume_title = 0
    collection_volume_subject = 0
    collection_volume_displaydate = 0
    collection_volume_language = 0
    collection_volume_creator = 0
    collection_volume_publisher = 0
    collection_volume_spatialname = 0
    collection_volume_spatialcoords = 0
    collection_volume_rights = 0
    collection_volume_description = 0
    collection_volume_provider = 0
    collection_volume_dataprovider = 0
    collection_volume_format = 0
    collection_volume_type = 0
    # add counter for collection
    collection_volume_collection = 0
    # define the counter for usage
    collection_usage_title = 0
    collection_usage_subject = 0
    collection_usage_displaydate = 0
    collection_usage_language = 0
    collection_usage_creator = 0
    collection_usage_publisher = 0
    collection_usage_spatialname = 0
    collection_usage_spatialcoords = 0
    collection_usage_rights = 0
    collection_usage_description = 0
    collection_usage_provider = 0
    collection_usage_dataprovider = 0
    collection_usage_format = 0
    collection_usage_type = 0
    # add counter for usage of collection
    collection_usage_collection = 0
    item_detail = []
    # process items, in each time, get 500 items, process them and drop them
    for i in range(1, total_num_pages + 1, 1):
        print('processing page', i)
        condition['page_size'] = page_size
        condition['page'] = i
        dpla_response = dpla_utils.dpla_fetch(api_key=config.API_KEY,
                                              count=1000,
                                              **condition)
        docs = dpla_response

        for doc in docs:
            id_list.append(doc)

    #             sourceresource = doc['sourceResource']
    #             # compose item_volume section
    #             # add collection information
    #             item_volume = dict(title=get_item_volume('title', sourceresource),
    #                                format=get_item_volume('format', sourceresource),
    #                                type=get_item_volume('type', sourceresource),
    #                                subject=get_item_volume('subject', sourceresource),
    #                                rights=get_item_volume('rights', sourceresource),
    #                                language=get_item_volume('language', sourceresource),
    #                                creator=get_item_volume('creator', sourceresource),
    #                                publisher=get_item_volume('publisher', sourceresource),
    #                                description=get_item_volume('description', sourceresource),
    #                                collection=get_item_volume('collection', sourceresource),
    #                                provider=get_item_volume('provider', doc),
    #                                dataProvider=get_item_volume('dataProvider', doc))
    #             # add spatial to item_volume
    #             if 'spatial' in sourceresource:
    #                 spatials = sourceresource['spatial']
    #                 item_volume['spatialName'] = 0
    #                 item_volume['spatialCoords'] = 0
    #                 for spatial in spatials:
    #                     item_volume['spatialName'] += get_item_volume('name', spatial)
    #                     item_volume['spatialCoords'] += get_item_volume('coordinates', spatial)
    #             else:
    #                 item_volume['spatialName'] = 0
    #                 item_volume['spatialCoords'] = 0
    #
    #             # add displaydate to item_volume
    #             if 'date' in sourceresource:
    #                 date = sourceresource['date']
    #                 item_volume['displayDate'] = get_item_volume('displayDate', date)
    #
    #
    #             else:
    #                 item_volume['displayDate'] = 0
    #             # commented by unmil to improve code run time
# # `if item_volume['displayDate']<1:
#             #    print (item_volume['displayDate'])
#             item['itemVolume'] = item_volume
#             item_detail.append(item)
#             # sum each status
#             collection_volume_title = collection_volume_title + item['itemVolume']['title']
#             collection_volume_subject = collection_volume_subject + item['itemVolume']['subject']
#             collection_volume_displaydate = collection_volume_displaydate + item['itemVolume']['displayDate']
#             collection_volume_language = collection_volume_language + item['itemVolume']['language']
#             collection_volume_creator = collection_volume_creator + item['itemVolume']['creator']
#             collection_volume_publisher = collection_volume_publisher + item['itemVolume']['publisher']
#             collection_volume_rights = collection_volume_rights + item['itemVolume']['rights']
#             collection_volume_spatialname = collection_volume_spatialname + item['itemVolume']['spatialName']
#             collection_volume_spatialcoords = collection_volume_spatialcoords + item['itemVolume']['spatialCoords']
#             collection_volume_description = collection_volume_description + item['itemVolume']['description']
#             collection_volume_provider = collection_volume_provider + item['itemVolume']['provider']
#             collection_volume_dataprovider = collection_volume_dataprovider + item['itemVolume']['dataProvider']
#             collection_volume_format = collection_volume_format + item['itemVolume']['format']
#             collection_volume_type = collection_volume_type + item['itemVolume']['type']
#             # add collection
#             collection_volume_collection = collection_volume_collection + item['itemVolume']['collection']
#             # sum usage information
#             collection_usage_title += get_usage(item['itemVolume']['title'])
#             collection_usage_subject += get_usage(item['itemVolume']['subject'])
#             collection_usage_displaydate += get_usage(item['itemVolume']['displayDate'])
#             collection_usage_language += get_usage(item['itemVolume']['language'])
#             collection_usage_creator += get_usage(item['itemVolume']['creator'])
#             collection_usage_publisher += get_usage(item['itemVolume']['publisher'])
#             collection_usage_rights += get_usage(item['itemVolume']['rights'])
#             collection_usage_spatialcoords += get_usage(item['itemVolume']['spatialName'])
#             collection_usage_spatialname += get_usage(item['itemVolume']['spatialName'])
#             collection_usage_description += get_usage(item['itemVolume']['description'])
#             collection_usage_provider += get_usage(item['itemVolume']['provider'])
#             collection_usage_dataprovider += get_usage(item['itemVolume']['dataProvider'])
#             collection_usage_format += get_usage(item['itemVolume']['format'])
#             collection_usage_type += get_usage(item['itemVolume']['type'])
#             # add collection
#             collection_usage_collection += get_usage(collection_volume_collection)
#
# collection_volume = {
#     'title': collection_volume_title,
#     'format': collection_volume_format,
#     'type': collection_volume_type,
#     'subject': collection_volume_subject,
#     'rights': collection_volume_rights,
#     'language': collection_volume_language,
#     'creator': collection_volume_creator,
#     'publisher': collection_volume_publisher,
#     'description': collection_volume_description,
#     'provider': collection_volume_provider,
#     'dataProvider': collection_volume_dataprovider,
#     'displayDate': collection_volume_displaydate,
#     'spatialCoords': collection_volume_spatialcoords,
#     'spatialName': collection_volume_spatialname,
#     'collection': collection_volume_collection
# }
# collection_usage = {
#     'title': collection_usage_title,
#     'format': collection_usage_format,
#     'type': collection_usage_type,
#     'subject': collection_usage_subject,
#     'rights': collection_usage_rights,
#     'language': collection_usage_language,
#     'creator': collection_usage_creator,
#     'publisher': collection_usage_publisher,
#     'description': collection_usage_description,
#     'provider': collection_usage_provider,
#     'dataProvider': collection_usage_dataprovider,
#     'displayDate': collection_usage_displaydate,
#     'spatialCoords': collection_usage_spatialcoords,
#     'spatialName': collection_usage_spatialname,
#     'collection': collection_usage_collection
# }
#
# coll_metadata_detail = collections.OrderedDict()
# coll_metadata_detail['itemCount'] = len(item_detail)
# coll_metadata_detail['dateProfiled'] = time.strftime("%d/%m/%Y")
# if config.CP_REMOTE:
#     coll_metadata_detail['Source'] = "Remote"
# else:
#     coll_metadata_detail['Source'] = 'Local'
#
# coll_metadata_detail['collectionVolume'] = collection_volume
# coll_metadata_detail['collectionUsage'] = collection_usage
#
# # put the metadataDetail to the collection
# collection['collMetadataDetail'] = coll_metadata_detail
# collection['duplicated_item'] = id_duplicated
# # put itemDetail to collection
# if not collection_only:
#     collection['itemDetail'] = item_detail
# write the json format fo the data to local file named by user
    dest_file = open(filename, 'w')
    dest_file.write(json.dumps(id_list))
    dest_file.close()
Example #6
0
def retrieve_facets_by_collection_id():
    query_items_in_collections = {DPLA_API_QUERY_facets: DPLA_PROP_collectionId}
    response = dpla_utils.dpla_fetch_facets_remote( \
    api_key=config.API_KEY, **query_items_in_collections)

    return response
Example #7
0
def set_dataProvider_list():
    api_key = dpla_config.API_KEY

    # read input file and folder
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i',
        action='append',
        dest='provider_input_file',
        help=
        "A path of input provider file generated by set_provider_list.py is needed"
    )
    parser.add_argument(
        '-e',
        action='append',
        dest='dataProvider_folder',
        help=
        "A folder for storing dataProvider files is needed, the folder must exist"
    )
    results = parser.parse_args()

    provider_input_file = results.provider_input_file[0]
    folder = results.dataProvider_folder[0]

    # append / into folder path
    if not folder.endswith("/"):
        folder = folder + "/"
    try:
        #open input file
        with open(provider_input_file) as input_file:

            provider_csv_reader = csv.reader(input_file)

            #check input header
            input_header = next(provider_csv_reader, None)

            if input_header != EXPECTED_INPUT_HEADER:
                print("input header doesn`t match")
                sys.exit()

            dataProvider_count = 0
            #print("************* Question 7: How many items are contributed by each dataProvider? **********")
            for provider_information_row in provider_csv_reader:

                #query dataProvider by faceting provider.name
                current_provider_name = provider_information_row[0]
                query_terms = {PROVIDER_NAME: current_provider_name}
                query_terms['facets'] = DPLA_DATAPROVIDER
                dataProvider_query_response = dpla_utils.dpla_fetch_facets_remote(
                    api_key, **query_terms)

                dataprovider_result_list = dataProvider_query_response[
                    'dataProvider']['terms']

                new_dataProvider_result_list = []
                for index in range(len(dataprovider_result_list)):
                    current_dataProvider_result_list_entry = dataprovider_result_list[
                        index]
                    new_dataProvider_result_list_entry = {
                        DATAPROVIDER_HEADER_NAME:
                        current_dataProvider_result_list_entry['term'],
                        DATAPROVIDER_HEADER_COUNT:
                        current_dataProvider_result_list_entry['count']
                    }
                    new_dataProvider_result_list.append(
                        new_dataProvider_result_list_entry)
                # Question: How many items are contributed by each dataProvider?
                #print("dataProvider ", dataprovider_result_list['term'], " contributes ", dataprovider_result_list['term'], " items.")
                dataProvider_count += len(new_dataProvider_result_list)

                try:
                    # process current_provider_name
                    current_provider_name = current_provider_name.replace(
                        ' ', '_', 256)
                    # write dataProvider information into files
                    #print(folder + current_provider_name + DATAPROVIDER_FILE_SUFFLIX)
                    with open(
                            folder + current_provider_name +
                            DATAPROVIDER_FILE_SUFFLIX,
                            'w') as dataProvider_file:

                        #header = ['term', 'count'] # put in begining
                        header = [
                            DATAPROVIDER_HEADER_NAME, DATAPROVIDER_HEADER_COUNT
                        ]
                        dataProvider_csv_writer = csv.DictWriter(
                            dataProvider_file, header)
                        dataProvider_csv_writer.writeheader()
                        dataProvider_csv_writer.writerows(
                            new_dataProvider_result_list)
                        #header = [DATAPROVIDER_HEADER_COUNT, DATAPROVIDER_HEADER_NAME]
                        #dataProvider_csv_writer.writeheader()

                    dataProvider_file.close()

                except IOError as folder_error:
                    print("couldn`t create file in this folder")
                    sys.exit()
            print("There are " + str(dataProvider_count) +
                  " dataProviders contribute data for DPLA")
            input_file.close()
    except IOError as input_file_error:
        print("couldn`t read the input file ")

    #process provider list, add dataProvider_path column
    process_provider_list(provider_input_file, folder)