Exemple #1
0
def get_facet_query(solr_url, field, **kwargs):
    '''Return a facet data dict to muck with based on "field")
    '''
    query = facet_query.copy()
    query.update({'facet.field': field})
    solr_json = get_solr_json(solr_url=solr_url, query=query, **kwargs)
    return create_facet_dict(solr_json, field)
Exemple #2
0
def create_new_facet_values_sheet(facet, workbook, solr_url, api_key,
                                  solr_url_new, api_key_new):
    #report new values for the given facet
    query = {
        'facet': 'true',
        'facet.field': [
            facet,
        ],
        'rows': 0,
        'facet.limit': -1,  # give them all
        'facet.sort': 'count',
        'facet.mincount': 1,
    }
    production_json = get_solr_json(solr_url, query, api_key=api_key)
    production_facet_dict = create_facet_dict(production_json, facet)
    new_json = get_solr_json(solr_url_new, query, api_key=api_key_new)
    new_facet_dict = create_facet_dict(new_json, facet)
    not_in_new, not_in_prod, count_equal, new_less, new_more = \
        compare_datasets(production_facet_dict, new_facet_dict)
    print("{}: NOT IN PROD: {}  NOT_IN_NEW: {}".format(facet, len(not_in_prod),
                                                       len(not_in_new)))

    page = workbook.add_worksheet('New {} Values'.format(facet))
    header_format = workbook.add_format({
        'bold': True,
    })
    number_format = workbook.add_format()
    number_format.set_num_format('#,##0')
    if not_in_prod > 0:
        page.set_tab_color('red')
        number_format.set_bg_color('red')
    page.write(0, 0, 'New {} Values'.format(facet), header_format)
    page.write(0, 1, 'Counts', header_format)
    # width
    page.set_column(
        0,
        1,
        25,
    )
    row = 2
    for value, count in not_in_prod:
        page.write(row, 0, value)
        page.write(row, 1, count, number_format)
        row = row + 1
def create_new_facet_values_sheet(facet, workbook, solr_url, api_key,
                                  solr_url_new, api_key_new):
    #report new values for the given facet
    query = {
        'facet': 'true',
        'facet.field': [facet, ],
        'rows': 0,
        'facet.limit': -1,  # give them all
        'facet.sort': 'count',
        'facet.mincount': 1,
    }
    production_json = get_solr_json(solr_url, query, api_key=api_key)
    production_facet_dict = create_facet_dict(production_json, facet)
    new_json = get_solr_json(solr_url_new, query, api_key=api_key_new)
    new_facet_dict = create_facet_dict(new_json, facet)
    not_in_new, not_in_prod, count_equal, new_less, new_more = \
        compare_datasets(production_facet_dict, new_facet_dict)
    print("{}: NOT IN PROD: {}  NOT_IN_NEW: {}".format(
        facet, len(not_in_prod), len(not_in_new)))

    page = workbook.add_worksheet('New {} Values'.format(facet))
    header_format = workbook.add_format({'bold': True, })
    number_format = workbook.add_format()
    number_format.set_num_format('#,##0')
    if not_in_prod > 0:
        page.set_tab_color('red')
        number_format.set_bg_color('red')
    page.write(0, 0, 'New {} Values'.format(facet), header_format)
    page.write(0, 1, 'Counts', header_format)
    # width
    page.set_column(
        0,
        1,
        25, )
    row = 2
    for value, count in not_in_prod:
        page.write(row, 0, value)
        page.write(row, 1, count, number_format)
        row = row + 1
Exemple #4
0
def create_missing_report(field, workbook, header_format, add_query=None):
    '''add_query is additional parameters for the query as a dictionary of
    param: value. Needed for filter query for missing reference_image_md5
    '''
    query = {
        'q': '-{}:[* TO *]'.format(field),
        'rows': 0,
        'wt': 'json',
        'facet': 'true',
        'facet.field': 'collection_url'
    }
    if add_query:
        query.update(add_query)
    collection_urls = create_facet_dict(
        get_solr_json(
            solr_url,
            query=query,
            api_key=api_key,
            digest_user=digest_user,
            digest_pswd=digest_pswd),
        'collection_url')
    title = 'missing {}'.format(field)
    create_missing_worksheet(title, collection_urls, workbook, header_format)
Exemple #5
0
def main(solr_url='https://harvest-stg.cdlib.org/solr/dc-collection/query',
         outdir=None,
         api_key=None,
         digest_user=None,
         digest_pswd=None):
    print("USING SOLR:{}".format(solr_url))
    field = 'reference_image_md5'
    #print "======FIELD:{} {} {}".format(field, digest_user, digest_pswd)
    #print "======FIELD:{} {} {}".format(field, api_key, solr_url)
    dup_md5 = get_facet_query(
        solr_url,
        field,
        api_key=api_key,
        digest_user=digest_user,
        digest_pswd=digest_pswd)
    #now for each md5, get the collection_url that it is in
    for md5, count in dup_md5.items():
        query = {
            'q': md5,
            'rows': 0,
            'wt': 'json',
            'facet': 'true',
            'facet.field': 'collection_url'
        }
        collection_urls = create_facet_dict(
            get_solr_json(
                solr_url,
                query=query,
                api_key=api_key,
                digest_user=digest_user,
                digest_pswd=digest_pswd),
            'collection_url')
        dup_md5[md5] = (count, collection_urls)
    workbook, header_format, number_format = create_report_workbook(outdir)
    page = workbook.add_worksheet(field)
    # headers
    page.write(0, 0, field, header_format)
    page.write(0, 1, 'Number Dups', header_format)
    page.write(0, 2, 'Collections', header_format)
    # width
    page.set_column(
        0,
        0,
        50, )
    page.set_column(
        1,
        1,
        10, )
    page.set_column(
        2,
        10,
        50, )
    row = 1
    for md5, data in dup_md5.items():
        page.write(row, 0, md5)
        page.write(row, 1, data[0])
        column = 2
        for c_url, num in data[1].items():
            coll_data = ' - '.join((c_url, str(num)))
            page.write(row, column, coll_data)
            column += 1
        row += 1
    #end md5 page
    #missing type_ss
    field = 'type_ss'
    create_missing_report(field, workbook, header_format)
    field = 'repository_data'
    create_missing_report(field, workbook, header_format)
    field = 'title_ss'
    create_missing_report(field, workbook, header_format)
    field = 'url_item'
    create_missing_report(field, workbook, header_format)
    field = 'reference_image_md5'
    create_missing_report(
        field, workbook, header_format, add_query={'fq': 'type_ss:image'})
    field = 'rights_ss'
    create_missing_report(field, workbook, header_format)
    parser = argparse.ArgumentParser()
    parser.add_argument('outdir', )
    argv = parser.parse_args()

    config = configparser.SafeConfigParser()
    config.read('report.ini')

    solr_url = config.get('new-index', 'solrUrl')
    api_key = config.get('new-index', 'solrAuth')

    couchdb_url = config.get('couchdb', 'url')

    solr_collection_json = get_solr_json(solr_url,
                                         solr_collection_query,
                                         api_key=api_key)
    solr_collection_facet = create_facet_dict(solr_collection_json,
                                              'collection_url')
    diffs = []
    couch_less = []
    for curl, count in solr_collection_facet.items():
        cid = curl.rsplit('/', 2)[-2]
        url_couchdb_count = ''.join(
            ('{}/couchdb/ucldc/_design/', 'all_provider_docs/_view/',
             'by_provider_name_count?', 'key="{}"')).format(couchdb_url, cid)
        resp = requests.get(url_couchdb_count, verify=False)
        couch_count = resp.json()['rows'][0]['value']
        if count != couch_count:
            diffs.append((cid, count, couch_count))
            if couch_count < count:
                couch_less.append((cid, count, couch_count))
            print "{} SOLR:{} COUCH:{}".format(cid, count, couch_count)
    print "FOR {} COLLECTIONS, {} have different counts".format(
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('outdir',)
    argv = parser.parse_args()

    config = configparser.SafeConfigParser()
    config.read('report.ini')

    solr_url = config.get('new-index', 'solrUrl')
    api_key = config.get('new-index', 'solrAuth')

    couchdb_url = config.get('couchdb', 'url')

    solr_collection_json = get_solr_json(solr_url, solr_collection_query,
                                         api_key=api_key)
    solr_collection_facet = create_facet_dict(solr_collection_json,
                                              'collection_url')
    diffs = []
    couch_less = []
    for curl, count in solr_collection_facet.items():
        cid = curl.rsplit('/', 2)[-2]
        url_couchdb_count = ''.join(('{}/couchdb/ucldc/_design/',
                                     'all_provider_docs/_view/',
                                     'by_provider_name_count?',
                                     'key="{}"')).format(couchdb_url, cid)
        resp = requests.get(url_couchdb_count, verify=False)
        couch_count = resp.json()['rows'][0]['value']
        if count != couch_count:
            diffs.append((cid, count, couch_count))
            if couch_count < count:
                couch_less.append((cid, count, couch_count))
            print "{} SOLR:{} COUCH:{}".format(cid, count, couch_count)
Exemple #8
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'outdir',
        nargs=1,
    )

    if argv is None:
        argv = parser.parse_args()

    config = configparser.SafeConfigParser()
    config.read('report.ini')

    #get totals for reporting on first page
    query_t = {
        'facet': 'true',
        'facet.field': [
            'type_ss',
            'facet_decade',
        ],
        'facet.missing': 'on',
        'rows': 0,
        'facet.limit': -1,
    }
    solr_url = config.get('calisphere', 'solrUrl')
    api_key = config.get('calisphere', 'solrAuth')
    production_totals = get_solr_json(solr_url, query_t, api_key=api_key)
    num_prod_docs = get_total_docs(production_totals)
    production_type_ss_dict = create_facet_dict(production_totals, 'type_ss')
    solr_url_new = config.get('new-index', 'solrUrl')
    api_key_new = config.get('new-index', 'solrAuth')
    new_totals = get_solr_json(solr_url_new, query_t, api_key=api_key_new)
    num_new_docs = get_total_docs(new_totals)
    new_type_ss_dict = create_facet_dict(new_totals, 'type_ss')

    #get calisphere current index data
    production_json = get_solr_json(solr_url, base_query, api_key=api_key)
    production_facet_dict = create_facet_dict(production_json,
                                              'collection_url')
    new_json = get_solr_json(solr_url_new, base_query, api_key=api_key_new)
    new_facet_dict = create_facet_dict(new_json, 'collection_url')
    pp('OLD LEN:{} NEW LEN:{}'.format(len(production_facet_dict),
                                      len(new_facet_dict)))
    not_in_new, not_in_prod, count_equal, new_less, new_more = \
        compare_datasets(production_facet_dict, new_facet_dict)
    all_collections, ready_for_pub, not_ready_for_pub = \
        get_registry_collection_data()
    pp("READY FOR PUB:{} NOT READY:{}".format(len(ready_for_pub),
                                              len(not_ready_for_pub)))
    missing_ready_for_pub = [
        c for c in ready_for_pub if c['url'] not in new_facet_dict
    ]
    not_ready_for_pub = [
        c for c in not_ready_for_pub if c['url'] in new_facet_dict
    ]

    pp('NOT IN NEW INDEX {}'.format(len(not_in_new)))
    pp('NOT IN PROD INDEX {}'.format(len(not_in_prod)))
    pp('COUNT EQUAL {}'.format(len(count_equal)))
    pp('NEW LESS {}'.format(len(new_less)))
    pp('NEW MORE {}'.format(len(new_more)))
    workbook = create_report_workbook(
        argv.outdir[0],
        not_in_new,
        not_in_prod,
        count_equal,
        new_less,
        new_more,
        num_found_prod=num_prod_docs,
        num_found_new=num_new_docs,
        type_ss_prod=production_type_ss_dict,
        type_ss_new=new_type_ss_dict,
        all_collections=all_collections,
        missing_ready_for_pub=missing_ready_for_pub,
        not_ready_for_pub=not_ready_for_pub)

    create_new_facet_values_sheet('coverage_ss', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)
    create_new_facet_values_sheet('facet_decade', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)
    create_new_facet_values_sheet('rights_ss', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)

    workbook.close()
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'outdir',
        nargs=1, )

    if argv is None:
        argv = parser.parse_args()

    config = configparser.SafeConfigParser()
    config.read('report.ini')

    #get totals for reporting on first page
    query_t = {
        'facet': 'true',
        'facet.field': [
            'type_ss',
            'facet_decade',
        ],
        'facet.missing': 'on',
        'rows': 0,
        'facet.limit': -1,
    }
    solr_url = config.get('calisphere', 'solrUrl')
    api_key = config.get('calisphere', 'solrAuth')
    production_totals = get_solr_json(solr_url, query_t, api_key=api_key)
    num_prod_docs = get_total_docs(production_totals)
    production_type_ss_dict = create_facet_dict(production_totals, 'type_ss')
    solr_url_new = config.get('new-index', 'solrUrl')
    api_key_new = config.get('new-index', 'solrAuth')
    new_totals = get_solr_json(solr_url_new, query_t, api_key=api_key_new)
    num_new_docs = get_total_docs(new_totals)
    new_type_ss_dict = create_facet_dict(new_totals, 'type_ss')

    #get calisphere current index data
    production_json = get_solr_json(solr_url, base_query, api_key=api_key)
    production_facet_dict = create_facet_dict(production_json,
                                              'collection_url')
    new_json = get_solr_json(solr_url_new, base_query, api_key=api_key_new)
    new_facet_dict = create_facet_dict(new_json, 'collection_url')
    pp('OLD LEN:{} NEW LEN:{}'.format(
        len(production_facet_dict), len(new_facet_dict)))
    not_in_new, not_in_prod, count_equal, new_less, new_more = \
        compare_datasets(production_facet_dict, new_facet_dict)
    all_collections, ready_for_pub, not_ready_for_pub = \
        get_registry_collection_data()
    pp("READY FOR PUB:{} NOT READY:{}".format(
        len(ready_for_pub), len(not_ready_for_pub)))
    missing_ready_for_pub = [
        c for c in ready_for_pub if c['url'] not in new_facet_dict
    ]
    not_ready_for_pub = [
        c for c in not_ready_for_pub if c['url'] in new_facet_dict
    ]

    pp('NOT IN NEW INDEX {}'.format(len(not_in_new)))
    pp('NOT IN PROD INDEX {}'.format(len(not_in_prod)))
    pp('COUNT EQUAL {}'.format(len(count_equal)))
    pp('NEW LESS {}'.format(len(new_less)))
    pp('NEW MORE {}'.format(len(new_more)))
    workbook = create_report_workbook(
        argv.outdir[0],
        not_in_new,
        not_in_prod,
        count_equal,
        new_less,
        new_more,
        num_found_prod=num_prod_docs,
        num_found_new=num_new_docs,
        type_ss_prod=production_type_ss_dict,
        type_ss_new=new_type_ss_dict,
        all_collections=all_collections,
        missing_ready_for_pub=missing_ready_for_pub,
        not_ready_for_pub=not_ready_for_pub)

    create_new_facet_values_sheet('coverage_ss', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)
    create_new_facet_values_sheet('facet_decade', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)
    create_new_facet_values_sheet('rights_ss', workbook, solr_url, api_key,
                                  solr_url_new, api_key_new)

    workbook.close()