Exemple #1
0
def generate_quality_score():
    '''Reads the completeness and error json reports, and calculates the metadata quality scores.

    return summary_data, list(set(headers))
    '''

    # Generate completeness percent & weighted completeness percent
    scores = get_json('reports/attribute_completeness.json')
    completion_weightings = get_json(WEIGHTS)
    data = {}
    for s in scores:
        data[s['id']] = {
            'id': s['id'],
            'publisher': s['publisher'],
            'title': s['title']
        }
        c_score = round((s['filled_attributes'] / s['total_attributes']) * 100, 2) #completion score
        wc_score = round(attribute_weighted_score(s, completion_weightings) *100, 2) # weighted completion score
        data[s['id']]['completeness_percent'] = c_score
        data[s['id']]['weighted_completeness_percent'] = wc_score
    
    # Generate error percent and weighted error percent
    schema = get_json(DATASET_SCHEMA)
    total_attributes = len(list(schema['properties'].keys()))
    errors = get_json('reports/attribute_errors.json')
    error_weightings = get_json(WEIGHTS)
    for e in errors:
        e_score = round((e['attributes_with_errors'] / total_attributes) * 100, 2)
        we_score = round(attribute_weighted_score(e, error_weightings) * 100, 2)
        data[e['id']]['error_percent'] = e_score
        data[e['id']]['weighted_error_percent'] = we_score

    # Generate quality score, weighted quality score, quality score rating, and weighted quality score rating
    summary_data = []
    headers = []
    for id, d in data.items():
        avg_score = round(mean([data[id]['completeness_percent'], 100-data[id]['error_percent']]), 2)
        d['quality_score'] = avg_score
        d['quality_rating'] = quality_ratings(d['quality_score'])

        weighted_avg_score = round(mean([data[id]['weighted_completeness_percent'], 100-data[id]['weighted_error_percent']]), 2)
        d['weighted_quality_score'] = weighted_avg_score
        d['weighted_quality_rating'] = quality_ratings(d['weighted_quality_score'])

        headers.extend(d.keys())
        summary_data.append(d)

    return summary_data, list(set(headers))
Exemple #2
0
def completeness_check():
    schema = generate_baseline_from_sections(METADATA_SECTIONS,
                                             REPORTING_LEVELS)
    data_models = get_json(DATASETS_JSON)
    data = []
    header = []
    for dm in data_models['dataModels']:
        print("Processing:", dm['id'])
        d = {
            'pid': dm.get('pid', None),
            'id': dm.get('id', None),
            'publisher': dm.get('publisher', None),
            'title': dm.get('title', None)
        }
        for attribute in (set(dm.keys()) - set(schema.keys())):
            dm.pop(
                attribute, None
            )  # any attribute not in the schema, drop from the data model
        s = copy.deepcopy(schema)
        s.update(dm)
        score = nullScore(s)
        score.update(d)
        header.extend(score.keys())
        data.append(score)
    return data, list(set(header))
def schema_validation_check():
    schema = get_json(DATASET_SCHEMA)
    data_models = get_json(DATASETS_JSON)
    data = []
    headers = []
    for dm in data_models['dataModels']:
        errors = validate_schema(schema, dm)
        d = {
            'id': dm['id'],
            'publisher': dm['publisher'],
            'title': dm['title'],
            'schema_error_count': len(errors),
            'errors': errors
        }
        headers.extend(d.keys())
        data.append(d)
    return data, list(set(headers))
def generate_quality_score():
    # TODO: Differentiate between section (A-G) completeness by weighting them
    scores = get_json('reports/completeness.json')
    data = {}
    for s in scores:
        data[s['id']] = {
            'id': s['id'],
            'publisher': s['publisher'],
            'title': s['title']
        }
        c_score = round((s['missing_attributes'] / s['total_attributes']) * 100, 2)
        data[s['id']]['missingness_percent'] = c_score
    
    # TODO: Differentiate between error classes (required vs format) by weighting them
    schema = get_json(DATASET_SCHEMA)
    total_attributes = len(list(schema['properties'].keys()))
    errors = get_json('reports/schema_errors.json')
    for e in errors:
        e_score = round((e['schema_error_count'] / total_attributes) * 100, 2)
        data[e['id']]['error_percent'] = e_score
    
    # # Calculate average quality score (lower the better)
    # quality_scores = [100 - round(mean([v['missingness_percent'], v['error_percent']]),2) for k, v in data.items()]
    # mean_quality_score = round(mean(quality_scores))
    # stdev_quality_score = round(stdev(quality_scores))
    # print("MEAN:", mean_quality_score)
    # print("STDEV:", stdev_quality_score)

    summary_data = []
    headers = []
    for id, d in data.items():
        avg_score = round(mean([data[id]['missingness_percent'], data[id]['error_percent']]), 2)
        d['quality_score'] = round(100 - avg_score, 2)
        if d['quality_score'] <= 50:
            d['quality_rating'] = "Not Rated"
        elif d['quality_score'] > 50 and d['quality_score'] <= 70:
            d['quality_rating'] = "Bronze"
        elif d['quality_score'] > 70 and d['quality_score'] <= 80:
            d['quality_rating'] = "Silver"
        elif d['quality_score'] > 80 and d['quality_score'] <= 90:
            d['quality_rating'] = "Gold"
        elif d['quality_score'] > 90:
            d['quality_rating'] = "Platinum"
        headers.extend(d.keys())
        summary_data.append(d)
    return summary_data, list(set(headers))
def completeness_check():
    schema = get_json(BASELINE_SAMPLE)
    data_models = get_json(DATASETS_JSON)
    data = []
    header = []
    for dm in data_models['dataModels']:
        print("Processing:", dm['id'])
        d = {
            'id': dm['id'],
            'publisher': dm['publisher'],
            'title': dm['title']
        }
        dm.pop('dataClasses', None)
        s = copy.deepcopy(schema)
        s.update(dm)
        score = nullScore(s)
        score.update(d)
        header.extend(score.keys())
        data.append(score)
    return data, list(set(header))
Exemple #6
0
def schema_validation_check():
    schema = get_json(DATASET_SCHEMA)
    data_models = get_json(DATASETS_JSON)
    validation_attributes = set(generate_attribute_list(METADATA_SECTIONS, REPORTING_LEVELS))
    data = []
    headers = []
    for dm in data_models['dataModels']:
        dm_validate = copy.deepcopy(dm)
        for attribute in (set(dm_validate.keys()) - validation_attributes):
            dm_validate.pop(attribute, None)
        errors = validate_schema(schema, dm_validate)
        d = {
            'id': dm.get('id',None),
            'publisher': dm.get('publisher',None),
            'title': dm.get('title',None),
            'schema_error_count': len(errors),
            'errors': errors
        }
        headers.extend(d.keys())
        data.append(d)
    return data, list(set(headers))