def generate_quality_score(): '''Reads the completeness and error json reports, and calculates the metadata quality scores. return summary_data, list(set(headers)) ''' # Generate completeness percent & weighted completeness percent scores = get_json('reports/attribute_completeness.json') completion_weightings = get_json(WEIGHTS) data = {} for s in scores: data[s['id']] = { 'id': s['id'], 'publisher': s['publisher'], 'title': s['title'] } c_score = round((s['filled_attributes'] / s['total_attributes']) * 100, 2) #completion score wc_score = round(attribute_weighted_score(s, completion_weightings) *100, 2) # weighted completion score data[s['id']]['completeness_percent'] = c_score data[s['id']]['weighted_completeness_percent'] = wc_score # Generate error percent and weighted error percent schema = get_json(DATASET_SCHEMA) total_attributes = len(list(schema['properties'].keys())) errors = get_json('reports/attribute_errors.json') error_weightings = get_json(WEIGHTS) for e in errors: e_score = round((e['attributes_with_errors'] / total_attributes) * 100, 2) we_score = round(attribute_weighted_score(e, error_weightings) * 100, 2) data[e['id']]['error_percent'] = e_score data[e['id']]['weighted_error_percent'] = we_score # Generate quality score, weighted quality score, quality score rating, and weighted quality score rating summary_data = [] headers = [] for id, d in data.items(): avg_score = round(mean([data[id]['completeness_percent'], 100-data[id]['error_percent']]), 2) d['quality_score'] = avg_score d['quality_rating'] = quality_ratings(d['quality_score']) weighted_avg_score = round(mean([data[id]['weighted_completeness_percent'], 100-data[id]['weighted_error_percent']]), 2) d['weighted_quality_score'] = weighted_avg_score d['weighted_quality_rating'] = quality_ratings(d['weighted_quality_score']) headers.extend(d.keys()) summary_data.append(d) return summary_data, list(set(headers))
def completeness_check(): schema = generate_baseline_from_sections(METADATA_SECTIONS, REPORTING_LEVELS) data_models = get_json(DATASETS_JSON) data = [] header = [] for dm in data_models['dataModels']: print("Processing:", dm['id']) d = { 'pid': dm.get('pid', None), 'id': dm.get('id', None), 'publisher': dm.get('publisher', None), 'title': dm.get('title', None) } for attribute in (set(dm.keys()) - set(schema.keys())): dm.pop( attribute, None ) # any attribute not in the schema, drop from the data model s = copy.deepcopy(schema) s.update(dm) score = nullScore(s) score.update(d) header.extend(score.keys()) data.append(score) return data, list(set(header))
def schema_validation_check(): schema = get_json(DATASET_SCHEMA) data_models = get_json(DATASETS_JSON) data = [] headers = [] for dm in data_models['dataModels']: errors = validate_schema(schema, dm) d = { 'id': dm['id'], 'publisher': dm['publisher'], 'title': dm['title'], 'schema_error_count': len(errors), 'errors': errors } headers.extend(d.keys()) data.append(d) return data, list(set(headers))
def generate_quality_score(): # TODO: Differentiate between section (A-G) completeness by weighting them scores = get_json('reports/completeness.json') data = {} for s in scores: data[s['id']] = { 'id': s['id'], 'publisher': s['publisher'], 'title': s['title'] } c_score = round((s['missing_attributes'] / s['total_attributes']) * 100, 2) data[s['id']]['missingness_percent'] = c_score # TODO: Differentiate between error classes (required vs format) by weighting them schema = get_json(DATASET_SCHEMA) total_attributes = len(list(schema['properties'].keys())) errors = get_json('reports/schema_errors.json') for e in errors: e_score = round((e['schema_error_count'] / total_attributes) * 100, 2) data[e['id']]['error_percent'] = e_score # # Calculate average quality score (lower the better) # quality_scores = [100 - round(mean([v['missingness_percent'], v['error_percent']]),2) for k, v in data.items()] # mean_quality_score = round(mean(quality_scores)) # stdev_quality_score = round(stdev(quality_scores)) # print("MEAN:", mean_quality_score) # print("STDEV:", stdev_quality_score) summary_data = [] headers = [] for id, d in data.items(): avg_score = round(mean([data[id]['missingness_percent'], data[id]['error_percent']]), 2) d['quality_score'] = round(100 - avg_score, 2) if d['quality_score'] <= 50: d['quality_rating'] = "Not Rated" elif d['quality_score'] > 50 and d['quality_score'] <= 70: d['quality_rating'] = "Bronze" elif d['quality_score'] > 70 and d['quality_score'] <= 80: d['quality_rating'] = "Silver" elif d['quality_score'] > 80 and d['quality_score'] <= 90: d['quality_rating'] = "Gold" elif d['quality_score'] > 90: d['quality_rating'] = "Platinum" headers.extend(d.keys()) summary_data.append(d) return summary_data, list(set(headers))
def completeness_check(): schema = get_json(BASELINE_SAMPLE) data_models = get_json(DATASETS_JSON) data = [] header = [] for dm in data_models['dataModels']: print("Processing:", dm['id']) d = { 'id': dm['id'], 'publisher': dm['publisher'], 'title': dm['title'] } dm.pop('dataClasses', None) s = copy.deepcopy(schema) s.update(dm) score = nullScore(s) score.update(d) header.extend(score.keys()) data.append(score) return data, list(set(header))
def schema_validation_check(): schema = get_json(DATASET_SCHEMA) data_models = get_json(DATASETS_JSON) validation_attributes = set(generate_attribute_list(METADATA_SECTIONS, REPORTING_LEVELS)) data = [] headers = [] for dm in data_models['dataModels']: dm_validate = copy.deepcopy(dm) for attribute in (set(dm_validate.keys()) - validation_attributes): dm_validate.pop(attribute, None) errors = validate_schema(schema, dm_validate) d = { 'id': dm.get('id',None), 'publisher': dm.get('publisher',None), 'title': dm.get('title',None), 'schema_error_count': len(errors), 'errors': errors } headers.extend(d.keys()) data.append(d) return data, list(set(headers))