Exemple #1
0
def read_result(file_id, worksheet_id, domain_id, index):
    """Reads the data check result file"""

    result = {}
    try:
        result = {}
        result_df = get_check_results_df(file_id, worksheet_id)
        result_df = result_df.iloc[index]
        keys = ["label", "type", "rules", "ref_type_id"]
        checker_document = CheckerDocument()
        target_fields = checker_document.get_all_target_fields(domain_id, keys)

        for column in result_df.columns.values:
            s_check_res=result_df[column]
            indexes = s_check_res[s_check_res].index
            check_type, field_code, error_type, check_index = eval(column)
            for index in result_df.index:
                count = index
                if index in indexes:
                    target = result.setdefault(count, {})
                    target = target.setdefault(field_code, {})
                    field_data = target_fields.get(field_code)
                    checker = CheckerFactory.get_checker(check_type)
                    error_message = checker.get_message(field_data=field_data, check_type=check_type)
                    result[count][field_code][error_type] = {"check_type": check_type, "error_message": error_message}
                else:
                    result.setdefault(count, {})

        return result

    except pd.errors.EmptyDataError:
        return result
Exemple #2
0
def start_check_job(job_id, file_id, worksheet_id, mapping_id, domain_id, is_transformed, user_id, modifications=None):
    """Starts the data check service"""

    if is_transformed:
        transformed_path = worksheet_id.split('/')
        file_id = transformed_path[-2]
        worksheet_id = transformed_path[-1]

    keys = ["label", "type", "rules", "ref_type_id"]
    checker_document = CheckerDocument()
    modifier_document = ModifierDocument()
    target_fields = checker_document.get_all_target_fields(domain_id, keys)

    start = time.time()
    if modifications:
        rows_indices = set()
        rows_indices.update(map(int, modifications.keys()))
        nrows = len(rows_indices)
        skiprows = set(range(1, max(rows_indices) + 1)) - set([index + 1 for index in rows_indices])

        result_df = get_check_results_df(file_id, worksheet_id)
        mapped_df = get_mapped_df(file_id, worksheet_id, nrows=nrows, skiprows=skiprows)
        mapped_df.index = rows_indices
        modifier_document.save_modifications(worksheet_id, modifications, user_id)
        modifier_document.apply_modifications(mapped_df, worksheet_id, list(rows_indices))
        data_check_result = create_check_metadata(result_df.reset_index(), job_id, file_id, worksheet_id, mapping_id,
                                                  domain_id)
        result_df = check_modifications(mapped_df, result_df, target_fields, data_check_result, modifications,
                                        rows_indices)

        save_check_results_df(result_df, file_id, worksheet_id)
        print("end checks")
        print(time.time() - start)

    else:
        start = time.time()
        df = get_imported_data_df(file_id, worksheet_id, nrows=None, skiprows=None)
        final_df = apply_mapping_transformation(df, mapping_id, target_fields)
        modifier_document.apply_modifications(final_df, worksheet_id, is_all=True)
        normalize_data(final_df, target_fields)
        save_mapped_df(final_df, file_id, worksheet_id)
        print("end mapping")
        print(time.time() - start)
        data_check_result = create_check_metadata(final_df.reset_index(), job_id, file_id, worksheet_id, mapping_id,
                                                  domain_id)
        result_df = run_checks(final_df, target_fields, data_check_result)
        save_check_results_df(result_df, file_id, worksheet_id)
        print("end checks")
        print(time.time() - start)

    job_result_document = JobResultDocument()

    return job_result_document.save_check_job(data_check_result)
Exemple #3
0
def get_check_modifications(worksheet_id, domain_id):
    """Fetches the check modification data"""

    audit_trial = {}
    user_document = UserDocument()
    checker_document = CheckerDocument()
    modifier_document = ModifierDocument()

    target_fields = checker_document.get_all_target_fields(domain_id, ["label"])
    modified_data = modifier_document.get_modifications(worksheet_id, is_all=True)
    for modification in modified_data:
        for column, col_modif in modification["columns"].items():
            label = target_fields.get(column)["label"]
            line_modif = {"previous": col_modif["previous"][-1], "new": col_modif["new"], 
                          "updated_at": col_modif["updatedAt"],
                          "user": user_document.get_user_fullname(col_modif["userId"])}
            if audit_trial.get(label):
                audit_trial[label][modification["line"]] = line_modif
            else:
                audit_trial[label] = {}
                audit_trial[label][modification["line"]] = line_modif

    return audit_trial