def analyze_csv(file_path,
                model_ml=None,
                num_rows=500,
                date_process=TODAY,
                do_both_analysis="ml",
                return_probabilities=True,
                output_mode="ALL"):
    logger.info(" csv_detective on {}".format(file_path))
    try:
        if do_both_analysis:
            logger.info(f"Starting vanilla CSV Detective on file {file_path}")
            if do_both_analysis != "ml":
                dict_result = routine(file_path,
                                      num_rows=num_rows,
                                      output_mode="ALL")
            else:
                dict_result = routine(file_path,
                                      num_rows=num_rows,
                                      user_input_tests=None)
        else:
            # Get ML tagging
            logger.info(f"Starting ML CSV Detective on file {file_path}")
            dict_result = routine(file_path,
                                  num_rows=num_rows,
                                  user_input_tests=None)

        if do_both_analysis != 'rule':
            dict_result = routine_ml(csv_detective_results=dict_result,
                                     file_path=file_path,
                                     model_ml=model_ml,
                                     num_rows=500,
                                     return_probabilities=return_probabilities)

        # combine rb and ml dicts into a single one
        if output_mode == "ALL" and return_probabilities:
            if "columns" in dict_result and "columns_ml" in dict_result:
                dict_result["columns"] = join_reports(
                    dict_rb=dict_result["columns"],
                    dict_ml=dict_result["columns_ml_probas"])
                dict_result.pop("columns_ml_probas")
            else:
                logger.error(f"Only ML or RULE analysis is ongoing...")

    except Exception as e:
        logger.info("Analyzing file {0} failed with {1}".format(file_path, e))
        return {"error": "{}".format(e)}

    dict_result['analysis_date'] = date_process

    return dict_result
def analyze_csv(file_path,
                analysis_type="both",
                pipeline=None,
                num_rows=500,
                include_datasetID=None):
    logger.info(" csv_detective on {}".format(file_path))
    if include_datasetID:
        final_id = extract_id(file_path)
        if final_id in include_datasetID:
            final_id = f"{include_datasetID[final_id]}/{final_id}"
        else:
            final_id = f"NODATASETID/{final_id}"
            logger.info(
                f"Resource ID {final_id} not found in RESOURCEID2DATASETID dict"
            )
    try:
        if analysis_type == "both" or analysis_type == "rule":
            logger.info(f"Starting vanilla CSV Detective on file {file_path}")
            dict_result = routine(file_path, num_rows=num_rows)

            if "columns" in dict_result:
                dict_result["columns"] = {
                    k.strip('"'): v
                    for k, v in dict_result["columns"].items()
                }
                dict_result["columns_rb"] = dict_result["columns"]
                dict_result.pop("columns")
        else:
            # Get ML tagging
            logger.info(f"Starting ML CSV Detective on file {file_path}")
            dict_result = routine(file_path,
                                  num_rows=num_rows,
                                  user_input_tests=None)

        if analysis_type != "rule":
            assert pipeline is not None
            y_pred, csv_info = get_columns_ML_prediction(file_path,
                                                         pipeline,
                                                         dict_result,
                                                         num_rows=num_rows)
            dict_result["columns_ml"] = get_columns_types(y_pred, csv_info)

    except Exception as e:
        logger.info("Analyzing file {0} failed with {1}".format(file_path, e))
        return final_id, {"error": "{}".format(e)}

    dict_result['analysis_date'] = TODAY

    return final_id, dict_result
Beispiel #3
0
def test_old_detection():
    file_path = './annuaire-de-leducation.csv'
    expected_results = sort_keys(json.load(open("baseline_result.json")))

    # Open your file and run csv_detective
    inspection_results = sort_keys(routine(file_path))
    current_result = json.dump(inspection_results, open("current_result.json", "w"))
    pprint(inspection_results)
    assert str(inspection_results) == str(expected_results)
Beispiel #4
0
def run_csv_detective(file_path):
    logger.info("Treating file {}".format(file_path))
    try:
        inspection_results = routine(file_path)
    except Exception as e:
        logger.info(e)
        return

    if len(inspection_results) > 2 and len(inspection_results["columns"]):
        inspection_results["file"] = file_path
        logger.info(file_path, inspection_results)
        return inspection_results
    else:
        logger.info("Analysis output of file {} was empty".format(files_path))
Beispiel #5
0
def get_csv_detective_metadata(csv_detective_cache: dict, csv_file_path: Path, num_rows=5000):
    """
    Try and get the already computed meta-data of the csv_id passed, whether from a cached dict or calling
    the csv_detective routines
    :param csv_detective_cache: A key:value dict csv_id:csv_detective_info. Or an empty dict.
    :param csv_id: The id of the currently analysed csv file
    :return: The metadata of the csv file
    """
    csv_file_path = Path(csv_file_path)
    csv_id = csv_file_path.stem
    if csv_detective_cache and csv_id in csv_detective_cache:
        return csv_detective_cache[csv_id]
    try:
        dict_result = routine(csv_file_path.as_posix(), num_rows=num_rows)
    except:
        return {}
    csv_detective_cache[csv_id] = dict_result
    json.dump(csv_detective_cache, open("./data/csv_detective_analysis.json", "w"), indent=4)
    return dict_result
Beispiel #6
0
# Import the csv_detective package
import os
import json
from pathlib import Path

from csv_detective.explore_csv import routine

# Replace by your file path
input_folder = Path() / "tests" / "data"
output_folder = Path() / 'tests' / 'output_data'

for folder in os.listdir(input_folder):
    for file in os.listdir(input_folder / folder):
        # Open your file and run csv_detective
        file_path = input_folder / folder / file
        inspection_results = routine(file_path)

        # Write your file as json
        output_folder_file = output_folder / folder
        if not output_folder_file.exists():
            os.makedirs(output_folder_file)
        output_file_path = output_folder_file / file
        with open(output_file_path.with_suffix('.json'), 'w',
                  encoding="utf8") as fp:
            json.dump(inspection_results, fp, indent=4, separators=(',', ': '))
    if from_cache_only:
        generator = [os.path.join('.cache_csv', x) for x in os.listdir('.cache_csv')]
        # generator = [x for x in generator if '.zip' in x]
    else:
        generator = download_all()

    # zfile = zipfile.ZipFile(generator[0])
    # for name in zfile.namelist():
    #     (dirname, filename) = os.path.split(name)
    #     print "Decompressing " + filename + " on " + dirname
    #     if not os.path.exists(dirname):
    #         os.makedirs(dirname)
    #     zfile.extract(name, dirname)

    # import pdb
    # pdb.set_trace()

    for idx, file_path in enumerate(generator):
        print idx, 
        if '.csv' in file_path:
            # Open your file and run csv_detective
            with open(file_path, 'r') as file:
                inspection_results = routine(file, user_input_tests = list_tests)

            # Write your file as json
            json_path = os.path.join('cache_json', os.path.basename(file_path).replace('.csv', '.json'))
            with open(json_path, 'wb') as fp:
                json.dump(inspection_results, fp, indent=4, separators=(',', ': '), encoding="utf-8")

            if erase_csv_cache:
                os.remove(file_path)