def get_dataset_md5(dataset, use_cache=False, debug=True, location=temp_file_location): """ Parameters ---------- dataset : dataset script object use_cache : True to use cached data or False to download again debug: True to raise error or False to fail silently location: path where temporary files are to be created for finding md5 Returns ------- str : The md5 value of a particular dataset. Example ------- >>> for dataset in reload_scripts(): ... if dataset.name=='aquatic-animal-excretion': ... print(get_dataset_md5(dataset)) ... 683c8adfe780607ac31f58926cf1d326 """ try: db_name = '{}_sqlite.db'.format(dataset.name.replace('-', '_')) workdir = mkdtemp(dir=location) os.chdir(workdir) engine = sqlite_engine.__new__(sqlite_engine.__class__) engine.script_table_registry = {} args = { 'command': 'install', 'dataset': dataset, 'file': os.path.join(workdir, db_name), 'table_name': '{db}_{table}', 'data_dir': '.' } engine.opts = args engine.use_cache = use_cache dataset.download(engine=engine, debug=debug) engine.to_csv(sort=False) engine.final_cleanup() os.remove(os.path.join(workdir, db_name)) current_md5 = getmd5(os.path.join(file_location, workdir), data_type='dir', encoding=dataset.encoding) if not os.path.exists( os.path.join(file_location, 'current', dataset.name)): os.makedirs(os.path.join(file_location, 'current', dataset.name)) for file in os.listdir(workdir): move(os.path.join(workdir, file), os.path.join(file_location, 'current', dataset.name)) finally: if os.path.isfile(db_name): os.remove(db_name) if os.path.exists(os.path.join(HOME_DIR, 'raw_data', dataset.name)): rmtree(os.path.join(HOME_DIR, 'raw_data', dataset.name)) os.chdir(os.path.dirname(file_location)) rmtree(workdir) return current_md5
def test_status_dashboard(): if not os.path.exists(os.path.join(file_location, 'test_dir')): os.makedirs(os.path.join(file_location, 'test_dir')) create_dirs(os.path.join(test_files_location)) os.makedirs( os.path.join(file_location, 'test_dir', 'old', 'sample-dataset')) os.chdir(os.path.join(test_files_location, 'old', 'sample-dataset')) script_module = get_script_module('sample_dataset') sqlite_engine.opts = { 'install': 'sqlite', 'file': 'test_db.sqlite3', 'table_name': '{db}_{table}', 'data_dir': '.' } sqlite_engine.use_cache = False script_module.download(engine=sqlite_engine) script_module.engine.final_cleanup() script_module.engine.to_csv() os.remove('test_db.sqlite3') # Finding the md5 of the modified dataset setattr(script_module.tables['main'], 'path', modified_dataset_path) workdir = mkdtemp(dir=test_files_location) os.chdir(workdir) sqlite_engine.use_cache = False script_module.download(engine=sqlite_engine) script_module.engine.final_cleanup() script_module.engine.to_csv() os.remove('test_db.sqlite3') calculated_md5 = getmd5(os.getcwd(), data_type='dir') rmtree(workdir) # If md5 of current dataset doesn't match with current # md5 we have to find the diff if calculated_md5 != precalculated_md5: os.chdir(os.path.join(test_files_location, 'current')) sqlite_engine.opts = { 'install': 'sqlite', 'file': 'test_db_new.sqlite3', 'table_name': '{db}_{table}', 'data_dir': '.' } sqlite_engine.use_cache = False script_module.download(sqlite_engine) script_module.engine.final_cleanup() script_module.engine.to_csv() os.remove('test_db_new.sqlite3') diff_generator(script_module, location=test_files_location) diff_exist = True if os.path.isfile( os.path.join(test_files_location, 'diffs', 'sample_dataset_main.html')) else False csv_exist = True if os.path.isfile( os.path.join(test_files_location, 'old', 'sample-dataset', 'sample_dataset_main.csv')) else False os.chdir(file_location) rmtree(test_files_location) assert diff_exist == True assert csv_exist == True
def get_dataset_md5(dataset, use_cache=False, debug=True, location=temp_file_location): """Get the md5 value of a particular dataset dataset script object use_cache, True to use cached data or False to download again debug,True to raise error or False to fail silently location, path where temporary files are to be created for finding md5""" try: db_name = '{}_sqlite.db'.format(dataset.name.replace('-', '_')) workdir = mkdtemp(dir=location) os.chdir(workdir) engine = sqlite_engine.__new__(sqlite_engine.__class__) engine.script_table_registry = {} args = { 'command': 'install', 'dataset': dataset, 'file': os.path.join(workdir, db_name), 'table_name': '{db}_{table}', 'data_dir': '.' } engine.opts = args engine.use_cache = use_cache dataset.download(engine=engine, debug=debug) engine.to_csv(sort=False) engine.final_cleanup() try: os.remove(join_path([workdir, db_name])) except OSError as error: print("There was an error.", error) current_md5 = getmd5(workdir, data_type='dir', encoding=dataset.encoding) ds = os.path.join(file_location, 'current', dataset.name) try: if os.path.exists(ds): rmtree(ds) except OSError as error: print(error) os.makedirs(ds) for file in os.listdir(workdir): try: move(os.path.join(workdir, file), ds) except OSError as error: print(error) finally: if os.path.isfile(db_name): # delete database file os.remove(db_name) delete_raw_data(dataset) # delete raw data os.chdir(os.path.dirname(file_location)) rmtree(workdir) # delete temp directory return current_md5
def test_download_regression(dataset, expected): """Test download regression.""" os.chdir(retriever_root_dir) base_path = 'test_raw_data' path = os.path.join(base_path, dataset) data_dir = "test_temp" data = os.path.normpath(os.path.join(retriever_root_dir, path, data_dir)) rt.download(dataset, path=path) current_md5 = getmd5(data=path, data_type='dir') assert current_md5 == expected shutil.rmtree(base_path) # download using path and sub_dir os.chdir(retriever_root_dir) rt.download(dataset, path=path, sub_dir=data_dir) current_md5 = getmd5(data=data, data_type='dir') assert current_md5 == expected shutil.rmtree(base_path)
def check_dataset(dataset): md5 = None status = None reason = None diff = None dataset_detail = None previous_md5 = "" try: dataset_detail = load_dataset_details() previous_detail_records = "dataset_details" in dataset_detail and dataset_detail[ "dataset_details"] dataset_has_record = dataset.name in dataset_detail['dataset_details'] if previous_detail_records and dataset_has_record: previous_md5 = dataset_detail['dataset_details'][ dataset.name]['md5'] if dataset_type(dataset) == 'spatial': install_postgres(dataset) dir_path = DATASET_DATA_FOLDER.format(dataset_name=dataset.name) md5 = getmd5(dir_path, data_type='dir') if not dataset_has_record or md5 != previous_md5: diff = diff_generator_spatial(dataset) else: remove_old_diff(dataset) data_shift(dataset, is_spatial=True) else: md5 = get_dataset_md5(dataset) if not dataset_has_record or md5 != previous_md5: diff = diff_generator(dataset) else: remove_old_diff(dataset) data_shift(dataset) status = True except Exception as e: reason = str(e) status = False finally: json_file_details = dataset_detail json_file_details["dataset_details"][dataset.name] = { "md5": md5, "status": status, "reason": reason, "diff": diff } json_file_details["last_checked_on"] = datetime.now( timezone.utc).strftime("%d %b %Y") dataset_details_write = open(DATASET_DETAIL_JSON, 'w') json.dump(json_file_details, dataset_details_write, sort_keys=True, indent=4) dataset_details_write.close() delete_raw_data(dataset)
def get_csv_md5(dataset, engine, tmpdir, install_function, config, cols=None): workdir = tmpdir.mkdtemp() workdir.chdir() final_direct = os.getcwd() engine.script_table_registry = {} engine_obj = install_function(dataset.replace('_', '-'), **config) time.sleep(5) engine_obj.to_csv(select_columns=cols) current_md5 = getmd5(data=final_direct, data_type='dir') os.chdir(retriever_root_dir) return current_md5
def commit_info_for_commit(dataset, commit_message): """ Generate info for a particular commit. """ info = { "packages": package_details(), "time": datetime.now(timezone.utc).strftime("%m/%d/%Y, %H:%M:%S"), "version": dataset.version, "commit_message": commit_message, "script_name": os.path.basename(dataset._file), } path_to_raw_data = os.path.join(HOME_DIR, "raw_data", dataset.name) if os.path.exists(path_to_raw_data): info["md5_dataset"] = getmd5(path_to_raw_data, "dir", encoding=ENCODING) info["md5_script"] = getmd5(dataset._file, data_type="file", encoding=ENCODING) return info
def get_csv_md5(dataset, engine, tmpdir, install_function, config): workdir = tmpdir.mkdtemp() os.system("cp -r {} {}/".format(os.path.join(retriever_root_dir, 'scripts'), os.path.join(str(workdir), 'scripts'))) workdir.chdir() script_module = get_script_module(dataset) install_function(dataset.replace("_", "-"), **config) engine_obj = script_module.checkengine(engine) engine_obj.to_csv() os.system("rm -r scripts") # need to remove scripts before checking md5 on dir current_md5 = getmd5(data=str(workdir), data_type='dir') return current_md5
def check_dataset(dataset): os.chdir(os.path.join(file_location)) md5 = None status = None reason = None diff = None try: try: dataset_detail = json.load(open('dataset_details.json', 'r')) except FileNotFoundError: with open("dataset_details.json", 'w') as json_file: dataset_detail = dict() json.dump(dataset_detail, json_file) if dataset_type(dataset) == 'spatial': workdir = None try: workdir = mkdtemp(dir=file_location) download(dataset.name, path=workdir) md5 = getmd5(workdir, data_type='dir') except Exception: raise finally: if workdir: rmtree(workdir) else: md5 = get_dataset_md5(dataset) if dataset.name not in dataset_detail \ or md5 != dataset_detail[dataset.name]['md5']: diff = diff_generator(dataset) status = True except Exception as e: reason = str(e) status = False finally: os.chdir(os.path.join(file_location)) with FileLock('dataset_details.json.lock'): dataset_details_read = open('dataset_details.json', 'r') json_file_details = json.load(dataset_details_read) json_file_details[dataset.name] = { "md5": md5, "status": status, "reason": reason, "diff": diff } dataset_details_write = open('dataset_details.json', 'w') json.dump(json_file_details, dataset_details_write, sort_keys=True, indent=4)
def get_csv_md5(dataset, engine, tmpdir, install_function, config): workdir = tmpdir.mkdtemp() os.system("cp -r {} {}/".format( os.path.join(retriever_root_dir, 'scripts'), os.path.join(str(workdir), 'scripts'))) workdir.chdir() script_module = get_script_module(dataset) install_function(dataset.replace("_", "-"), **config) engine_obj = script_module.checkengine(engine) engine_obj.to_csv() os.system( "rm -r scripts") # need to remove scripts before checking md5 on dir current_md5 = getmd5(data=str(workdir), data_type='dir') return current_md5
def test_commit_installation(zip_file_name, expected_md5): """Installs the committed dataset in zip to sqlite and then converts it to csv to calculate md5 to compare it with the expected_md5""" db_name = 'test_sqlite.db' zip_file_path = os.path.join(file_location, "raw_data/dataset-provenance/", zip_file_name) engine = install_sqlite(zip_file_path, file=db_name, force=True) workdir = mkdtemp(dir=file_location) os.chdir(workdir) engine.to_csv() os.chdir(file_location) if os.path.isfile(db_name): os.remove(db_name) calculated_md5 = getmd5(workdir, data_type='dir', encoding=ENCODING) rmtree(workdir) assert calculated_md5 == expected_md5
def get_csv_md5(dataset, engine, tmpdir, install_function, config): workdir = tmpdir.mkdtemp() src = os.path.join(retriever_root_dir, 'scripts') dest = os.path.join(str(workdir), 'scripts') subprocess.call(['cp', '-r', src, dest]) workdir.chdir() final_direct = os.getcwd() engine.script_table_registry = {} engine_obj = install_function(dataset.replace('_', '-'), **config) engine_obj.to_csv() # need to remove scripts before checking md5 on dir subprocess.call(['rm', '-r', 'scripts']) current_md5 = getmd5(data=final_direct, data_type='dir') os.chdir(retriever_root_dir) return current_md5
def get_csv_md5(dataset, engine, tmpdir, install_function, config, cols=None): workdir = tmpdir.mkdtemp() src = os.path.join(retriever_root_dir, 'scripts') dest = os.path.join(str(workdir), 'scripts') subprocess.call(['cp', '-r', src, dest]) workdir.chdir() final_direct = os.getcwd() engine.script_table_registry = {} engine_obj = install_function(dataset.replace('_', '-'), **config) time.sleep(5) engine_obj.to_csv(select_columns=cols) # need to remove scripts before checking md5 on dir subprocess.call(['rm', '-r', 'scripts']) current_md5 = getmd5(data=final_direct, data_type='dir') os.chdir(retriever_root_dir) return current_md5
def check_dataset(dataset): md5 = None status = None reason = None diff = None dataset_detail = None try: try: with open(os.path.join(file_location, "dataset_details.json"), 'r') as json_file: dataset_detail = json.load(json_file) except (OSError, JSONDecodeError): dataset_detail = dict() dataset_detail['dataset_details'] = {} if dataset_type(dataset) == 'spatial': workdir = None try: workdir = mkdtemp(dir=file_location) download(dataset.name, path=workdir) md5 = getmd5(workdir, data_type='dir') except Exception: raise finally: if workdir: rmtree(workdir) else: md5 = get_dataset_md5(dataset) if dataset.name not in dataset_detail \ or md5 != dataset_detail[dataset.name]['md5']: diff = diff_generator(dataset) status = True except Exception as e: reason = str(e) status = False finally: json_file_details = dataset_detail json_file_details["dataset_details"][dataset.name] = { "md5": md5, "status": status, "reason": reason, "diff": diff} json_file_details["last_checked_on"] = datetime.now(timezone.utc).strftime("%d %b %Y") dataset_details_write = open(os.path.join(file_location, 'dataset_details.json'), 'w') json.dump(json_file_details, dataset_details_write, sort_keys=True, indent=4) dataset_details_write.close()
def test_getmd5_path(): """Test md5 sum calculation given a path to data source.""" data_file = create_file(['a,b,c', '1,2,3', '4,5,6']) exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a' assert getmd5(data=data_file, data_type='file') == exp_hash
def test_getmd5_line_end(): """Test md5 sum calculation given a line with end of line character.""" lines_end = ['a,b,c\n', '1,2,3\n', '4,5,6\n'] exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a' assert getmd5(data=lines_end, data_type='lines') == exp_hash
def test_getmd5_lines(): """Test md5 sum calculation given a line.""" lines = ['a,b,c', '1,2,3', '4,5,6'] exp_hash = 'ca471abda3ebd4ae8ce1b0814b8f470c' assert getmd5(data=lines, data_type='lines') == exp_hash
def test_download_regression(dataset, expected): """Test download regression.""" os.chdir(retriever_root_dir) download(dataset, "raw_data/{0}".format(dataset)) current_md5 = getmd5(data="raw_data/{0}".format(dataset), data_type='dir') assert current_md5 == expected
def check_dataset(dataset): md5 = None status = None reason = None diff = None dataset_detail = None try: try: with open(os.path.join(file_location, "dataset_details.json"), 'r') as json_file: dataset_detail = json.load(json_file) except (OSError, JSONDecodeError): dataset_detail = dict() dataset_detail['dataset_details'] = {} if dataset_type(dataset) == 'spatial': install_postgres(dataset) md5 = getmd5(path.join(file_location, 'current', dataset.name), data_type='dir') if dataset.name not in dataset_detail[ 'dataset_details'] or md5 != dataset_detail[ 'dataset_details'][dataset.name]['md5']: diff = diff_generator_spatial(dataset) else: for keys in dataset.tables: file_name = '{}.{}'.format(dataset.name.replace('-', '_'), keys) html_file_name = '{}.html'.format(file_name) if os.path.exists( os.path.join(file_location, 'diffs', html_file_name)): remove( os.path.join(file_location, 'diffs', html_file_name)) data_shift(dataset, is_spatial=True) else: md5 = get_dataset_md5(dataset) if dataset.name not in dataset_detail[ 'dataset_details'] or md5 != dataset_detail[ 'dataset_details'][dataset.name]['md5']: diff = diff_generator(dataset) else: for keys in dataset.tables: file_name = '{}_{}'.format(dataset.name.replace('-', '_'), keys) html_file_name = '{}.html'.format(file_name) if os.path.exists( os.path.join(file_location, 'diffs', html_file_name)): remove( os.path.join(file_location, 'diffs', html_file_name)) data_shift(dataset) status = True except Exception as e: reason = str(e) status = False finally: json_file_details = dataset_detail json_file_details["dataset_details"][dataset.name] = { "md5": md5, "status": status, "reason": reason, "diff": diff } json_file_details["last_checked_on"] = datetime.now( timezone.utc).strftime("%d %b %Y") dataset_details_write = open( os.path.join(file_location, 'dataset_details.json'), 'w') json.dump(json_file_details, dataset_details_write, sort_keys=True, indent=4) dataset_details_write.close() if os.path.exists(os.path.join(HOME_DIR, 'raw_data', dataset.name)): rmtree(os.path.join(HOME_DIR, 'raw_data', dataset.name))