def dataset_preparation_test(): d = os.path.dirname(__file__) target = os.path.join(utils.get_project_root(), 'raw-datasets/unittests-tiny-raw.pickle') shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target) preprocess_dataset.create_preprocessed_dataset( target, os.path.join(utils.get_project_root(), 'preprocessed/small-baseline/data.pickle'), [preprocessing.ScaleAndShift()])
def dataset_preparation_test(): d = os.path.dirname(__file__) target = os.path.join(utils.get_project_root(), 'raw-datasets/unittests-tiny-raw.pickle') shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target) preprocess_dataset.create_preprocessed_dataset( target, os.path.join(utils.get_project_root(), 'preprocessed/small-baseline/data.pickle'), [preprocessing.ScaleAndShift()])
def test_execution(): view._fetch_data_from_server(31, "mysql_online") d = os.path.dirname(__file__) target = os.path.join(utils.get_project_root(), "raw-datasets/unittests-tiny-raw.pickle") shutil.copyfile(os.path.join(d, "data/unittests-tiny-raw.pickle"), target) view._get_data_from_rawfile(target, 345) # Is in tiny test set view._get_data_from_rawfile(target, 42) # Is not in tiny test set view._list_ids(target) model_folder = os.path.join(utils.get_project_root(), "models/small-baseline") view._get_system(model_folder)
def execution_test(): view.get_parser() view._fetch_data_from_server(31, 'mysql_online') d = os.path.dirname(__file__) target = os.path.join(utils.get_project_root(), 'raw-datasets/unittests-tiny-raw.pickle') shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target) view._get_data_from_rawfile(target, 345) # Is in tiny test set view._get_data_from_rawfile(target, 42) # Is not in tiny test set view._list_ids(target) model_folder = os.path.join(utils.get_project_root(), 'models/small-baseline') view._get_system(model_folder)
def test_execution_test2(): d = os.path.dirname(__file__) raw_datasets = os.path.join(utils.get_project_root(), "raw-datasets/unittests-tiny-raw.pickle") shutil.copyfile(os.path.join(d, "data/unittests-tiny-raw.pickle"), raw_datasets) dam.TimeBetweenPointsAndStrokes(raw_datasets)
def get_test_results(model_folder, basename, test_file): model_src = utils.get_latest_model(model_folder, basename) if model_src is None: logging.error("No model with basename '%s' found in '%s'.", basename, model_folder) else: _, model_use = tempfile.mkstemp(suffix='.json', text=True) utils.create_adjusted_model_for_percentages(model_src, model_use) # Start evaluation project_root = utils.get_project_root() time_prefix = time.strftime("%Y-%m-%d-%H-%M") logging.info("Evaluate '%s' with '%s'...", model_src, test_file) logfile = os.path.join(project_root, "logs/%s-error-evaluation.log" % time_prefix) logging.info('Write log to %s...', logfile) with open(logfile, "w") as log, open(model_use, "r") as model_src_p: p = subprocess.Popen([ utils.get_nntoolkit(), 'run', '--batch-size', '1', '-f%0.4f', test_file ], stdin=model_src_p, stdout=log) ret = p.wait() if ret != 0: logging.error("nntoolkit finished with ret code %s", str(ret)) sys.exit(-1) os.remove(model_use) return logfile
def get_test_results(model_folder, basename, test_file): model_src = utils.get_latest_model(model_folder, basename) if model_src is None: logger.error( f"No model with basename '{basename}' found in '{model_folder}'.") else: _, model_use = tempfile.mkstemp(suffix=".json", text=True) utils.create_adjusted_model_for_percentages(model_src, model_use) # Start evaluation project_root = utils.get_project_root() time_prefix = time.strftime("%Y-%m-%d-%H-%M") logger.info(f"Evaluate '{model_src}' with '{test_file}'...") logfile = os.path.join(project_root, "logs/%s-error-evaluation.log" % time_prefix) logger.info(f"Write log to {logfile}...") with open(logfile, "w") as log, open(model_use) as model_src_p: p = subprocess.Popen( [ utils.get_nntoolkit(), "run", "--batch-size", "1", "-f%0.4f", test_file, ], stdin=model_src_p, stdout=log, ) ret = p.wait() if ret != 0: logger.error(f"nntoolkit finished with ret code {ret}") sys.exit(-1) os.remove(model_use) return logfile
def test_get_latest_model(): """Check if get_latest_model works.""" model_folder = "/etc" basename = "model" assert utils.get_latest_model(model_folder, basename) is None small = os.path.join(utils.get_project_root(), "models/small-baseline") utils.get_latest_model(small, basename)
def test_execute_main(): model_small = os.path.join(utils.get_project_root(), "models", "small-baseline") view.main(True, model_small, False, 31, False, "mysql_online") view.main(False, model_small, False, 31, False, "mysql_online") view.main(False, model_small, True, 31, False, "mysql_online") view.main(False, model_small, False, 31, True, "mysql_online")
def execution_test2(): d = os.path.dirname(__file__) raw_datasets = os.path.join(utils.get_project_root(), 'raw-datasets/unittests-tiny-raw.pickle') shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), raw_datasets) dam.TimeBetweenPointsAndStrokes(raw_datasets)
def get_parser(): """Return the parser object for this script.""" project_root = utils.get_project_root() archive_path = os.path.join(project_root, "raw-datasets") from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-d", "--destination", dest="destination", default=archive_path, help="where to write the handwriting_dataset.pickle", type=lambda x: utils.is_valid_folder(parser, x), metavar="FOLDER") parser.add_argument("--dataset", dest="dataset", default='all', help=("of which symbols do you want the recordings?")) parser.add_argument("-r", "--renderings", dest="renderings", action="store_true", default=False, help=("should the svg renderings be downloaded?")) parser.add_argument("--dropbox", dest="dropbox", action="store_true", default=False, help=("upload to new files to DropBox")) return parser
def dropbox_upload(filename, directory, client): """ Upload the data to DropBox. Parameters ---------- filename : string Name of the file that gets uploaded. directory : string Name of the directory in which the file is that gets uploaded (relativ to the project root) client : a DropBox client object """ local_path = os.path.join(utils.get_project_root(), directory, filename) online_path = os.path.join(directory, filename) filesize = os.path.getsize(local_path) logging.info("Start uploading '%s' (%s)...", filename, utils.sizeof_fmt(filesize)) with open(local_path, 'rb') as f: uploader = client.get_chunked_uploader(f, filesize) uploader.upload_chunked() uploader.finish(online_path, overwrite=True) url = client.share(online_path, short_url=False)['url'].encode('ascii', 'ignore') url = url.replace("?dl=0", "?dl=1") return url
def dropbox_upload(filename, directory, client): """ Upload the data to DropBox. Parameters ---------- filename : string Name of the file that gets uploaded. directory : string Name of the directory in which the file is that gets uploaded (relativ to the project root) client : a DropBox client object """ local_path = os.path.join(utils.get_project_root(), directory, filename) online_path = os.path.join(directory, filename) filesize = os.path.getsize(local_path) logging.info("Start uploading '%s' (%s)...", filename, utils.sizeof_fmt(filesize)) with open(local_path, 'rb') as f: uploader = client.get_chunked_uploader(f, filesize) uploader.upload_chunked() uploader.finish(online_path, overwrite=True) url = client.share(online_path, short_url=False)['url'].encode('ascii', 'ignore') url = url.replace("?dl=0", "?dl=1") return url
def get_test_results(model_folder, basename, test_file): model_src = utils.get_latest_model(model_folder, basename) if model_src is None: logging.error("No model with basename '%s' found in '%s'.", basename, model_folder) else: _, model_use = tempfile.mkstemp(suffix='.json', text=True) utils.create_adjusted_model_for_percentages(model_src, model_use) # Start evaluation project_root = utils.get_project_root() time_prefix = time.strftime("%Y-%m-%d-%H-%M") logging.info("Evaluate '%s' with '%s'...", model_src, test_file) logfile = os.path.join(project_root, "logs/%s-error-evaluation.log" % time_prefix) logging.info('Write log to %s...', logfile) with open(logfile, "w") as log, open(model_use, "r") as model_src_p: p = subprocess.Popen([utils.get_nntoolkit(), 'run', '--batch-size', '1', '-f%0.4f', test_file], stdin=model_src_p, stdout=log) ret = p.wait() if ret != 0: logging.error("nntoolkit finished with ret code %s", str(ret)) sys.exit(-1) os.remove(model_use) return logfile
def get_latest_model_test(): """Check if get_latest_model works.""" model_folder = "/etc" basename = "model" nose.tools.assert_equal(utils.get_latest_model(model_folder, basename), None) small = os.path.join(utils.get_project_root(), "models/small-baseline") utils.get_latest_model(small, basename)
def execute_main_test(): model_small = os.path.join(utils.get_project_root(), "models", "small-baseline") view.main(True, model_small, False, 31, False, 'mysql_online') view.main(False, model_small, False, 31, False, 'mysql_online') view.main(False, model_small, True, 31, False, 'mysql_online') view.main(False, model_small, False, 31, True, 'mysql_online')
def _get_default_pickle(): project_root = utils.get_project_root() raw_dir = os.path.join(project_root, "raw-datasets") models = filter(lambda n: n.endswith(".pickle"), os.listdir(raw_dir)) models = natsort.natsorted(models, reverse=True) if len(models) == 0: return None else: return os.path.join(raw_dir, models[0])
def get_latest_model_test(): """Check if get_latest_model works.""" model_folder = "/etc" basename = "model" nose.tools.assert_equal(utils.get_latest_model(model_folder, basename), None) small = os.path.join(utils.get_project_root(), "models/small-baseline") utils.get_latest_model(small, basename)
def test_create_translation_file(): """Test create_ffiles._create_translation_file.""" feature_folder = os.path.join(utils.get_project_root(), "feature-files", "small-baseline") dataset_name = "testtestdata" translation = [(133700, "\\alpha", 42)] formula_id2index = {42: 1} create_ffiles._create_translation_file(feature_folder, dataset_name, translation, formula_id2index)
def _get_default_pickle(): project_root = utils.get_project_root() raw_dir = os.path.join(project_root, "raw-datasets") models = filter(lambda n: n.endswith(".pickle"), os.listdir(raw_dir)) models = natsort.natsorted(models, reverse=True) if len(models) == 0: return None else: return os.path.join(raw_dir, models[0])
def get_recognizer_folders_test(): """Test if all folders are catched.""" small = os.path.join(utils.get_project_root(), "models/small-baseline") folders = utils.get_recognizer_folders(small) wanted_folders = [ 'preprocessed/small-baseline', 'feature-files/small', 'models/small-baseline' ] for folder, wanted_folder in zip(folders, wanted_folders): nose.tools.assert_equal(folder.endswith(wanted_folder), True)
def main(model_folder, aset='test', n=3, merge=True): """Main part of the test script.""" project_root = utils.get_project_root() if aset == 'test': key_model, key_file = 'testing', 'testdata' elif aset == 'valid': key_model, key_file = 'validating', 'validdata' else: key_model, key_file = 'training', 'traindata' # Get model description model_description_file = os.path.join(model_folder, "info.yml") # Read the model description file with open(model_description_file, 'r') as ymlfile: model_description = yaml.load(ymlfile) # Get the data paths (hdf5) project_root = utils.get_project_root() data = {} data['training'] = os.path.join(project_root, model_description["data-source"], "traindata.hdf5") data['testing'] = os.path.join(project_root, model_description["data-source"], "testdata.hdf5") data['validating'] = os.path.join(project_root, model_description["data-source"], "validdata.hdf5") test_data_path = os.path.join(model_folder, data[key_model]) evaluation_file = get_test_results(model_folder, "model", test_data_path) translation_csv = os.path.join(project_root, model_description["data-source"], "index2formula_id.csv") what_evaluated_file = os.path.join(project_root, model_description["data-source"], "translation-%s.csv" % key_file) analyze_results(translation_csv, what_evaluated_file, evaluation_file, n, merge)
def test_execution(): """Test if the functions execute at all.""" utils.get_project_root() utils.get_latest_model(".", "model") utils.get_latest_working_model(".") utils.get_latest_successful_run(".") assert utils.get_readable_time(123) == "123ms" assert utils.get_readable_time(1000 * 30) == "30s 0ms" assert utils.get_readable_time(1000 * 60) == "1 minutes 0s 0ms" assert utils.get_readable_time(1000 * 60 * 60) == "1h, 0 minutes 0s 0ms" assert utils.get_readable_time(2 * 1000 * 60 * 60) == "2h, 0 minutes 0s 0ms" assert utils.get_readable_time(25 * 1000 * 60 * 60 + 3) == "25h, 0 minutes 0s 3ms" utils.print_status(3, 1, 123) utils.get_nntoolkit() utils.get_database_config_file() utils.get_database_configuration() assert utils.sizeof_fmt(1) == "1.0 bytes" assert utils.sizeof_fmt(1111) == "1.1 KB"
def test_get_recognizer_folders(): """Test if all folders are catched.""" small = os.path.join(utils.get_project_root(), "models/small-baseline") folders = utils.get_recognizer_folders(small) wanted_folders = [ "preprocessed/small-baseline", "feature-files/small", "models/small-baseline", ] for folder, wanted_folder in zip(folders, wanted_folders): assert folder.endswith(wanted_folder)
def get_recognizer_folders_test(): """Test if all folders are catched.""" small = os.path.join(utils.get_project_root(), "models/small-baseline") folders = utils.get_recognizer_folders(small) wanted_folders = ['preprocessed/small-baseline', 'feature-files/small', 'models/small-baseline'] for folder, wanted_folder in zip(folders, wanted_folders): nose.tools.assert_equal(folder.endswith(wanted_folder), True)
def create_translation_file_test(): """Test create_ffiles._create_translation_file.""" feature_folder = os.path.join(utils.get_project_root(), "feature-files", "small-baseline") dataset_name = "testtestdata" translation = [(133700, '\\alpha', 42)] formula_id2index = {42: 1} create_ffiles._create_translation_file(feature_folder, dataset_name, translation, formula_id2index)
def main(model_folder, aset='test', n=3, merge=True): """Main part of the test script.""" project_root = utils.get_project_root() if aset == 'test': key_model, key_file = 'testing', 'testdata' elif aset == 'valid': key_model, key_file = 'validating', 'validdata' else: key_model, key_file = 'training', 'traindata' # Get model description model_description_file = os.path.join(model_folder, "info.yml") # Read the model description file with open(model_description_file, 'r') as ymlfile: model_description = yaml.load(ymlfile) # Get the data paths (hdf5) project_root = utils.get_project_root() data = {} data['training'] = os.path.join(project_root, model_description["data-source"], "traindata.hdf5") data['testing'] = os.path.join(project_root, model_description["data-source"], "testdata.hdf5") data['validating'] = os.path.join(project_root, model_description["data-source"], "validdata.hdf5") test_data_path = os.path.join(model_folder, data[key_model]) evaluation_file = get_test_results(model_folder, "model", test_data_path) translation_csv = os.path.join(project_root, model_description["data-source"], "index2formula_id.csv") what_evaluated_file = os.path.join(project_root, model_description["data-source"], "translation-%s.csv" % key_file) analyze_results(translation_csv, what_evaluated_file, evaluation_file, n, merge)
def update_if_outdated(folder): """Check if the currently watched instance (model, feature or preprocessing) is outdated and update it eventually. """ folders = [] while os.path.isdir(folder): folders.append(folder) # Get info.yml with open(os.path.join(folder, "info.yml")) as ymlfile: content = yaml.load(ymlfile) folder = os.path.join(utils.get_project_root(), content['data-source']) raw_source_file = folder if not os.path.isfile(raw_source_file): logging.error("File '%s' was not found.", raw_source_file) logging.error("You should eventually execute 'hwrt download'.") sys.exit(-1) dt = os.path.getmtime(raw_source_file) source_mtime = datetime.datetime.utcfromtimestamp(dt) folders = folders[::-1] # Reverse order to get the most "basic one first" for target_folder in folders: target_mtime = utils.get_latest_successful_run(target_folder) if target_mtime is None or source_mtime > target_mtime: # The source is later than the target. That means we need to # refresh the target if "preprocessed" in target_folder: logging.info("Preprocessed file was outdated. Update...") preprocess_dataset.main( os.path.join(utils.get_project_root(), target_folder)) elif "feature-files" in target_folder: logging.info("Feature file was outdated. Update...") create_ffiles.main(target_folder) elif "model" in target_folder: logging.info("Model file was outdated. Update...") create_model.main(target_folder, True) target_mtime = datetime.datetime.utcnow() else: logging.info("'%s' is up-to-date.", target_folder) source_mtime = target_mtime
def update_if_outdated(folder): """Check if the currently watched instance (model, feature or preprocessing) is outdated and update it eventually. """ folders = [] while os.path.isdir(folder): folders.append(folder) # Get info.yml with open(os.path.join(folder, "info.yml")) as ymlfile: content = yaml.load(ymlfile) folder = os.path.join(utils.get_project_root(), content['data-source']) raw_source_file = folder if not os.path.isfile(raw_source_file): logging.error("File '%s' was not found.", raw_source_file) logging.error("You should eventually execute 'hwrt download'.") sys.exit(-1) dt = os.path.getmtime(raw_source_file) source_mtime = datetime.datetime.utcfromtimestamp(dt) folders = folders[::-1] # Reverse order to get the most "basic one first" for target_folder in folders: target_mtime = utils.get_latest_successful_run(target_folder) if target_mtime is None or source_mtime > target_mtime: # The source is later than the target. That means we need to # refresh the target if "preprocessed" in target_folder: logging.info("Preprocessed file was outdated. Update...") preprocess_dataset.main(os.path.join(utils.get_project_root(), target_folder)) elif "feature-files" in target_folder: logging.info("Feature file was outdated. Update...") create_ffiles.main(target_folder) elif "model" in target_folder: logging.info("Model file was outdated. Update...") create_model.main(target_folder, True) target_mtime = datetime.datetime.utcnow() else: logging.info("'%s' is up-to-date.", target_folder) source_mtime = target_mtime
def execution_test(): """Test if the functions execute at all.""" utils.get_project_root() utils.get_latest_model(".", "model") utils.get_latest_working_model(".") utils.get_latest_successful_run(".") nose.tools.assert_equal(utils.get_readable_time(123), "123ms") nose.tools.assert_equal(utils.get_readable_time(1000*30), "30s 0ms") nose.tools.assert_equal(utils.get_readable_time(1000*60), "1 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(1000*60*60), "1h, 0 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(2*1000*60*60), "2h, 0 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(25*1000*60*60+3), "25h, 0 minutes 0s 3ms") utils.print_status(3, 1, 123) utils.get_nntoolkit() utils.get_database_config_file() utils.get_database_configuration() nose.tools.assert_equal(utils.sizeof_fmt(1), "1.0 bytes") nose.tools.assert_equal(utils.sizeof_fmt(1111), "1.1 KB")
def generate_training_command(model_folder): """Generate a string that contains a command with all necessary parameters to train the model.""" update_if_outdated(model_folder) model_description_file = os.path.join(model_folder, "info.yml") # Read the model description file with open(model_description_file, 'r') as ymlfile: model_description = yaml.load(ymlfile) # Get the data paths (hdf5 files) project_root = utils.get_project_root() data = {} data['training'] = os.path.join(project_root, model_description["data-source"], "traindata.hdf5") data['testing'] = os.path.join(project_root, model_description["data-source"], "testdata.hdf5") data['validating'] = os.path.join(project_root, model_description["data-source"], "validdata.hdf5") # Get latest model file basename = "model" latest_model = utils.get_latest_working_model(model_folder) if latest_model == "": logging.error("There is no model with basename '%s'.", basename) return None else: logging.info("Model '%s' found.", latest_model) i = int(latest_model.split("-")[-1].split(".")[0]) model_src = os.path.join(model_folder, "%s-%i.json" % (basename, i)) model_target = os.path.join(model_folder, "%s-%i.json" % (basename, i + 1)) # generate the training command training = model_description['training'] training = training.replace("{{testing}}", data['testing']) training = training.replace("{{training}}", data['training']) training = training.replace("{{validation}}", data['validating']) training = training.replace("{{src_model}}", model_src) training = training.replace("{{target_model}}", model_target) training = training.replace("{{nntoolkit}}", utils.get_nntoolkit()) return training
def generate_training_command(model_folder): """Generate a string that contains a command with all necessary parameters to train the model.""" update_if_outdated(model_folder) model_description_file = os.path.join(model_folder, "info.yml") # Read the model description file with open(model_description_file, 'r') as ymlfile: model_description = yaml.load(ymlfile) # Get the data paths (hdf5 files) project_root = utils.get_project_root() data = {} data['training'] = os.path.join(project_root, model_description["data-source"], "traindata.hdf5") data['testing'] = os.path.join(project_root, model_description["data-source"], "testdata.hdf5") data['validating'] = os.path.join(project_root, model_description["data-source"], "validdata.hdf5") # Get latest model file basename = "model" latest_model = utils.get_latest_working_model(model_folder) if latest_model == "": logging.error("There is no model with basename '%s'.", basename) return None else: logging.info("Model '%s' found.", latest_model) i = int(latest_model.split("-")[-1].split(".")[0]) model_src = os.path.join(model_folder, "%s-%i.json" % (basename, i)) model_target = os.path.join(model_folder, "%s-%i.json" % (basename, i+1)) # generate the training command training = model_description['training'] training = training.replace("{{testing}}", data['testing']) training = training.replace("{{training}}", data['training']) training = training.replace("{{validation}}", data['validating']) training = training.replace("{{src_model}}", model_src) training = training.replace("{{target_model}}", model_target) training = training.replace("{{nntoolkit}}", utils.get_nntoolkit()) return training
def generate_training_command(model_folder): """Generate a string that contains a command with all necessary parameters to train the model.""" update_if_outdated(model_folder) model_description_file = os.path.join(model_folder, "info.yml") # Read the model description file with open(model_description_file) as ymlfile: model_description = yaml.safe_load(ymlfile) # Get the data paths (hdf5 files) project_root = utils.get_project_root() data = {} data["training"] = os.path.join(project_root, model_description["data-source"], "traindata.hdf5") data["testing"] = os.path.join(project_root, model_description["data-source"], "testdata.hdf5") data["validating"] = os.path.join(project_root, model_description["data-source"], "validdata.hdf5") # Get latest model file basename = "model" latest_model = utils.get_latest_working_model(model_folder) if latest_model == "": logger.error( f"There is no model with basename '{basename}' in {model_folder}") return None logger.info(f"Model '{latest_model}' found.") i = int(latest_model.split("-")[-1].split(".")[0]) model_src = os.path.join(model_folder, f"{basename}-{i}.json") model_target = os.path.join(model_folder, f"{basename}-{i + 1}.json") # generate the training command training = model_description["training"] training = training.replace("{{testing}}", data["testing"]) training = training.replace("{{training}}", data["training"]) training = training.replace("{{validation}}", data["validating"]) training = training.replace("{{src_model}}", model_src) training = training.replace("{{target_model}}", model_target) training = training.replace("{{nntoolkit}}", utils.get_nntoolkit()) return training
def get_parser(): """Return the parser object for this script.""" project_root = utils.get_project_root() archive_path = os.path.join(project_root, "raw-datasets") from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-d", "--destination", dest="destination", default=archive_path, help="where to write the handwriting_dataset.pickle", type=lambda x: utils.is_valid_folder(parser, x), metavar="FOLDER") parser.add_argument("--dataset", dest="dataset", default='all', help=("of which symbols do you want the recordings?")) parser.add_argument("-r", "--renderings", dest="renderings", action="store_true", default=False, help=("should the svg renderings be downloaded?")) parser.add_argument("--dropbox", dest="dropbox", action="store_true", default=False, help=("upload to new files to DropBox")) return parser
def sync_directory(directory): """Sync a directory. Return if syncing was successful.""" # Developers should read # https://www.dropbox.com/developers/core/start/python # before modifying the following code cfg = utils.get_project_configuration() # Information about files in this folder project_root = utils.get_project_root() directory_information_file = os.path.join(project_root, directory, "info.yml") if not os.path.isfile(directory_information_file): # create if not exists with open(directory_information_file, 'w') as ymlfile: ymlfile.write(yaml.dump([])) # Dropbox stuff APP_KEY = cfg['dropbox_app_key'] APP_SECRET = cfg['dropbox_app_secret'] flow = dropbox.client.DropboxOAuth2FlowNoRedirect(APP_KEY, APP_SECRET) authorize_url = flow.start() webbrowser.open_new_tab(authorize_url) print("1. Go to: " + authorize_url) print("2. Click 'Allow' (you might have to log in first)") print("3. Copy the authorization code.") access_token = input_string().strip() try: # This will fail if the user enters an invalid authorization code access_token, user_id = flow.finish(access_token) client = dropbox.client.DropboxClient(access_token) except Exception as e: logging.error("Dropbox connection error: %s", e) return False # Get all local files local_path = os.path.join(project_root, directory) files = [ f for f in os.listdir(local_path) if os.path.isfile(os.path.join(local_path, f)) ] files = filter(lambda n: n.endswith(".pickle"), files) new_yaml_content = [] # upload them for filename in files: file_meta = {} file_meta['filename'] = filename file_meta['online_path'] = os.path.join(directory, filename) local_path_file = os.path.join(local_path, filename) file_meta['md5'] = hashlib.md5(open(local_path_file, 'rb').read()).hexdigest() new_yaml_content.append(file_meta) file_meta['url'] = dropbox_upload(filename, directory, client) if not file_meta['url']: return False # TODO: Remove all files from Dropbox that are not in local folder # Update YAML file with open(directory_information_file, 'w') as ymlfile: ymlfile.write(yaml.dump(new_yaml_content, default_flow_style=False)) return True
def main(destination=os.path.join(utils.get_project_root(), "raw-datasets"), dataset='all', renderings=False): """Main part of the backup script.""" time_prefix = time.strftime("%Y-%m-%d-%H-%M") filename = ("%s-handwriting_datasets-%s-raw.pickle" % (time_prefix, dataset.replace('/', '-'))) destination_path = os.path.join(destination, filename) logging.info("Data will be written to '%s'", destination_path) cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() formulas = get_formulas(cursor, dataset) logging.info('Received %i formulas.', len(formulas)) handwriting_datasets = [] formula_id2latex = {} # Go through each formula and download every raw_data instance for formula in formulas: formula_id2latex[formula['id']] = formula['formula_in_latex'] sql = (( "SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData( raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id'], user_name=raw_data['display_name']) handwriting_datasets.append({ 'handwriting': handwriting, 'id': raw_data['id'], 'formula_id': formula['id'], 'formula_in_latex': formula['formula_in_latex'], 'is_in_testset': raw_data['is_in_testset'] }) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) pickle.dump( { 'handwriting_datasets': handwriting_datasets, 'formula_id2latex': formula_id2latex }, open(destination_path, "wb"), 2) if renderings: logging.info("Start downloading SVG renderings...") svgfolder = tempfile.mkdtemp() sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id AND t1.creation_time < t2.creation_time WHERE t2.id is null""" cursor.execute(sql) formulas = cursor.fetchall() logging.info("Create svg...") for formula in formulas: filename = os.path.join(svgfolder, "%s.svg" % str(formula['formula_id'])) with open(filename, 'wb') as temp_file: temp_file.write(formula['svg']) logging.info("Tar at %s", os.path.abspath("renderings.tar")) tar = tarfile.open("renderings.tar.bz2", "w:bz2") for fn in os.listdir(svgfolder): filename = os.path.join(svgfolder, fn) if os.path.isfile(filename): print(filename) tar.add(filename, arcname=os.path.basename(filename)) tar.close()
def sync_directory(directory): """Sync a directory. Return if syncing was successful.""" # Developers should read # https://www.dropbox.com/developers/core/start/python # before modifying the following code cfg = utils.get_project_configuration() # Information about files in this folder project_root = utils.get_project_root() directory_information_file = os.path.join(project_root, directory, "info.yml") if not os.path.isfile(directory_information_file): # create if not exists with open(directory_information_file, 'w') as ymlfile: ymlfile.write(yaml.dump([])) # Dropbox stuff APP_KEY = cfg['dropbox_app_key'] APP_SECRET = cfg['dropbox_app_secret'] flow = dropbox.client.DropboxOAuth2FlowNoRedirect(APP_KEY, APP_SECRET) authorize_url = flow.start() webbrowser.open_new_tab(authorize_url) print("1. Go to: " + authorize_url) print("2. Click 'Allow' (you might have to log in first)") print("3. Copy the authorization code.") access_token = input_string().strip() try: # This will fail if the user enters an invalid authorization code access_token, user_id = flow.finish(access_token) client = dropbox.client.DropboxClient(access_token) except Exception as e: logging.error("Dropbox connection error: %s", e) return False # Get all local files local_path = os.path.join(project_root, directory) files = [f for f in os.listdir(local_path) if os.path.isfile(os.path.join(local_path, f))] files = filter(lambda n: n.endswith(".pickle"), files) new_yaml_content = [] # upload them for filename in files: file_meta = {} file_meta['filename'] = filename file_meta['online_path'] = os.path.join(directory, filename) local_path_file = os.path.join(local_path, filename) file_meta['md5'] = hashlib.md5(open(local_path_file, 'rb').read()).hexdigest() new_yaml_content.append(file_meta) file_meta['url'] = dropbox_upload(filename, directory, client) if not file_meta['url']: return False # TODO: Remove all files from Dropbox that are not in local folder # Update YAML file with open(directory_information_file, 'w') as ymlfile: ymlfile.write(yaml.dump(new_yaml_content, default_flow_style=False)) return True
def main(model_folder): """ Parameters ---------- model_folder : str Path to a folder in which a model (json file) is. """ a = yaml.load(open(utils.get_latest_in_folder(model_folder, ".json"))) layers = [] filenames = ["model.yml", "input_semantics.csv", "output_semantics.csv", "preprocessing.yml", "features.yml"] # Create input_semantics.csv inputs = a['layers'][0]['_props']['n_visible'] with open('input_semantics.csv', 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in range(inputs): spamwriter.writerow(["inputs %i" % row]) # Create output_semantics.csv outputs = a['layers'][-1]['_props']['n_hidden'] create_output_semantics(model_folder, outputs) # Write layers for layer_index, layer in enumerate(a['layers']): W = _as_ndarray(layer['params']['W']) Wfile = h5py.File('W%i.hdf5' % layer_index, 'w') Wfile.create_dataset(Wfile.id.name, data=W) Wfile.close() b = _as_ndarray(layer['params']['b']) bfile = h5py.File('b%i.hdf5' % layer_index, 'w') bfile.create_dataset(bfile.id.name, data=b) bfile.close() activation = layer['_props']['activation'] activation = activation.replace('sigmoid', 'Sigmoid') activation = activation.replace('softmax', 'Softmax') layers.append({'W': {'size': list(W.shape), 'filename': 'W%i.hdf5' % layer_index}, 'b': {'size': list(b.shape), 'filename': 'b%i.hdf5' % layer_index}, 'activation': activation}) filenames.append('W%i.hdf5' % layer_index) filenames.append('b%i.hdf5' % layer_index) model = {'type': 'mlp', 'layers': layers} with open("model.yml", 'w') as f: yaml.dump(model, f, default_flow_style=False) logging.info("Get preprocessing.yml") # Get model folder model_description_file = os.path.join(model_folder, "info.yml") with open(model_description_file, 'r') as ymlfile: model_description = yaml.load(ymlfile) # Get feature folder feature_description_file = os.path.join(utils.get_project_root(), model_description["data-source"], "info.yml") with open(feature_description_file, 'r') as ymlfile: feature_description = yaml.load(ymlfile) with open("features.yml", 'w') as f: yaml.dump(feature_description, f, default_flow_style=False) # Get preprocessing folder preprocessing_description_file = os.path.join(utils.get_project_root(), feature_description["data-source"], "info.yml") with open(preprocessing_description_file, 'r') as ymlfile: preprocessing_description = yaml.load(ymlfile) with open("preprocessing.yml", 'w') as f: yaml.dump(preprocessing_description, f, default_flow_style=False) with tarfile.open("model.tar", "w:") as tar: for name in filenames: tar.add(name) # Remove temporary files which are now in tar file for filename in filenames: os.remove(filename)
def main(model_folder): """ Parameters ---------- model_folder : str Path to a folder in which a model (json file) is. """ with open(utils.get_latest_in_folder(model_folder, ".json")) as fp: a = yaml.safe_load(fp) layers = [] filenames = [ "model.yml", "input_semantics.csv", "output_semantics.csv", "preprocessing.yml", "features.yml", ] # Create input_semantics.csv inputs = a["layers"][0]["_props"]["n_visible"] with open("input_semantics.csv", "wb") as csvfile: spamwriter = csv.writer( csvfile, delimiter=";", quotechar="|", quoting=csv.QUOTE_MINIMAL ) for row in range(inputs): spamwriter.writerow(["inputs %i" % row]) # Create output_semantics.csv outputs = a["layers"][-1]["_props"]["n_hidden"] create_output_semantics(model_folder, outputs) # Write layers for layer_index, layer in enumerate(a["layers"]): w = _as_ndarray(layer["params"]["W"]) wfile = h5py.File(f"W{layer_index}.hdf5", "w") wfile.create_dataset(wfile.id.name, data=w) wfile.close() b = _as_ndarray(layer["params"]["b"]) bfile = h5py.File(f"b{layer_index}.hdf5", "w") bfile.create_dataset(bfile.id.name, data=b) bfile.close() activation = layer["_props"]["activation"] activation = activation.replace("sigmoid", "Sigmoid") activation = activation.replace("softmax", "Softmax") layers.append( { "W": {"size": list(w.shape), "filename": "W%i.hdf5" % layer_index}, "b": {"size": list(b.shape), "filename": "b%i.hdf5" % layer_index}, "activation": activation, } ) filenames.append(f"W{layer_index}.hdf5") filenames.append(f"b{layer_index}.hdf5") model = {"type": "mlp", "layers": layers} with open("model.yml", "w") as f: yaml.dump(model, f, default_flow_style=False) logging.info("Get preprocessing.yml") # Get model folder model_description_file = os.path.join(model_folder, "info.yml") with open(model_description_file) as ymlfile: model_description = yaml.safe_load(ymlfile) # Get feature folder feature_description_file = os.path.join( utils.get_project_root(), model_description["data-source"], "info.yml" ) with open(feature_description_file) as ymlfile: feature_description = yaml.safe_load(ymlfile) with open("features.yml", "w") as f: yaml.dump(feature_description, f, default_flow_style=False) # Get preprocessing folder preprocessing_description_file = os.path.join( utils.get_project_root(), feature_description["data-source"], "info.yml" ) with open(preprocessing_description_file) as ymlfile: preprocessing_description = yaml.safe_load(ymlfile) with open("preprocessing.yml", "w") as f: yaml.dump(preprocessing_description, f, default_flow_style=False) with tarfile.open("model.tar", "w:") as tar: for name in filenames: tar.add(name) # Remove temporary files which are now in tar file for filename in filenames: os.remove(filename)
def create_report(true_data, eval_data, index2latex, n, merge=True): """ Parameters ---------- true_data : list Labels eval_data : list Predicted labels index2latex : dict Maps the output neurons index to LaTeX n : TODO? merge : bool If set to True, some symbols like \sum and \Sigma will not be count as errors when confused. """ # Gather data correct = [] wrong = [] # Get MER classes merge_cfg_path = pkg_resources.resource_filename('hwrt', 'misc/') merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml") merge_data = yaml.load(open(merge_cfg_file, 'r')) # Make classes confusing = make_all(merge_data) if not merge: confusing = [] # Get false/true negative/positive for each symbol statistical = {} possible_keys = [] assert len(true_data) > 0, "true_data was empty" assert len(true_data) == len(eval_data), \ ("len(true_data)=%i, len(eval_data)=%i" % (len(true_data), len(eval_data))) for known, evaluated in zip(true_data, eval_data): evaluated_t1 = evaluated.keys()[0] if known['index'] not in statistical: statistical[known['index']] = { 'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0, 'latex': index2latex[known['index']] } possible_keys.append(known['index']) for key in evaluated.keys(): if key not in statistical: if key not in index2latex: logging.error( "Key '%s' is not in index2latex. Did you " "probaly define a too small number of " "outputnodes?", str(key)) logging.error("index2latex.keys(): %s", str(index2latex.keys())) sys.exit(-1) statistical[key] = { 'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0, 'latex': index2latex[key] } possible_keys.append(key) if known['index'] in evaluated.keys()[:n]: statistical[known['index']]['TP'] += 1 correct.append(known) for key in possible_keys: if key != known['index']: statistical[key]['TN'] += 1 elif (index2latex[known['index']], index2latex[evaluated_t1]) in confusing: # Some confusions are ok! statistical[known['index']]['TP'] += 1 correct.append(known) for key in possible_keys: if key != known['index']: statistical[key]['TN'] += 1 else: for key in possible_keys: if key != known['index']: if key not in evaluated.keys()[:n]: statistical[key]['TN'] += 1 else: statistical[key]['FP'] += 1 else: statistical[key]['FN'] += 1 formula_id = index2latex[evaluated_t1] known['confused'] = formula_id # That's an index! wrong.append(known) classification_error = (len(wrong) / float(len(wrong) + len(correct))) logging.info("Classification error (n=%i, MER=%r): %0.4f (%i of %i wrong)", n, merge, classification_error, len(wrong), len(eval_data)) # Get the data errors_by_correct_classification = DefaultOrderedDict(list) errors_by_wrong_classification = DefaultOrderedDict(list) for el in wrong: errors_by_correct_classification[el['latex']].append(el) errors_by_wrong_classification[el['confused']].append(el) # Sort errors_by_correct_classification tmp = sorted(errors_by_correct_classification.iteritems(), key=lambda n: len(n[1]), reverse=True) errors_by_correct_classification = OrderedDict(tmp) for key in errors_by_correct_classification: tmp = sorted(errors_by_correct_classification[key], key=lambda n: n['confused']) errors_by_correct_classification[key] = tmp # Sort errors_by_wrong_classification tmp = sorted(errors_by_wrong_classification.iteritems(), key=lambda n: len(n[1]), reverse=True) errors_by_wrong_classification = OrderedDict(tmp) for key in errors_by_wrong_classification: tmp = sorted(errors_by_wrong_classification[key], key=lambda n: n['latex']) errors_by_wrong_classification[key] = tmp # Get the tempalte project_root = utils.get_project_root() template_path = pkg_resources.resource_filename('hwrt', 'templates/') template = os.path.join(template_path, "classification-error-report.html") with open(template) as f: template = f.read() # Find right place for report file time_prefix = time.strftime("%Y-%m-%d-%H-%M") directory = os.path.join(project_root, "reports") if not os.path.exists(directory): os.makedirs(directory) target = os.path.join( project_root, ("reports/" "%s-classification-error-report.html") % time_prefix) # Fill the template from jinja2 import FileSystemLoader from jinja2.environment import Environment env = Environment() env.loader = FileSystemLoader(template_path) t = env.get_template('classification-error-report.html') rendered = t.render( wrong=wrong, correct=correct, classification_error=classification_error, errors_by_correct_classification=errors_by_correct_classification, errors_by_wrong_classification=errors_by_wrong_classification, statistical=statistical) with open(target, "w") as f: f.write(rendered)
def create_report(true_data, eval_data, index2latex, n, merge=True): r""" Parameters ---------- true_data : list Labels eval_data : list Predicted labels index2latex : dict Maps the output neurons index to LaTeX n : TODO? merge : bool If set to True, some symbols like \sum and \Sigma will not be count as errors when confused. """ # Gather data correct = [] wrong = [] # Get MER classes merge_cfg_path = pkg_resources.resource_filename(__name__, "misc/") merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml") with open(merge_cfg_file) as fp: merge_data = yaml.safe_load(fp) # Make classes confusing = make_all(merge_data) if not merge: confusing = [] # Get false/true negative/positive for each symbol statistical = {} possible_keys = [] assert len(true_data) > 0, "true_data was empty" assert len(true_data) == len( eval_data), "len(true_data)=%i, len(eval_data)=%i" % ( len(true_data), len(eval_data), ) for known, evaluated in zip(true_data, eval_data): evaluated_t1 = list(evaluated.keys())[0] if known["index"] not in statistical: statistical[known["index"]] = { "FP": 0, "TP": 0, "FN": 0, "TN": 0, "latex": index2latex[known["index"]], } possible_keys.append(known["index"]) for key in list(evaluated.keys()): if key not in statistical: if key not in index2latex: logger.error(f"Key '{key}' is not in index2latex. Did you " "probaly define a too small number of " "outputnodes?") logger.error(f"index2latex.keys(): {index2latex.keys()}") sys.exit(-1) statistical[key] = { "FP": 0, "TP": 0, "FN": 0, "TN": 0, "latex": index2latex[key], } possible_keys.append(key) if known["index"] in list(evaluated.keys())[:n]: statistical[known["index"]]["TP"] += 1 correct.append(known) for key in possible_keys: if key != known["index"]: statistical[key]["TN"] += 1 elif (index2latex[known["index"]], index2latex[evaluated_t1]) in confusing: # Some confusions are ok! statistical[known["index"]]["TP"] += 1 correct.append(known) for key in possible_keys: if key != known["index"]: statistical[key]["TN"] += 1 else: for key in possible_keys: if key != known["index"]: if key not in list(evaluated.keys())[:n]: statistical[key]["TN"] += 1 else: statistical[key]["FP"] += 1 else: statistical[key]["FN"] += 1 formula_id = index2latex[evaluated_t1] known["confused"] = formula_id # That's an index! wrong.append(known) classification_error = len(wrong) / float(len(wrong) + len(correct)) logger.info( f"Classification error (n={n}, MER={merge}): " f"{classification_error:0.4f} ({len(wrong)} of {len(eval_data)} wrong)", ) # Get the data errors_by_correct_classification = DefaultOrderedDict(list) errors_by_wrong_classification = DefaultOrderedDict(list) for el in wrong: errors_by_correct_classification[el["latex"]].append(el) errors_by_wrong_classification[el["confused"]].append(el) # Sort errors_by_correct_classification tmp = sorted( iter(errors_by_correct_classification.items()), key=lambda n: len(n[1]), reverse=True, ) errors_by_correct_classification = OrderedDict(tmp) for key in errors_by_correct_classification: tmp = sorted(errors_by_correct_classification[key], key=lambda n: n["confused"]) errors_by_correct_classification[key] = tmp # Sort errors_by_wrong_classification tmp = sorted( iter(errors_by_wrong_classification.items()), key=lambda n: len(n[1]), reverse=True, ) errors_by_wrong_classification = OrderedDict(tmp) for key in errors_by_wrong_classification: tmp = sorted(errors_by_wrong_classification[key], key=lambda n: n["latex"]) errors_by_wrong_classification[key] = tmp # Get the tempalte project_root = utils.get_project_root() template_path = pkg_resources.resource_filename("hwrt", "templates/") template = os.path.join(template_path, "classification-error-report.html") with open(template) as f: template = f.read() # Find right place for report file time_prefix = time.strftime("%Y-%m-%d-%H-%M") directory = os.path.join(project_root, "reports") if not os.path.exists(directory): os.makedirs(directory) target = os.path.join( project_root, f"reports/{time_prefix}-classification-error-report.html") # Fill the template # Third party modules from jinja2 import FileSystemLoader from jinja2.environment import Environment env = Environment() env.loader = FileSystemLoader(template_path) t = env.get_template("classification-error-report.html") rendered = t.render( wrong=wrong, correct=correct, classification_error=classification_error, errors_by_correct_classification=errors_by_correct_classification, errors_by_wrong_classification=errors_by_wrong_classification, statistical=statistical, ) with open(target, "w") as f: f.write(rendered)
def main(destination=os.path.join(utils.get_project_root(), "raw-datasets"), dataset='all', renderings=False): """Main part of the backup script.""" time_prefix = time.strftime("%Y-%m-%d-%H-%M") filename = ("%s-handwriting_datasets-%s-raw.pickle" % (time_prefix, dataset.replace('/', '-'))) destination_path = os.path.join(destination, filename) logging.info("Data will be written to '%s'", destination_path) cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() formulas = get_formulas(cursor, dataset) logging.info('Received %i formulas.', len(formulas)) handwriting_datasets = [] formula_id2latex = {} # Go through each formula and download every raw_data instance for formula in formulas: formula_id2latex[formula['id']] = formula['formula_in_latex'] sql = (("SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData(raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id'], user_name=raw_data['display_name']) handwriting_datasets.append({'handwriting': handwriting, 'id': raw_data['id'], 'formula_id': formula['id'], 'formula_in_latex': formula['formula_in_latex'], 'is_in_testset': raw_data['is_in_testset']}) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) pickle.dump({'handwriting_datasets': handwriting_datasets, 'formula_id2latex': formula_id2latex}, open(destination_path, "wb"), 2) if renderings: logging.info("Start downloading SVG renderings...") svgfolder = tempfile.mkdtemp() sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id AND t1.creation_time < t2.creation_time WHERE t2.id is null""" cursor.execute(sql) formulas = cursor.fetchall() logging.info("Create svg...") for formula in formulas: filename = os.path.join(svgfolder, "%s.svg" % str(formula['formula_id'])) with open(filename, 'wb') as temp_file: temp_file.write(formula['svg']) logging.info("Tar at %s", os.path.abspath("renderings.tar")) tar = tarfile.open("renderings.tar.bz2", "w:bz2") for fn in os.listdir(svgfolder): filename = os.path.join(svgfolder, fn) if os.path.isfile(filename): print(filename) tar.add(filename, arcname=os.path.basename(filename)) tar.close()
def test_get_parameters(): # TODO: nose.proxy.UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 # in position 0: invalid start byte small = os.path.join(utils.get_project_root(), "preprocessed/small-baseline") preprocess_dataset.get_parameters(small)
def create_report(true_data, eval_data, index2latex, n, merge=True): """ Parameters ---------- true_data : list Labels eval_data : list Predicted labels index2latex : dict Maps the output neurons index to LaTeX n : TODO? merge : bool If set to True, some symbols like \sum and \Sigma will not be count as errors when confused. """ # Gather data correct = [] wrong = [] # Get MER classes merge_cfg_path = pkg_resources.resource_filename('hwrt', 'misc/') merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml") merge_data = yaml.load(open(merge_cfg_file, 'r')) # Make classes confusing = make_all(merge_data) if not merge: confusing = [] # Get false/true negative/positive for each symbol statistical = {} possible_keys = [] assert len(true_data) > 0, "true_data was empty" assert len(true_data) == len(eval_data), \ ("len(true_data)=%i, len(eval_data)=%i" % (len(true_data), len(eval_data))) for known, evaluated in zip(true_data, eval_data): evaluated_t1 = evaluated.keys()[0] if known['index'] not in statistical: statistical[known['index']] = {'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0, 'latex': index2latex[known['index']] } possible_keys.append(known['index']) for key in evaluated.keys(): if key not in statistical: if key not in index2latex: logging.error("Key '%s' is not in index2latex. Did you " "probaly define a too small number of " "outputnodes?", str(key)) logging.error("index2latex.keys(): %s", str(index2latex.keys())) sys.exit(-1) statistical[key] = {'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0, 'latex': index2latex[key]} possible_keys.append(key) if known['index'] in evaluated.keys()[:n]: statistical[known['index']]['TP'] += 1 correct.append(known) for key in possible_keys: if key != known['index']: statistical[key]['TN'] += 1 elif (index2latex[known['index']], index2latex[evaluated_t1]) in confusing: # Some confusions are ok! statistical[known['index']]['TP'] += 1 correct.append(known) for key in possible_keys: if key != known['index']: statistical[key]['TN'] += 1 else: for key in possible_keys: if key != known['index']: if key not in evaluated.keys()[:n]: statistical[key]['TN'] += 1 else: statistical[key]['FP'] += 1 else: statistical[key]['FN'] += 1 formula_id = index2latex[evaluated_t1] known['confused'] = formula_id # That's an index! wrong.append(known) classification_error = (len(wrong) / float(len(wrong) + len(correct))) logging.info("Classification error (n=%i, MER=%r): %0.4f (%i of %i wrong)", n, merge, classification_error, len(wrong), len(eval_data)) # Get the data errors_by_correct_classification = DefaultOrderedDict(list) errors_by_wrong_classification = DefaultOrderedDict(list) for el in wrong: errors_by_correct_classification[el['latex']].append(el) errors_by_wrong_classification[el['confused']].append(el) # Sort errors_by_correct_classification tmp = sorted(errors_by_correct_classification.iteritems(), key=lambda n: len(n[1]), reverse=True) errors_by_correct_classification = OrderedDict(tmp) for key in errors_by_correct_classification: tmp = sorted(errors_by_correct_classification[key], key=lambda n: n['confused']) errors_by_correct_classification[key] = tmp # Sort errors_by_wrong_classification tmp = sorted(errors_by_wrong_classification.iteritems(), key=lambda n: len(n[1]), reverse=True) errors_by_wrong_classification = OrderedDict(tmp) for key in errors_by_wrong_classification: tmp = sorted(errors_by_wrong_classification[key], key=lambda n: n['latex']) errors_by_wrong_classification[key] = tmp # Get the tempalte project_root = utils.get_project_root() template_path = pkg_resources.resource_filename('hwrt', 'templates/') template = os.path.join(template_path, "classification-error-report.html") with open(template) as f: template = f.read() # Find right place for report file time_prefix = time.strftime("%Y-%m-%d-%H-%M") directory = os.path.join(project_root, "reports") if not os.path.exists(directory): os.makedirs(directory) target = os.path.join(project_root, ("reports/" "%s-classification-error-report.html") % time_prefix) # Fill the template from jinja2 import FileSystemLoader from jinja2.environment import Environment env = Environment() env.loader = FileSystemLoader(template_path) t = env.get_template('classification-error-report.html') rendered = t.render(wrong=wrong, correct=correct, classification_error=classification_error, errors_by_correct_classification= errors_by_correct_classification, errors_by_wrong_classification= errors_by_wrong_classification, statistical=statistical) with open(target, "w") as f: f.write(rendered)
def get_parameters_test(): # TODO: nose.proxy.UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 # in position 0: invalid start byte small = os.path.join(utils.get_project_root(), "preprocessed/small-baseline") preprocess_dataset.get_parameters(small)
def execution_test(): small = os.path.join(utils.get_project_root(), "models/small-baseline") create_model.main(small)