def dataset_preparation_test():
    d = os.path.dirname(__file__)
    target = os.path.join(utils.get_project_root(),
                          'raw-datasets/unittests-tiny-raw.pickle')
    shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target)
    preprocess_dataset.create_preprocessed_dataset(
        target,
        os.path.join(utils.get_project_root(),
                     'preprocessed/small-baseline/data.pickle'),
        [preprocessing.ScaleAndShift()])
Exemple #2
0
def dataset_preparation_test():
    d = os.path.dirname(__file__)
    target = os.path.join(utils.get_project_root(),
                          'raw-datasets/unittests-tiny-raw.pickle')
    shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'),
                    target)
    preprocess_dataset.create_preprocessed_dataset(
        target,
        os.path.join(utils.get_project_root(),
                     'preprocessed/small-baseline/data.pickle'),
        [preprocessing.ScaleAndShift()])
Exemple #3
0
def test_execution():
    view._fetch_data_from_server(31, "mysql_online")

    d = os.path.dirname(__file__)
    target = os.path.join(utils.get_project_root(),
                          "raw-datasets/unittests-tiny-raw.pickle")
    shutil.copyfile(os.path.join(d, "data/unittests-tiny-raw.pickle"), target)
    view._get_data_from_rawfile(target, 345)  # Is in tiny test set
    view._get_data_from_rawfile(target, 42)  # Is not in tiny test set
    view._list_ids(target)
    model_folder = os.path.join(utils.get_project_root(),
                                "models/small-baseline")
    view._get_system(model_folder)
Exemple #4
0
def execution_test():
    view.get_parser()
    view._fetch_data_from_server(31, 'mysql_online')

    d = os.path.dirname(__file__)
    target = os.path.join(utils.get_project_root(),
                          'raw-datasets/unittests-tiny-raw.pickle')
    shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'),
                    target)
    view._get_data_from_rawfile(target, 345)  # Is in tiny test set
    view._get_data_from_rawfile(target, 42)  # Is not in tiny test set
    view._list_ids(target)
    model_folder = os.path.join(utils.get_project_root(),
                                'models/small-baseline')
    view._get_system(model_folder)
Exemple #5
0
def test_execution_test2():
    d = os.path.dirname(__file__)
    raw_datasets = os.path.join(utils.get_project_root(),
                                "raw-datasets/unittests-tiny-raw.pickle")
    shutil.copyfile(os.path.join(d, "data/unittests-tiny-raw.pickle"),
                    raw_datasets)
    dam.TimeBetweenPointsAndStrokes(raw_datasets)
Exemple #6
0
def get_test_results(model_folder, basename, test_file):
    model_src = utils.get_latest_model(model_folder, basename)
    if model_src is None:
        logging.error("No model with basename '%s' found in '%s'.", basename,
                      model_folder)
    else:
        _, model_use = tempfile.mkstemp(suffix='.json', text=True)
        utils.create_adjusted_model_for_percentages(model_src, model_use)

        # Start evaluation
        project_root = utils.get_project_root()
        time_prefix = time.strftime("%Y-%m-%d-%H-%M")
        logging.info("Evaluate '%s' with '%s'...", model_src, test_file)
        logfile = os.path.join(project_root,
                               "logs/%s-error-evaluation.log" % time_prefix)
        logging.info('Write log to %s...', logfile)
        with open(logfile, "w") as log, open(model_use, "r") as model_src_p:
            p = subprocess.Popen([
                utils.get_nntoolkit(), 'run', '--batch-size', '1', '-f%0.4f',
                test_file
            ],
                                 stdin=model_src_p,
                                 stdout=log)
            ret = p.wait()
            if ret != 0:
                logging.error("nntoolkit finished with ret code %s", str(ret))
                sys.exit(-1)
        os.remove(model_use)
        return logfile
Exemple #7
0
def get_test_results(model_folder, basename, test_file):
    model_src = utils.get_latest_model(model_folder, basename)
    if model_src is None:
        logger.error(
            f"No model with basename '{basename}' found in '{model_folder}'.")
    else:
        _, model_use = tempfile.mkstemp(suffix=".json", text=True)
        utils.create_adjusted_model_for_percentages(model_src, model_use)

        # Start evaluation
        project_root = utils.get_project_root()
        time_prefix = time.strftime("%Y-%m-%d-%H-%M")
        logger.info(f"Evaluate '{model_src}' with '{test_file}'...")
        logfile = os.path.join(project_root,
                               "logs/%s-error-evaluation.log" % time_prefix)
        logger.info(f"Write log to {logfile}...")
        with open(logfile, "w") as log, open(model_use) as model_src_p:
            p = subprocess.Popen(
                [
                    utils.get_nntoolkit(),
                    "run",
                    "--batch-size",
                    "1",
                    "-f%0.4f",
                    test_file,
                ],
                stdin=model_src_p,
                stdout=log,
            )
            ret = p.wait()
            if ret != 0:
                logger.error(f"nntoolkit finished with ret code {ret}")
                sys.exit(-1)
        os.remove(model_use)
        return logfile
Exemple #8
0
def test_get_latest_model():
    """Check if get_latest_model works."""
    model_folder = "/etc"
    basename = "model"
    assert utils.get_latest_model(model_folder, basename) is None
    small = os.path.join(utils.get_project_root(), "models/small-baseline")
    utils.get_latest_model(small, basename)
Exemple #9
0
def test_execute_main():
    model_small = os.path.join(utils.get_project_root(), "models",
                               "small-baseline")
    view.main(True, model_small, False, 31, False, "mysql_online")
    view.main(False, model_small, False, 31, False, "mysql_online")
    view.main(False, model_small, True, 31, False, "mysql_online")
    view.main(False, model_small, False, 31, True, "mysql_online")
Exemple #10
0
def execution_test2():
    d = os.path.dirname(__file__)
    raw_datasets = os.path.join(utils.get_project_root(),
                                'raw-datasets/unittests-tiny-raw.pickle')
    shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'),
                    raw_datasets)
    dam.TimeBetweenPointsAndStrokes(raw_datasets)
Exemple #11
0
def get_parser():
    """Return the parser object for this script."""
    project_root = utils.get_project_root()
    archive_path = os.path.join(project_root, "raw-datasets")
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
    parser = ArgumentParser(description=__doc__,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d",
                        "--destination",
                        dest="destination",
                        default=archive_path,
                        help="where to write the handwriting_dataset.pickle",
                        type=lambda x: utils.is_valid_folder(parser, x),
                        metavar="FOLDER")
    parser.add_argument("--dataset",
                        dest="dataset",
                        default='all',
                        help=("of which symbols do you want the recordings?"))
    parser.add_argument("-r",
                        "--renderings",
                        dest="renderings",
                        action="store_true",
                        default=False,
                        help=("should the svg renderings be downloaded?"))
    parser.add_argument("--dropbox",
                        dest="dropbox",
                        action="store_true",
                        default=False,
                        help=("upload to new files to DropBox"))
    return parser
Exemple #12
0
def dropbox_upload(filename, directory, client):
    """
    Upload the data to DropBox.

    Parameters
    ----------
    filename : string
        Name of the file that gets uploaded.
    directory : string
        Name of the directory in which the file is that gets uploaded (relativ
        to the project root)
    client :
        a DropBox client object
    """
    local_path = os.path.join(utils.get_project_root(), directory, filename)
    online_path = os.path.join(directory, filename)
    filesize = os.path.getsize(local_path)
    logging.info("Start uploading '%s' (%s)...",
                 filename,
                 utils.sizeof_fmt(filesize))
    with open(local_path, 'rb') as f:
        uploader = client.get_chunked_uploader(f, filesize)
        uploader.upload_chunked()
        uploader.finish(online_path, overwrite=True)
    url = client.share(online_path,
                       short_url=False)['url'].encode('ascii', 'ignore')
    url = url.replace("?dl=0", "?dl=1")
    return url
Exemple #13
0
def dropbox_upload(filename, directory, client):
    """
    Upload the data to DropBox.

    Parameters
    ----------
    filename : string
        Name of the file that gets uploaded.
    directory : string
        Name of the directory in which the file is that gets uploaded (relativ
        to the project root)
    client :
        a DropBox client object
    """
    local_path = os.path.join(utils.get_project_root(), directory, filename)
    online_path = os.path.join(directory, filename)
    filesize = os.path.getsize(local_path)
    logging.info("Start uploading '%s' (%s)...", filename,
                 utils.sizeof_fmt(filesize))
    with open(local_path, 'rb') as f:
        uploader = client.get_chunked_uploader(f, filesize)
        uploader.upload_chunked()
        uploader.finish(online_path, overwrite=True)
    url = client.share(online_path,
                       short_url=False)['url'].encode('ascii', 'ignore')
    url = url.replace("?dl=0", "?dl=1")
    return url
Exemple #14
0
def get_test_results(model_folder, basename, test_file):
    model_src = utils.get_latest_model(model_folder, basename)
    if model_src is None:
        logging.error("No model with basename '%s' found in '%s'.",
                      basename,
                      model_folder)
    else:
        _, model_use = tempfile.mkstemp(suffix='.json', text=True)
        utils.create_adjusted_model_for_percentages(model_src, model_use)

        # Start evaluation
        project_root = utils.get_project_root()
        time_prefix = time.strftime("%Y-%m-%d-%H-%M")
        logging.info("Evaluate '%s' with '%s'...", model_src, test_file)
        logfile = os.path.join(project_root,
                               "logs/%s-error-evaluation.log" %
                               time_prefix)
        logging.info('Write log to %s...', logfile)
        with open(logfile, "w") as log, open(model_use, "r") as model_src_p:
            p = subprocess.Popen([utils.get_nntoolkit(), 'run',
                                  '--batch-size', '1', '-f%0.4f', test_file],
                                 stdin=model_src_p,
                                 stdout=log)
            ret = p.wait()
            if ret != 0:
                logging.error("nntoolkit finished with ret code %s", str(ret))
                sys.exit(-1)
        os.remove(model_use)
        return logfile
Exemple #15
0
def get_latest_model_test():
    """Check if get_latest_model works."""
    model_folder = "/etc"
    basename = "model"
    nose.tools.assert_equal(utils.get_latest_model(model_folder, basename),
                            None)
    small = os.path.join(utils.get_project_root(), "models/small-baseline")
    utils.get_latest_model(small, basename)
Exemple #16
0
def execute_main_test():
    model_small = os.path.join(utils.get_project_root(),
                               "models",
                               "small-baseline")
    view.main(True, model_small, False, 31, False, 'mysql_online')
    view.main(False, model_small, False, 31, False, 'mysql_online')
    view.main(False, model_small, True, 31, False, 'mysql_online')
    view.main(False, model_small, False, 31, True, 'mysql_online')
def _get_default_pickle():
    project_root = utils.get_project_root()
    raw_dir = os.path.join(project_root, "raw-datasets")
    models = filter(lambda n: n.endswith(".pickle"), os.listdir(raw_dir))
    models = natsort.natsorted(models, reverse=True)
    if len(models) == 0:
        return None
    else:
        return os.path.join(raw_dir, models[0])
Exemple #18
0
def get_latest_model_test():
    """Check if get_latest_model works."""
    model_folder = "/etc"
    basename = "model"
    nose.tools.assert_equal(utils.get_latest_model(model_folder, basename),
                            None)
    small = os.path.join(utils.get_project_root(),
                         "models/small-baseline")
    utils.get_latest_model(small, basename)
Exemple #19
0
def test_create_translation_file():
    """Test create_ffiles._create_translation_file."""
    feature_folder = os.path.join(utils.get_project_root(), "feature-files",
                                  "small-baseline")
    dataset_name = "testtestdata"
    translation = [(133700, "\\alpha", 42)]
    formula_id2index = {42: 1}
    create_ffiles._create_translation_file(feature_folder, dataset_name,
                                           translation, formula_id2index)
Exemple #20
0
def _get_default_pickle():
    project_root = utils.get_project_root()
    raw_dir = os.path.join(project_root, "raw-datasets")
    models = filter(lambda n: n.endswith(".pickle"), os.listdir(raw_dir))
    models = natsort.natsorted(models, reverse=True)
    if len(models) == 0:
        return None
    else:
        return os.path.join(raw_dir, models[0])
Exemple #21
0
def get_recognizer_folders_test():
    """Test if all folders are catched."""
    small = os.path.join(utils.get_project_root(), "models/small-baseline")
    folders = utils.get_recognizer_folders(small)
    wanted_folders = [
        'preprocessed/small-baseline', 'feature-files/small',
        'models/small-baseline'
    ]
    for folder, wanted_folder in zip(folders, wanted_folders):
        nose.tools.assert_equal(folder.endswith(wanted_folder), True)
Exemple #22
0
def main(model_folder, aset='test', n=3, merge=True):
    """Main part of the test script."""
    project_root = utils.get_project_root()

    if aset == 'test':
        key_model, key_file = 'testing', 'testdata'
    elif aset == 'valid':
        key_model, key_file = 'validating', 'validdata'
    else:
        key_model, key_file = 'training', 'traindata'

    # Get model description
    model_description_file = os.path.join(model_folder, "info.yml")
    # Read the model description file
    with open(model_description_file, 'r') as ymlfile:
        model_description = yaml.load(ymlfile)

    # Get the data paths (hdf5)
    project_root = utils.get_project_root()
    data = {}
    data['training'] = os.path.join(project_root,
                                    model_description["data-source"],
                                    "traindata.hdf5")
    data['testing'] = os.path.join(project_root,
                                   model_description["data-source"],
                                   "testdata.hdf5")
    data['validating'] = os.path.join(project_root,
                                      model_description["data-source"],
                                      "validdata.hdf5")

    test_data_path = os.path.join(model_folder, data[key_model])
    evaluation_file = get_test_results(model_folder,
                                       "model",
                                       test_data_path)
    translation_csv = os.path.join(project_root,
                                   model_description["data-source"],
                                   "index2formula_id.csv")
    what_evaluated_file = os.path.join(project_root,
                                       model_description["data-source"],
                                       "translation-%s.csv" % key_file)
    analyze_results(translation_csv, what_evaluated_file, evaluation_file, n,
                    merge)
Exemple #23
0
def test_execution():
    """Test if the functions execute at all."""
    utils.get_project_root()
    utils.get_latest_model(".", "model")
    utils.get_latest_working_model(".")
    utils.get_latest_successful_run(".")
    assert utils.get_readable_time(123) == "123ms"
    assert utils.get_readable_time(1000 * 30) == "30s 0ms"
    assert utils.get_readable_time(1000 * 60) == "1 minutes 0s 0ms"
    assert utils.get_readable_time(1000 * 60 * 60) == "1h, 0 minutes 0s 0ms"
    assert utils.get_readable_time(2 * 1000 * 60 *
                                   60) == "2h, 0 minutes 0s 0ms"
    assert utils.get_readable_time(25 * 1000 * 60 * 60 +
                                   3) == "25h, 0 minutes 0s 3ms"
    utils.print_status(3, 1, 123)
    utils.get_nntoolkit()
    utils.get_database_config_file()
    utils.get_database_configuration()
    assert utils.sizeof_fmt(1) == "1.0 bytes"
    assert utils.sizeof_fmt(1111) == "1.1 KB"
Exemple #24
0
def test_get_recognizer_folders():
    """Test if all folders are catched."""
    small = os.path.join(utils.get_project_root(), "models/small-baseline")
    folders = utils.get_recognizer_folders(small)
    wanted_folders = [
        "preprocessed/small-baseline",
        "feature-files/small",
        "models/small-baseline",
    ]
    for folder, wanted_folder in zip(folders, wanted_folders):
        assert folder.endswith(wanted_folder)
Exemple #25
0
def get_recognizer_folders_test():
    """Test if all folders are catched."""
    small = os.path.join(utils.get_project_root(),
                         "models/small-baseline")
    folders = utils.get_recognizer_folders(small)
    wanted_folders = ['preprocessed/small-baseline',
                      'feature-files/small',
                      'models/small-baseline']
    for folder, wanted_folder in zip(folders, wanted_folders):
        nose.tools.assert_equal(folder.endswith(wanted_folder),
                                True)
Exemple #26
0
def create_translation_file_test():
    """Test create_ffiles._create_translation_file."""
    feature_folder = os.path.join(utils.get_project_root(),
                                  "feature-files",
                                  "small-baseline")
    dataset_name = "testtestdata"
    translation = [(133700, '\\alpha', 42)]
    formula_id2index = {42: 1}
    create_ffiles._create_translation_file(feature_folder,
                                           dataset_name,
                                           translation,
                                           formula_id2index)
Exemple #27
0
def main(model_folder, aset='test', n=3, merge=True):
    """Main part of the test script."""
    project_root = utils.get_project_root()

    if aset == 'test':
        key_model, key_file = 'testing', 'testdata'
    elif aset == 'valid':
        key_model, key_file = 'validating', 'validdata'
    else:
        key_model, key_file = 'training', 'traindata'

    # Get model description
    model_description_file = os.path.join(model_folder, "info.yml")
    # Read the model description file
    with open(model_description_file, 'r') as ymlfile:
        model_description = yaml.load(ymlfile)

    # Get the data paths (hdf5)
    project_root = utils.get_project_root()
    data = {}
    data['training'] = os.path.join(project_root,
                                    model_description["data-source"],
                                    "traindata.hdf5")
    data['testing'] = os.path.join(project_root,
                                   model_description["data-source"],
                                   "testdata.hdf5")
    data['validating'] = os.path.join(project_root,
                                      model_description["data-source"],
                                      "validdata.hdf5")

    test_data_path = os.path.join(model_folder, data[key_model])
    evaluation_file = get_test_results(model_folder, "model", test_data_path)
    translation_csv = os.path.join(project_root,
                                   model_description["data-source"],
                                   "index2formula_id.csv")
    what_evaluated_file = os.path.join(project_root,
                                       model_description["data-source"],
                                       "translation-%s.csv" % key_file)
    analyze_results(translation_csv, what_evaluated_file, evaluation_file, n,
                    merge)
Exemple #28
0
def update_if_outdated(folder):
    """Check if the currently watched instance (model, feature or
        preprocessing) is outdated and update it eventually.
    """

    folders = []
    while os.path.isdir(folder):
        folders.append(folder)
        # Get info.yml
        with open(os.path.join(folder, "info.yml")) as ymlfile:
            content = yaml.load(ymlfile)
        folder = os.path.join(utils.get_project_root(), content['data-source'])
    raw_source_file = folder
    if not os.path.isfile(raw_source_file):
        logging.error("File '%s' was not found.", raw_source_file)
        logging.error("You should eventually execute 'hwrt download'.")
        sys.exit(-1)
    dt = os.path.getmtime(raw_source_file)
    source_mtime = datetime.datetime.utcfromtimestamp(dt)
    folders = folders[::-1]  # Reverse order to get the most "basic one first"

    for target_folder in folders:
        target_mtime = utils.get_latest_successful_run(target_folder)
        if target_mtime is None or source_mtime > target_mtime:
            # The source is later than the target. That means we need to
            # refresh the target
            if "preprocessed" in target_folder:
                logging.info("Preprocessed file was outdated. Update...")
                preprocess_dataset.main(
                    os.path.join(utils.get_project_root(), target_folder))
            elif "feature-files" in target_folder:
                logging.info("Feature file was outdated. Update...")
                create_ffiles.main(target_folder)
            elif "model" in target_folder:
                logging.info("Model file was outdated. Update...")
                create_model.main(target_folder, True)
            target_mtime = datetime.datetime.utcnow()
        else:
            logging.info("'%s' is up-to-date.", target_folder)
        source_mtime = target_mtime
Exemple #29
0
def update_if_outdated(folder):
    """Check if the currently watched instance (model, feature or
        preprocessing) is outdated and update it eventually.
    """

    folders = []
    while os.path.isdir(folder):
        folders.append(folder)
        # Get info.yml
        with open(os.path.join(folder, "info.yml")) as ymlfile:
            content = yaml.load(ymlfile)
        folder = os.path.join(utils.get_project_root(), content['data-source'])
    raw_source_file = folder
    if not os.path.isfile(raw_source_file):
        logging.error("File '%s' was not found.", raw_source_file)
        logging.error("You should eventually execute 'hwrt download'.")
        sys.exit(-1)
    dt = os.path.getmtime(raw_source_file)
    source_mtime = datetime.datetime.utcfromtimestamp(dt)
    folders = folders[::-1]  # Reverse order to get the most "basic one first"

    for target_folder in folders:
        target_mtime = utils.get_latest_successful_run(target_folder)
        if target_mtime is None or source_mtime > target_mtime:
            # The source is later than the target. That means we need to
            # refresh the target
            if "preprocessed" in target_folder:
                logging.info("Preprocessed file was outdated. Update...")
                preprocess_dataset.main(os.path.join(utils.get_project_root(),
                                                     target_folder))
            elif "feature-files" in target_folder:
                logging.info("Feature file was outdated. Update...")
                create_ffiles.main(target_folder)
            elif "model" in target_folder:
                logging.info("Model file was outdated. Update...")
                create_model.main(target_folder, True)
            target_mtime = datetime.datetime.utcnow()
        else:
            logging.info("'%s' is up-to-date.", target_folder)
        source_mtime = target_mtime
Exemple #30
0
def execution_test():
    """Test if the functions execute at all."""
    utils.get_project_root()
    utils.get_latest_model(".", "model")
    utils.get_latest_working_model(".")
    utils.get_latest_successful_run(".")
    nose.tools.assert_equal(utils.get_readable_time(123), "123ms")
    nose.tools.assert_equal(utils.get_readable_time(1000*30),
                            "30s 0ms")
    nose.tools.assert_equal(utils.get_readable_time(1000*60),
                            "1 minutes 0s 0ms")
    nose.tools.assert_equal(utils.get_readable_time(1000*60*60),
                            "1h, 0 minutes 0s 0ms")
    nose.tools.assert_equal(utils.get_readable_time(2*1000*60*60),
                            "2h, 0 minutes 0s 0ms")
    nose.tools.assert_equal(utils.get_readable_time(25*1000*60*60+3),
                            "25h, 0 minutes 0s 3ms")
    utils.print_status(3, 1, 123)
    utils.get_nntoolkit()
    utils.get_database_config_file()
    utils.get_database_configuration()
    nose.tools.assert_equal(utils.sizeof_fmt(1), "1.0 bytes")
    nose.tools.assert_equal(utils.sizeof_fmt(1111), "1.1 KB")
Exemple #31
0
def generate_training_command(model_folder):
    """Generate a string that contains a command with all necessary
       parameters to train the model."""
    update_if_outdated(model_folder)
    model_description_file = os.path.join(model_folder, "info.yml")
    # Read the model description file
    with open(model_description_file, 'r') as ymlfile:
        model_description = yaml.load(ymlfile)

    # Get the data paths (hdf5 files)
    project_root = utils.get_project_root()
    data = {}
    data['training'] = os.path.join(project_root,
                                    model_description["data-source"],
                                    "traindata.hdf5")
    data['testing'] = os.path.join(project_root,
                                   model_description["data-source"],
                                   "testdata.hdf5")
    data['validating'] = os.path.join(project_root,
                                      model_description["data-source"],
                                      "validdata.hdf5")

    # Get latest model file
    basename = "model"
    latest_model = utils.get_latest_working_model(model_folder)

    if latest_model == "":
        logging.error("There is no model with basename '%s'.", basename)
        return None
    else:
        logging.info("Model '%s' found.", latest_model)
        i = int(latest_model.split("-")[-1].split(".")[0])
        model_src = os.path.join(model_folder, "%s-%i.json" % (basename, i))
        model_target = os.path.join(model_folder,
                                    "%s-%i.json" % (basename, i + 1))

    # generate the training command
    training = model_description['training']
    training = training.replace("{{testing}}", data['testing'])
    training = training.replace("{{training}}", data['training'])
    training = training.replace("{{validation}}", data['validating'])
    training = training.replace("{{src_model}}", model_src)
    training = training.replace("{{target_model}}", model_target)
    training = training.replace("{{nntoolkit}}", utils.get_nntoolkit())
    return training
Exemple #32
0
def generate_training_command(model_folder):
    """Generate a string that contains a command with all necessary
       parameters to train the model."""
    update_if_outdated(model_folder)
    model_description_file = os.path.join(model_folder, "info.yml")
    # Read the model description file
    with open(model_description_file, 'r') as ymlfile:
        model_description = yaml.load(ymlfile)

    # Get the data paths (hdf5 files)
    project_root = utils.get_project_root()
    data = {}
    data['training'] = os.path.join(project_root,
                                    model_description["data-source"],
                                    "traindata.hdf5")
    data['testing'] = os.path.join(project_root,
                                   model_description["data-source"],
                                   "testdata.hdf5")
    data['validating'] = os.path.join(project_root,
                                      model_description["data-source"],
                                      "validdata.hdf5")

    # Get latest model file
    basename = "model"
    latest_model = utils.get_latest_working_model(model_folder)

    if latest_model == "":
        logging.error("There is no model with basename '%s'.", basename)
        return None
    else:
        logging.info("Model '%s' found.", latest_model)
        i = int(latest_model.split("-")[-1].split(".")[0])
        model_src = os.path.join(model_folder, "%s-%i.json" % (basename, i))
        model_target = os.path.join(model_folder,
                                    "%s-%i.json" % (basename, i+1))

    # generate the training command
    training = model_description['training']
    training = training.replace("{{testing}}", data['testing'])
    training = training.replace("{{training}}", data['training'])
    training = training.replace("{{validation}}", data['validating'])
    training = training.replace("{{src_model}}", model_src)
    training = training.replace("{{target_model}}", model_target)
    training = training.replace("{{nntoolkit}}", utils.get_nntoolkit())
    return training
Exemple #33
0
def generate_training_command(model_folder):
    """Generate a string that contains a command with all necessary
    parameters to train the model."""
    update_if_outdated(model_folder)
    model_description_file = os.path.join(model_folder, "info.yml")
    # Read the model description file
    with open(model_description_file) as ymlfile:
        model_description = yaml.safe_load(ymlfile)

    # Get the data paths (hdf5 files)
    project_root = utils.get_project_root()
    data = {}
    data["training"] = os.path.join(project_root,
                                    model_description["data-source"],
                                    "traindata.hdf5")
    data["testing"] = os.path.join(project_root,
                                   model_description["data-source"],
                                   "testdata.hdf5")
    data["validating"] = os.path.join(project_root,
                                      model_description["data-source"],
                                      "validdata.hdf5")

    # Get latest model file
    basename = "model"
    latest_model = utils.get_latest_working_model(model_folder)

    if latest_model == "":
        logger.error(
            f"There is no model with basename '{basename}' in {model_folder}")
        return None

    logger.info(f"Model '{latest_model}' found.")
    i = int(latest_model.split("-")[-1].split(".")[0])
    model_src = os.path.join(model_folder, f"{basename}-{i}.json")
    model_target = os.path.join(model_folder, f"{basename}-{i + 1}.json")

    # generate the training command
    training = model_description["training"]
    training = training.replace("{{testing}}", data["testing"])
    training = training.replace("{{training}}", data["training"])
    training = training.replace("{{validation}}", data["validating"])
    training = training.replace("{{src_model}}", model_src)
    training = training.replace("{{target_model}}", model_target)
    training = training.replace("{{nntoolkit}}", utils.get_nntoolkit())
    return training
Exemple #34
0
def get_parser():
    """Return the parser object for this script."""
    project_root = utils.get_project_root()
    archive_path = os.path.join(project_root, "raw-datasets")
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
    parser = ArgumentParser(description=__doc__,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d", "--destination", dest="destination",
                        default=archive_path,
                        help="where to write the handwriting_dataset.pickle",
                        type=lambda x: utils.is_valid_folder(parser, x),
                        metavar="FOLDER")
    parser.add_argument("--dataset", dest="dataset",
                        default='all',
                        help=("of which symbols do you want the recordings?"))
    parser.add_argument("-r", "--renderings", dest="renderings",
                        action="store_true", default=False,
                        help=("should the svg renderings be downloaded?"))
    parser.add_argument("--dropbox", dest="dropbox",
                        action="store_true", default=False,
                        help=("upload to new files to DropBox"))
    return parser
Exemple #35
0
def sync_directory(directory):
    """Sync a directory. Return if syncing was successful."""
    # Developers should read
    # https://www.dropbox.com/developers/core/start/python
    # before modifying the following code
    cfg = utils.get_project_configuration()

    # Information about files in this folder
    project_root = utils.get_project_root()
    directory_information_file = os.path.join(project_root, directory,
                                              "info.yml")
    if not os.path.isfile(directory_information_file):  # create if not exists
        with open(directory_information_file, 'w') as ymlfile:
            ymlfile.write(yaml.dump([]))

    # Dropbox stuff
    APP_KEY = cfg['dropbox_app_key']
    APP_SECRET = cfg['dropbox_app_secret']

    flow = dropbox.client.DropboxOAuth2FlowNoRedirect(APP_KEY, APP_SECRET)
    authorize_url = flow.start()
    webbrowser.open_new_tab(authorize_url)
    print("1. Go to: " + authorize_url)
    print("2. Click 'Allow' (you might have to log in first)")
    print("3. Copy the authorization code.")
    access_token = input_string().strip()

    try:
        # This will fail if the user enters an invalid authorization code
        access_token, user_id = flow.finish(access_token)
        client = dropbox.client.DropboxClient(access_token)
    except Exception as e:
        logging.error("Dropbox connection error: %s", e)
        return False

    # Get all local files
    local_path = os.path.join(project_root, directory)
    files = [
        f for f in os.listdir(local_path)
        if os.path.isfile(os.path.join(local_path, f))
    ]
    files = filter(lambda n: n.endswith(".pickle"), files)

    new_yaml_content = []

    # upload them
    for filename in files:
        file_meta = {}
        file_meta['filename'] = filename
        file_meta['online_path'] = os.path.join(directory, filename)
        local_path_file = os.path.join(local_path, filename)
        file_meta['md5'] = hashlib.md5(open(local_path_file,
                                            'rb').read()).hexdigest()
        new_yaml_content.append(file_meta)
        file_meta['url'] = dropbox_upload(filename, directory, client)
        if not file_meta['url']:
            return False

    # TODO: Remove all files from Dropbox that are not in local folder

    # Update YAML file
    with open(directory_information_file, 'w') as ymlfile:
        ymlfile.write(yaml.dump(new_yaml_content, default_flow_style=False))

    return True
Exemple #36
0
def main(destination=os.path.join(utils.get_project_root(), "raw-datasets"),
         dataset='all',
         renderings=False):
    """Main part of the backup script."""
    time_prefix = time.strftime("%Y-%m-%d-%H-%M")
    filename = ("%s-handwriting_datasets-%s-raw.pickle" %
                (time_prefix, dataset.replace('/', '-')))
    destination_path = os.path.join(destination, filename)
    logging.info("Data will be written to '%s'", destination_path)

    cfg = utils.get_database_configuration()
    mysql = cfg['mysql_online']
    connection = pymysql.connect(host=mysql['host'],
                                 user=mysql['user'],
                                 passwd=mysql['passwd'],
                                 db=mysql['db'],
                                 cursorclass=pymysql.cursors.DictCursor)
    cursor = connection.cursor()

    formulas = get_formulas(cursor, dataset)
    logging.info('Received %i formulas.', len(formulas))
    handwriting_datasets = []
    formula_id2latex = {}

    # Go through each formula and download every raw_data instance
    for formula in formulas:
        formula_id2latex[formula['id']] = formula['formula_in_latex']
        sql = ((
            "SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, "
            "`wild_point_count`, `missing_line`, `user_id`, "
            "`display_name` "
            "FROM `wm_raw_draw_data` "
            "JOIN `wm_users` ON "
            "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) "
            "WHERE `accepted_formula_id` = %s "
            # "AND `display_name` LIKE 'MfrDB::%%'"
        ) % str(formula['id']))
        cursor.execute(sql)
        raw_datasets = cursor.fetchall()
        logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets))
        for raw_data in raw_datasets:
            try:
                handwriting = HandwrittenData(
                    raw_data['data'],
                    formula['id'],
                    raw_data['id'],
                    formula['formula_in_latex'],
                    raw_data['wild_point_count'],
                    raw_data['missing_line'],
                    raw_data['user_id'],
                    user_name=raw_data['display_name'])
                handwriting_datasets.append({
                    'handwriting':
                    handwriting,
                    'id':
                    raw_data['id'],
                    'formula_id':
                    formula['id'],
                    'formula_in_latex':
                    formula['formula_in_latex'],
                    'is_in_testset':
                    raw_data['is_in_testset']
                })
            except Exception as e:
                logging.info("Raw data id: %s", raw_data['id'])
                logging.info(e)
    pickle.dump(
        {
            'handwriting_datasets': handwriting_datasets,
            'formula_id2latex': formula_id2latex
        }, open(destination_path, "wb"), 2)

    if renderings:
        logging.info("Start downloading SVG renderings...")
        svgfolder = tempfile.mkdtemp()
        sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1
                 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id
                 AND t1.creation_time < t2.creation_time
                 WHERE t2.id is null"""
        cursor.execute(sql)
        formulas = cursor.fetchall()
        logging.info("Create svg...")
        for formula in formulas:
            filename = os.path.join(svgfolder,
                                    "%s.svg" % str(formula['formula_id']))
            with open(filename, 'wb') as temp_file:
                temp_file.write(formula['svg'])
        logging.info("Tar at %s", os.path.abspath("renderings.tar"))

        tar = tarfile.open("renderings.tar.bz2", "w:bz2")
        for fn in os.listdir(svgfolder):
            filename = os.path.join(svgfolder, fn)
            if os.path.isfile(filename):
                print(filename)
                tar.add(filename, arcname=os.path.basename(filename))
        tar.close()
Exemple #37
0
def sync_directory(directory):
    """Sync a directory. Return if syncing was successful."""
    # Developers should read
    # https://www.dropbox.com/developers/core/start/python
    # before modifying the following code
    cfg = utils.get_project_configuration()

    # Information about files in this folder
    project_root = utils.get_project_root()
    directory_information_file = os.path.join(project_root,
                                              directory, "info.yml")
    if not os.path.isfile(directory_information_file):  # create if not exists
        with open(directory_information_file, 'w') as ymlfile:
            ymlfile.write(yaml.dump([]))

    # Dropbox stuff
    APP_KEY = cfg['dropbox_app_key']
    APP_SECRET = cfg['dropbox_app_secret']

    flow = dropbox.client.DropboxOAuth2FlowNoRedirect(APP_KEY, APP_SECRET)
    authorize_url = flow.start()
    webbrowser.open_new_tab(authorize_url)
    print("1. Go to: " + authorize_url)
    print("2. Click 'Allow' (you might have to log in first)")
    print("3. Copy the authorization code.")
    access_token = input_string().strip()

    try:
        # This will fail if the user enters an invalid authorization code
        access_token, user_id = flow.finish(access_token)
        client = dropbox.client.DropboxClient(access_token)
    except Exception as e:
        logging.error("Dropbox connection error: %s", e)
        return False

    # Get all local files
    local_path = os.path.join(project_root, directory)
    files = [f for f in os.listdir(local_path)
             if os.path.isfile(os.path.join(local_path, f))]
    files = filter(lambda n: n.endswith(".pickle"), files)

    new_yaml_content = []

    # upload them
    for filename in files:
        file_meta = {}
        file_meta['filename'] = filename
        file_meta['online_path'] = os.path.join(directory, filename)
        local_path_file = os.path.join(local_path, filename)
        file_meta['md5'] = hashlib.md5(open(local_path_file,
                                            'rb').read()).hexdigest()
        new_yaml_content.append(file_meta)
        file_meta['url'] = dropbox_upload(filename, directory, client)
        if not file_meta['url']:
            return False

    # TODO: Remove all files from Dropbox that are not in local folder

    # Update YAML file
    with open(directory_information_file, 'w') as ymlfile:
        ymlfile.write(yaml.dump(new_yaml_content, default_flow_style=False))

    return True
Exemple #38
0
def main(model_folder):
    """
    Parameters
    ----------
    model_folder : str
        Path to a folder in which a model (json file) is.
    """
    a = yaml.load(open(utils.get_latest_in_folder(model_folder, ".json")))

    layers = []
    filenames = ["model.yml", "input_semantics.csv", "output_semantics.csv",
                 "preprocessing.yml", "features.yml"]

    # Create input_semantics.csv
    inputs = a['layers'][0]['_props']['n_visible']
    with open('input_semantics.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in range(inputs):
            spamwriter.writerow(["inputs %i" % row])

    # Create output_semantics.csv
    outputs = a['layers'][-1]['_props']['n_hidden']
    create_output_semantics(model_folder, outputs)

    # Write layers
    for layer_index, layer in enumerate(a['layers']):
        W = _as_ndarray(layer['params']['W'])
        Wfile = h5py.File('W%i.hdf5' % layer_index, 'w')
        Wfile.create_dataset(Wfile.id.name, data=W)
        Wfile.close()

        b = _as_ndarray(layer['params']['b'])
        bfile = h5py.File('b%i.hdf5' % layer_index, 'w')
        bfile.create_dataset(bfile.id.name, data=b)
        bfile.close()

        activation = layer['_props']['activation']
        activation = activation.replace('sigmoid', 'Sigmoid')
        activation = activation.replace('softmax', 'Softmax')
        layers.append({'W': {'size': list(W.shape),
                             'filename': 'W%i.hdf5' % layer_index},
                       'b': {'size': list(b.shape),
                             'filename': 'b%i.hdf5' % layer_index},
                       'activation': activation})
        filenames.append('W%i.hdf5' % layer_index)
        filenames.append('b%i.hdf5' % layer_index)

    model = {'type': 'mlp', 'layers': layers}

    with open("model.yml", 'w') as f:
        yaml.dump(model, f, default_flow_style=False)

    logging.info("Get preprocessing.yml")
    # Get model folder
    model_description_file = os.path.join(model_folder, "info.yml")
    with open(model_description_file, 'r') as ymlfile:
        model_description = yaml.load(ymlfile)

    # Get feature folder
    feature_description_file = os.path.join(utils.get_project_root(),
                                            model_description["data-source"],
                                            "info.yml")
    with open(feature_description_file, 'r') as ymlfile:
        feature_description = yaml.load(ymlfile)

    with open("features.yml", 'w') as f:
        yaml.dump(feature_description, f, default_flow_style=False)

    # Get preprocessing folder
    preprocessing_description_file = os.path.join(utils.get_project_root(),
                                                  feature_description["data-source"],
                                                  "info.yml")
    with open(preprocessing_description_file, 'r') as ymlfile:
        preprocessing_description = yaml.load(ymlfile)

    with open("preprocessing.yml", 'w') as f:
        yaml.dump(preprocessing_description, f, default_flow_style=False)

    with tarfile.open("model.tar", "w:") as tar:
        for name in filenames:
            tar.add(name)

    # Remove temporary files which are now in tar file
    for filename in filenames:
        os.remove(filename)
Exemple #39
0
def main(model_folder):
    """
    Parameters
    ----------
    model_folder : str
        Path to a folder in which a model (json file) is.
    """
    with open(utils.get_latest_in_folder(model_folder, ".json")) as fp:
        a = yaml.safe_load(fp)

    layers = []
    filenames = [
        "model.yml",
        "input_semantics.csv",
        "output_semantics.csv",
        "preprocessing.yml",
        "features.yml",
    ]

    # Create input_semantics.csv
    inputs = a["layers"][0]["_props"]["n_visible"]
    with open("input_semantics.csv", "wb") as csvfile:
        spamwriter = csv.writer(
            csvfile, delimiter=";", quotechar="|", quoting=csv.QUOTE_MINIMAL
        )
        for row in range(inputs):
            spamwriter.writerow(["inputs %i" % row])

    # Create output_semantics.csv
    outputs = a["layers"][-1]["_props"]["n_hidden"]
    create_output_semantics(model_folder, outputs)

    # Write layers
    for layer_index, layer in enumerate(a["layers"]):
        w = _as_ndarray(layer["params"]["W"])
        wfile = h5py.File(f"W{layer_index}.hdf5", "w")
        wfile.create_dataset(wfile.id.name, data=w)
        wfile.close()

        b = _as_ndarray(layer["params"]["b"])
        bfile = h5py.File(f"b{layer_index}.hdf5", "w")
        bfile.create_dataset(bfile.id.name, data=b)
        bfile.close()

        activation = layer["_props"]["activation"]
        activation = activation.replace("sigmoid", "Sigmoid")
        activation = activation.replace("softmax", "Softmax")
        layers.append(
            {
                "W": {"size": list(w.shape), "filename": "W%i.hdf5" % layer_index},
                "b": {"size": list(b.shape), "filename": "b%i.hdf5" % layer_index},
                "activation": activation,
            }
        )
        filenames.append(f"W{layer_index}.hdf5")
        filenames.append(f"b{layer_index}.hdf5")

    model = {"type": "mlp", "layers": layers}

    with open("model.yml", "w") as f:
        yaml.dump(model, f, default_flow_style=False)

    logging.info("Get preprocessing.yml")
    # Get model folder
    model_description_file = os.path.join(model_folder, "info.yml")
    with open(model_description_file) as ymlfile:
        model_description = yaml.safe_load(ymlfile)

    # Get feature folder
    feature_description_file = os.path.join(
        utils.get_project_root(), model_description["data-source"], "info.yml"
    )
    with open(feature_description_file) as ymlfile:
        feature_description = yaml.safe_load(ymlfile)

    with open("features.yml", "w") as f:
        yaml.dump(feature_description, f, default_flow_style=False)

    # Get preprocessing folder
    preprocessing_description_file = os.path.join(
        utils.get_project_root(), feature_description["data-source"], "info.yml"
    )
    with open(preprocessing_description_file) as ymlfile:
        preprocessing_description = yaml.safe_load(ymlfile)

    with open("preprocessing.yml", "w") as f:
        yaml.dump(preprocessing_description, f, default_flow_style=False)

    with tarfile.open("model.tar", "w:") as tar:
        for name in filenames:
            tar.add(name)

    # Remove temporary files which are now in tar file
    for filename in filenames:
        os.remove(filename)
Exemple #40
0
def create_report(true_data, eval_data, index2latex, n, merge=True):
    """
    Parameters
    ----------
    true_data : list
        Labels
    eval_data : list
        Predicted labels
    index2latex : dict
        Maps the output neurons index to LaTeX
    n : TODO?
    merge : bool
        If set to True, some symbols like \sum and \Sigma will not be count as
        errors when confused.
    """
    # Gather data
    correct = []
    wrong = []
    # Get MER classes
    merge_cfg_path = pkg_resources.resource_filename('hwrt', 'misc/')
    merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml")
    merge_data = yaml.load(open(merge_cfg_file, 'r'))
    # Make classes
    confusing = make_all(merge_data)
    if not merge:
        confusing = []

    # Get false/true negative/positive for each symbol
    statistical = {}
    possible_keys = []

    assert len(true_data) > 0, "true_data was empty"
    assert len(true_data) == len(eval_data), \
        ("len(true_data)=%i, len(eval_data)=%i" %
         (len(true_data), len(eval_data)))
    for known, evaluated in zip(true_data, eval_data):
        evaluated_t1 = evaluated.keys()[0]
        if known['index'] not in statistical:
            statistical[known['index']] = {
                'FP': 0,
                'TP': 0,
                'FN': 0,
                'TN': 0,
                'latex': index2latex[known['index']]
            }
            possible_keys.append(known['index'])
        for key in evaluated.keys():
            if key not in statistical:
                if key not in index2latex:
                    logging.error(
                        "Key '%s' is not in index2latex. Did you "
                        "probaly define a too small number of "
                        "outputnodes?", str(key))
                    logging.error("index2latex.keys(): %s",
                                  str(index2latex.keys()))
                    sys.exit(-1)
                statistical[key] = {
                    'FP': 0,
                    'TP': 0,
                    'FN': 0,
                    'TN': 0,
                    'latex': index2latex[key]
                }
                possible_keys.append(key)
        if known['index'] in evaluated.keys()[:n]:
            statistical[known['index']]['TP'] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known['index']:
                    statistical[key]['TN'] += 1
        elif (index2latex[known['index']],
              index2latex[evaluated_t1]) in confusing:
            # Some confusions are ok!
            statistical[known['index']]['TP'] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known['index']:
                    statistical[key]['TN'] += 1
        else:
            for key in possible_keys:
                if key != known['index']:
                    if key not in evaluated.keys()[:n]:
                        statistical[key]['TN'] += 1
                    else:
                        statistical[key]['FP'] += 1
                else:
                    statistical[key]['FN'] += 1
            formula_id = index2latex[evaluated_t1]
            known['confused'] = formula_id  # That's an index!
            wrong.append(known)
    classification_error = (len(wrong) / float(len(wrong) + len(correct)))
    logging.info("Classification error (n=%i, MER=%r): %0.4f (%i of %i wrong)",
                 n, merge, classification_error, len(wrong), len(eval_data))

    # Get the data
    errors_by_correct_classification = DefaultOrderedDict(list)
    errors_by_wrong_classification = DefaultOrderedDict(list)
    for el in wrong:
        errors_by_correct_classification[el['latex']].append(el)
        errors_by_wrong_classification[el['confused']].append(el)

    # Sort errors_by_correct_classification
    tmp = sorted(errors_by_correct_classification.iteritems(),
                 key=lambda n: len(n[1]),
                 reverse=True)
    errors_by_correct_classification = OrderedDict(tmp)
    for key in errors_by_correct_classification:
        tmp = sorted(errors_by_correct_classification[key],
                     key=lambda n: n['confused'])
        errors_by_correct_classification[key] = tmp
    # Sort errors_by_wrong_classification
    tmp = sorted(errors_by_wrong_classification.iteritems(),
                 key=lambda n: len(n[1]),
                 reverse=True)
    errors_by_wrong_classification = OrderedDict(tmp)
    for key in errors_by_wrong_classification:
        tmp = sorted(errors_by_wrong_classification[key],
                     key=lambda n: n['latex'])
        errors_by_wrong_classification[key] = tmp

    # Get the tempalte
    project_root = utils.get_project_root()
    template_path = pkg_resources.resource_filename('hwrt', 'templates/')
    template = os.path.join(template_path, "classification-error-report.html")
    with open(template) as f:
        template = f.read()

    # Find right place for report file
    time_prefix = time.strftime("%Y-%m-%d-%H-%M")
    directory = os.path.join(project_root, "reports")
    if not os.path.exists(directory):
        os.makedirs(directory)
    target = os.path.join(
        project_root, ("reports/"
                       "%s-classification-error-report.html") % time_prefix)
    # Fill the template
    from jinja2 import FileSystemLoader
    from jinja2.environment import Environment
    env = Environment()
    env.loader = FileSystemLoader(template_path)
    t = env.get_template('classification-error-report.html')
    rendered = t.render(
        wrong=wrong,
        correct=correct,
        classification_error=classification_error,
        errors_by_correct_classification=errors_by_correct_classification,
        errors_by_wrong_classification=errors_by_wrong_classification,
        statistical=statistical)
    with open(target, "w") as f:
        f.write(rendered)
Exemple #41
0
def create_report(true_data, eval_data, index2latex, n, merge=True):
    r"""
    Parameters
    ----------
    true_data : list
        Labels
    eval_data : list
        Predicted labels
    index2latex : dict
        Maps the output neurons index to LaTeX
    n : TODO?
    merge : bool
        If set to True, some symbols like \sum and \Sigma will not be count as
        errors when confused.
    """
    # Gather data
    correct = []
    wrong = []
    # Get MER classes
    merge_cfg_path = pkg_resources.resource_filename(__name__, "misc/")
    merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml")
    with open(merge_cfg_file) as fp:
        merge_data = yaml.safe_load(fp)
    # Make classes
    confusing = make_all(merge_data)
    if not merge:
        confusing = []

    # Get false/true negative/positive for each symbol
    statistical = {}
    possible_keys = []

    assert len(true_data) > 0, "true_data was empty"
    assert len(true_data) == len(
        eval_data), "len(true_data)=%i, len(eval_data)=%i" % (
            len(true_data),
            len(eval_data),
        )
    for known, evaluated in zip(true_data, eval_data):
        evaluated_t1 = list(evaluated.keys())[0]
        if known["index"] not in statistical:
            statistical[known["index"]] = {
                "FP": 0,
                "TP": 0,
                "FN": 0,
                "TN": 0,
                "latex": index2latex[known["index"]],
            }
            possible_keys.append(known["index"])
        for key in list(evaluated.keys()):
            if key not in statistical:
                if key not in index2latex:
                    logger.error(f"Key '{key}' is not in index2latex. Did you "
                                 "probaly define a too small number of "
                                 "outputnodes?")
                    logger.error(f"index2latex.keys(): {index2latex.keys()}")
                    sys.exit(-1)
                statistical[key] = {
                    "FP": 0,
                    "TP": 0,
                    "FN": 0,
                    "TN": 0,
                    "latex": index2latex[key],
                }
                possible_keys.append(key)
        if known["index"] in list(evaluated.keys())[:n]:
            statistical[known["index"]]["TP"] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known["index"]:
                    statistical[key]["TN"] += 1
        elif (index2latex[known["index"]],
              index2latex[evaluated_t1]) in confusing:
            # Some confusions are ok!
            statistical[known["index"]]["TP"] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known["index"]:
                    statistical[key]["TN"] += 1
        else:
            for key in possible_keys:
                if key != known["index"]:
                    if key not in list(evaluated.keys())[:n]:
                        statistical[key]["TN"] += 1
                    else:
                        statistical[key]["FP"] += 1
                else:
                    statistical[key]["FN"] += 1
            formula_id = index2latex[evaluated_t1]
            known["confused"] = formula_id  # That's an index!
            wrong.append(known)
    classification_error = len(wrong) / float(len(wrong) + len(correct))
    logger.info(
        f"Classification error (n={n}, MER={merge}): "
        f"{classification_error:0.4f} ({len(wrong)} of {len(eval_data)} wrong)",
    )

    # Get the data
    errors_by_correct_classification = DefaultOrderedDict(list)
    errors_by_wrong_classification = DefaultOrderedDict(list)
    for el in wrong:
        errors_by_correct_classification[el["latex"]].append(el)
        errors_by_wrong_classification[el["confused"]].append(el)

    # Sort errors_by_correct_classification
    tmp = sorted(
        iter(errors_by_correct_classification.items()),
        key=lambda n: len(n[1]),
        reverse=True,
    )
    errors_by_correct_classification = OrderedDict(tmp)
    for key in errors_by_correct_classification:
        tmp = sorted(errors_by_correct_classification[key],
                     key=lambda n: n["confused"])
        errors_by_correct_classification[key] = tmp
    # Sort errors_by_wrong_classification
    tmp = sorted(
        iter(errors_by_wrong_classification.items()),
        key=lambda n: len(n[1]),
        reverse=True,
    )
    errors_by_wrong_classification = OrderedDict(tmp)
    for key in errors_by_wrong_classification:
        tmp = sorted(errors_by_wrong_classification[key],
                     key=lambda n: n["latex"])
        errors_by_wrong_classification[key] = tmp

    # Get the tempalte
    project_root = utils.get_project_root()
    template_path = pkg_resources.resource_filename("hwrt", "templates/")
    template = os.path.join(template_path, "classification-error-report.html")
    with open(template) as f:
        template = f.read()

    # Find right place for report file
    time_prefix = time.strftime("%Y-%m-%d-%H-%M")
    directory = os.path.join(project_root, "reports")
    if not os.path.exists(directory):
        os.makedirs(directory)
    target = os.path.join(
        project_root,
        f"reports/{time_prefix}-classification-error-report.html")
    # Fill the template
    # Third party modules
    from jinja2 import FileSystemLoader
    from jinja2.environment import Environment

    env = Environment()
    env.loader = FileSystemLoader(template_path)
    t = env.get_template("classification-error-report.html")
    rendered = t.render(
        wrong=wrong,
        correct=correct,
        classification_error=classification_error,
        errors_by_correct_classification=errors_by_correct_classification,
        errors_by_wrong_classification=errors_by_wrong_classification,
        statistical=statistical,
    )
    with open(target, "w") as f:
        f.write(rendered)
Exemple #42
0
def main(destination=os.path.join(utils.get_project_root(),
                                  "raw-datasets"),
         dataset='all',
         renderings=False):
    """Main part of the backup script."""
    time_prefix = time.strftime("%Y-%m-%d-%H-%M")
    filename = ("%s-handwriting_datasets-%s-raw.pickle" %
                (time_prefix, dataset.replace('/', '-')))
    destination_path = os.path.join(destination, filename)
    logging.info("Data will be written to '%s'", destination_path)

    cfg = utils.get_database_configuration()
    mysql = cfg['mysql_online']
    connection = pymysql.connect(host=mysql['host'],
                                 user=mysql['user'],
                                 passwd=mysql['passwd'],
                                 db=mysql['db'],
                                 cursorclass=pymysql.cursors.DictCursor)
    cursor = connection.cursor()

    formulas = get_formulas(cursor, dataset)
    logging.info('Received %i formulas.', len(formulas))
    handwriting_datasets = []
    formula_id2latex = {}

    # Go through each formula and download every raw_data instance
    for formula in formulas:
        formula_id2latex[formula['id']] = formula['formula_in_latex']
        sql = (("SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, "
                "`wild_point_count`, `missing_line`, `user_id`, "
                "`display_name` "
                "FROM `wm_raw_draw_data` "
                "JOIN `wm_users` ON "
                "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) "
                "WHERE `accepted_formula_id` = %s "
                # "AND `display_name` LIKE 'MfrDB::%%'"
                ) %
               str(formula['id']))
        cursor.execute(sql)
        raw_datasets = cursor.fetchall()
        logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets))
        for raw_data in raw_datasets:
            try:
                handwriting = HandwrittenData(raw_data['data'],
                                              formula['id'],
                                              raw_data['id'],
                                              formula['formula_in_latex'],
                                              raw_data['wild_point_count'],
                                              raw_data['missing_line'],
                                              raw_data['user_id'],
                                              user_name=raw_data['display_name'])
                handwriting_datasets.append({'handwriting': handwriting,
                                             'id': raw_data['id'],
                                             'formula_id': formula['id'],
                                             'formula_in_latex':
                                             formula['formula_in_latex'],
                                             'is_in_testset':
                                             raw_data['is_in_testset']})
            except Exception as e:
                logging.info("Raw data id: %s", raw_data['id'])
                logging.info(e)
    pickle.dump({'handwriting_datasets': handwriting_datasets,
                 'formula_id2latex': formula_id2latex},
                open(destination_path, "wb"),
                2)

    if renderings:
        logging.info("Start downloading SVG renderings...")
        svgfolder = tempfile.mkdtemp()
        sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1
                 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id
                 AND t1.creation_time < t2.creation_time
                 WHERE t2.id is null"""
        cursor.execute(sql)
        formulas = cursor.fetchall()
        logging.info("Create svg...")
        for formula in formulas:
            filename = os.path.join(svgfolder,
                                    "%s.svg" % str(formula['formula_id']))
            with open(filename, 'wb') as temp_file:
                temp_file.write(formula['svg'])
        logging.info("Tar at %s", os.path.abspath("renderings.tar"))

        tar = tarfile.open("renderings.tar.bz2", "w:bz2")
        for fn in os.listdir(svgfolder):
            filename = os.path.join(svgfolder, fn)
            if os.path.isfile(filename):
                print(filename)
                tar.add(filename, arcname=os.path.basename(filename))
        tar.close()
def test_get_parameters():
    # TODO: nose.proxy.UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80
    #                                      in position 0: invalid start byte
    small = os.path.join(utils.get_project_root(),
                         "preprocessed/small-baseline")
    preprocess_dataset.get_parameters(small)
Exemple #44
0
def create_report(true_data, eval_data, index2latex, n, merge=True):
    """
    Parameters
    ----------
    true_data : list
        Labels
    eval_data : list
        Predicted labels
    index2latex : dict
        Maps the output neurons index to LaTeX
    n : TODO?
    merge : bool
        If set to True, some symbols like \sum and \Sigma will not be count as
        errors when confused.
    """
    # Gather data
    correct = []
    wrong = []
    # Get MER classes
    merge_cfg_path = pkg_resources.resource_filename('hwrt', 'misc/')
    merge_cfg_file = os.path.join(merge_cfg_path, "merge.yml")
    merge_data = yaml.load(open(merge_cfg_file, 'r'))
    # Make classes
    confusing = make_all(merge_data)
    if not merge:
        confusing = []

    # Get false/true negative/positive for each symbol
    statistical = {}
    possible_keys = []

    assert len(true_data) > 0, "true_data was empty"
    assert len(true_data) == len(eval_data), \
        ("len(true_data)=%i, len(eval_data)=%i" %
         (len(true_data), len(eval_data)))
    for known, evaluated in zip(true_data, eval_data):
        evaluated_t1 = evaluated.keys()[0]
        if known['index'] not in statistical:
            statistical[known['index']] = {'FP': 0,
                                           'TP': 0,
                                           'FN': 0,
                                           'TN': 0,
                                           'latex': index2latex[known['index']]
                                           }
            possible_keys.append(known['index'])
        for key in evaluated.keys():
            if key not in statistical:
                if key not in index2latex:
                    logging.error("Key '%s' is not in index2latex. Did you "
                                  "probaly define a too small number of "
                                  "outputnodes?", str(key))
                    logging.error("index2latex.keys(): %s",
                                  str(index2latex.keys()))
                    sys.exit(-1)
                statistical[key] = {'FP': 0,
                                    'TP': 0,
                                    'FN': 0,
                                    'TN': 0,
                                    'latex': index2latex[key]}
                possible_keys.append(key)
        if known['index'] in evaluated.keys()[:n]:
            statistical[known['index']]['TP'] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known['index']:
                    statistical[key]['TN'] += 1
        elif (index2latex[known['index']],
              index2latex[evaluated_t1]) in confusing:
            # Some confusions are ok!
            statistical[known['index']]['TP'] += 1
            correct.append(known)
            for key in possible_keys:
                if key != known['index']:
                    statistical[key]['TN'] += 1
        else:
            for key in possible_keys:
                if key != known['index']:
                    if key not in evaluated.keys()[:n]:
                        statistical[key]['TN'] += 1
                    else:
                        statistical[key]['FP'] += 1
                else:
                    statistical[key]['FN'] += 1
            formula_id = index2latex[evaluated_t1]
            known['confused'] = formula_id  # That's an index!
            wrong.append(known)
    classification_error = (len(wrong) / float(len(wrong) + len(correct)))
    logging.info("Classification error (n=%i, MER=%r): %0.4f (%i of %i wrong)",
                 n, merge, classification_error, len(wrong), len(eval_data))

    # Get the data
    errors_by_correct_classification = DefaultOrderedDict(list)
    errors_by_wrong_classification = DefaultOrderedDict(list)
    for el in wrong:
        errors_by_correct_classification[el['latex']].append(el)
        errors_by_wrong_classification[el['confused']].append(el)

    # Sort errors_by_correct_classification
    tmp = sorted(errors_by_correct_classification.iteritems(),
                 key=lambda n: len(n[1]),
                 reverse=True)
    errors_by_correct_classification = OrderedDict(tmp)
    for key in errors_by_correct_classification:
        tmp = sorted(errors_by_correct_classification[key],
                     key=lambda n: n['confused'])
        errors_by_correct_classification[key] = tmp
    # Sort errors_by_wrong_classification
    tmp = sorted(errors_by_wrong_classification.iteritems(),
                 key=lambda n: len(n[1]),
                 reverse=True)
    errors_by_wrong_classification = OrderedDict(tmp)
    for key in errors_by_wrong_classification:
        tmp = sorted(errors_by_wrong_classification[key],
                     key=lambda n: n['latex'])
        errors_by_wrong_classification[key] = tmp

    # Get the tempalte
    project_root = utils.get_project_root()
    template_path = pkg_resources.resource_filename('hwrt', 'templates/')
    template = os.path.join(template_path, "classification-error-report.html")
    with open(template) as f:
        template = f.read()

    # Find right place for report file
    time_prefix = time.strftime("%Y-%m-%d-%H-%M")
    directory = os.path.join(project_root, "reports")
    if not os.path.exists(directory):
        os.makedirs(directory)
    target = os.path.join(project_root,
                          ("reports/"
                           "%s-classification-error-report.html") %
                          time_prefix)
    # Fill the template
    from jinja2 import FileSystemLoader
    from jinja2.environment import Environment
    env = Environment()
    env.loader = FileSystemLoader(template_path)
    t = env.get_template('classification-error-report.html')
    rendered = t.render(wrong=wrong, correct=correct,
                        classification_error=classification_error,
                        errors_by_correct_classification=
                        errors_by_correct_classification,
                        errors_by_wrong_classification=
                        errors_by_wrong_classification,
                        statistical=statistical)
    with open(target, "w") as f:
        f.write(rendered)
Exemple #45
0
def get_parameters_test():
    # TODO: nose.proxy.UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80
    #                                      in position 0: invalid start byte
    small = os.path.join(utils.get_project_root(),
                         "preprocessed/small-baseline")
    preprocess_dataset.get_parameters(small)
Exemple #46
0
def execution_test():
    small = os.path.join(utils.get_project_root(), "models/small-baseline")
    create_model.main(small)