def main(): """ Get a list of formulas. Parameters ---------- cursor : a database cursor dataset : string Either 'all' or a path to a yaml symbol file. Returns ------- list : A list of formulas """ cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor, charset='utf8') cursor = connection.cursor() sql = ( "SELECT `id`, `formula_in_latex` FROM `wm_formula` " # "WHERE `formula_type` = 'single symbol' " "WHERE `formula_type` = 'nesting symbol' " "ORDER BY `formula_in_latex` ASC") cursor.execute(sql) symbols = cursor.fetchall() store_symbols(symbols)
def test_execution(): """Test if the functions execute at all.""" utils.get_project_root() utils.get_latest_model(".", "model") utils.get_latest_working_model(".") utils.get_latest_successful_run(".") assert utils.get_readable_time(123) == "123ms" assert utils.get_readable_time(1000 * 30) == "30s 0ms" assert utils.get_readable_time(1000 * 60) == "1 minutes 0s 0ms" assert utils.get_readable_time(1000 * 60 * 60) == "1h, 0 minutes 0s 0ms" assert utils.get_readable_time(2 * 1000 * 60 * 60) == "2h, 0 minutes 0s 0ms" assert utils.get_readable_time(25 * 1000 * 60 * 60 + 3) == "25h, 0 minutes 0s 3ms" utils.print_status(3, 1, 123) utils.get_nntoolkit() utils.get_database_config_file() utils.get_database_configuration() assert utils.sizeof_fmt(1) == "1.0 bytes" assert utils.sizeof_fmt(1111) == "1.1 KB"
def main(): cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] logging.info("Start dumping structure and constraints...") dir_s = "/home/moose/GitHub/write-math" tables = dump_structure(mysql, prefix='wm_', filename_strucutre=(("%s/database/structure/" "write-math.sql") % dir_s), filename_constraints=(("%s/database/structure/" "foreign-keys.sql") % dir_s)) logging.info(tables)
def execution_test(): """Test if the functions execute at all.""" utils.get_project_root() utils.get_latest_model(".", "model") utils.get_latest_working_model(".") utils.get_latest_successful_run(".") nose.tools.assert_equal(utils.get_readable_time(123), "123ms") nose.tools.assert_equal(utils.get_readable_time(1000*30), "30s 0ms") nose.tools.assert_equal(utils.get_readable_time(1000*60), "1 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(1000*60*60), "1h, 0 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(2*1000*60*60), "2h, 0 minutes 0s 0ms") nose.tools.assert_equal(utils.get_readable_time(25*1000*60*60+3), "25h, 0 minutes 0s 3ms") utils.print_status(3, 1, 123) utils.get_nntoolkit() utils.get_database_config_file() utils.get_database_configuration() nose.tools.assert_equal(utils.sizeof_fmt(1), "1.0 bytes") nose.tools.assert_equal(utils.sizeof_fmt(1111), "1.1 KB")
def main(destination=os.path.join(utils.get_project_root(), "raw-datasets"), dataset='all', renderings=False): """Main part of the backup script.""" time_prefix = time.strftime("%Y-%m-%d-%H-%M") filename = ("%s-handwriting_datasets-%s-raw.pickle" % (time_prefix, dataset.replace('/', '-'))) destination_path = os.path.join(destination, filename) logging.info("Data will be written to '%s'", destination_path) cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() formulas = get_formulas(cursor, dataset) logging.info('Received %i formulas.', len(formulas)) handwriting_datasets = [] formula_id2latex = {} # Go through each formula and download every raw_data instance for formula in formulas: formula_id2latex[formula['id']] = formula['formula_in_latex'] sql = (( "SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData( raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id'], user_name=raw_data['display_name']) handwriting_datasets.append({ 'handwriting': handwriting, 'id': raw_data['id'], 'formula_id': formula['id'], 'formula_in_latex': formula['formula_in_latex'], 'is_in_testset': raw_data['is_in_testset'] }) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) pickle.dump( { 'handwriting_datasets': handwriting_datasets, 'formula_id2latex': formula_id2latex }, open(destination_path, "wb"), 2) if renderings: logging.info("Start downloading SVG renderings...") svgfolder = tempfile.mkdtemp() sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id AND t1.creation_time < t2.creation_time WHERE t2.id is null""" cursor.execute(sql) formulas = cursor.fetchall() logging.info("Create svg...") for formula in formulas: filename = os.path.join(svgfolder, "%s.svg" % str(formula['formula_id'])) with open(filename, 'wb') as temp_file: temp_file.write(formula['svg']) logging.info("Tar at %s", os.path.abspath("renderings.tar")) tar = tarfile.open("renderings.tar.bz2", "w:bz2") for fn in os.listdir(svgfolder): filename = os.path.join(svgfolder, fn) if os.path.isfile(filename): print(filename) tar.add(filename, arcname=os.path.basename(filename)) tar.close()
def main(dataset='all'): """ Parameters ---------- dataset : string Either 'all' or a path to a yaml symbol file. """ cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # TODO: no formulas, only single-symbol ones. formulas = get_formulas(cursor, dataset) prob = {} # Go through each formula and download every raw_data instance for formula in formulas: stroke_counts = [] recordings = [] sql = (("SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " "AND wild_point_count=0 " "AND has_correction=0 " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData(raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id']) stroke_counts.append(len(handwriting.get_pointlist())) recordings.append(handwriting) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) if len(stroke_counts) > 0: logging.info("\t[%i - %i]", min(stroke_counts), max(stroke_counts)) median = numpy.median(stroke_counts) logging.info("\tMedian: %0.2f\tMean: %0.2f\tstd: %0.2f", median, numpy.mean(stroke_counts), numpy.std(stroke_counts)) # Make prob s = sorted(Counter(stroke_counts).items(), key=lambda n: n[1], reverse=True) key = formula['formula_in_latex'] prob[key] = {} for stroke_nr, count in s: prob[key][stroke_nr] = count # Outliers modes = get_modes(stroke_counts) logging.info("\tModes: %s", modes) exceptions = [] for rec in recordings: if len(rec.get_pointlist()) not in modes: url = (("http://www.martin-thoma.de/" "write-math/view/?raw_data_id=%i - " "%i strokes") % (rec.raw_data_id, len(rec.get_pointlist()))) dist = get_dist(len(rec.get_pointlist()), modes) exceptions.append((url, len(rec.get_pointlist()), dist)) print_exceptions(exceptions, max_print=10) else: logging.debug("No recordings for symbol " "'http://www.martin-thoma.de/" "write-math/symbol/?id=%s'.", formula['id']) write_prob(prob, "prob_stroke_count_by_symbol.yml")
for el in add_new: print("\thttp://write-math.com/view/?raw_data_id=%i" % el) for rid in add_new: sql = ("UPDATE `wm_raw_draw_data` SET `is_in_testset`=1 " "WHERE `id` = %i LIMIT 1") % rid cursor.execute(sql) connection.commit() def get_parser(): """Return the parser object for this script.""" from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-s", "--symbol", dest="symbol_filename", type=lambda x: utils.is_valid_file(parser, x), required=True, help="symbol yml file", metavar="FILE") return parser if __name__ == '__main__': args = get_parser().parse_args() cfg = utils.get_database_configuration() if 'mysql_online' in cfg: main(cfg['mysql_online'], args.symbol_filename) if 'mysql_local' in cfg: main(cfg['mysql_local'], args.symbol_filename)
def main(cfg, raw_data_start_id): cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # Get formulas print("Get formulas") sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` WHERE `id` > %s") cursor.execute(sql, (raw_data_start_id, )) formulas = cursor.fetchall() formulaid2latex = {} for el in formulas: formulaid2latex[el['id']] = el['formula_in_latex'] preprocessing_queue = [preprocessing.ScaleAndShift(), # preprocessing.Douglas_peucker(EPSILON=0.2), # preprocessing.Space_evenly(number=100, # kind='cubic') ] checked_formulas = 0 checked_raw_data_instances = 0 for formula_id in formulaid2latex.keys(): if formula_id == 1: # This formula id is for trash. No need to look at it. continue # Get data print("Get data for formula_id %i (%s)" % (formula_id, formulaid2latex[formula_id]) ) sql = ("SELECT `id`, `data`, `accepted_formula_id`, " "`wild_point_count`, `missing_line`, `has_hook`, " "`has_too_long_line`, `is_image`, `administrator_edit`, " "`other_problem`, `has_interrupted_line` " "FROM `wm_raw_draw_data` " "WHERE `accepted_formula_id` = %i " "ORDER BY `administrator_edit` DESC, " "`creation_date` ASC;") % formula_id cursor.execute(sql) raw_datasets = cursor.fetchall() print("Raw datasets: %i" % len(raw_datasets)) checked_raw_data_instances += len(raw_datasets) checked_formulas += 1 if len(raw_datasets) < 100: continue for i, data in enumerate(raw_datasets): if data['data'] == "[]": continue B = HandwrittenDataM(data['data'], data['accepted_formula_id'], data['wild_point_count'], data['missing_line'], data['has_hook'], data['has_too_long_line'], data['is_image'], data['other_problem'], data['has_interrupted_line'], data['id'], formulaid2latex[formula_id]) B.preprocessing(preprocessing_queue) Bs = deepcopy(B) Bs.preprocessing([preprocessing.DotReduction(0.01)]) if B != Bs: before_pointcount = sum([len(line) for line in B.get_pointlist()]) after_pointcount = sum([len(line) for line in Bs.get_pointlist()]) print("Reduced %i lines to %i lines." % (len(B.get_pointlist()), len(Bs.get_pointlist()))) print("Reduced %i points to %i points." % (before_pointcount, after_pointcount)) if before_pointcount - after_pointcount > 2: B.show() Bs.show() print("[Status] Checked formulas: %i of %i" % (checked_formulas, len(formulaid2latex))) print("[Status] Checked raw_data_instances: %i" % checked_raw_data_instances) print("done")
def main(dataset='all'): """ Parameters ---------- dataset : string Either 'all' or a path to a yaml symbol file. """ cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # TODO: no formulas, only single-symbol ones. formulas = get_formulas(cursor, dataset) prob = {} # Go through each formula and download every raw_data instance for formula in formulas: stroke_counts = [] recordings = [] sql = (( "SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " "AND wild_point_count=0 " "AND has_correction=0 " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData(raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id']) stroke_counts.append(len(handwriting.get_pointlist())) recordings.append(handwriting) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) if len(stroke_counts) > 0: logging.info("\t[%i - %i]", min(stroke_counts), max(stroke_counts)) median = numpy.median(stroke_counts) logging.info("\tMedian: %0.2f\tMean: %0.2f\tstd: %0.2f", median, numpy.mean(stroke_counts), numpy.std(stroke_counts)) # Make prob s = sorted(Counter(stroke_counts).items(), key=lambda n: n[1], reverse=True) key = formula['formula_in_latex'] prob[key] = {} for stroke_nr, count in s: prob[key][stroke_nr] = count # Outliers modes = get_modes(stroke_counts) logging.info("\tModes: %s", modes) exceptions = [] for rec in recordings: if len(rec.get_pointlist()) not in modes: url = (("http://www.martin-thoma.de/" "write-math/view/?raw_data_id=%i - " "%i strokes") % (rec.raw_data_id, len(rec.get_pointlist()))) dist = get_dist(len(rec.get_pointlist()), modes) exceptions.append((url, len(rec.get_pointlist()), dist)) print_exceptions(exceptions, max_print=10) else: logging.debug( "No recordings for symbol " "'http://www.martin-thoma.de/" "write-math/symbol/?id=%s'.", formula['id']) write_prob(prob, "prob_stroke_count_by_symbol.yml")
def main(cfg, raw_data_start_id): cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # Get formulas print("Get formulas") sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` WHERE `id` > %s") cursor.execute(sql, (raw_data_start_id, )) formulas = cursor.fetchall() formulaid2latex = {} for el in formulas: formulaid2latex[el['id']] = el['formula_in_latex'] preprocessing_queue = [ preprocessing.ScaleAndShift(), # preprocessing.Douglas_peucker(EPSILON=0.2), # preprocessing.Space_evenly(number=100, # kind='cubic') ] checked_formulas = 0 checked_raw_data_instances = 0 for formula_id in formulaid2latex.keys(): if formula_id == 1: # This formula id is for trash. No need to look at it. continue # Get data print("Get data for formula_id %i (%s)" % (formula_id, formulaid2latex[formula_id])) sql = ("SELECT `id`, `data`, `accepted_formula_id`, " "`wild_point_count`, `missing_line`, `has_hook`, " "`has_too_long_line`, `is_image`, `administrator_edit`, " "`other_problem`, `has_interrupted_line` " "FROM `wm_raw_draw_data` " "WHERE `accepted_formula_id` = %i " "ORDER BY `administrator_edit` DESC, " "`creation_date` ASC;") % formula_id cursor.execute(sql) raw_datasets = cursor.fetchall() print("Raw datasets: %i" % len(raw_datasets)) checked_raw_data_instances += len(raw_datasets) checked_formulas += 1 if len(raw_datasets) < 100: continue for i, data in enumerate(raw_datasets): if data['data'] == "[]": continue B = HandwrittenDataM(data['data'], data['accepted_formula_id'], data['wild_point_count'], data['missing_line'], data['has_hook'], data['has_too_long_line'], data['is_image'], data['other_problem'], data['has_interrupted_line'], data['id'], formulaid2latex[formula_id]) B.preprocessing(preprocessing_queue) Bs = deepcopy(B) Bs.preprocessing([preprocessing.DotReduction(0.01)]) if B != Bs: before_pointcount = sum( [len(line) for line in B.get_pointlist()]) after_pointcount = sum( [len(line) for line in Bs.get_pointlist()]) print("Reduced %i lines to %i lines." % (len(B.get_pointlist()), len(Bs.get_pointlist()))) print("Reduced %i points to %i points." % (before_pointcount, after_pointcount)) if before_pointcount - after_pointcount > 2: B.show() Bs.show() print("[Status] Checked formulas: %i of %i" % (checked_formulas, len(formulaid2latex))) print("[Status] Checked raw_data_instances: %i" % checked_raw_data_instances) print("done")
def main(): cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] find_wrong_count(mysql)
def main(destination=os.path.join(utils.get_project_root(), "raw-datasets"), dataset='all', renderings=False): """Main part of the backup script.""" time_prefix = time.strftime("%Y-%m-%d-%H-%M") filename = ("%s-handwriting_datasets-%s-raw.pickle" % (time_prefix, dataset.replace('/', '-'))) destination_path = os.path.join(destination, filename) logging.info("Data will be written to '%s'", destination_path) cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() formulas = get_formulas(cursor, dataset) logging.info('Received %i formulas.', len(formulas)) handwriting_datasets = [] formula_id2latex = {} # Go through each formula and download every raw_data instance for formula in formulas: formula_id2latex[formula['id']] = formula['formula_in_latex'] sql = (("SELECT `wm_raw_draw_data`.`id`, `data`, `is_in_testset`, " "`wild_point_count`, `missing_line`, `user_id`, " "`display_name` " "FROM `wm_raw_draw_data` " "JOIN `wm_users` ON " "(`wm_users`.`id` = `wm_raw_draw_data`.`user_id`) " "WHERE `accepted_formula_id` = %s " # "AND `display_name` LIKE 'MfrDB::%%'" ) % str(formula['id'])) cursor.execute(sql) raw_datasets = cursor.fetchall() logging.info("%s (%i)", formula['formula_in_latex'], len(raw_datasets)) for raw_data in raw_datasets: try: handwriting = HandwrittenData(raw_data['data'], formula['id'], raw_data['id'], formula['formula_in_latex'], raw_data['wild_point_count'], raw_data['missing_line'], raw_data['user_id'], user_name=raw_data['display_name']) handwriting_datasets.append({'handwriting': handwriting, 'id': raw_data['id'], 'formula_id': formula['id'], 'formula_in_latex': formula['formula_in_latex'], 'is_in_testset': raw_data['is_in_testset']}) except Exception as e: logging.info("Raw data id: %s", raw_data['id']) logging.info(e) pickle.dump({'handwriting_datasets': handwriting_datasets, 'formula_id2latex': formula_id2latex}, open(destination_path, "wb"), 2) if renderings: logging.info("Start downloading SVG renderings...") svgfolder = tempfile.mkdtemp() sql = """SELECT t1.formula_id, t1.svg from wm_renderings t1 LEFT JOIN wm_renderings t2 ON t1.formula_id = t2.formula_id AND t1.creation_time < t2.creation_time WHERE t2.id is null""" cursor.execute(sql) formulas = cursor.fetchall() logging.info("Create svg...") for formula in formulas: filename = os.path.join(svgfolder, "%s.svg" % str(formula['formula_id'])) with open(filename, 'wb') as temp_file: temp_file.write(formula['svg']) logging.info("Tar at %s", os.path.abspath("renderings.tar")) tar = tarfile.open("renderings.tar.bz2", "w:bz2") for fn in os.listdir(svgfolder): filename = os.path.join(svgfolder, fn) if os.path.isfile(filename): print(filename) tar.add(filename, arcname=os.path.basename(filename)) tar.close()