def test_json2tl(shared_data_dir): src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True) tl_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.tl", to_str=True) json_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True) json2tl(src, tl_tar) tl2json(tl_tar, json_tar) assert True
def test_build_interactions(shared_data_dir): question_csv_path = path_append(shared_data_dir, "tests", "EdNet", "contents", "questions.csv") users_dir = path_append(shared_data_dir, "tests", "EdNet", "KT1") user_csv = path_append(users_dir, "u1.csv") judgement = Judgement(question_csv_path) interactions = csv2interactions(user_csv, judgement) assert interactions[0] == [5011, 0] tar = path_append(shared_data_dir, "tests", "EdNet", "KT1", "data", "kt.json") build_interactions(users_dir, question_csv_path, tar) with open(tar) as f: assert json.loads(f.readline())[0] == [5011, 0] assert len(json.loads(f.readline())) == 16 tar2 = path_append(tar, to_str=True) + ".%s" % 1 select_n_most_active(tar, tar2, 1) with open(tar2) as f: assert json.loads(f.readline())[0] == [5011, 0]
def test_loading(tmpdir): csv_src = path_append(tmpdir, "test.csv") json_src = path_append(tmpdir, "test.json") text_to_csv(csv_src) csv2jsonl(csv_src, json_src) jsonl2csv(json_src, csv_src) for src in [csv_src, json_src, load_jsonl(json_src)]: for i, line in enumerate(loading(src)): assert int(line["id"]) == i, line if i == 0: assert line["name"] == "Tom", line elif i == 1: assert line["name"] == "Jerry", line src = path_append(tmpdir, "test") with as_out_io(src) as wf: print(DEMO_TEXT.strip(), file=wf) assert [line.strip() for line in loading(src)] == DEMO_TEXT.strip().split("\n") with as_io(src) as f: assert [line.strip() for line in loading(f)] == DEMO_TEXT.strip().split("\n") assert "hello world" == loading(lambda: "hello world")
def download_data(url, data_dir, override, bloom_filter: set = None): bloom_filter = set() if bloom_filter is None else bloom_filter if url in bloom_filter: # pragma: no cover return if url.endswith("/"): # 以/结尾是文件夹,其余是文件 _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True) r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, "lxml") al = soup.find_all('a') for a in al: # 获得链接名 h = a.get('href') if h[0] != '.': url_h = url + h if url_h not in bloom_filter: download_data(url_h, _data_dir, override, bloom_filter) bloom_filter.add(url) else: os.makedirs(data_dir, exist_ok=True) save_path = path_append(data_dir, url.split('/')[-1], to_str=True) download_file(url, save_path, override) bloom_filter.add(url) return data_dir
def _update(self, **kwargs): params = kwargs params["logger"] = params.pop( "logger", config_logging(logger=params.get("model_name", self.model_name), console_log_level="info")) for key in params: if key.endswith("_params") and key + "_update" in params: params[key].update(params[key + "_update"]) self.deep_update(**params) _vars = ["ctx"] for _var in _vars: if _var in kwargs: try: setattr(self, _var, eval_var(kwargs[_var])) except TypeError: pass self.validation_result_file = path_append(self.model_dir, RESULT_JSON, to_str=True) self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
def test_copy(tmpdir): src_dir = path_append(tmpdir, "src") tar_dir = path_append(tmpdir, "tar") src = path_append(src_dir, "src.txt") tar = path_append(tar_dir, "tar.txt") with as_out_io(src) as wf: print("hello world", file=wf) config.OVERRIDE = False copytree(src_dir, tar_dir) copytree(src_dir, tar_dir) copyfile(src, tar) template_copy(src, tar) config.OVERRIDE = True copytree(src_dir, tar_dir) copyfile(src, tar) config.OVERRIDE = None with simulate_stdin("y", "y"): copytree(src_dir, tar_dir) copyfile(src, tar) with simulate_stdin("n", "n"): copytree(src_dir, tar_dir) copyfile(src, tar) with simulate_stdin("unk", "y"): default_legal_input("", __legal_input={"y"})
def build_json_sequence(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/", ku_dict_path: str = "../data/junyi/data/graph_vertex.json", n: int = 1000): select_n_most_frequent_students( path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True), path_append(tar_root, "student_log_kt_", to_str=True), ku_dict_path, n, )
def transfer_synthetic_dataset(src_dir, tar_dir): for root, dirs, files in os.walk(src_dir): for filename in files: src = PurePath(path_append(root, filename)) if src.suffix != ".csv": # pragma: no cover continue tar = path_append(tar_dir, src.with_suffix(".json").name) synthetic2json(src, tar)
def test_analysis(shared_data_dir): src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True) analysis_records(src) graph_src = path_append(shared_data_dir, "dense_graph", to_str=True) analysis_edges(graph_src) graph_src = path_append(shared_data_dir, "transition_graph", to_str=True) analysis_edges(graph_src, threshold=0.5) analysis_edges(graph_src, threshold=None)
def test_path(tmp_path): path_append(tmp_path, "../data", "../dataset1/", "train", to_str=True) tmp_file = path_append(tmp_path, "test_path.txt") with wf_open(tmp_file) as wf: print("hello world", file=wf) assert file_exist(tmp_file) _dir = abs_current_dir(tmp_file) assert parent_dir(_dir, 2) == path_append(_dir, "..", "..")
def test_graph(shared_data_dir): json_src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True) dense_graph(835, path_append(shared_data_dir, "dense_graph", to_str=True)) trans_graph = path_append(shared_data_dir, "transition_graph", to_str=True) transition_graph(835, json_src, tar=trans_graph) ctrans_graph = path_append(shared_data_dir, "correct_transition_graph", to_str=True) correct_transition_graph(835, json_src, tar=ctrans_graph) ctrans_sim = path_append(shared_data_dir, "correct_transition_sim_graph", to_str=True) similarity_graph(835, ctrans_graph, ctrans_sim)
def _update(self, **kwargs): params = kwargs params["logger"] = params.pop( "logger", config_logging(logger=params.get("model_name", self.model_name), console_log_level="info")) for key in params: if key.endswith("_params") and key + "_update" in params: params[key].update(params[key + "_update"]) # path_override_check path_check_list = [ "dataset", "root_data_dir", "workspace", "root_model_dir", "model_dir" ] _overridden = {} for path_check in path_check_list: if kwargs.get(path_check) is None or kwargs[path_check] == getattr( self, "%s" % path_check): _overridden[path_check] = False else: _overridden[path_check] = True for param, value in params.items(): setattr(self, "%s" % param, value) def is_overridden(varname): return _overridden["%s" % varname] # set dataset if is_overridden("dataset") and not is_overridden("root_data_dir"): kwargs["root_data_dir"] = path_append("$root", "data", "$dataset") # set workspace if (is_overridden("workspace") or is_overridden("root_model_dir") ) and not is_overridden("model_dir"): kwargs["model_dir"] = path_append("$root_model_dir", "$workspace") # rebuild relevant directory or file path according to the kwargs _dirs = [ "workspace", "root_data_dir", "data_dir", "root_model_dir", "model_dir" ] for _dir in _dirs: exp = var2exp(kwargs.get(_dir, getattr(self, _dir)), env_wrap=lambda x: "self.%s" % x) setattr(self, _dir, eval(exp)) self.validation_result_file = path_append(self.model_dir, RESULT_JSON, to_str=True) self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
def test_load_json(tmpdir): csv_src = path_append(tmpdir, "test.csv") src = path_append(tmpdir, "json.csv") text_to_csv(csv_src) csv2jsonl(csv_src, src) for i, line in enumerate(load_jsonl(src)): assert int(line["id"]) == i if i == 0: assert line["name"] == "Tom" elif i == 1: assert line["name"] == "Jerry"
def test_io_type(tmp_path): with pytest.raises(TypeError): wf_open(12345) with pytest.raises(TypeError): rf_open(12345) with as_out_io(path_append(tmp_path, "test_out")) as wf: with as_out_io(wf): pass with as_io(path_append(tmp_path, "test_out")) as f: with rf_open(f): pass
def load_environment_parameters(directory=None): if directory is None: directory = path_append(abs_current_dir(__file__), "meta_data") return { "configuration": load_configuration(path_append(directory, "configuration.json")), "knowledge_structure": load_knowledge_structure( path_append(directory, "knowledge_structure.csv")), "learning_order": load_learning_order(path_append(directory, "learning_order.json")), "items": load_items(path_append(directory, "items.json")) }
def test_encode(tmpdir): demo_text = "测试用中文\nhello world\n如果再重来" src = path_append(tmpdir, "gbk.txt") tar = path_append(tmpdir, "utf8.txt") with wf_open(src, encoding="gbk") as wf: print(demo_text, end='', file=wf) encode(src, "gbk", tar, "utf-8") with rf_open(tar) as f: for line in f: print(line)
def load_environment_parameters(directory): return { "transition_matrix": load_transition_matrix(path_append(directory, "transition_matrix.json")), "configuration": load_configuration(path_append(directory, "configuration.json")), "knowledge_structure": load_knowledge_structure( path_append(directory, "knowledge_structure.csv")), "state2vector": load_state_to_vector(path_append(directory, "state2vector.json")), "initial_states": load_initial_states(path_append(directory, "initial_states.json")), }
def result_file(tmp_path_factory): tmp_path = tmp_path_factory.mktemp("result") tmp_file = path_append(tmp_path, "result.json", to_str=True) with wf_open(tmp_file) as wf: for r in result_demo: print(json.dumps(r), file=wf) return tmp_file
def net_viz(_net, _cfg, view_tag=False, **kwargs): # pragma: no cover """visualization check, only support pure static network""" batch_size = _cfg.batch_size model_dir = _cfg.model_dir logger = kwargs.get( 'logger', _cfg.logger if hasattr(_cfg, 'logger') else logging ) try: viz_dir = path_append(model_dir, "plot/network") logger.info("visualization: file in %s" % viz_dir) from copy import deepcopy viz_net = deepcopy(_net) viz_net.length = 2 viz_shape = {'data': (batch_size,) + (2,)} x = mx.sym.var("data") sym = viz_net(x)[1][-1] plot_network( nn_symbol=sym, save_path=viz_dir, shape=viz_shape, node_attrs={"fixedsize": "false"}, view=view_tag ) except VizError as e: logger.error("error happen in visualization, aborted") logger.error(e)
def toolbox_init( self, evaluation_formatter_parameters=None, validation_logger_mode="w", silent=False, ): from longling import path_append from longling.lib.clock import Clock from longling.lib.utilog import config_logging from longling.ML.toolkit import EvalFormatter as Formatter from longling.ML.toolkit import MovingLoss, ConsoleProgressMonitor as ProgressMonitor self.toolbox = { "monitor": dict(), "timer": None, "formatter": dict(), } mod = self.mod cfg = self.mod.cfg # 4.1 todo 定义损失函数 # bp_loss_f 定义了用来进行 back propagation 的损失函数, # 有且只能有一个,命名中不能为 *_\d+ 型 assert self.loss_function is not None loss_monitor = MovingLoss(self.loss_function) # 4.1 todo 初始化一些训练过程中的交互信息 timer = Clock() progress_monitor = ProgressMonitor( indexes={"Loss": [name for name in self.loss_function]}, values={"Loss": loss_monitor.losses}, end_epoch=cfg.end_epoch - 1, silent=silent) validation_logger = config_logging( filename=path_append(cfg.model_dir, "result.log"), logger="%s-validation" % cfg.model_name, mode=validation_logger_mode, log_format="%(message)s", ) # set evaluation formatter evaluation_formatter_parameters = {} \ if evaluation_formatter_parameters is None \ else evaluation_formatter_parameters evaluation_formatter = Formatter( logger=validation_logger, dump_file=mod.cfg.validation_result_file, **evaluation_formatter_parameters) self.toolbox["monitor"]["loss"] = loss_monitor self.toolbox["monitor"]["progress"] = progress_monitor self.toolbox["timer"] = timer self.toolbox["formatter"]["evaluation"] = evaluation_formatter
def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False): try: return download_data(url_dict[dataset], data_dir, override) except FileExistsError: return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True)
def test_configuration(tmpdir, file_format): _config = DemoConfiguration() assert _config.class_var == DemoConfiguration.vars() assert _config.parsable_var == DemoConfiguration.pvars() filename = path_append(tmpdir, "test_config.%s" % file_format, to_str=True) _config.b = 4 if file_format == ".err": with pytest.raises(TypeError): _config.dump(filename, override=True, file_format=file_format) return else: _config.dump(filename, override=True, file_format=file_format) _config.dump(filename, override=False, file_format=file_format) _config = DemoConfiguration.load(filename, file_format=file_format) assert "a" in _config assert _config["a"] == 1 and _config.b == 4 print(_config) assert len(_config.items()) == 2
def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None): """ Parameters ---------- dataset: str 数据集名 data_dir: str 数据存储目录 override: bool 是否覆盖已存在的文件 url_dict: 链接名称与链接映射 Returns ------- """ url_dict = URL_DICT if not url_dict else url_dict if dataset in url_dict: url = url_dict[dataset] elif re.match("http(s?)://.*", dataset): url = dataset else: raise ValueError("%s is neither a valid dataset name nor an url" % dataset) try: return download_data(url, data_dir, override) except FileExistsError: return path_append(data_dir, url.split('/')[-1], to_str=True)
def numerical_check(_net, _cfg: Configuration, train_data, test_data, dump_result=False): # pragma: no cover ctx = _cfg.ctx batch_size = _cfg.batch_size _net.initialize(ctx=ctx) bp_loss_f = get_bp_loss(**_cfg.loss_params) loss_function = {} loss_function.update(bp_loss_f) from longling.ML.MxnetHelper.glue import module from longling.ML.toolkit import EvalFormatter as Formatter from longling.ML.toolkit import MovingLoss from tqdm import tqdm loss_monitor = MovingLoss(loss_function) progress_monitor = tqdm if dump_result: from longling import config_logging validation_logger = config_logging( filename=path_append(_cfg.model_dir, "result.log"), logger="%s-validation" % _cfg.model_name, mode="w", log_format="%(message)s", ) evaluation_formatter = Formatter( logger=validation_logger, dump_file=_cfg.validation_result_file, ) else: evaluation_formatter = Formatter() # train check trainer = module.Module.get_trainer( _net, optimizer=_cfg.optimizer, optimizer_params=_cfg.optimizer_params, select=_cfg.train_select ) for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for batch_data in progress_monitor(train_data, "Epoch: %s" % epoch): fit_f( net=_net, batch_size=batch_size, batch_data=batch_data, trainer=trainer, bp_loss_f=bp_loss_f, loss_function=loss_function, loss_monitor=loss_monitor, ctx=ctx, ) if epoch % 1 == 0: if epoch % 1 == 0: print( evaluation_formatter( epoch=epoch, loss_name_value=dict(loss_monitor.items()), eval_name_value=eval_f(_net, test_data, ctx=ctx), extra_info=None, dump=True, )[0] )
def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None): """ Parameters ---------- dataset: str 数据集名 data_dir: str 数据存储目录 override: bool 是否覆盖已存在的文件 url_dict: 链接名称与链接映射 Returns ------- """ url_dict = URL_DICT if not url_dict else url_dict try: return download_data(url_dict[dataset], data_dir, override) except FileExistsError: # pragma: no cover return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True)
def _update(self, **kwargs): params = kwargs params["logger"] = params.pop( "logger", config_logging(logger=params.get("model_name", self.model_name), console_log_level="info")) for key in params: if key.endswith("_params") and key + "_update" in params: params[key].update(params[key + "_update"]) self.deep_update(**params) self.validation_result_file = path_append(self.model_dir, RESULT_JSON, to_str=True) self.cfg_path = path_append(self.model_dir, CFG_JSON, to_str=True)
def get_epoch_params_filepath(model_name: str, epoch: int, model_dir: str = "./"): """ Examples -------- >>> get_epoch_params_filepath("CNN", 10) 'CNN-0010.params' """ return path_append(model_dir, epoch_params_filename(model_name, epoch), to_str=True)
def get_params_filepath(model_name: str, model_dir: str = "./"): """ Examples -------- >>> get_params_filepath("CNN") 'CNN.params' """ return path_append(model_dir, params_filename(model_name), to_str=True)
def build_knowledge_graph(src_root: str, tar_root: (str, None) = None, ku_dict_path: str = None, prerequisite_path: (str, None) = None, similarity_path: (str, None) = None, difficulty_path: (str, None) = None): tar_root = tar_root if tar_root is not None else src_root exercise_src = path_append(src_root, "junyi_Exercise_table.csv") assert ku_dict_path is not None relation_src = merge_relationship_annotation( [path_append(src_root, "relationship_annotation_{}.csv".format(name)) for name in ["testing", "training"]], path_append(src_root, "relationship_annotation.csv") ) ku_dict_path = path_append(tar_root, ku_dict_path) build_ku_dict(exercise_src, ku_dict_path) if prerequisite_path is not None: prerequisite_path = path_append(tar_root, prerequisite_path) extract_prerequisite(exercise_src, prerequisite_path, ku_dict_path) if similarity_path is not None: similarity_path = path_append(tar_root, "similarity.json") extract_similarity(relation_src, similarity_path, ku_dict_path) if difficulty_path is not None: difficulty_path = path_append(tar_root, "difficulty.json") extract_difficulty(relation_src, difficulty_path, ku_dict_path)
def test_load_csv(tmpdir): src = path_append(tmpdir, "test.csv") text_to_csv(src) for i, line in enumerate(load_csv(src)): assert int(line["id"]) == i if i == 0: assert line["name"] == "Tom" elif i == 1: assert line["name"] == "Jerry"