def gensim2json(src, tar): model = _load_gensim(src) with wf_open(tar) as wf: for word in tqdm(model.wv.vocab, "gensim2json: %s --> %s" % (src, tar)): print(json.dumps([word, model.wv[word].tolist()]), file=wf)
def dump_kt(self, learner_num, filename, step=50): learners = self.generate_learners(learner_num) with wf_open(filename) as wf: for learner in tqdm(learners, "kss for kt"): self._learner_warm_up(learner, step) print(json.dumps(learner.exercise_history), file=wf)
def seq2idx(src, tar, vec_json, src_encoding="utf-8", tar_encoding="utf-8"): """convert token sequences in json format into idx sequence in json format""" vec_dict = WVDict.from_file(vec_json) with rf_open(src, encoding=src_encoding) as f, wf_open( tar, encoding=tar_encoding) as wf: for line in tqdm(f, desc="converting %s -> %s" % (src, tar)): print(json.dumps(vec_dict.token2idx(json.loads(line))), file=wf)
def extract_students_log(source, target, ku_dict): """require big memory to run this function""" outcome = { "INCORRECT": 0, "CORRECT": 1, "HINT": 0, } students = {} with open(ku_dict) as f: ku_dict = json.load(f) with open(source) as f: f.readline() for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"): student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \ outcome[line[10]], line[8] if student not in students: students[student] = {} if session not in students[student]: students[student][session] = [] students[student][session].append([int(timestamp), exercise, correct]) with wf_open(target) as wf: for student_id, sessions in tqdm(students.items(), "sorting"): for session_id, exercises in sessions.items(): exercises.sort(key=lambda x: x[0]) exercise_response = [(exercise[1], exercise[2]) for exercise in exercises] print(json.dumps(exercise_response), file=wf)
def result_file(tmp_path_factory): tmp_path = tmp_path_factory.mktemp("result") tmp_file = path_append(tmp_path, "result.json", to_str=True) with wf_open(tmp_file) as wf: for r in result_demo: print(json.dumps(r), file=wf) return tmp_file
def dump(self, cfg_path: str, override=True, file_format=None): """ 将配置参数写入文件 Updated in version 1.3.16 Parameters ---------- cfg_path: str override: bool file_format: str """ if os.path.isfile(cfg_path) and not override: self.logger.warning( "file %s existed, dump aborted" % os.path.abspath(cfg_path) ) return self.logger.info( "writing configuration parameters to %s" % os.path.abspath(cfg_path) ) file_format = file_format if file_format is not None else self.default_file_format() with wf_open(cfg_path) as wf: if file_format == "json": json.dump(self.parsable_var, wf, indent=2) elif file_format == "toml": toml.dump(self.parsable_var, wf) elif file_format == "yaml": yaml.dump(self.parsable_var, wf) else: raise TypeError( "Unsupported file format: %s, only `json`, `toml` and `yaml` are supported" % file_format )
def extract_prerequisite(source, target, ku_dict): """in target: (A, B) means predecessor --> successor""" with codecs.open( source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf: ku_dict = json.load(kf) prerequisite_edges = [] f.readline() for line in tqdm(csv.reader(f)): if not line[2]: continue successor = ku_dict[line[0]] for prerequisite in line[2].split(','): predecessor = ku_dict[prerequisite] if predecessor == successor: continue if predecessor == 61 and successor == 498: # there is a loop 498 -> 510 -> 61 -> 498 in original data continue prerequisite_edges.append((predecessor, successor)) logger.info("prerequisite edges: %s" % len(prerequisite_edges)) # clean the loop in prerequisite graph graph = nx.DiGraph() graph.add_edges_from(prerequisite_edges) assert not list(nx.algorithms.simple_cycles(graph)), "loop in DiGraph" json.dump(prerequisite_edges, wf, indent=2)
def template_copy(src: PATH_TYPE, tar: PATH_TYPE, default_value: (str, dict, None) = "", quotation="\'", key_lower=True, **variables): """ Generate the tar file based on the template file where the variables will be replaced. Usually, the variable is specified like `$PROJECT` in the template file. Parameters ---------- src: template file tar: target location default_value: the default value quotation: the quotation to wrap the variable value variables: the real variable values which are used to replace the variable in template file """ if not override_check(tar): return with open(src) as f, wf_open(tar) as wf: for line in f: print(default_variable_replace(line, default_value=default_value, quotation=quotation, key_lower=key_lower, **variables), end='', file=wf)
def json2csv(src, tar, delimiter=' '): with rf_open(src) as f, wf_open(tar) as wf: writer = csv.writer(wf, delimiter=delimiter) for line in f: token, vec = json.loads(line) writer.writerow([token] + list(map(str, vec))) return tar
def gensim2csv(src, tar, delimiter=" "): model = _load_gensim(src) with wf_open(tar) as wf: writer = csv.writer(wf, delimiter=delimiter) for word in tqdm(model.wv.vocab, "gensim2json: %s --> %s" % (src, tar)): writer.writerow([word] + model.wv[word].tolist())
def synthetic2json(src, tar): with open(src) as f, wf_open(tar) as wf: for line in tqdm(f, desc="%s -> %s" % (src, tar)): line = line.strip() if not line: # pragma: no cover continue elems = line.split(",") print(json.dumps([[i, int(ans)] for i, ans in enumerate(elems)]), file=wf)
def cached(self): assert self.cache_queue is not None with wf_open(self.cache_file, mode="w") as wf: while True: data = self.cache_queue.get() if isinstance(data, StopIteration): break print(json.dumps(data), file=wf)
def merge_relationship_annotation(sources, target): with wf_open(target) as wf: with codecs.open(sources[0]) as f: for line in f: wf.write(line) with codecs.open(sources[1]) as f: f.readline() for line in f: wf.write(line)
def _write(students, target): with wf_open(target) as wf: for student_id, sessions in tqdm(students.items(), "writing -> %s" % target): for session_id, exercises in sessions.items(): exercises.sort(key=lambda x: x[0]) exercise_response = [(exercise[1], exercise[2]) for exercise in exercises] print(json.dumps(exercise_response), file=wf)
def csv2json(src, tar, delimiter=' ', skip_first_line=False): with rf_open(src) as f, wf_open(tar) as wf: if skip_first_line: # pragma: no cover f.readline() for line in tqdm(csv.reader(f, delimiter=delimiter), "csv2json: %s --> %s" % (src, tar)): token = line[0] vec = list(map(float, line[1:])) print(json.dumps([token, vec]), file=wf) return tar
def build_interactions(users_dir, questions_csv, tar): judgement = Judgement(questions_csv) with wf_open(tar) as wf: for root, dirs, files in os.walk(users_dir): for filename in tqdm(files, "building interactions"): if re.match("u.*\.csv", filename): interactions_seq = csv2interactions( path_append(root, filename, to_str=True), judgement) print(json.dumps(interactions_seq), file=wf)
def dense_graph(ku_num, tar): _graph = [] for i in range(ku_num): for j in range(ku_num): if i != j: _graph.append([i, j]) with wf_open(tar) as wf: json.dump(_graph, wf, indent=2)
def test_path(tmp_path): path_append(tmp_path, "../data", "../dataset1/", "train", to_str=True) tmp_file = path_append(tmp_path, "test_path.txt") with wf_open(tmp_file) as wf: print("hello world", file=wf) assert file_exist(tmp_file) _dir = abs_current_dir(tmp_file) assert parent_dir(_dir, 2) == path_append(_dir, "..", "..")
def build_ku_dict(source, target): with codecs.open(source, encoding="utf-8") as f, wf_open(target) as wf: f.readline() idx = 0 vertex_dict = {} for line in tqdm(csv.reader(f)): if line[0] not in vertex_dict: vertex_dict[line[0]] = idx idx += 1 logger.info("vertex num: %s" % len(vertex_dict)) json.dump(vertex_dict, wf, indent=2)
def dense_graph(ku_num: int, tar=None, undirected: bool = False): """ Dense graph where any two vertex have a link No self loop is reserved. Parameters ---------- ku_num: int tar undirected Examples -------- Target file is a json file, json.load can be used to read it. Demo of target file with undirected tag is False: [ [0, 1], [0, 2], [1, 0], ... [2, 0], [2, 1] ] Demo of target file with undirected tag is True: [ [0, 1], [1, 2], [0, 2] ] >>> dense_graph(3) [[0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1]] >>> dense_graph(3, undirected=True) [[0, 1], [0, 2], [1, 2]] """ _graph = [] if undirected: for i in range(ku_num): for j in range(i + 1, ku_num): _graph.append([i, j]) else: for i in range(ku_num): for j in range(ku_num): if i != j: _graph.append([i, j]) if tar is not None: with wf_open(tar) as wf: json.dump(_graph, wf, indent=2) return _graph
def _output_graph(graph, tar): ku_num = len(graph) _graph = [] for i in range(ku_num): for j in range(ku_num): if i != j and graph[i][j] > 0: _graph.append([i, j, graph[i][j]]) with wf_open(tar) as wf: json.dump(_graph, wf, indent=2)
def extract_difficulty(source, target, ku_dict): """ In target: (A, B, v) means A is similar with B in v degree. If v is small, A and B should be considered as not similar. """ difficulty = [] with codecs.open(source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf: f.readline() ku_dict = json.load(kf) for line in csv.reader(f): difficulty.append((ku_dict[line[0]], ku_dict[line[1]], float(line[4]))) logger.info("edges: %s" % len(difficulty)) logger.info(pandas.Series([sim[-1] for sim in difficulty]).describe()) json.dump(difficulty, wf, indent=2)
def select_n_most_active(src, tar, n): lengths = [] with open(src) as f: for i, line in tqdm(enumerate(f), "evaluating length of each row"): lengths.append([i, len(json.loads(line))]) selected_idx = set( list(zip(*heapq.nlargest(n, lengths, key=lambda x: x[1])))[0]) with open(src) as f, wf_open(tar) as wf: for i, line in tqdm( enumerate(f), "selecting %s most active students from %s to %s" % (n, src, tar)): if i not in selected_idx: continue print(line, end='', file=wf)
def test_template_copy(tmpdir): config.OVERRIDE = True pseudo_template = """ project=$PROJECT author=$AUTHOR """.lstrip() src = path_append(tmpdir, "src.template") tar = path_append(tmpdir, "tar") with wf_open(src) as wf: print(pseudo_template, file=wf) template_copy(src, tar, quotation='', project="longling", author="sherlock") with open(tar) as f: assert f.readline().strip() == "project=longling" assert f.readline().strip() == "author=sherlock"
def test_load_configuration_json(tmpdir, file_format): configuration = {"id": "12345", "name": "test_config"} filename = path_append(tmpdir, "test_config.%s" % file_format) with wf_open(filename) as wf: if file_format == "json": json.dump(configuration, wf) elif file_format == "toml": toml.dump(configuration, wf) elif file_format == "yaml": yaml.dump(configuration, wf) else: print(configuration, file=wf) if file_format == "err": with pytest.raises(TypeError): with open(filename) as f: _c = load_configuration(f, file_format=file_format) else: with open(filename) as f: _c = load_configuration(f, file_format=file_format) assert _c["id"] == "12345" assert _c["name"] == "test_config"
with wf_open(target) as wf: for student_id, sessions in tqdm(students.items(), "sorting"): for session_id, exercises in sessions.items(): exercises.sort(key=lambda x: x[0]) exercise_response = [(exercise[1], exercise[2]) for exercise in exercises] print(json.dumps(exercise_response), file=wf) if __name__ == '__main__': root = "../../" student_log_raw_file = root + "raw_data/junyi/junyi_ProblemLog_for_PSLC.txt" student_log_file = root + "data/junyi/student_log_kt.json" ku_dict_file = root + "data/junyi/graph_vertex.json" # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file) student_log_file_small = student_log_file + ".small" with open(student_log_file) as f, wf_open(student_log_file_small) as wf: for i, line in tqdm(enumerate(f)): if i > 50000: break print(line, end="", file=wf) print(train_valid_test( student_log_file_small, valid_ratio=0., test_ratio=0.2, root_dir=root + "data/junyi/", silent=False, ))
def gitlab_ci(private, stages: dict, atype: str = "", tar_dir: PATH_TYPE = "./", version_in_path=True): """ cli alias: ``arch gitlab_ci`` Parameters ---------- private stages atype tar_dir version_in_path Returns ------- """ base_src = path_append(META, "gitlab-ci", ".gitlab-ci.yml") src = path_append(META, "gitlab-ci", "%s.gitlab-ci.yml" % atype) tar = path_append(tar_dir, ".gitlab-ci.yml") config_template = OrderedDict() with open(base_src) as f: config_template.update(ordered_yaml_load(f)) with open(src) as f: config_template.update(ordered_yaml_load(f)) logger.info("generate %s" % tar) with wf_open(tar) as wf: for _c in ["variables", "cache"]: if _c in config_template: print(dump_folded_yaml({_c: config_template[_c]}), file=wf) print(dump_folded_yaml({ "stages": [stage for stage in stages.keys() if stage in config_template] }), file=wf) for stage, params in stages.items(): if stage == "docs": params["registry_suffix"] = "/docs" elif stage in {"test", "build"}: params["deployment"] = False if stage not in config_template: logger.warning("%s is not listed in %s, skipped" % (stage, src)) continue commands = {stage: config_template[stage]} _gitlab_ci(commands, stage, private=private, version_in_path=version_in_path, **params) print(dump_folded_yaml(commands), file=wf) if private: print("*" * 30) print("私有项目注意") print( "在项目的settings->repository->Deploy Tokens 添加一个name为gitlab-deploy-token、" "其他两项留空的具有read_registry权限的token") print("*" * 30)