def clean_unique_tokens(data_prefix, part_prefix): tokens = load_data(PATH() + '{}_{}_unique_tokens_base.data'.format(part_prefix, data_prefix)) print(len(tokens)) varset = set() varfuncset = set() for v, vf, tree in iter_trees(data_prefix, part_prefix): varset.update(v) varfuncset.update(vf) print(len(varset)) print(len(varfuncset)) keys_to_delete = [] for k, kset in [('VAR', varset), ('VARFUNC', 'varfuncset')]: tokens[k] = 0 for v in kset: if v in tokens: tokens[k] += tokens[v] keys_to_delete.append(v) tokens['_n'] = 0 for k in tokens: if k[0] == '_' and k[1:].isdigit(): tokens['_n'] += tokens[k] keys_to_delete.append(k) tokens['UNK'] = 0 print(len(keys_to_delete)) for k in keys_to_delete: del tokens[k] print(len(tokens)) print() print() dump_data(PATH() + '{}_{}_unique_tokens.data'.format(part_prefix, data_prefix), tokens)
def load_texts(table, data_prefix, part_prefix): ids = np.load(PATH() + '{}_{}_ids.npy'.format(part_prefix, data_prefix)) texts = None with Holstep.Setup() as db: sql = 'SELECT Id, Text FROM {} ORDER BY Id'.format(table) texts = db.ex_many(sql) texts = [a[1] for a in texts if a[0] in ids] dump_data(PATH() + '{}_{}_loaded_texts.data'.format(part_prefix, data_prefix), texts)
def build_premise_to_conjecture_map(part_prefix): relationships = np.load(PATH() + '{}_relationships.npy'.format(part_prefix)) pcmap = dict() for cid, pid in relationships: if pid not in pcmap: pcmap[pid] = [] pcmap[pid].append(cid) print(len(pcmap)) dump_data(PATH() + '{}_relationships_dict.data'.format(part_prefix), pcmap)
def build_ids(data_prefix, part_prefix, input_counter, output_name): objs = load_data(PATH() + '{}_{}_{}.data'.format(part_prefix, data_prefix, input_counter)) ids = [k for k in objs.keys()] ids.sort() idmap = {k:i for i, k in enumerate(ids)} dump_data(PATH() + '{}_{}_{}_ids.data'.format(part_prefix, data_prefix, output_name), ids) dump_data(PATH() + '{}_{}_{}_idmap.data'.format(part_prefix, data_prefix, output_name), idmap)
def savemodel(model, name, params, desc='', history=None, withnowstring=False): file = '{}_{}'.format( BaseModel.getfilename(name, params), desc) if withnowstring: file += '_{}'.format(BaseModel.nowstring()) if history is not None: model.save_weights(MODELS() + file + '.h5') dump_data(MODELS() + file + '.history', history.history) dump_data(MODELS() + file + '.params', params) return file
def dump(args, client): dump_data(client, args["measurements"], args["folder"], dryrun=args["dryrun"], chunk_size=args["chunksize"], start=args["start"], end=args["end"], retry=args["retry"], typecast=args["typecast"], cast=args["cast"], verbose=args["verbose"])
def build_premise_subtrees(part_prefix): tokens = load_data(PATH() + '{}_premise_tokens_ids.data'.format(part_prefix)) token_map = load_data(PATH() + '{}_premise_tokens_idmap.data'.format(part_prefix)) def save(t, n): n = ("000000000" + str(n))[-9:] dump_data(PATH() + 'subtrees/{}_premise_subtrees_{}.data'.format(part_prefix, n), t) def worksubtree(pid, tree, ref, depth=0): first_child = 0 second_child = 0 l1, l2 = 0, 0 if len(tree.children) >= 1: first_child, l1 = worksubtree(pid, tree.children[0], ref, depth=depth+1) if len(tree.children) >= 2: second_child, l2 = worksubtree(pid, tree.children[1], ref, depth=depth+1) layers = max(l1, l2) + 1 text = tree.simpletext() if text not in ref.subtreemap: ref.sti += 1 ref.subtreemap[text] = ref.sti ref.subtreemaplist.append(text) ref.subtreelist.append((pid, token_map[tree.value], ref.subtreemap[text], first_child, second_child, depth, layers)) ref.counter += 1 if ref.counter % 1000000 == 0: print(ref.counter) save(ref.subtreelist, ref.counter) ref.subtreelist = [] return ref.subtreemap[text], layers ref = Ref() for pid, (v, vf, tree) in enumerate(iter_trees('premise', part_prefix)): tree.cleanreplace(v, vf, tokens) worksubtree(pid, tree, ref) save(ref.subtreelist, ref.counter) print() print(len(ref.subtreemaplist)) dump_data(PATH() + '{}_premise_subtrees_idmap.data'.format(part_prefix), ref.subtreemap) dump_data(PATH() + '{}_premise_subtrees_ids.data'.format(part_prefix), ref.subtreemaplist)
def build_premise_identifiers(prefix, premise_lower_bound): cids = np.load(PATH() + '{}_conjecture_ids.npy'.format(prefix)) with Holstep.Setup() as db: sql = 'SELECT ConjectureId, StepId FROM ConjectureStep ' sql += 'GROUP BY StepId HAVING COUNT(StepId) >= {} '.format( premise_lower_bound) steps = db.ex_many(sql) steps = [a[1] for a in steps if a[0] in cids] steps = sorted(list(set(steps))) id_to_step = np.array(steps) np.save(PATH() + '{}_premise_id_to_step.npy'.format(prefix), id_to_step) print(id_to_step) print(len(id_to_step)) step_to_id = {} for i, x in enumerate(steps): step_to_id[x] = i dump_data(PATH() + '{}_premise_step_to_id.data'.format(prefix), step_to_id) print(step_to_id) print(len(step_to_id))
def dump(args, client): dump_data(client, args["measurements"], args["folder"], dryrun=args["dryrun"], verbose=args["verbose"])
def build(cls, params): model, _ = ConjectureTokenVAE.build_model(params) path = BaseModel.savemodel(model, ConjectureTokenVAE._name(), params) dump_data(MODELS() + path + '.params', params) return cls(path)
def save(self): n = ("000000000" + str(self.counter))[-9:] path = 'records/{}_{}_{}.data'.format(self.part_prefix, self.name, n) print(path) dump_data(PATH() + path, self.records) self.records = []
def save(t, n): n = ("000000000" + str(n))[-9:] dump_data(PATH() + 'subtrees/{}_premise_subtrees_{}.data'.format(part_prefix, n), t)
def build_unique_tokens(data_prefix, part_prefix): unique_tokens = Counter() for v, vf, tree in iter_trees(data_prefix, part_prefix): unique_tokens.update(tree.unique_tokens()) dump_data(PATH() + '{}_{}_unique_tokens_base.data'.format(part_prefix, data_prefix), unique_tokens)
def save(t, n): n = ("000000" + str(n))[-6:] dump_data(PATH() + 'trees/{}_{}_trees_{}.data'.format(part_prefix, data_prefix, n), t)
def build_id_map(prefix): arr = np.load(PATH() + '{}_ids.npy'.format(prefix)) idmap = {k: i for i, k in enumerate(arr)} dump_data(PATH() + '{}_idmap.data'.format(prefix), idmap)