Esempio n. 1
0
def clean_unique_tokens(data_prefix, part_prefix):
    tokens = load_data(PATH() + '{}_{}_unique_tokens_base.data'.format(part_prefix, data_prefix))
    print(len(tokens))
    varset = set()
    varfuncset = set()
    for v, vf, tree in iter_trees(data_prefix, part_prefix):
        varset.update(v)
        varfuncset.update(vf)
    print(len(varset))
    print(len(varfuncset))
    keys_to_delete = []
    
    for k, kset in [('VAR', varset), ('VARFUNC', 'varfuncset')]:
        tokens[k] = 0
        for v in kset:
            if v in tokens:
                tokens[k] += tokens[v]
                keys_to_delete.append(v)
            
    tokens['_n'] = 0
    for k in tokens:
        if k[0] == '_' and k[1:].isdigit():
            tokens['_n'] += tokens[k]
            keys_to_delete.append(k)
            
    tokens['UNK'] = 0
    
    print(len(keys_to_delete))
    for k in keys_to_delete:
        del tokens[k]
     
    print(len(tokens))
    print()
    print()
    dump_data(PATH() + '{}_{}_unique_tokens.data'.format(part_prefix, data_prefix), tokens)
Esempio n. 2
0
def load_texts(table, data_prefix, part_prefix):
    ids = np.load(PATH() + '{}_{}_ids.npy'.format(part_prefix, data_prefix))
    texts = None
    with Holstep.Setup() as db:
        sql = 'SELECT Id, Text FROM {} ORDER BY Id'.format(table)
        texts = db.ex_many(sql)
        texts = [a[1] for a in texts if a[0] in ids]
    dump_data(PATH() + '{}_{}_loaded_texts.data'.format(part_prefix, data_prefix), texts)  
Esempio n. 3
0
def build_premise_to_conjecture_map(part_prefix):
    relationships = np.load(PATH() + '{}_relationships.npy'.format(part_prefix))
    pcmap = dict()
    for cid, pid in relationships:
        if pid not in pcmap:
            pcmap[pid] = []
        pcmap[pid].append(cid)
    print(len(pcmap))
    dump_data(PATH() + '{}_relationships_dict.data'.format(part_prefix), pcmap)
Esempio n. 4
0
def build_ids(data_prefix, part_prefix, input_counter, output_name):
    objs = load_data(PATH() + '{}_{}_{}.data'.format(part_prefix, data_prefix, input_counter))
    
    ids = [k for k in objs.keys()]
    ids.sort()
    idmap = {k:i for i, k in enumerate(ids)}
    
    dump_data(PATH() + '{}_{}_{}_ids.data'.format(part_prefix, data_prefix, output_name), ids)
    dump_data(PATH() + '{}_{}_{}_idmap.data'.format(part_prefix, data_prefix, output_name), idmap)
Esempio n. 5
0
 def savemodel(model, name, params, desc='', history=None, withnowstring=False):
     file = '{}_{}'.format(
             BaseModel.getfilename(name, params), 
             desc)
     if withnowstring:
         file += '_{}'.format(BaseModel.nowstring())
     if history is not None:
         model.save_weights(MODELS() + file + '.h5')
         dump_data(MODELS() + file + '.history', history.history)
         dump_data(MODELS() + file + '.params', params)
     return file
Esempio n. 6
0
def dump(args, client):
    dump_data(client,
              args["measurements"],
              args["folder"],
              dryrun=args["dryrun"],
              chunk_size=args["chunksize"],
              start=args["start"],
              end=args["end"],
              retry=args["retry"],
              typecast=args["typecast"],
              cast=args["cast"],
              verbose=args["verbose"])
Esempio n. 7
0
def build_premise_subtrees(part_prefix):
    tokens = load_data(PATH() + '{}_premise_tokens_ids.data'.format(part_prefix))
    token_map = load_data(PATH() + '{}_premise_tokens_idmap.data'.format(part_prefix))
    
    def save(t, n):
        n = ("000000000" + str(n))[-9:]
        dump_data(PATH() + 'subtrees/{}_premise_subtrees_{}.data'.format(part_prefix, n), t)
    
    def worksubtree(pid, tree, ref, depth=0):
        first_child = 0
        second_child = 0
        l1, l2 = 0, 0
        if len(tree.children) >= 1:
            first_child, l1 = worksubtree(pid, tree.children[0], ref, depth=depth+1)
        if len(tree.children) >= 2:
            second_child, l2 = worksubtree(pid, tree.children[1], ref, depth=depth+1)
        layers = max(l1, l2) + 1
        
        text = tree.simpletext()
        if text not in ref.subtreemap:
            ref.sti += 1
            ref.subtreemap[text] = ref.sti
            ref.subtreemaplist.append(text)

        ref.subtreelist.append((pid, token_map[tree.value], ref.subtreemap[text], 
                                first_child, second_child, depth, layers))
        ref.counter += 1
        if ref.counter % 1000000 == 0:
            print(ref.counter)
            save(ref.subtreelist, ref.counter)
            ref.subtreelist = []
        
        return ref.subtreemap[text], layers

    ref = Ref()
    for pid, (v, vf, tree) in enumerate(iter_trees('premise', part_prefix)):
        tree.cleanreplace(v, vf, tokens)
        worksubtree(pid, tree, ref)
    save(ref.subtreelist, ref.counter)
    
    print()
    print(len(ref.subtreemaplist))
    
    dump_data(PATH() + '{}_premise_subtrees_idmap.data'.format(part_prefix), ref.subtreemap)
    dump_data(PATH() + '{}_premise_subtrees_ids.data'.format(part_prefix), ref.subtreemaplist)
def build_premise_identifiers(prefix, premise_lower_bound):
    cids = np.load(PATH() + '{}_conjecture_ids.npy'.format(prefix))
    with Holstep.Setup() as db:
        sql = 'SELECT ConjectureId, StepId FROM ConjectureStep '
        sql += 'GROUP BY StepId HAVING COUNT(StepId) >= {} '.format(
            premise_lower_bound)
        steps = db.ex_many(sql)
        steps = [a[1] for a in steps if a[0] in cids]
        steps = sorted(list(set(steps)))

        id_to_step = np.array(steps)
        np.save(PATH() + '{}_premise_id_to_step.npy'.format(prefix),
                id_to_step)
        print(id_to_step)
        print(len(id_to_step))

        step_to_id = {}
        for i, x in enumerate(steps):
            step_to_id[x] = i
        dump_data(PATH() + '{}_premise_step_to_id.data'.format(prefix),
                  step_to_id)
        print(step_to_id)
        print(len(step_to_id))
Esempio n. 9
0
def dump(args, client):
    dump_data(client,
              args["measurements"],
              args["folder"],
              dryrun=args["dryrun"],
              verbose=args["verbose"])
Esempio n. 10
0
 def build(cls, params):
     model, _ = ConjectureTokenVAE.build_model(params)
     path = BaseModel.savemodel(model, ConjectureTokenVAE._name(), params)
     dump_data(MODELS() + path + '.params', params)
     return cls(path)
Esempio n. 11
0
 def save(self):
     n = ("000000000" + str(self.counter))[-9:]
     path = 'records/{}_{}_{}.data'.format(self.part_prefix, self.name, n)
     print(path)
     dump_data(PATH() + path, self.records)
     self.records = []
Esempio n. 12
0
 def save(t, n):
     n = ("000000000" + str(n))[-9:]
     dump_data(PATH() + 'subtrees/{}_premise_subtrees_{}.data'.format(part_prefix, n), t)
Esempio n. 13
0
def build_unique_tokens(data_prefix, part_prefix):
    unique_tokens = Counter()
    for v, vf, tree in iter_trees(data_prefix, part_prefix):
        unique_tokens.update(tree.unique_tokens())
    dump_data(PATH() + '{}_{}_unique_tokens_base.data'.format(part_prefix, data_prefix), 
              unique_tokens) 
Esempio n. 14
0
 def save(t, n):
     n = ("000000" + str(n))[-6:]
     dump_data(PATH() + 'trees/{}_{}_trees_{}.data'.format(part_prefix, data_prefix, n), t)
Esempio n. 15
0
def build_id_map(prefix):
    arr = np.load(PATH() + '{}_ids.npy'.format(prefix))
    idmap = {k: i for i, k in enumerate(arr)}
    dump_data(PATH() + '{}_idmap.data'.format(prefix), idmap)