Exemple #1
0
    print("{} unique sqls".format(len(lines)))
    print("{} unique sqls have T2".format(len(T2lines)))
    print("{} unique sqls have T3".format(len(T3lines)))
    print("{} unique sqls have T4".format(len(T4lines)))
    print("{} unique sqls have T5".format(len(T5lines)))
    print("{} unique sqls have GROUP BY".format(len(groupbylines)))
    print("{} unique sqls have nesting".format(len(nestedlines)))
    print("{} unique sqls have JOIN".format(len(joinlines)))
    print("{} unique sqls have JOIN without aliases".format(
        len(joinwithouttlines)))
    print("{} unique sqls have JOIN without GROUP BY".format(
        len(joinwithoutgroupby)))
    print("{} unique sqls have multiple aliases for same table".format(
        len(rejoinlines)))
    print("{} unique sqls have argmax".format(len(argmaxlines)))
    print("{} unique sqls have order by without argmax".format(
        len(nonargmaxlines)))
    print("{} unique sqls have COUNT(".format(len(countlines)))
    print("{} unique sqls have COUNT(DISTINCT ...)".format(
        len(countdistinctlines)))
    print("{} unique sqls with COUNT have no COUNT(*) or COUNT(DISTINCT(...))".
          format(len(nonstarcountlines)))

    # for line in joinwithoutgroupby:
    #     print(line)


if __name__ == '__main__':
    q.argprun(load_tables)
    # q.argprun(check_gold_sql)
Exemple #2
0
    threshold = 2
    stt.msg("{} unique rels at least {} time(s) in train data".format(
        len([xe for xe in allrelwcounts if xe[1] > threshold]), threshold))
    rarerels = set([xe[0] for xe in allrelwcounts if xe[1] <= threshold])
    testrarecount = 0
    for rel in bert_rel_test:
        if rel in rarerels:
            testrarecount += 1
    stt.msg("{}/{} test examples affected by rare rel".format(
        testrarecount, len(bert_rel_test)
    ))

    tt.tick("reload")
    reloaded = np.load(open(outp+".npz", "rb"))
    _relD = reloaded["relD"].item()
    _tokmat = reloaded["tokmat"]
    print(reloaded["devstart"])
    tt.tock("reloaded")


def run(lr=0):
    load_data()



if __name__ == '__main__':
    # q.argprun(run)
    # q.argprun(get_all_types)
    q.argprun(get_names_for_entities)
Exemple #3
0
    # print(traverser.query)
    # print(res)
    return qid, vnts, traverser.query_fn


def get_vnt_for_datasets(p="../../../../datasets/webqsp/webqsp.",
                         files=("train", "test", "core.test", "core.train")):
    tt = q.ticktock("vnt builder")
    for file in files:
        tt.tick("doing {}".format(file))
        filep = p + file + ".butd"
        vnts = {}
        with open(filep) as f:
            i = 0
            for line in f:
                qid, vnt, _ = get_vnt_for_butd(line)
                vnts[qid] = vnt
                i += 1
                if i % 10 == 0:
                    tt.msg("{}".format(i))
        pickle.dump(vnts, open(p + file + ".butd.vnt", "w"))
        tt.tock("done {}".format(file))


if __name__ == "__main__":
    _, _, query = get_vnt_for_butd(
        "WebQTrn-1102	what is the name of <E0> 's son	<E0> :people.person.children <BRANCH> :people.person.gender m.05zppz <JOIN> <RETURN>	(<E0>|walt disney|m.081nh)"
    )
    print(query())
    q.argprun(get_vnt_for_datasets)
Exemple #4
0
    #         optimizer.step()
    #
    #         if (i+1) % 100 == 0:
    #             btt.tock("100 batches done")
    #             print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
    #                    %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.datasets[0]))
    #             btt.tick()
    #         #tt.tock("batch done")
    #     tt.tock("epoch {} done".format(epoch))
    # Test the Model
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = q.var(images.view(-1, sequence_length,
                                   input_size)).cuda(crit=gpu).v
        labels = q.var(labels).cuda(crit=gpu).v
        outputs = rnn(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.data).sum()

    print('Test Accuracy of the model on the 10000 test images: %d %%' %
          (100 * correct / total))

    # Save the Model
    torch.save(rnn.state_dict(), 'rnn.pkl')


if __name__ == "__main__":
    q.argprun(main)
Exemple #5
0
        [2, 3, 4, 5, 6, 7, 8, 0],
        [9, 10, 0, 0, 0, 0, 0, 0]
    ]
    c = torch.tensor(c)

    bert = q.bert.TransformerBERT.load_from_dir("../../../data/bert/bert-base/")
    m = AdaptedBERTEncoderPairSlotPtr(bert, oldvocab=vocab)

    print("made model")

    y = m(a, b, c)
    print(y)
    pass

    # this should reset bert transformer's params only, but not bert's embeddings
    m.bert.reset_parameters()
    # this deletes some layers
    del m.bert.encoder.layers[6:]
    # m.bert.emb.word_embeddings.weight[0]

    y2 = m(a, b, c)
    pass

# endregion


if __name__ == '__main__':
    # q.argprun(test_adapted_bert_encoder_pair)
    q.argprun(test_adapted_bert_encoder_pair_slotptr)
    # q.argprun(test_adapted_bert_compare_slotptr)
Exemple #6
0
            "alignments": alignments,
            "align_entropies": align_entropies
        })

    ret = {
        "sentence": sentence,
        "gold": gold,
        "candidates": cands
    }
    return ret


def run_hpo(numsplits=6, cuda=False, gpu=0):
    ranges = {"encdim": [256, 400],
              "dropout": [.25, .4],
              "smoothing": [.1],
              "epochs": [60, 75],
              "beta": [0.5, 0.3, 0.7],
              }
    results = q.run_hpo_cv(run, ranges, numcvfolds=numsplits, path=__file__+".hpo", cuda=cuda, gpu=gpu)
    print(results["best"])


if __name__ == '__main__':
    # try_basic_query_tokenizer()
    # try_build_grammar()
    # try_dataset()
    # try_tree_permutations()
    # q.argprun(run)
    q.argprun(run_hpo)
    # q.argprun(run_rerank)
Exemple #7
0
        if not uris_gold[i] in at150ents:
            at150.add(i)
            # debug_print(i)
            # print(_questions_test[i])

    print("{:.4} % corrected (end before start)".format(
        100 * len(invalid_startends) / len(questions_test)))
    print("{:.4} % accuracy after postprocessing".format(
        100 * (1 - (len(errors) / len(questions_test)))))
    print("{:.4} % overlap after postprocessing".format(
        100 * (1 - (len(nooverlap) / len(questions_test)))))
    print("{:.4} % R@1".format(100 * (1 - (len(at1) / len(questions_test)))))
    print("{:.4} % R@5".format(100 * (1 - (len(at5) / len(questions_test)))))
    print("{:.4} % R@20".format(100 * (1 - (len(at20) / len(questions_test)))))
    print("{:.4} % R@50".format(100 * (1 - (len(at50) / len(questions_test)))))
    print("{:.4} % R@150".format(100 * (1 -
                                        (len(at150) / len(questions_test)))))
    # for k in range(50):
    #     debug_print(list(someoverlap)[k])
    #     print(_questions_test[list(someoverlap)[k]])

    with open(os.path.join(p, "entcands.{}.pkl".format(which)), "wb") as f:
        pkl.dump(allcands, f)


if __name__ == '__main__':
    # build_entity_bloom()
    # q.argprun(get_dsF1_wordlevel)
    q.argprun(run_borders)
    # test_index()
    # build_entity_index(testsearch=True)
Exemple #8
0
                outpushpop.append(-numpops)
        ret = torch.tensor(outpushpop)
        return ret


class TreeRNNDecoderCellNumChild(NumChildPushPopper, TreeRNNDecoderCell):
    pass



def test_gated_tree_lstm_cell(lr=0.):
    m = GatedTreeLSTMCell(4, 3)

    x = torch.randn(2, 6, 4)
    prev_pushpop = torch.tensor(
        [
            [1, 0, 1, 0, -1, -1],
            [1, 1, 1, 0, -3, 0]
        ]
    )

    for t in range(6):
        y = m(x[:, t], prev_pushpop[:, t])

    print("done")
# endregion

if __name__ == '__main__':
    q.argprun(test_gated_tree_lstm_cell)

        "lr": [0.0001],
        "enclrmul": [0.1],
        "warmup": [1],
        "epochs": [75],
        "numheads": [16],
        "numlayers": [3],
        "dropout": [.1],
        "hdim": [960],
        "seed": [12345678, 65748390, 98387670, 23655798, 66453829],     # TODO: add more later
    }
    p = __file__ + f".{domain}"
    def check_config(x):
        effectiveenclr = x["enclrmul"] * x["lr"]
        if effectiveenclr < 0.000005:
            return False
        dimperhead = x["hdim"] / x["numheads"]
        if dimperhead < 20 or dimperhead > 100:
            return False
        return True

    q.run_experiments(run, ranges, path_prefix=p, check_config=check_config,
                      domain=domain, gpu=gpu, patience=patience, cosinelr=cosinelr)




if __name__ == '__main__':
    ret = q.argprun(run)
    # print(ret)
    # q.argprun(run_experiments)
    # q.argprun(run_experiments_seed)
Exemple #10
0
                                                  score_norm=scorenorm,
                                                  renormalize=False,
                                                  **kw)


def test_lstm_phrase_attention(lr=0):
    print("testing lstm phrase attention")
    m = LSTMPhraseAttention(4)
    ctx = torch.randn(2, 5, 4)
    qrys = torch.randn(2, 6, 4)
    ctx_mask = torch.tensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
    pushpop = [[1, 0, 1, 0, -1, -1], [1, 1, 1, 1, -4, 0]]
    pushpop = list(zip(*pushpop))

    for i in range(qrys.size(1)):
        alphas, summary, scores = m(qrys[:, i],
                                    ctx,
                                    ctx_mask=ctx_mask,
                                    pushpop=pushpop[i])

    overlap = m.get_sibling_overlap()
    pass


# endregion

if __name__ == '__main__':
    # q.argprun(test_custom_f)
    # q.argprun(test_phrase_attention)
    q.argprun(test_lstm_phrase_attention)
Exemple #11
0
                                         cooldown=1,
                                         warmup=warmup,
                                         threshold=0.,
                                         verbose=True,
                                         eps=1e-9)
    on_after_valid = [lambda: lrplateau.step(vlosses[1].get_epoch_error())]
    _devloop = partial(devloop, on_end=on_after_valid)
    stoptrain = [lambda: all([pg["lr"] <= 1e-7 for pg in optim.param_groups])]

    tt.tick("training")
    q.run_training(trainloop,
                   _devloop,
                   max_epochs=epochs,
                   check_stop=stoptrain)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    settings["testres"] = testres
    tt.tock("tested")

    devres = devloop()
    print(devres, vlosses[0].get_epoch_error())

    return vlosses[1].get_epoch_error()


if __name__ == '__main__':
    q.argprun(run_seq2seq)
Exemple #12
0
        testpreds = q.eval_loop(m, evalloader, device=device)
        borderpreds = testpreds[0].cpu().detach().numpy()
        relpreds = testpreds[1].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.test.npy"), borderpreds)
        np.save(os.path.join(savedir, "relpreds.test.npy"), relpreds)
        # DEV data
        testpreds = q.eval_loop(m, evalloader_dev, device=device)
        borderpreds = testpreds[0].cpu().detach().numpy()
        relpreds = testpreds[1].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.dev.npy"), borderpreds)
        np.save(os.path.join(savedir, "relpreds.dev.npy"), relpreds)
        # save bert-tokenized questions
        # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        # with open(os.path.join(savedir, "testquestions.txt"), "w") as f:
        #     for batch in evalloader:
        #         ques, io = batch
        #         ques = ques.numpy()
        #         for question in ques:
        #             qstr = " ".join([x for x in tokenizer.convert_ids_to_tokens(question) if x != "[PAD]"])
        #             f.write(qstr + "\n")

        tt.tock("done")
    # endregion


if __name__ == '__main__':
    # test_io_span_detector()
    # q.argprun(run_span_borders)
    # test_spanf1_borders()
    q.argprun(run_both)
    # q.argprun(run_relations)
Exemple #13
0
    m = LSTMPhraseAttention(4)
    ctx = torch.randn(2, 5, 4)
    qrys = torch.randn(2, 6, 4)
    ctx_mask = torch.tensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
    pushpop = [
        [1, 0, 1, 0, -1, -1],  # output of last step will be "masked"
        [1, 1, 1, 1, -4, 0]
    ]  # output of last two steps will be "masked"
    pushpop = torch.tensor(pushpop)
    # pushpop = list(zip(*pushpop))

    for i in range(qrys.size(1)):
        alphas, summary, scores = m(qrys[:, i],
                                    ctx,
                                    ctx_mask=ctx_mask,
                                    prev_pushpop=pushpop[:, i])

    overlap = m.get_sibling_overlap()
    pass


# endregion

if __name__ == '__main__':
    # q.argprun(test_custom_f)
    # q.argprun(test_phrase_attention)
    # q.argprun(test_phrase_attention_teacher)
    # q.argprun(test_lstm_phrase_attention)
    # q.argprun(test_pooled_lstm_summ_comp)
    q.argprun(test_rel_attention)
    if epochs >= 0:
        ranges["epochs"] = [epochs]
    p = __file__ + f".{domain}"

    def check_config(x):
        effectiveenclr = x["enclrmul"] * x["lr"]
        # if effectiveenclr < 0.000005:
        #     return False
        dimperhead = x["hdim"] / x["numheads"]
        if dimperhead < 20 or dimperhead > 100:
            return False
        return True

    q.run_experiments(run,
                      ranges,
                      path_prefix=p,
                      check_config=check_config,
                      domain=domain,
                      gpu=gpu,
                      trainonvalid=trainonvalid)


if __name__ == '__main__':
    # test_multi_celoss()
    # test_tensors_to_tree()
    # try_tree_insertion_model_decode()
    # try_tree_insertion_model_tagger()
    # try_real_tree_insertion_model_tagger()
    # q.argprun(run)
    q.argprun(run_experiments_seed)
        optim=optim,
        device=device,
        losses=[q.LossWrapper(ce),
                q.LossWrapper(elemacc),
                q.LossWrapper(acc)],
        print_every_batch=False,
        _train_batch=batchloop)
    validloop = partial(q.test_epoch,
                        model=test_encdec,
                        dataloader=vloader,
                        device=device,
                        losses=[q.LossWrapper(treeacc)],
                        print_every_batch=False)

    tt.tick("training")
    q.run_training(trainloop, validloop, max_epochs=epochs)
    tt.tock("trained")

    tt.tick("testing")
    test_results = validloop(model=test_encdec, dataloader=xloader)
    print("Test results (freerunning): {}".format(test_results))
    test_results = validloop(model=train_encdec, dataloader=xloader)
    print("Test results (TF): {}".format(test_results))
    tt.tock("tested")
    # endregion
    tt.msg("done")


if __name__ == '__main__':
    q.argprun(run_normal)
Exemple #16
0
        if k in settings:
            if isinstance(settings[k], str) and settings[k] != "default":
                ranges[k] = [settings[k]]
            elif isinstance(settings[k], (int, float)) and settings[k] >= 0:
                ranges[k] = [settings[k]]
            else:
                pass
                # raise Exception(f"something wrong with setting '{k}'")
            del settings[k]

    def checkconfig(spec):
        if spec["dataset"].startswith("cfq"):
            if spec["epochs"] != 25 or spec["batsize"] != 128:
                return False
        elif spec["dataset"].startswith("scan"):
            if spec["epochs"] != 40 or spec["batsize"] != 256:
                return False
        return True

    print(__file__)
    p = __file__ + f".baseline.{dataset}"
    q.run_experiments_random(run,
                             ranges,
                             path_prefix=None,
                             check_config=checkconfig,
                             **settings)


if __name__ == '__main__':
    q.argprun(run_experiment)
Exemple #17
0
        self.post_core_update()

        alphas, summaries, scores = self.att(core_out, ctx, ctx_mask=ctx_mask, values=ctx)  # do attention
        out_vec = self.merge(core_out, summaries, core_inp)
        out_vec = self.dropout(out_vec)
        self._outvec_tm1 = out_vec      # store outvec (this is how Luong, 2015 does it)

        if self.out is None:
            ret_normal = out_vec
        else:
            if isinstance(self.out, PointerGeneratorOut):
                _out_vec = self.out(out_vec, scores=scores)
            else:
                _out_vec = self.out(out_vec)
            ret_normal = _out_vec

        l = locals()
        ret = tuple([l[k] for k in sum(self.returns, [])])
        return ret[0] if len(ret) == 1 else ret


def test_training_stackcell(lr=0.001,
                            ):
    pass



if __name__ == '__main__':
    q.argprun(test_training_stackcell)
Exemple #18
0
    for bstate in x.bstates._list:
        cand = query_vocab.tostr(bstate.followed_actions[0],
                                 return_tokens=True)
        alignments = []
        align_entropies = []

        for i in range(len(cand)):
            att = bstate.stored_attentions[0, i]
            entropy = -(att.clamp_min(1e-6).log() * att).sum()
            _, amax = att.max(-1)
            alignments.append(amax.cpu().item())
            align_entropies.append(entropy.cpu().item())

        cands.append({
            "tokens": cand,
            "alignments": alignments,
            "align_entropies": align_entropies
        })

    ret = {"sentence": sentence, "gold": gold, "candidates": cands}
    return ret


if __name__ == '__main__':
    # try_basic_query_tokenizer()
    # try_build_grammar()
    # try_dataset()
    q.argprun(run)
    # q.argprun(run_rerank)
    # try_tree_gru_encoder()
Exemple #19
0
                                             question_sm=question_sm,
                                             query_sm=query_sm)

    vnt_mat, vnt_mat_shape = load_vnt_mats(qids=qids, p=qp, tgtdict=tgt_emb.D)
    print(vnt_mat.nbytes)
    # print(np.sum(vnt_mat == 1), np.sum(vnt_mat == 0))

    tt.tock("loaded everything")
    # vnt_mat = vnt_mat.todense()
    # print(vnt_mat.shape)
    # vnt_mat = vnt_mat.reshape(vnt_mat_shape)
    print(vnt_mat.shape)
    assert (len(vnt_mat) == len(question_sm.matrix))
    print("vnt mat has same length as question mat")

    # check real next token in vnt
    tt.tick("checking loaded vnts")
    for i in range(query_sm.matrix.shape[0]):
        for j in range(query_sm.matrix.shape[1]):
            if query_sm.matrix[i, j] != query_sm.D["<MASK>"]:
                next_symbol = query_sm.matrix[i, j]
                assert (vnt_mat[i, j, next_symbol] == 1)

    tt.tock("checked loaded vnts")
    return (question_sm, query_sm, vnt_mat, tx_sep, qids), (src_emb, tgt_emb,
                                                            tgt_lin)


if __name__ == "__main__":
    q.argprun(load_all)