Beispiel #1
0
    def train_and_eval(self):
        print("Training the LSTM TuckER on {}...".format(args.dataset))
        self.entity_idxs = {d.entities[i]: i for i in range(len(d.entities))}
        self.relation_idxs = {
            d.relations[i]: i
            for i in range(len(d.relations))
        }

        train_data_idxs = self.get_data_idxs(d.train_data)
        #data_idxs = self.get_data_idxs(d.data)
        print("Number of training data points: %d" % len(train_data_idxs))
        #print("Number of all data points: %d" % len(data_idxs))

        ########
        #print("entities="+str(d.entities))
        entities_ids, self.Evocab = self.strings_to_ids(vocab=self.Evocab,
                                                        data=d.entities)
        #print("entities_ids = "+str(entities_ids))
        relation_ids, self.Rvocab = self.strings_to_ids(vocab=self.Rvocab,
                                                        data=d.relations)
        print("entities_ids len=%d" % len(entities_ids))
        print("relation_ids len=%d" % len(relation_ids))
        print("read vocab ready.")
        d.Etextdata = d.get_index(
            entities_ids, self.maxlength)  # list, contained padding entities
        #self.Elen = np.array(Elen)
        self.Etextdata = np.array(d.Etextdata)
        d.Rtextdata = d.get_index(relation_ids, self.maxlength)
        #self.Rlen = np.array(Rlen)
        self.Rtextdata = np.array(d.Rtextdata)
        # self.textdata = np.array(d.Etextdata + d.Rtextdata)
        # self.check_textdata()
        print("text data ready")
        cfg = config(dict(read_json(args.config)))
        es_idx = torch.LongTensor(self.Etextdata)
        if self.cuda:
            es_idx = es_idx.cuda()
        #print(cfg)
        model = LSTMTuckER(d=d, es_idx=es_idx, ent_vec_dim=self.ent_vec_dim, rel_vec_dim=self.rel_vec_dim, \
                           cfg=cfg, Evocab=len(self.Evocab), Rvocab=len(self.Rvocab), n_ctx = self.maxlength, **self.kwargs)# n_ctx = 52为COMET中计算出的
        print("model ready")

        ########
        if self.cuda:
            model.cuda()
        #model.init()
        opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        if self.decay_rate:
            scheduler = ExponentialLR(opt, self.decay_rate)

        er_vocab = self.get_er_vocab(train_data_idxs)  #dict (e1,r)->e2
        er_vocab_pairs = list(er_vocab.keys())  #list [...,(e1,r),...]

        print("Starting training...")
        for it in range(1, self.num_iterations + 1):
            start_train = time.time()
            model.train()
            losses = []
            np.random.shuffle(er_vocab_pairs)

            for j in range(0, len(er_vocab_pairs), self.batch_size):

                data_batch, targets = self.get_batch(er_vocab, er_vocab_pairs,
                                                     j)
                #target: tensor [batch, len(d.entities), 0./1.]
                opt.zero_grad()
                #print(data_batch[:,0].reshape(-1, 1))

                #print(textdata)
                #print(textdata[data_batch[:,0].reshape(-1, 1)])
                e1 = data_batch[:, 0]
                r = data_batch[:, 1]
                e1_idx = torch.LongTensor(self.Etextdata[e1])
                r_idx = torch.LongTensor(self.Rtextdata[r])

                if self.cuda:
                    e1_idx = e1_idx.cuda()
                    r_idx = r_idx.cuda()
                if e1_idx.size(0) == 1:
                    print(j)
                    continue
                predictions = model.forward(e1_idx, r_idx)
                #print('predictions size:'+str(predictions.size()))
                #print('targets size:' + str(targets.size()))
                if self.label_smoothing:
                    targets = ((1.0 - self.label_smoothing) *
                               targets) + (1.0 / targets.size(1))
                loss = model.loss(predictions, targets)
                loss.backward()
                opt.step()
                losses.append(loss.item())
            if self.decay_rate:
                scheduler.step()
            print(it)
            print(time.time() - start_train)
            print('loss=' + str(np.mean(losses)))
            model.eval()
            with torch.no_grad():
                if not it % 2:
                    if it % 10 == 0:
                        print("Train:")
                        start_test = time.time()
                        self.evaluate(model, d.train_data)
                        print(time.time() - start_test)
                    # print("Valid:")
                    # start_test = time.time()
                    # self.evaluate(model, d.valid_data)
                    # print(time.time() - start_test)
                    print("Test:")
                    start_test = time.time()
                    self.evaluate(model, d.test_data)
                    print(time.time() - start_test)
    def train_and_eval(self):
        print("Training the TuckER model...")
        self.entity_idxs = {d.entities[i]: i for i in range(len(d.entities))}
        self.relation_idxs = {d.relations[i]: i for i in range(len(d.relations))}

        train_data_idxs = self.get_data_idxs(d.train_data)
        data_idxs = self.get_data_idxs(d.data)
        print("Number of training data points: %d" % len(train_data_idxs))

        ########
        data_ids, self.vocab = self.strings_to_ids(data=d.data, vocab=self.vocab)
        print("read vocab ready.")
        d.textdata = d.get_index(data_ids, self.maxlength)
        self.textdata = np.array(d.textdata)
        print("text data ready")
        cfg = config(dict(read_json(args.config)))
        # print(cfg)
        model = TransformerTucker(d, self.ent_vec_dim, self.rel_vec_dim, cfg=cfg, vocab=40508, n_ctx=self.maxlength,
                                  **self.kwargs)  # n_ctx = 52为COMET中计算出的
        print("model ready")
        load_openai_pretrained_model(
            model.transformer, n_ctx=self.maxlength)
        print("loading model ready")

        ########
        if self.cuda:
            model.cuda()
        # model.init()
        opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        if self.decay_rate:
            scheduler = ExponentialLR(opt, self.decay_rate)

        er_vocab = self.get_er_vocab(train_data_idxs)
        er_vocab_pairs = list(er_vocab.keys())

        print("Starting training...")
        for it in range(1, self.num_iterations + 1):
            start_train = time.time()
            model.train()
            losses = []
            np.random.shuffle(er_vocab_pairs)
            for j in range(0, len(er_vocab_pairs), self.batch_size):
                data_batch, targets = self.get_batch(er_vocab, er_vocab_pairs, j)
                opt.zero_grad()
                # print(data_batch[:,0].reshape(-1, 1))

                # print(textdata)
                # print(textdata[data_batch[:,0].reshape(-1, 1)])
                e1_idx = torch.LongTensor(self.textdata[data_batch[:, 0]][:, :, np.newaxis])
                r_idx = torch.LongTensor(self.textdata[data_batch[:, 1]][:, :, np.newaxis])
                e1_idx = prepare_position_embeddings(encoder_vocab=self.vocab, sequences=e1_idx)
                r_idx = prepare_position_embeddings(encoder_vocab=self.vocab, sequences=r_idx)

                if self.cuda:
                    e1_idx = e1_idx.cuda()
                    r_idx = r_idx.cuda()
                predictions = model.forward(e1_idx, r_idx)
                if self.label_smoothing:
                    targets = ((1.0 - self.label_smoothing) * targets) + (1.0 / targets.size(1))
                loss = model.loss(predictions, targets)
                loss.backward()
                opt.step()
                losses.append(loss.item())
            if self.decay_rate:
                scheduler.step()
            print(it)
            print(time.time() - start_train)
            print(np.mean(losses))
            model.eval()
            with torch.no_grad():
                if it % 5:
                    print("Validation:")
                    self.evaluate(model, d.valid_data)
                    if not it % 2:
                        print("Test:")
                        start_test = time.time()
                        self.evaluate(model, d.test_data)
                        print(time.time() - start_test)
Beispiel #3
0
    def train_and_eval(self):
        print("Training the {} model on {}...".format(args.model,
                                                      args.dataset))
        self.entity_idxs = {d.entities[i]: i for i in range(len(d.entities))}
        self.relation_idxs = {
            d.relations[i]: i
            for i in range(len(d.relations))
        }

        train_data_idxs = self.get_data_idxs(d.train_data)
        # data_idxs = self.get_data_idxs(d.data)
        print("Number of training data points: %d" % len(train_data_idxs))
        # print("Number of all data points: %d" % len(data_idxs))

        ########
        # data_ids, self.vocab = self.strings_to_ids(vocab=self.vocab, data=d.data)
        #print('d.entities='+str(len(d.entities)))
        entities_ids, self.Evocab = self.strings_to_ids(vocab=[
            'NULL',
        ],
                                                        data=d.entities)

        #print("entities_ids = " + str(entities_ids))
        relation_ids, self.Rvocab = self.strings_to_ids(vocab=[
            'NULL',
        ],
                                                        data=d.relations)
        print("entities_ids len=%d" % len(entities_ids))
        print("relation_ids len=%d" % len(relation_ids))
        #print('XXX = ' + str([len(i) for i in entities_ids].index(0)))
        #print('YYY = ' + str([len(i) for i in entities_ids].index(0)))
        cfg = config(dict(read_json(args.config)))
        if args.do_pretrain == 1:
            cfg.hSize = 768
            Eembs = self.get_vocab_emb(self.Evocab, cfg.hSize)
        print("read vocab ready.")

        d.Etextdata = d.get_index(
            entities_ids, self.maxlength)  # list, contained padding entities
        self.Etextdata = np.array(d.Etextdata)
        d.Rtextdata = d.get_index(relation_ids, 1)
        self.Rtextdata = np.array(d.Rtextdata)
        # self.textdata = np.array(d.Etextdata + d.Rtextdata)
        #self.check_textdata()

        print("text data ready")
        es_idx = torch.LongTensor(self.Etextdata)
        if self.cuda:
            es_idx = es_idx.cuda()
            print("es ready")
        if args.model == 'Mean':
            model = MeanTuckER(d=d,
                               es_idx=es_idx,
                               ent_vec_dim=self.ent_vec_dim,
                               rel_vec_dim=self.rel_vec_dim,
                               cfg=cfg,
                               Evocab=len(self.Evocab),
                               Rvocab=len(self.Rvocab))
        elif args.model == 'CNN':
            model = CNNTuckER(d=d,
                              es_idx=es_idx,
                              ent_vec_dim=self.ent_vec_dim,
                              rel_vec_dim=self.rel_vec_dim,
                              cfg=cfg,
                              max_length=self.maxlength,
                              Evocab=len(self.Evocab),
                              Rvocab=len(self.Rvocab))
        elif args.model == 'LSTM':
            model = LSTMTuckER(d=d,
                               es_idx=es_idx,
                               ent_vec_dim=self.ent_vec_dim,
                               rel_vec_dim=self.rel_vec_dim,
                               cfg=cfg,
                               max_length=self.maxlength,
                               Evocab=len(self.Evocab),
                               Rvocab=len(self.Rvocab))
        else:
            print("No Model")
            exit(0)
        print("model ready")
        if args.do_pretrain == 1:
            model.Eembed.weight.data.copy_(torch.from_numpy(np.array(Eembs)))
            print("Embedding Loaded")

        ########
        if self.cuda:
            model.cuda()
        #model.init()
        opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        if self.decay_rate:
            scheduler = ExponentialLR(opt, self.decay_rate)

        er_vocab = self.get_er_vocab(train_data_idxs)  # dict (e1,r)->e2
        er_vocab_pairs = list(er_vocab.keys())  # list [...,(e1,r),...]

        print("Starting training...")

        for it in range(1, self.num_iterations + 1):
            start_train = time.time()
            model.train()
            losses = []
            np.random.shuffle(train_data_idxs)

            for j in range(0, len(train_data_idxs), self.batch_size):

                data_batch, e2n_idx = self.get_batch_train(
                    er_vocab, train_data_idxs, j)
                # target: tensor [batch, len(d.entities), 0./1.]
                opt.zero_grad()

                e1_idx = torch.LongTensor(self.Etextdata[data_batch[:, 0]])
                r_idx = torch.LongTensor(self.Rtextdata[data_batch[:, 1]])
                e2p_idx = torch.LongTensor(self.Etextdata[data_batch[:, 2]])
                e2n_idx = torch.LongTensor(self.Etextdata[e2n_idx])
                targets = torch.cat((torch.ones(
                    e2p_idx.size(0)), torch.zeros(e2n_idx.size(0))), 0)
                #e2_idx = torch.LongTensor(data_batch[:, 2])  # e2 are not used for model forward

                if self.cuda:
                    e1_idx = e1_idx.cuda()
                    r_idx = r_idx.cuda()
                    e2p_idx = e2p_idx.cuda()
                    e2n_idx = e2n_idx.cuda()
                    targets = targets.cuda()
                if e1_idx.size(0) == 1:
                    print(j)
                    continue
                pred_p, pred_n = model.forward(e1_idx, r_idx, e2p_idx, e2n_idx)
                #print("predictions="+str(predictions))
                predication = torch.cat((pred_p, pred_n), 0)

                if self.label_smoothing:
                    targets = ((1.0 - self.label_smoothing) *
                               targets) + (1.0 / len(d.entities))
                loss = model.loss(predication, targets)
                loss.backward()
                opt.step()
                losses.append(loss.item())

            if self.decay_rate:
                scheduler.step()
            print(it)
            print(time.time() - start_train)
            print("loss=" + str(np.mean(losses)))
            model.eval()
            with torch.no_grad():
                # print("Validation:")
                # self.evaluate(model, d.valid_data)
                # if not it % 2:
                # if it % 10 == 0:
                #     print("Train:")
                #     start_test = time.time()
                #     self.evaluate(model, d.train_data)
                #     print(time.time() - start_test)
                print("Test:")
                start_test = time.time()
                self.evaluate(model, d.test_data)
                print(time.time() - start_test)