Ejemplo n.º 1
0
 def predict_similarity(self, sa, sb):
     q = [[sa, sb, 0]]
     x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen)
     ls = []
     ls2 = []
     use_noise.set_value(0.)
     for j in range(0, len(x1)):
         ls.append(embed(x1[j], 'en', W=self.W))
         ls2.append(embed(x2[j], 'jp'))
     trconv = np.dstack(ls)
     trconv2 = np.dstack(ls2)
     emb2 = np.swapaxes(trconv2, 1, 2)
     emb1 = np.swapaxes(trconv, 1, 2)
     return self.f2sim(emb1, mas1, emb2, mas2)
Ejemplo n.º 2
0
    def evaluate(self, data):
        x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen)
        use_noise.set_value(0.)

        n_samples = len(data)

        ls = []  # Embedding results of xa
        ls2 = []  # Embedding results of xb
        for j in range(0, n_samples):
            ls.append(embed(x1[j], 'en', W=self.W))
            ls2.append(embed(x2[j], 'jp'))

        # print "ls: (should be the same ref_embed)", ls
        rank_results = []

        for i in range(0, n_samples):

            # NOTE: mas1 and mas2 are verticle matrix, not a normal one!
            # ref_ls refers to n_samples(999,EN) of duplicated ls[i]
            # So we can compare the ls[i](EN) with other sentences(999,JP)
            # to derive the ranking results for this given article ls[i](EN)
            # 用一个英语文章比较所有可能为pairs的日语文章(如999篇)求出ranking
            # ref_ls 就是一个重复了999(n_samples)次的文章ls[i]
            # 而 ls2 就是可能为paris的999篇日语的文章
            ref_ls = [ls[i]] * n_samples
            # print "ref_embed", ref_embed
            ref_mas1 = np.array([
                mas1[:, i],
            ] * n_samples).T
            # print "ref_mas", ref_mas
            # print "mas1", mas1
            # return mas1, ref_mas
            trconv = np.dstack(ref_ls)
            trconv2 = np.dstack(ls2)
            emb2 = np.swapaxes(trconv2, 1, 2)
            emb1 = np.swapaxes(trconv, 1, 2)
            pred = self.f2sim(emb1, ref_mas1, emb2, mas2)

            rank = pd.Series(pred).rank(ascending=False)[i]
            rank_results.append(rank)
            print "the round", i, "rank:", rank

        return rank_results
Ejemplo n.º 3
0
    def get_mse(self, data):
        # list saving the projection results (50 dim):

        x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen)
        # print "Finish preparing the data!"
        use_noise.set_value(0.)

        n_samples = len(data)

        ls = []  # Embedding results of xa
        ls2 = []  # Embedding results of xb
        for j in range(0, n_samples):
            ls.append(embed(x1[j], 'en', W=self.W))
            ls2.append(embed(x2[j], 'jp'))

        # print "Finished embedding,start projecting..."

        # start_time = time.time()
        # for i in range(0, n_samples):

        # print "conducting the", i, "projection"
        # loop_time = time.time()

        trconv = np.dstack(ls)
        trconv2 = np.dstack(ls2)

        emb1 = np.swapaxes(trconv, 1, 2)
        emb2 = np.swapaxes(trconv2, 1, 2)

        # list saving the projection results (50 dim):

        # list_projection1 = self.f_proj11(emb1, mas1)
        # list_projection2 = self.f_proj11(emb2, mas2)
        c = self.f_cost(emb1, mas1, emb2, mas2, y2)

        # After projection, compare the distance for possible pairs
        # ## SKIP

        return c
Ejemplo n.º 4
0
 def chkterr2(self, mydata):
     # count = []
     num = len(mydata)
     px = []
     yx = []
     use_noise.set_value(0.)
     for i in range(0, num, 256):
         q = []
         x = i + 256
         if x > num:
             x = num
         for j in range(i, x):
             q.append(mydata[j])
         x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen)
         ls = []
         ls2 = []
         for j in range(0, len(q)):
             ls.append(embed(x1[j], 'en', W=self.W))
             ls2.append(embed(x2[j], 'jp'))
         trconv = np.dstack(ls)
         trconv2 = np.dstack(ls2)
         emb2 = np.swapaxes(trconv2, 1, 2)
         emb1 = np.swapaxes(trconv, 1, 2)
         pred = (self.f2sim(emb1, mas1, emb2, mas2)) * 4.0 + 1.0
         #dm1=np.ones(mas1.shape,dtype=np.float32)
         #dm2=np.ones(mas2.shape,dtype=np.float32)
         #corr=f_cost(emb1,mas1,emb2,mas2,y2)
         for z in range(0, len(q)):
             yx.append(y2[z])
             px.append(pred[z])
     #count.append(corr)
     px = np.array(px)
     yx = np.array(yx)
     #print "average error= "+str(np.mean(acc))
     return np.mean(np.square(px - yx)), meas.pearsonr(
         px, yx)[0], meas.spearmanr(yx, px)[0]
Ejemplo n.º 5
0
    def train_lstm(self,
                   train,
                   max_epochs,
                   correct,
                   test_correct,
                   batchsize=32):
        print "Training"
        print "the length of the training data is ", len(train)

        # test = train

        print "Batchsize =", batchsize
        print "max_epochs =", max_epochs
        lrate = 0.0001  # Learning rate, but Not USED ???
        freq = 0  # ???
        batchsize = 64
        dfreq = 21  #display frequency

        self.mse = []  # MSE of train1 + train2
        self.rank = []
        self.tops = {}

        self.mse_test = []  # MSE of test1
        self.mse_train = []  # MSE of train1
        self.rank_test = []
        self.tops_test = {}

        self.top_keys = [1, 5, 10]

        print "Before trianing, the error is:"
        # print self.chkterr2(train) # MSE check
        cst_all = self.chkterr2(train)[0] / 16
        self.mse.append(cst_all)
        cst_test = self.chkterr2(test_correct)[0] / 16
        self.mse_test.append(cst_test)
        cst_train = self.chkterr2(correct)[0] / 16
        self.mse_train.append(cst_train)
        # 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train)
        # 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的
        print "Training error:", cst_all  #, "==", self.get_mse(train)
        print "Training_correct error", cst_train
        print "Testing_correct error:", cst_test

        # Saving (Initialization) the ranking and top1,5,10 information (Trianing data)
        rank_results_train, n_tops = self.evaluate2(
            correct, tops=self.top_keys)  # Similairty check
        # print "[debug]", n_tops
        for top_key in self.top_keys:
            # print "[debug]", n_tops[top_key]
            self.tops[top_key] = []
            self.tops[top_key].append(n_tops[top_key])
            print "top-", top_key, "=", self.tops[top_key], ":", n_tops[
                top_key]
        print "Discription of evaluation (ranking) for training data:"
        print pd.Series(rank_results_train).describe()

        # Saving (Initialization) the ranking and top1,5,10 information (Testing data)
        rank_results_test, n_tops_test = self.evaluate2(
            test_correct, tops=self.top_keys)  # Similairty check
        # print "[debug]", n_tops
        for top_key in self.top_keys:
            # print "[debug]", n_tops[top_key]
            self.tops_test[top_key] = []
            self.tops_test[top_key].append(n_tops_test[top_key])
            print "top-", top_key, "=", self.tops_test[
                top_key], ":", n_tops_test[top_key]
        print "Discription of evaluation (ranking) for testing data:"
        print pd.Series(rank_results_test).describe()

        # eidx -> index of epoch
        for eidx in xrange(0, max_epochs):
            sta = time.time()
            print ""
            print 'Epoch', eidx, '...'

            num = len(train)  # length of training data

            #---------------------Shuffle the data------------------------------#
            # 为何不直接用shuffle函数?
            # generates a list with length of num from the population xrange(num)
            # Used for shuffling the training data each time for each epoches
            # [5,2,6,.11,...] length -> len(train)
            rnd = random.sample(xrange(num), num)

            # i would be (0,32,64,...)
            # Iterate all batches
            for i in range(0, num, batchsize):
                q = []
                x = i + batchsize
                if x > num:
                    x = num

                # Shuffle data
                # Iterate samples inside each batch
                # i -> start index of the batch
                # x -> end index of the batch
                for z in range(i, x):
                    # shuffling the training data to the list q
                    q.append(train[rnd[z]])
            #---------------------------------------------------------------------#
                """
                Mask for LSTM is prepared by sentence module
                x1 = np.array([["我","很","好",",",",",","][...]...])
                len(x1) => 文档的总数
                mas1 = np.array([[1,1,1,0,0,0,0,0,0,0][...]...])
                """
                x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen)

                ls = []
                ls2 = []
                freq += 1
                use_noise.set_value(1.)
                for j in range(0, len(x1)):
                    ls.append(embed(x1[j], 'en', W=self.W))
                    ls2.append(embed(x2[j], 'jp'))
                trconv = np.dstack(ls)
                trconv2 = np.dstack(ls2)
                emb2 = np.swapaxes(trconv2, 1, 2)
                emb1 = np.swapaxes(trconv, 1, 2)

                cst = self.f_grad_shared(emb2, mas2, emb1, mas1, y2)
                s = self.f_update(lrate)  # Not USED ???

                if np.mod(freq, dfreq) == 0:
                    print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst
                # print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst

            # Evalution
            # print self.chkterr2(train) # MSE check
            cst_all = self.chkterr2(train)[0] / 16
            self.mse.append(cst_all)
            cst_test = self.chkterr2(test_correct)[0] / 16
            self.mse_test.append(cst_test)
            cst_train = self.chkterr2(correct)[0] / 16
            self.mse_train.append(cst_train)
            # 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train)
            # 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的
            # 错误用法: print "Training error:", cst, "=", self.chkterr2(train)[0]/16, "==", self.get_mse(train)
            print "Training error:", cst_all  #, "==", self.get_mse(train)
            print "Training_correct error", cst_train
            print "Testing_correct error:", cst_test

            # Saving the ranking and top1,5,10 information
            rank_results_train, n_tops = self.evaluate2(
                correct, tops=self.top_keys)  # Similairty check
            self.rank.append(rank_results_train)
            for top_key in self.top_keys:
                self.tops[top_key].append(n_tops[top_key])
                print "top-", top_key, "=", self.tops[top_key], ":", n_tops[
                    top_key]
            print "Discription of evaluation (ranking) for training data:"
            print pd.Series(rank_results_train).describe()

            # Saving the ranking and top1,5,10 information
            rank_results_test, n_tops_test = self.evaluate2(
                test_correct, tops=self.top_keys)  # Similairty check
            self.rank_test.append(rank_results_test)
            for top_key in self.top_keys:
                self.tops_test[top_key].append(n_tops_test[top_key])
                print "top-", top_key, "=", self.tops_test[
                    top_key], ":", n_tops_test[top_key]
            print "Discription of evaluation (ranking) for testing data:"
            print pd.Series(rank_results_test).describe()

            # Saving the present weights:
            self.save_model(name=self.model_name + "_" + str(eidx) + ".p")

            sto = time.time()
            self.time_saver = sto - sta
            print "epoch took:", self.time_saver