Esempio n. 1
0
    def test_cos_sim(self):
        import numpy as np
        import scipy.spatial.distance

        def gen_testcase(n):
            np.random.seed(0)
            x = np.random.rand(n)
            y = np.random.rand(n)
            ans = 1 - scipy.spatial.distance.cosine(x, y)
            return x, y, ans

        n_test = 3
        for i in range(n_test):
            np.random.seed(0)
            x, y, expected = gen_testcase(10**i)
            self.assertLess(abs(expected - cos_sim(x, y)), 1e-10)
Esempio n. 2
0
    def fit(self):

        #Cos_cent = [None] * (self.ds_len * self.crd_len) #1つの文書に対して各重心とのコサイン類似度(距離)を計算
        #Cos_neigh = [None] * (self.ds_len ** 2) #1つの文書と各カテゴリに含まれる文書とのユークリッド距離を計算
        Cos_cent = np.empty((len(self.ds.index), len(self.crd.index)),
                            dtype=np.float32)
        #Cos_neigh = np.empty((len(self.ds.index), len(self.ds.index)), dtype=np.float32) #従来手法
        Cos_neigh = np.empty((len(self.ds.index), self.k),
                             dtype=np.float32)  #手法2
        tmpDS = self.ds.values.copy(order='C').astype(np.float32)
        faiss.normalize_L2(tmpDS)
        self.knn.add(tmpDS)
        """
        for i in tqdm(range(self.ds_len), desc='fit'): 
            ###Cos_neighを生成###
            for j in range(self.ds_len):
                #Cos_neigh[i*self.ds_len + j] = np.linalg.norm(self.ds.iloc[j] - self.ds.iloc[i]) #1つの文書から近いカテゴリに含まれている各文書とのコサイン類似度を求める(Cos_neigh)
                Cos_neigh[i][j] = self.cos_sim(self.ds.iloc[i].values, self.ds.iloc[j].values)
            
            ###Cos_centを生成###
            for j in range(self.crd_len):
                #Cos_cent[i*self.crd_len + j] = self.cos_sim(self.ds.iloc[i], self.crd.iloc[j]) #1つの文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent)
                Cos_cent[i][j] = self.cos_sim(self.ds.iloc[i].values, self.crd.iloc[j].values)
        """

        print('fit')
        ###Cos_neighを生成###
        Cos_cent = cs.cos_sim(self.ds.values, self.crd.values)
        """
        ###Cos_neighを生成###
        Cos_neigh = cs.cos_sim(self.ds.values, self.ds.values)
        """
        for i, d in enumerate(tqdm(self.ds.values, desc='cos_neigh')):
            D, I = self.knn.search(d, self.k + 1)
            D = np.delete(D, 0, 1)
            print(D)
            Cos_neigh[i] = D[0]

        ###Cos_neighを生成###
        #Cos_neigh = cn.cos_neigh(self.ds.values, self.ds.values) #1つの文書から近いカテゴリに含まれている各文書とのユークリッド距離を求める(Cos_neigh)
        #Cos_cent = np.array(Cos_cent).reshape(self.ds_len, self.crd_len) #行列の形に変換
        #Cos_neigh = np.array(Cos_neigh).reshape(self.ds_len, self.ds_len)
        self.XMFextend = np.concatenate([self.ds.values, Cos_cent, Cos_neigh],
                                        1)  #XMextendを生成 #従来手法
        """
Esempio n. 3
0
    def predict(self, target):
        #test_Mcent = [None] * self.crd_len
        #test_Cent_ext = [None] * (target_len * self.ds_len)
        test_Mcent = np.empty((len(target.index), len(self.crd.index)),
                              dtype=np.float32)
        test_Cent_ext = np.empty((len(target.index), len(self.ds.index)),
                                 dtype=np.float32)
        """
        for i in tqdm(range(len(target.index)), desc='predict'): #ターゲット文書と各重心とのコサイン類似度を計算(Cos_cent)
            for j in range(len(self.crd.index)):
                test_Mcent[j] = self.cos_sim(target.iloc[i].values, self.crd.iloc[j].values)

            for j in range(self.ds_len):
                #test_Cent_ext[i*self.ds_len + j] = np.linalg.norm(self.Mcent[j] - test_Mcent) #各文書とのユークリッド距離を求める(Cos_neigh)
                test_Cent_ext[i][j] = np.linalg.norm(self.Mcent[j] - test_Mcent)
        """
        test_Mcent = cs.cos_sim(target.values, self.crd.values)
        test_Cent_ext = cn.cos_neigh(test_Mcent, self.Mcent)

        #test_Cent_ext = np.array(test_Cent_ext).reshape(target_len, self.ds_len)
        test_pred = self.clf.predict(test_Cent_ext)

        return test_pred
Esempio n. 4
0
    def fit(self):
        Cent_ext = np.empty((len(self.ds.index), len(self.ds.index)),
                            dtype=np.float32)
        """
        for i in tqdm(range(self.ds_len), desc='Mcent'): #1つの文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent)
            for j in range(self.crd_len):
                #self.Mcent[i*self.crd_len + j] = self.cos_sim(self.ds.iloc[i], self.crd.iloc[j])
                self.Mcent[i][j] = self.cos_sim(self.ds.iloc[i].values, self.crd.iloc[j].values)

        #self.Mcent = np.array(self.Mcent).reshape(self.ds_len, self.crd_len)
        #Cent_ext = [None] * ((self.ds_len) ** 2)
        #Cent_ext = np.empty((self.ds_len, self.ds_len), dtype=np.float32)

        for i in tqdm(range(self.ds_len), desc='Cent_ext'):
            for j in range(self.ds_len):
                #Cent_ext[i*self.ds_len + j] = np.linalg.norm(self.Mcent[j] - self.Mcent[i]) #1つの文書から近いカテゴリに含まれている各文書とのユークリッド距離を求める(Cos_neigh)
                self.Cent_ext[i][j] = np.linalg.norm(self.Mcent[j] - self.Mcent[i])
        """
        self.Mcent = cs.cos_sim(self.ds.values, self.crd.values)
        Cent_ext = cn.cos_neigh(self.Mcent, self.Mcent)

        #Cent_ext = np.array(Cent_ext).reshape(self.ds_len, self.ds_len) #行列の形の変換
        self.clf.fit(Cent_ext, np.ravel(self.cl))
Esempio n. 5
0
    def predict(self, target):
        test_XMFextend = None

        #test_Cos_cent = [None] * (target_len * self.crd_len) #1つの文書に対して各重心とのコサイン類似度(距離)を計算
        #test_Cos_neigh = [None] * (target_len * self.ds_len) #1つの文書と各カテゴリに含まれる文書とのユークリッド距離を計算
        test_Cos_cent = np.empty((len(target.index), len(self.crd.index)),
                                 dtype=np.float32)
        test_Cos_neigh = np.empty((len(target.index), len(self.ds.index)),
                                  dtype=np.float32)
        """
        for i in tqdm(range(target_len), desc="test"): #ターゲット文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent)
            ###Cos_neighを生成###
            for j in range(self.ds_len): 
                #test_Cos_neigh[i*self.ds_len + j] = np.linalg.norm(self.ds.iloc[j] - target.iloc[i]) #各文書とのコサイン類似度を求める(Cos_neigh)
                test_Cos_neigh[i][j] = self.cos_sim(target.iloc[i].values, self.ds.iloc[j].values)

            ###Cos_centを生成###
            for j in range(self.crd_len):      
                #test_Cos_cent[i*self.crd_len + j] = self.cos_sim(target.iloc[i], self.crd.iloc[j])
                test_Cos_cent[i][j] = self.cos_sim(target.iloc[i].values, self.crd.iloc[j].values)
        """

        print('test')
        ###Cos_neighを生成###
        test_Cos_cent = cs.cos_sim(target.values, self.crd.values)

        ###Cos_neighを生成###
        test_Cos_neigh = cs.cos_sim(target.values, self.ds.values)

        ###Cos_neighを生成###
        #test_Cos_neigh = cn.test_cos_neigh(target.values, self.ds.values) #各文書とのユークリッド距離を求める(Cos_neigh)
        #test_Cos_cent = np.array(test_Cos_cent).reshape(target_len, self.crd_len)
        #test_Cos_neigh = np.array(test_Cos_neigh).reshape(target_len, self.ds_len)
        """
        test_Mcent = cs.cos_sim(target.values, self.crd.values) #改良手法1
        test_cent_ext = cn.cos_neigh(test_Mcent, self.Mcent)
        #print(test_cent_ext.shape)
        """
        test_XMFextend = np.concatenate(
            [target.values, test_Cos_neigh, test_Cos_cent],
            1)  #test_XMextendを生成 #従来手法
        #test_XMFextend = np.concatenate([target.values, test_cent_ext], 1) #test_XMextendを生成 #改良手法1
        #print(test_XMFextend.shape)

        orig_ext = pd.DataFrame(columns=self.crd.index,
                                index=target.index)  #予測したカテゴリを格納
        """
        del self.Mcent
        del test_Mcent
        del test_cent_ext
        gc.collect()
        """
        for i in tqdm(range(len(self.hwc)), desc='hwc fit'):  #カテゴリの種類数だけ超平面を生成
            Bool_cl_train = (self.cl['Category'] == self.crd.index[i]
                             ).values  #fitに与えるカテゴリをTrueとFalseの2値にする
            self.hwc[i].fit(self.XMFextend, Bool_cl_train)  #カテゴリごとに学習 #二値分類

        del self.XMFextend
        gc.collect()

        for i in tqdm(range(len(self.crd.index)), desc='predict'):
            orig_ext[self.crd.index[i]] = [
                self.sigmoid_func(i)
                for i in self.hwc[i].decision_function(test_XMFextend)
            ]  #テスト文書と超平面の間の距離を取得し、シグモイド関数で正規化 #Orig_ext行列へ縦方向に代入していく

        test_pred = orig_ext.astype(np.float32).idxmax(
            axis=1)  #ターゲット文書が持つ特徴量が最大のカテゴリを取得

        return test_pred
Esempio n. 6
0
def main():
    path = r"glove.6B.50d.txt.w2v"
    glove = KeyedVectors.load_word2vec_format(path, binary=False)

    # loads the json file
    path_to_json = "captions_train2014.json"
    with open(path_to_json, "rb") as f:
        json_data = json.load(f)
    resnet = unpickle.unpickle()

    with open("idfs1.pkl", mode="rb") as idf:
        idfs = pickle.load(idf)
    with open("img_to_caption1.pkl", mode="rb") as cap:
        img_to_caption = pickle.load(cap)
    #with open("img_to_coco1.pkl", mode="rb") as coco:
    #img_to_coco=pickle.load(coco)
    model = Model()

    model.dense1.weight = mg.Tensor(np.load('weight.npy'))
    model.dense1.bias = mg.Tensor(np.load('bias.npy'))
    optim = Adam(model.parameters)

    batch_size = 100
    for epoch_cnt in range(100):

        idxs = list(resnet.keys())
        np.random.shuffle(idxs)
        for batch_cnt in range(0, len(idxs) // batch_size - 1):
            batch_indices = idxs[(batch_cnt * batch_size):((batch_cnt + 1) *
                                                           batch_size)]
            batch_indices2 = idxs[((batch_cnt + 1) *
                                   batch_size):((batch_cnt + 2) * batch_size)]
            # id1 = np.random.choice(list(resnet.keys()))
            # print(id1)
            id1 = batch_indices
            # while id1 == id2:
            id2 = batch_indices2

            # print(type(resnet[id1]),type(img_to_caption[id1][0]),type(resnet[id2]))
            good_image = resnet[id1[0]]
            bad_image = resnet[id2[0]]
            text = embed_text.se_text(img_to_caption[id1[0]][0], glove, idfs)
            for i in id1[1:]:
                good_image = np.vstack((good_image, resnet[i]))
                text = np.vstack(
                    (text, embed_text.se_text(img_to_caption[i][0], glove,
                                              idfs)))

            for i in id2[1:]:
                bad_image = np.vstack((bad_image, resnet[i]))

            sim_to_good = cos_sim.cos_sim(model(good_image), text)
            sim_to_bad = cos_sim.cos_sim(model(bad_image), text)

            # compute the loss associated with our predictions(use softmax_cross_entropy)
            loss = margin_ranking_loss(sim_to_good, sim_to_bad, 1, 0.1)
            # back-propagate through your computational graph through your loss
            loss.backward()

            # compute the accuracy between the prediction and the truth
            acc = accuracy(sim_to_good.data, sim_to_bad.data)
            # execute gradient descent by calling step() of optim
            optim.step()
            # null your gradients
            loss.null_gradients()

    np.save('weight', model.dense1.parameters[0].data)
    np.save('bias', model.dense1.parameters[1].data)
Esempio n. 7
0
def guess_category(dv, v):
    cos_sim_dict = dict([[cate, cos_sim(np.array(x), np.array(v))]
                         for cate, x in dv.items()])
    return max(cos_sim_dict, key=cos_sim_dict.get)