def test_cos_sim(self): import numpy as np import scipy.spatial.distance def gen_testcase(n): np.random.seed(0) x = np.random.rand(n) y = np.random.rand(n) ans = 1 - scipy.spatial.distance.cosine(x, y) return x, y, ans n_test = 3 for i in range(n_test): np.random.seed(0) x, y, expected = gen_testcase(10**i) self.assertLess(abs(expected - cos_sim(x, y)), 1e-10)
def fit(self): #Cos_cent = [None] * (self.ds_len * self.crd_len) #1つの文書に対して各重心とのコサイン類似度(距離)を計算 #Cos_neigh = [None] * (self.ds_len ** 2) #1つの文書と各カテゴリに含まれる文書とのユークリッド距離を計算 Cos_cent = np.empty((len(self.ds.index), len(self.crd.index)), dtype=np.float32) #Cos_neigh = np.empty((len(self.ds.index), len(self.ds.index)), dtype=np.float32) #従来手法 Cos_neigh = np.empty((len(self.ds.index), self.k), dtype=np.float32) #手法2 tmpDS = self.ds.values.copy(order='C').astype(np.float32) faiss.normalize_L2(tmpDS) self.knn.add(tmpDS) """ for i in tqdm(range(self.ds_len), desc='fit'): ###Cos_neighを生成### for j in range(self.ds_len): #Cos_neigh[i*self.ds_len + j] = np.linalg.norm(self.ds.iloc[j] - self.ds.iloc[i]) #1つの文書から近いカテゴリに含まれている各文書とのコサイン類似度を求める(Cos_neigh) Cos_neigh[i][j] = self.cos_sim(self.ds.iloc[i].values, self.ds.iloc[j].values) ###Cos_centを生成### for j in range(self.crd_len): #Cos_cent[i*self.crd_len + j] = self.cos_sim(self.ds.iloc[i], self.crd.iloc[j]) #1つの文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent) Cos_cent[i][j] = self.cos_sim(self.ds.iloc[i].values, self.crd.iloc[j].values) """ print('fit') ###Cos_neighを生成### Cos_cent = cs.cos_sim(self.ds.values, self.crd.values) """ ###Cos_neighを生成### Cos_neigh = cs.cos_sim(self.ds.values, self.ds.values) """ for i, d in enumerate(tqdm(self.ds.values, desc='cos_neigh')): D, I = self.knn.search(d, self.k + 1) D = np.delete(D, 0, 1) print(D) Cos_neigh[i] = D[0] ###Cos_neighを生成### #Cos_neigh = cn.cos_neigh(self.ds.values, self.ds.values) #1つの文書から近いカテゴリに含まれている各文書とのユークリッド距離を求める(Cos_neigh) #Cos_cent = np.array(Cos_cent).reshape(self.ds_len, self.crd_len) #行列の形に変換 #Cos_neigh = np.array(Cos_neigh).reshape(self.ds_len, self.ds_len) self.XMFextend = np.concatenate([self.ds.values, Cos_cent, Cos_neigh], 1) #XMextendを生成 #従来手法 """
def predict(self, target): #test_Mcent = [None] * self.crd_len #test_Cent_ext = [None] * (target_len * self.ds_len) test_Mcent = np.empty((len(target.index), len(self.crd.index)), dtype=np.float32) test_Cent_ext = np.empty((len(target.index), len(self.ds.index)), dtype=np.float32) """ for i in tqdm(range(len(target.index)), desc='predict'): #ターゲット文書と各重心とのコサイン類似度を計算(Cos_cent) for j in range(len(self.crd.index)): test_Mcent[j] = self.cos_sim(target.iloc[i].values, self.crd.iloc[j].values) for j in range(self.ds_len): #test_Cent_ext[i*self.ds_len + j] = np.linalg.norm(self.Mcent[j] - test_Mcent) #各文書とのユークリッド距離を求める(Cos_neigh) test_Cent_ext[i][j] = np.linalg.norm(self.Mcent[j] - test_Mcent) """ test_Mcent = cs.cos_sim(target.values, self.crd.values) test_Cent_ext = cn.cos_neigh(test_Mcent, self.Mcent) #test_Cent_ext = np.array(test_Cent_ext).reshape(target_len, self.ds_len) test_pred = self.clf.predict(test_Cent_ext) return test_pred
def fit(self): Cent_ext = np.empty((len(self.ds.index), len(self.ds.index)), dtype=np.float32) """ for i in tqdm(range(self.ds_len), desc='Mcent'): #1つの文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent) for j in range(self.crd_len): #self.Mcent[i*self.crd_len + j] = self.cos_sim(self.ds.iloc[i], self.crd.iloc[j]) self.Mcent[i][j] = self.cos_sim(self.ds.iloc[i].values, self.crd.iloc[j].values) #self.Mcent = np.array(self.Mcent).reshape(self.ds_len, self.crd_len) #Cent_ext = [None] * ((self.ds_len) ** 2) #Cent_ext = np.empty((self.ds_len, self.ds_len), dtype=np.float32) for i in tqdm(range(self.ds_len), desc='Cent_ext'): for j in range(self.ds_len): #Cent_ext[i*self.ds_len + j] = np.linalg.norm(self.Mcent[j] - self.Mcent[i]) #1つの文書から近いカテゴリに含まれている各文書とのユークリッド距離を求める(Cos_neigh) self.Cent_ext[i][j] = np.linalg.norm(self.Mcent[j] - self.Mcent[i]) """ self.Mcent = cs.cos_sim(self.ds.values, self.crd.values) Cent_ext = cn.cos_neigh(self.Mcent, self.Mcent) #Cent_ext = np.array(Cent_ext).reshape(self.ds_len, self.ds_len) #行列の形の変換 self.clf.fit(Cent_ext, np.ravel(self.cl))
def predict(self, target): test_XMFextend = None #test_Cos_cent = [None] * (target_len * self.crd_len) #1つの文書に対して各重心とのコサイン類似度(距離)を計算 #test_Cos_neigh = [None] * (target_len * self.ds_len) #1つの文書と各カテゴリに含まれる文書とのユークリッド距離を計算 test_Cos_cent = np.empty((len(target.index), len(self.crd.index)), dtype=np.float32) test_Cos_neigh = np.empty((len(target.index), len(self.ds.index)), dtype=np.float32) """ for i in tqdm(range(target_len), desc="test"): #ターゲット文書に対して各重心とのコサイン類似度(距離)を計算(Cos_cent) ###Cos_neighを生成### for j in range(self.ds_len): #test_Cos_neigh[i*self.ds_len + j] = np.linalg.norm(self.ds.iloc[j] - target.iloc[i]) #各文書とのコサイン類似度を求める(Cos_neigh) test_Cos_neigh[i][j] = self.cos_sim(target.iloc[i].values, self.ds.iloc[j].values) ###Cos_centを生成### for j in range(self.crd_len): #test_Cos_cent[i*self.crd_len + j] = self.cos_sim(target.iloc[i], self.crd.iloc[j]) test_Cos_cent[i][j] = self.cos_sim(target.iloc[i].values, self.crd.iloc[j].values) """ print('test') ###Cos_neighを生成### test_Cos_cent = cs.cos_sim(target.values, self.crd.values) ###Cos_neighを生成### test_Cos_neigh = cs.cos_sim(target.values, self.ds.values) ###Cos_neighを生成### #test_Cos_neigh = cn.test_cos_neigh(target.values, self.ds.values) #各文書とのユークリッド距離を求める(Cos_neigh) #test_Cos_cent = np.array(test_Cos_cent).reshape(target_len, self.crd_len) #test_Cos_neigh = np.array(test_Cos_neigh).reshape(target_len, self.ds_len) """ test_Mcent = cs.cos_sim(target.values, self.crd.values) #改良手法1 test_cent_ext = cn.cos_neigh(test_Mcent, self.Mcent) #print(test_cent_ext.shape) """ test_XMFextend = np.concatenate( [target.values, test_Cos_neigh, test_Cos_cent], 1) #test_XMextendを生成 #従来手法 #test_XMFextend = np.concatenate([target.values, test_cent_ext], 1) #test_XMextendを生成 #改良手法1 #print(test_XMFextend.shape) orig_ext = pd.DataFrame(columns=self.crd.index, index=target.index) #予測したカテゴリを格納 """ del self.Mcent del test_Mcent del test_cent_ext gc.collect() """ for i in tqdm(range(len(self.hwc)), desc='hwc fit'): #カテゴリの種類数だけ超平面を生成 Bool_cl_train = (self.cl['Category'] == self.crd.index[i] ).values #fitに与えるカテゴリをTrueとFalseの2値にする self.hwc[i].fit(self.XMFextend, Bool_cl_train) #カテゴリごとに学習 #二値分類 del self.XMFextend gc.collect() for i in tqdm(range(len(self.crd.index)), desc='predict'): orig_ext[self.crd.index[i]] = [ self.sigmoid_func(i) for i in self.hwc[i].decision_function(test_XMFextend) ] #テスト文書と超平面の間の距離を取得し、シグモイド関数で正規化 #Orig_ext行列へ縦方向に代入していく test_pred = orig_ext.astype(np.float32).idxmax( axis=1) #ターゲット文書が持つ特徴量が最大のカテゴリを取得 return test_pred
def main(): path = r"glove.6B.50d.txt.w2v" glove = KeyedVectors.load_word2vec_format(path, binary=False) # loads the json file path_to_json = "captions_train2014.json" with open(path_to_json, "rb") as f: json_data = json.load(f) resnet = unpickle.unpickle() with open("idfs1.pkl", mode="rb") as idf: idfs = pickle.load(idf) with open("img_to_caption1.pkl", mode="rb") as cap: img_to_caption = pickle.load(cap) #with open("img_to_coco1.pkl", mode="rb") as coco: #img_to_coco=pickle.load(coco) model = Model() model.dense1.weight = mg.Tensor(np.load('weight.npy')) model.dense1.bias = mg.Tensor(np.load('bias.npy')) optim = Adam(model.parameters) batch_size = 100 for epoch_cnt in range(100): idxs = list(resnet.keys()) np.random.shuffle(idxs) for batch_cnt in range(0, len(idxs) // batch_size - 1): batch_indices = idxs[(batch_cnt * batch_size):((batch_cnt + 1) * batch_size)] batch_indices2 = idxs[((batch_cnt + 1) * batch_size):((batch_cnt + 2) * batch_size)] # id1 = np.random.choice(list(resnet.keys())) # print(id1) id1 = batch_indices # while id1 == id2: id2 = batch_indices2 # print(type(resnet[id1]),type(img_to_caption[id1][0]),type(resnet[id2])) good_image = resnet[id1[0]] bad_image = resnet[id2[0]] text = embed_text.se_text(img_to_caption[id1[0]][0], glove, idfs) for i in id1[1:]: good_image = np.vstack((good_image, resnet[i])) text = np.vstack( (text, embed_text.se_text(img_to_caption[i][0], glove, idfs))) for i in id2[1:]: bad_image = np.vstack((bad_image, resnet[i])) sim_to_good = cos_sim.cos_sim(model(good_image), text) sim_to_bad = cos_sim.cos_sim(model(bad_image), text) # compute the loss associated with our predictions(use softmax_cross_entropy) loss = margin_ranking_loss(sim_to_good, sim_to_bad, 1, 0.1) # back-propagate through your computational graph through your loss loss.backward() # compute the accuracy between the prediction and the truth acc = accuracy(sim_to_good.data, sim_to_bad.data) # execute gradient descent by calling step() of optim optim.step() # null your gradients loss.null_gradients() np.save('weight', model.dense1.parameters[0].data) np.save('bias', model.dense1.parameters[1].data)
def guess_category(dv, v): cos_sim_dict = dict([[cate, cos_sim(np.array(x), np.array(v))] for cate, x in dv.items()]) return max(cos_sim_dict, key=cos_sim_dict.get)