def eval(self, cur_lang, x, y, label_vecs, bs=8, av='micro', L=0, source=None, avg=True, mode='none'): """ Evaluate model on the given validation or test set. """ cur_lang = 0 if source is not None else cur_lang preds, real, watts, satts = [], [], [], [] batch, elapsed, curbatch, init = 0, 0, 0, 0 rls, aps, oes, elapsed = [], [], [], 0.0 total = len(x) keys = x.keys() num_labels = label_vecs.shape[1] if mode and mode == 'seen': eval_ids = pickle.load(open(self.args['seen_ids'])) eval_ids = self.revids[eval_ids] # select evaluation ids elif mode and mode == 'unseen': eval_ids = pickle.load(open(self.args['unseen_ids'])) eval_ids = self.revids[eval_ids] # select evaluation ids else: # validation eval_ids = np.arange(label_vecs.shape[1]) total = 5000 # use a small sample for validation / otherwise too slow print while batch < total/(1.0*bs): start_time = time.time() init_ids = [init+curbatch+cur for cur in range(bs) if init+curbatch+cur < len(keys)] idxs = np.array(keys)[init_ids] x_vecs, y_vecs = load_vectors(x, y, idxs, self.args['wpad'], num_labels, self) if self.args['la']: # Zero-shot models if self.args['train']: # Predictions for all the labels are build subsequently due to # the predefined vocabulary size which is required by sampling. ll = int(self.args["sampling"]*num_labels) done, pred, pi = False, None, 0 while (not done): if pi == 0: totest = label_vecs[:,:ll] elif pi > 0: totest = label_vecs[:,pi*ll:ll+pi*ll] if totest.shape[1] != ll: remained = totest.shape[1] totest = np.hstack([totest, np.zeros((bs,ll - totest.shape[1], totest.shape[2]))]) done = True cur_pred = self.model.predict([np.array(x_vecs), totest], batch_size=self.args['bs']) if pred is None: pred = cur_pred else: if done: pred = np.hstack([pred, cur_pred[:,:remained]]) else: pred = np.hstack([pred, cur_pred]) pi += 1 else: pred = self.model.predict([np.array(x_vecs), label_vecs], batch_size=self.args['bs']) else: # Non-zero-shot models pred = self.model.predict(np.array(x_vecs), batch_size=self.args['bs']) real = np.array(y_vecs); pred = np.array(pred) rls.append(rankloss(real[:,eval_ids], pred[:,eval_ids])) aps.append(avgprec(real[:,eval_ids], pred[:,eval_ids])) cur_oes = [one_error(real[j][eval_ids], pred[j][eval_ids]) for j in range(len(pred))] oes.append(np.array(cur_oes).mean()) elapsed += time.time() - start_time sys.stdout.write("\t%d/%d rls=%.5f - aps=%.5f - oe=%.5f \t %ds\r"%(((batch+1)*bs), len(x), np.array(rls).mean(), np.array(aps).mean(), np.array(oes).mean(), elapsed)) sys.stdout.flush() batch += 1; curbatch += bs if avg: rls = np.array(rls).mean() aps = np.array(aps).mean() oes = np.array(oes).mean() print "rl: %.4f - ap: %.4f - oe: %.4f" % (rls, aps, oes) return rls, aps, oes else: return rls, aps, oes
model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy", "binary_crossentropy"]) # model.fit(train_x, train_y, batch_size=16, epochs=2)#, validation_data=(x_val, y_val)) for i in range(10): model.fit(train_x, train_y, batch_size=32, epochs=3, verbose=2) pred_y = model.predict(test_x) savemat('result_large-reg-3106_' + str(count) + '_' + str(i) + '.mat', { 'pred_y': pred_y, 'test_y': test_y }) ap_list.append(avgprec(test_y, pred_y)) rl_list.append(label_ranking_loss(test_y, pred_y)) ce_list.append(coverage_error(test_y, pred_y) - 1) print('ap_list: {}'.format(ap_list)) print('rl_list: {}'.format(rl_list)) print('ce_list: {}'.format(ce_list)) count += 1 ap_values = np.array(ap_list).reshape((5, 10)) rl_values = np.array(rl_list).reshape((5, 10)) ce_values = np.array(ce_list).reshape((5, 10)) with open('new_encoding_3106_lrg_reg-250.txt', 'w') as result_file: result_file.write('the ap score is: \n') result_file.write(str(ap_values) + '\n')