def plot_losses(cls_losses, ssh_losses, fname): import matplotlib.pyplot as plt from utils.misc import normalize cls_losses = normalize(cls_losses) ssh_losses = normalize(ssh_losses) correlation = pearsonr(cls_losses, ssh_losses) print('correlation: %.3f, significance: %.3f' % (correlation[0], correlation[1])) plt.scatter(cls_losses, ssh_losses, color='r', s=4) plt.xlabel('supervised loss') plt.ylabel('self-supervised loss') plt.savefig('%s_scatter.pdf' % (fname)) plt.close()
def plot_losses(cls_losses, ssh_losses, fname, use_agg=True): from utils.misc import normalize import matplotlib.pyplot as plt if use_agg: plt.switch_backend('agg') colors = ['r', 'g', 'b', 'm'] labels = range(4) cls_losses = normalize(cls_losses) for losses, color, label in zip(ssh_losses, colors, labels): losses = normalize(losses) plt.scatter(cls_losses, losses, label=str(label), color=color, s=4) plt.xlabel('classification loss') plt.ylabel('rotation loss') plt.savefig('%s_scatter_%d.pdf' % (fname, label)) plt.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--vector_file", type=str, required=True, help="Path to the vector file.") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--top_num", type=int, default=10, help="The number of neighbours returned.") args = parser.parse_args() if args.sparse: matrix, vocab, _ = load_sparse_txt(args.vector_file) else: matrix, vocab, _ = load_dense_txt(args.vector_file) if args.normalize: matrix = normalize(matrix, args.sparse) top_num = args.top_num while (True): target = input("Enter a word (EXIT to break): ") if target == "EXIT": break if target not in vocab["i2w"]: print("Out of vocabulary") continue target_vocab = {} target_vocab["i2w"], target_vocab["w2i"] = [target], {target: 0} sim_matrix = prepare_similarities(matrix, target_vocab, vocab, args.sparse) neighbours = [] for i, w in enumerate(vocab["i2w"]): sim = sim_matrix[0, i] if target == w: continue if len(neighbours) == 0: neighbours.append((w, sim)) continue if sim <= neighbours[-1][1] and len(neighbours) >= top_num: continue for j in range(len(neighbours)): if sim > neighbours[j][1]: neighbours.insert(j, (w, sim)) break if len(neighbours) > top_num: neighbours.pop(-1) print("{0: <20} {1: <20}".format("word", "similarity")) for w, sim in neighbours: print("{0: <20} {1: <20}".format(w, sim))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--ppmi_file", type=str, required=True, help="Path to the counts (matrix) file.") parser.add_argument("--svd_file", type=str, required=True, help="Path to the SVD file.") parser.add_argument("--input_vocab_file", type=str, required=True, help="Path to the input vocabulary file.") parser.add_argument("--output_vocab_file", type=str, required=True, help="Path to the output vocabulary file.") parser.add_argument("--size", type=int, default=100, help="Vector size.") parser.add_argument("--normalize", action="store_true", help="If set, we factorize normalized PPMI matrix") args = parser.parse_args() print("Ppmi2svd") input_vocab, _ = load_vocabulary(args.input_vocab_file) output_vocab, _ = load_vocabulary(args.output_vocab_file) ppmi, _, _ = load_sparse(args.ppmi_file) if args.normalize: ppmi = normalize(ppmi, sparse=True) ut, s, vt = sparsesvd(ppmi.tocsc(), args.size) np.save(args.svd_file + ".ut.npy", ut) np.save(args.svd_file + ".s.npy", s) np.save(args.svd_file + ".vt.npy", vt) save_dense(args.svd_file + ".input", ut.T, input_vocab) save_dense(args.svd_file + ".output", vt.T, output_vocab) print("Ppmi2svd finished")
def update_use_cbow(self, word, word_list): """ word: 当前单词 word_list: 周围的单词 """ if not self.word_dict.__contains__(word): return huffman_code = self.word_dict[word]['Huffman'] # eg: 001 vector_sum = np.zeros([1, self.vec_len]) for i in np.arange(len(word_list))[::-1]: # i: 3,2,1,0 item = word_list[i] if self.word_dict.__contains__(item): # 判断有没有这个词,有就加上vector,没有就pop vector_sum += self.word_dict[item]['vector'] else: word_list.pop(i) if len(word_list) == 0: # 如果周围的词都没有,就直接返回 return e = self.go_along_huffman(huffman_code, vector_sum, self.huffman.root) for item in word_list: self.word_dict[item]['vector'] += e self.word_dict[item]['vector'] = normalize(self.word_dict[item]['vector'])
def save_word_vector(self): dir_save = './data' if not os.path.exists(dir_save): os.mkdir(dir_save) with open(dir_save+'/word_vector.pkl', 'wb') as f: pickle.dump(self.word_dict, f) word_vector_norm = {w: normalize(self.word_dict[w]['vector']) for w in self.word_dict.keys()} with open(dir_save+'/word_vector_norm.pkl', 'wb') as f: pickle.dump(word_vector_norm, f)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--input_vector_file", type=str, required=True, help="Path to the input vector file.") parser.add_argument("--output_vector_file", type=str, help="Path to the output vector file.") parser.add_argument("--test_file", type=str, required=True, help="Path to the similarity task.") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--ensemble", type=str, default="input", choices=["input", "output", "add", "concat"], help="""Strategies for using input/output vectors. One can use only input, only output, the addition of input and output, or their concatenation. Options are [input|output|add|concat].""") args = parser.parse_args() testset = load_similarity(args.test_file) if args.sparse: matrix, vocab, _ = load_sparse(args.input_vector_file) else: matrix, vocab, _ = load_dense(args.input_vector_file) if not args.sparse: if args.ensemble == "add": output_matrix, output_vocab, _ = load_dense(args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = matrix + output_matrix elif args.ensemble == "concat": output_matrix, output_vocab, _ = load_dense(args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = np.concatenate([matrix, output_matrix], axis=1) elif args.ensemble == "output": matrix, vocab, _ = load_dense(args.output_vector_file) else: # args.ensemble == "input": pass if args.normalize: matrix = normalize(matrix, args.sparse) results = [] for (w1, w2), sim_expected in testset: sim_actual = similarity(matrix, vocab["w2i"], w1, w2, args.sparse) if sim_actual is not None: results.append((sim_actual, sim_expected)) actual, expected = zip(*results) print("seen/total: {}/{}".format(len(results), len(testset))) print("{}: {:.3f}".format(args.test_file, spearmanr(actual, expected)[0]))
def update_use_skip_gram(self, word, word_list): if not self.word_dict.__contains__(word): return word_vector = self.word_dict[word]['vector'] for i in np.arange(len(word_list))[::-1]: if not self.word_dict.__contains__(word_list[i]): word_list.pop(i) if len(word_list) == 0: return for u in word_list: u_huffman = self.word_dict[u]['Huffman'] e = self.go_along_huffman(u_huffman, word_vector, self.huffman.root) self.word_dict[word]['vector'] += e self.word_dict[word]['vector'] = normalize(self.word_dict[word]['vector'])
def go_along_huffman(self, word_huffman, input_vector, root): """ :param word_huffman: 当前词的huffman编码,例如:001 :param input_vector: 周围单词的词向量的和 :param root: 根节点 :return: 当前单词的梯度变化总和 """ node = root e = np.zeros([1, self.vec_len]) for level in range(word_huffman.__len__()): # 从huffman tree上往下搜索 huffman_char = word_huffman[level] # 0 或者 1 q = sigmoid(input_vector.dot(node.value.T)) # 为什么不是average后再点乘??? grad = self.lr * (1-int(huffman_char)-q) # 计算梯度 e += grad * node.value # 累加当前词的梯度变化 node.value += grad * input_vector # 更新结点的向量 node.value = normalize(node.value) if huffman_char == '0': node = node.right else: node = node.left return e
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--input_vector_file", type=str, required=True, help="") parser.add_argument("--output_vector_file", type=str, help="") parser.add_argument("--test_file", type=str, required=True, help="") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--ensemble", type=str, default="input", choices=["input", "output", "add", "concat"], help="""Strategies for using input/output vectors. One can use only input, only output, the addition of input and output, or their concatenation. Options are [input|output|add|concat].""") args = parser.parse_args() testset = load_analogy(args.test_file) ana_vocab, vocab = {}, {} ana_vocab["i2w"], ana_vocab["w2i"] = get_ana_vocab(testset) if args.sparse: matrix, vocab, _ = load_sparse(args.input_vector_file) else: matrix, vocab, _ = load_dense(args.input_vector_file) if not args.sparse: if args.ensemble == "add": output_matrix, output_vocab, _ = load_dense( args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = matrix + output_matrix elif args.ensemble == "concat": output_matrix, output_vocab, _ = load_dense( args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = np.concatenate([matrix, output_matrix], axis=1) elif args.ensemble == "output": matrix, vocab, _ = load_dense(args.output_vector_file) else: # args.ensemble == "input" pass if args.normalize: matrix = normalize(matrix, args.sparse) matrix, vocab["i2w"], vocab["w2i"] = retain_words(matrix, vocab["i2w"], vocab["w2i"]) sim_matrix = prepare_similarities(matrix, ana_vocab, vocab, sparse=args.sparse) seen, correct_add, correct_mul = 0, 0, 0 for a, a_, b, b_ in testset: if a not in vocab["i2w"] or a_ not in vocab["i2w"] or b not in vocab[ "i2w"]: continue seen += 1 guess_add, guess_mul = guess(sim_matrix, ana_vocab, vocab, a, a_, b) if guess_add == b_: correct_add += 1 if guess_mul == b_: correct_mul += 1 accuracy_add = float(correct_add) / seen accuracy_mul = float(correct_mul) / seen print("seen/total: {}/{}".format(seen, len(testset))) print("{}: {:.3f} {:.3f}".format(args.test_file, accuracy_add, accuracy_mul))