def plot_losses(cls_losses, ssh_losses, fname):
    import matplotlib.pyplot as plt
    from utils.misc import normalize

    cls_losses = normalize(cls_losses)
    ssh_losses = normalize(ssh_losses)
    correlation = pearsonr(cls_losses, ssh_losses)
    print('correlation: %.3f, significance: %.3f' %
          (correlation[0], correlation[1]))
    plt.scatter(cls_losses, ssh_losses, color='r', s=4)
    plt.xlabel('supervised loss')
    plt.ylabel('self-supervised loss')
    plt.savefig('%s_scatter.pdf' % (fname))
    plt.close()
Ejemplo n.º 2
0
def plot_losses(cls_losses, ssh_losses, fname, use_agg=True):
    from utils.misc import normalize
    import matplotlib.pyplot as plt
    if use_agg:
        plt.switch_backend('agg')

    colors = ['r', 'g', 'b', 'm']
    labels = range(4)
    cls_losses = normalize(cls_losses)
    for losses, color, label in zip(ssh_losses, colors, labels):
        losses = normalize(losses)
        plt.scatter(cls_losses, losses, label=str(label), color=color, s=4)
        plt.xlabel('classification loss')
        plt.ylabel('rotation loss')
        plt.savefig('%s_scatter_%d.pdf' % (fname, label))
        plt.close()
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--vector_file",
                        type=str,
                        required=True,
                        help="Path to the vector file.")
    parser.add_argument('--sparse',
                        action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize',
                        action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--top_num",
                        type=int,
                        default=10,
                        help="The number of neighbours returned.")

    args = parser.parse_args()

    if args.sparse:
        matrix, vocab, _ = load_sparse_txt(args.vector_file)
    else:
        matrix, vocab, _ = load_dense_txt(args.vector_file)

    if args.normalize:
        matrix = normalize(matrix, args.sparse)
    top_num = args.top_num

    while (True):
        target = input("Enter a word (EXIT to break): ")
        if target == "EXIT":
            break
        if target not in vocab["i2w"]:
            print("Out of vocabulary")
            continue
        target_vocab = {}
        target_vocab["i2w"], target_vocab["w2i"] = [target], {target: 0}
        sim_matrix = prepare_similarities(matrix, target_vocab, vocab,
                                          args.sparse)
        neighbours = []
        for i, w in enumerate(vocab["i2w"]):
            sim = sim_matrix[0, i]
            if target == w:
                continue
            if len(neighbours) == 0:
                neighbours.append((w, sim))
                continue
            if sim <= neighbours[-1][1] and len(neighbours) >= top_num:
                continue
            for j in range(len(neighbours)):
                if sim > neighbours[j][1]:
                    neighbours.insert(j, (w, sim))
                    break
            if len(neighbours) > top_num:
                neighbours.pop(-1)

        print("{0: <20} {1: <20}".format("word", "similarity"))
        for w, sim in neighbours:
            print("{0: <20} {1: <20}".format(w, sim))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--ppmi_file", type=str, required=True,
                        help="Path to the counts (matrix) file.")
    parser.add_argument("--svd_file", type=str, required=True,
                        help="Path to the SVD file.")
    parser.add_argument("--input_vocab_file", type=str, required=True,
                        help="Path to the input vocabulary file.")
    parser.add_argument("--output_vocab_file", type=str, required=True,
                        help="Path to the output vocabulary file.")

    parser.add_argument("--size", type=int, default=100,
                        help="Vector size.")
    parser.add_argument("--normalize", action="store_true",
                        help="If set, we factorize normalized PPMI matrix")

    args = parser.parse_args()

    print("Ppmi2svd")
    input_vocab, _ = load_vocabulary(args.input_vocab_file)
    output_vocab, _ = load_vocabulary(args.output_vocab_file)
    ppmi, _, _ = load_sparse(args.ppmi_file)
    if args.normalize:
        ppmi = normalize(ppmi, sparse=True)
    ut, s, vt = sparsesvd(ppmi.tocsc(), args.size)    

    np.save(args.svd_file + ".ut.npy", ut)
    np.save(args.svd_file + ".s.npy", s)
    np.save(args.svd_file + ".vt.npy", vt)

    save_dense(args.svd_file + ".input", ut.T, input_vocab)
    save_dense(args.svd_file + ".output", vt.T, output_vocab)
    print("Ppmi2svd finished")
Ejemplo n.º 5
0
    def update_use_cbow(self, word, word_list):
        """
        word: 当前单词
        word_list: 周围的单词
        """
        if not self.word_dict.__contains__(word):
            return

        huffman_code = self.word_dict[word]['Huffman']  # eg: 001
        vector_sum = np.zeros([1, self.vec_len])
        for i in np.arange(len(word_list))[::-1]:  # i: 3,2,1,0
            item = word_list[i]
            if self.word_dict.__contains__(item):  # 判断有没有这个词,有就加上vector,没有就pop
                vector_sum += self.word_dict[item]['vector']
            else:
                word_list.pop(i)

        if len(word_list) == 0:  # 如果周围的词都没有,就直接返回
            return

        e = self.go_along_huffman(huffman_code, vector_sum, self.huffman.root)

        for item in word_list:
            self.word_dict[item]['vector'] += e
            self.word_dict[item]['vector'] = normalize(self.word_dict[item]['vector'])
Ejemplo n.º 6
0
 def save_word_vector(self):
     dir_save = './data'
     if not os.path.exists(dir_save):
         os.mkdir(dir_save)
     with open(dir_save+'/word_vector.pkl', 'wb') as f:
         pickle.dump(self.word_dict, f)
     word_vector_norm = {w: normalize(self.word_dict[w]['vector']) for w in self.word_dict.keys()}
     with open(dir_save+'/word_vector_norm.pkl', 'wb') as f:
         pickle.dump(word_vector_norm, f)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--input_vector_file", type=str, required=True,
                        help="Path to the input vector file.")
    parser.add_argument("--output_vector_file", type=str,
                        help="Path to the output vector file.")
    parser.add_argument("--test_file", type=str, required=True,
                        help="Path to the similarity task.")
    parser.add_argument('--sparse', action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize', action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--ensemble", type=str, default="input",
                        choices=["input", "output", "add", "concat"],
                        help="""Strategies for using input/output vectors.
                        One can use only input, only output, the addition of input and output,
                        or their concatenation. Options are
                        [input|output|add|concat].""")

    args = parser.parse_args()
    
    testset = load_similarity(args.test_file)
    if args.sparse:
        matrix, vocab, _ = load_sparse(args.input_vector_file)
    else:
        matrix, vocab, _ = load_dense(args.input_vector_file)

    if not args.sparse:
        if args.ensemble == "add":
            output_matrix, output_vocab, _ = load_dense(args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab)
            matrix = matrix + output_matrix
        elif args.ensemble == "concat":
            output_matrix, output_vocab, _ = load_dense(args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab)
            matrix = np.concatenate([matrix, output_matrix], axis=1)
        elif args.ensemble == "output":
            matrix, vocab, _ = load_dense(args.output_vector_file)
        else: # args.ensemble == "input":
            pass

    if args.normalize:
        matrix = normalize(matrix, args.sparse)

    results = []
    for (w1, w2), sim_expected in testset:
        sim_actual = similarity(matrix, vocab["w2i"], w1, w2, args.sparse)
        if sim_actual is not None:
            results.append((sim_actual, sim_expected))
    actual, expected = zip(*results)
    print("seen/total: {}/{}".format(len(results), len(testset)))
    print("{}: {:.3f}".format(args.test_file, spearmanr(actual, expected)[0]))
Ejemplo n.º 8
0
    def update_use_skip_gram(self, word, word_list):

        if not self.word_dict.__contains__(word):
            return

        word_vector = self.word_dict[word]['vector']
        for i in np.arange(len(word_list))[::-1]:
            if not self.word_dict.__contains__(word_list[i]):
                word_list.pop(i)

        if len(word_list) == 0:
            return

        for u in word_list:
            u_huffman = self.word_dict[u]['Huffman']
            e = self.go_along_huffman(u_huffman, word_vector, self.huffman.root)
            self.word_dict[word]['vector'] += e
            self.word_dict[word]['vector'] = normalize(self.word_dict[word]['vector'])
Ejemplo n.º 9
0
 def go_along_huffman(self, word_huffman, input_vector, root):
     """
     :param word_huffman:    当前词的huffman编码,例如:001
     :param input_vector:    周围单词的词向量的和
     :param root:            根节点
     :return:                当前单词的梯度变化总和
     """
     node = root
     e = np.zeros([1, self.vec_len])
     for level in range(word_huffman.__len__()):             # 从huffman tree上往下搜索
         huffman_char = word_huffman[level]                  # 0 或者 1
         q = sigmoid(input_vector.dot(node.value.T))         # 为什么不是average后再点乘???
         grad = self.lr * (1-int(huffman_char)-q)            # 计算梯度
         e += grad * node.value                              # 累加当前词的梯度变化
         node.value += grad * input_vector                   # 更新结点的向量
         node.value = normalize(node.value)
         if huffman_char == '0':
             node = node.right
         else:
             node = node.left
     return e
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--input_vector_file",
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--output_vector_file", type=str, help="")
    parser.add_argument("--test_file", type=str, required=True, help="")
    parser.add_argument('--sparse',
                        action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize',
                        action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--ensemble",
                        type=str,
                        default="input",
                        choices=["input", "output", "add", "concat"],
                        help="""Strategies for using input/output vectors.
                        One can use only input, only output, the addition of input and output,
                        or their concatenation. Options are
                        [input|output|add|concat].""")

    args = parser.parse_args()

    testset = load_analogy(args.test_file)
    ana_vocab, vocab = {}, {}
    ana_vocab["i2w"], ana_vocab["w2i"] = get_ana_vocab(testset)
    if args.sparse:
        matrix, vocab, _ = load_sparse(args.input_vector_file)
    else:
        matrix, vocab, _ = load_dense(args.input_vector_file)

    if not args.sparse:
        if args.ensemble == "add":
            output_matrix, output_vocab, _ = load_dense(
                args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab,
                                         output_vocab)
            matrix = matrix + output_matrix
        elif args.ensemble == "concat":
            output_matrix, output_vocab, _ = load_dense(
                args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab,
                                         output_vocab)
            matrix = np.concatenate([matrix, output_matrix], axis=1)
        elif args.ensemble == "output":
            matrix, vocab, _ = load_dense(args.output_vector_file)
        else:  # args.ensemble == "input"
            pass

    if args.normalize:
        matrix = normalize(matrix, args.sparse)

    matrix, vocab["i2w"], vocab["w2i"] = retain_words(matrix, vocab["i2w"],
                                                      vocab["w2i"])
    sim_matrix = prepare_similarities(matrix,
                                      ana_vocab,
                                      vocab,
                                      sparse=args.sparse)

    seen, correct_add, correct_mul = 0, 0, 0
    for a, a_, b, b_ in testset:
        if a not in vocab["i2w"] or a_ not in vocab["i2w"] or b not in vocab[
                "i2w"]:
            continue
        seen += 1
        guess_add, guess_mul = guess(sim_matrix, ana_vocab, vocab, a, a_, b)
        if guess_add == b_:
            correct_add += 1
        if guess_mul == b_:
            correct_mul += 1
    accuracy_add = float(correct_add) / seen
    accuracy_mul = float(correct_mul) / seen
    print("seen/total: {}/{}".format(seen, len(testset)))
    print("{}: {:.3f} {:.3f}".format(args.test_file, accuracy_add,
                                     accuracy_mul))