Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--vector_file",
                        type=str,
                        required=True,
                        help="Path to the vector file.")
    parser.add_argument('--sparse',
                        action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize',
                        action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--top_num",
                        type=int,
                        default=10,
                        help="The number of neighbours returned.")

    args = parser.parse_args()

    if args.sparse:
        matrix, vocab, _ = load_sparse(args.vector_file)
    else:
        matrix, vocab, _ = load_dense(args.vector_file)

    if args.normalize:
        matrix = normalize(matrix, args.sparse)
    top_num = args.top_num

    while (True):
        target = input("Enter a word (EXIT to break): ")
        if target == "EXIT":
            break
        if target not in vocab["i2w"]:
            print("Out of vocabulary")
            continue
        target_vocab = {}
        target_vocab["i2w"], target_vocab["w2i"] = [target], {target: 0}
        sim_matrix = prepare_similarities(matrix, target_vocab, vocab,
                                          args.sparse)
        neighbours = []
        for i, w in enumerate(vocab["i2w"]):
            sim = sim_matrix[0, i]
            if target == w:
                continue
            if len(neighbours) == 0:
                neighbours.append((w, sim))
                continue
            if sim <= neighbours[-1][1] and len(neighbours) >= top_num:
                continue
            for j in range(len(neighbours)):
                if sim > neighbours[j][1]:
                    neighbours.insert(j, (w, sim))
                    break
            if len(neighbours) > top_num:
                neighbours.pop(-1)

        print("{0: <20} {1: <20}".format("word", "similarity"))
        for w, sim in neighbours:
            print("{0: <20} {1: <20}".format(w, sim))
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--ppmi_file", type=str, required=True,
                        help="Path to the counts (matrix) file.")
    parser.add_argument("--svd_file", type=str, required=True,
                        help="Path to the SVD file.")
    parser.add_argument("--input_vocab_file", type=str, required=True,
                        help="Path to the input vocabulary file.")
    parser.add_argument("--output_vocab_file", type=str, required=True,
                        help="Path to the output vocabulary file.")

    parser.add_argument("--size", type=int, default=100,
                        help="Vector size.")
    parser.add_argument("--normalize", action="store_true",
                        help="If set, we factorize normalized PPMI matrix")

    args = parser.parse_args()

    print("Ppmi2svd")
    input_vocab, _ = load_vocabulary(args.input_vocab_file)
    output_vocab, _ = load_vocabulary(args.output_vocab_file)
    ppmi, _, _ = load_sparse(args.ppmi_file)
    if args.normalize:
        ppmi = normalize(ppmi, sparse=True)
    ut, s, vt = sparsesvd(ppmi.tocsc(), args.size)    

    np.save(args.svd_file + ".ut.npy", ut)
    np.save(args.svd_file + ".s.npy", s)
    np.save(args.svd_file + ".vt.npy", vt)

    save_dense(args.svd_file + ".input", ut.T, input_vocab)
    save_dense(args.svd_file + ".output", vt.T, output_vocab)
    print("Ppmi2svd finished")
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--input_vector_file", type=str, required=True,
                        help="Path to the input vector file.")
    parser.add_argument("--output_vector_file", type=str,
                        help="Path to the output vector file.")
    parser.add_argument("--test_file", type=str, required=True,
                        help="Path to the similarity task.")
    parser.add_argument('--sparse', action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize', action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--ensemble", type=str, default="input",
                        choices=["input", "output", "add", "concat"],
                        help="""Strategies for using input/output vectors.
                        One can use only input, only output, the addition of input and output,
                        or their concatenation. Options are
                        [input|output|add|concat].""")

    args = parser.parse_args()
    
    testset = load_similarity(args.test_file)
    if args.sparse:
        matrix, vocab, _ = load_sparse(args.input_vector_file)
    else:
        matrix, vocab, _ = load_dense(args.input_vector_file)

    if not args.sparse:
        if args.ensemble == "add":
            output_matrix, output_vocab, _ = load_dense(args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab)
            matrix = matrix + output_matrix
        elif args.ensemble == "concat":
            output_matrix, output_vocab, _ = load_dense(args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab)
            matrix = np.concatenate([matrix, output_matrix], axis=1)
        elif args.ensemble == "output":
            matrix, vocab, _ = load_dense(args.output_vector_file)
        else: # args.ensemble == "input":
            pass

    if args.normalize:
        matrix = normalize(matrix, args.sparse)

    results = []
    for (w1, w2), sim_expected in testset:
        sim_actual = similarity(matrix, vocab["w2i"], w1, w2, args.sparse)
        if sim_actual is not None:
            results.append((sim_actual, sim_expected))
    actual, expected = zip(*results)
    print("seen/total: {}/{}".format(len(results), len(testset)))
    print("{}: {:.3f}".format(args.test_file, spearmanr(actual, expected)[0]))
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--input_vector_file",
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--output_vector_file", type=str, help="")
    parser.add_argument("--test_file", type=str, required=True, help="")
    parser.add_argument('--sparse',
                        action='store_true',
                        help="Load sparse representation.")
    parser.add_argument('--normalize',
                        action='store_true',
                        help="If set, vector is normalized.")
    parser.add_argument("--ensemble",
                        type=str,
                        default="input",
                        choices=["input", "output", "add", "concat"],
                        help="""Strategies for using input/output vectors.
                        One can use only input, only output, the addition of input and output,
                        or their concatenation. Options are
                        [input|output|add|concat].""")

    args = parser.parse_args()

    testset = load_analogy(args.test_file)
    ana_vocab, vocab = {}, {}
    ana_vocab["i2w"], ana_vocab["w2i"] = get_ana_vocab(testset)
    if args.sparse:
        matrix, vocab, _ = load_sparse(args.input_vector_file)
    else:
        matrix, vocab, _ = load_dense(args.input_vector_file)

    if not args.sparse:
        if args.ensemble == "add":
            output_matrix, output_vocab, _ = load_dense(
                args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab,
                                         output_vocab)
            matrix = matrix + output_matrix
        elif args.ensemble == "concat":
            output_matrix, output_vocab, _ = load_dense(
                args.output_vector_file)
            output_matrix = align_matrix(matrix, output_matrix, vocab,
                                         output_vocab)
            matrix = np.concatenate([matrix, output_matrix], axis=1)
        elif args.ensemble == "output":
            matrix, vocab, _ = load_dense(args.output_vector_file)
        else:  # args.ensemble == "input"
            pass

    if args.normalize:
        matrix = normalize(matrix, args.sparse)

    matrix, vocab["i2w"], vocab["w2i"] = retain_words(matrix, vocab["i2w"],
                                                      vocab["w2i"])
    sim_matrix = prepare_similarities(matrix,
                                      ana_vocab,
                                      vocab,
                                      sparse=args.sparse)

    seen, correct_add, correct_mul = 0, 0, 0
    for a, a_, b, b_ in testset:
        if a not in vocab["i2w"] or a_ not in vocab["i2w"] or b not in vocab[
                "i2w"]:
            continue
        seen += 1
        guess_add, guess_mul = guess(sim_matrix, ana_vocab, vocab, a, a_, b)
        if guess_add == b_:
            correct_add += 1
        if guess_mul == b_:
            correct_mul += 1
    accuracy_add = float(correct_add) / seen
    accuracy_mul = float(correct_mul) / seen
    print("seen/total: {}/{}".format(seen, len(testset)))
    print("{}: {:.3f} {:.3f}".format(args.test_file, accuracy_add,
                                     accuracy_mul))