コード例 #1
0
 def test_save(self):
     embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     path_save = "/tmp/vecto/saved"
     embs.save_to_dir(path_save)
     embs = load_from_dir(path_save)
     print(embs.matrix.shape)
     embs.save_to_dir_plain_txt("/tmp/vecto/saved_plain")
コード例 #2
0
ファイル: test_embeddings.py プロジェクト: vecto-ai/vecto
 def test_save(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     path_save = path.join('/tmp', 'vecto', 'saved')
     embs.save_to_dir(path_save)
     embs = load_from_dir(path_save)
     print(embs.matrix.shape)
     embs.save_to_dir_plain_txt(path.join('/tmp', 'vecto', 'saved_plain'))
コード例 #3
0
def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("--window_size", default=5, type=int)
    parser.add_argument("--test", default=True, help='use small test dataset')
    parser.add_argument("--method",
                        default='lstm',
                        choices=['lr', '2FFNN', 'lstm'],
                        help='name of method')
    parser.add_argument("--path_out",
                        default=False,
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    language_modeling = Language_modeling(window_size=args.window_size,
                                          method=args.method,
                                          test=args.test)
    results = language_modeling.get_result(embeddings)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            name_file_out = os.path.join(args.path_out, "language_modeling",
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)
コード例 #4
0
    def test_api(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        analogy = Analogy(method="3CosAdd")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="PairDistance")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul2")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosAvg")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="SimilarToAny")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = Analogy(method="SimilarToB")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = Analogy(method="LRCos")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)
コード例 #5
0
 def test_categorization_data(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     categorization = KMeansCategorization()
     result = categorization.get_result(embs, path_categorization_dataset)
     word_stats = result[0]['word_stats']
     # self.assertEqual(word_stats['4. banana']['true_category'], 'food')
     self.assertEqual(len(word_stats.keys()), 7)
コード例 #6
0
def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("dataset")
    parser.add_argument("--method",
                        help="analogy solving method",
                        default="LRCos")
    parser.add_argument("--path_out",
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    benchmark = select_method(args.method)
    results = benchmark.get_result(embeddings, args.dataset)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            dataset = os.path.basename(os.path.normpath(args.dataset))
            name_file_out = os.path.join(args.path_out, dataset, args.method,
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)
コード例 #7
0
 def test_filter(self):
     embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     path_vocab = "./tests/data/vocabs/plain"
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])
コード例 #8
0
def run_benchmark_by_name(name, args):
    print(name, args)
    print("running ", name)
    mod = importlib.import_module("vecto.benchmarks." + name)
    parser = argparse.ArgumentParser()
    add_extra_args = getattr(mod, 'add_extra_args')
    add_extra_args(parser)
    parser.add_argument("--path_out",
                        default=None,
                        help="destination folder to save results")
    args = parser.parse_args(args)
    dict_args = vars(args)
    embeddings = load_from_dir(args.embeddings)
    # TODO: this is ugly hack, do subparsers or something
    if name == "language_modeling":
        dataset = Dataset("/tmp/")
        dataset.name = "ptb"
    else:
        dataset = Dataset(args.dataset)
        dict_args.pop("dataset")

    dict_args.pop("embeddings")
    # TODO: not sure if all banchmarks use dataset arg
    path_out = dict_args.pop("path_out")
    Benchmark = getattr(mod, "Benchmark")
    benchmark = Benchmark(**dict_args)

    print("SHAPE:", embeddings.matrix.shape)
    print("vocab size:", embeddings.vocabulary.cnt_words)
    results = benchmark.run(embeddings, dataset)
    if path_out:
        save_results(results, path_out, dataset.metadata["name"])
    else:
        print_json(results)
コード例 #9
0
    def test_text_classification(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        tc = Text_classification(model='cnn')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)
        tc = Text_classification(model='rnn')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)
        tc = Text_classification(model='bow')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)

        model = text_classification.load_model(
            "./tests/data/benchmarks_results/text_classification/args.json",
            embs.matrix)
        model = text_classification.load_model(
            "./tests/data/benchmarks_results/text_classification/args.json",
            embs.matrix)
        print(text_classification.predict(model, "I like this"))
        print(
            text_classification.get_vectors(model,
                                            ["I like this", "I hate this"]))
コード例 #10
0
    def test_analogy(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        analogy = LinearOffset()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = PairDistance()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosMul()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosMul2()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosAvg()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        # analogy = SimilarToAny()
        # result = analogy.get_result(embs, path_analogy_dataset)
        # print(result)
        # analogy = SimilarToB()
        # result = analogy.get_result(embs, path_analogy_dataset)
        # print(result)
        analogy = LRCos()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)
        analogy_visualize.plot_accuracy()
コード例 #11
0
    def test_api(self):
        embs = load_from_dir(path_emb)
        dataset = Dataset(path_text_classification_dataset)

        tc = Text_classification(model='cnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='rnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='bow')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json",
                                               embs.matrix)
        print(predict(model, "I like this"))
        print(get_vectors(model, ["I like this", "I hate this"]))
コード例 #12
0
ファイル: test_embeddings.py プロジェクト: vecto-ai/vecto
 def test_filter(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain')
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])
コード例 #13
0
def main():
    max_len = 30
    min_count = 2

    embeddings_dir = '/home/mattd/embeddings/reddit_2/'
    #dataset_path = '/home/mattd/datasets/AskReddit/'
    dataset_path = "/home/mattd/PycharmProjects/reddit/data/"
    dataset_train_filename = "{}train.csv".format(dataset_path)
    dataset_val_filename = "{}validation.csv".format(dataset_path)
    save_dir = "/home/mattd/PycharmProjects/reddit/embeddings/"

    dataset_train = PairsDataset(dataset_train_filename, max_len, min_count)
    dataset_val = PairsDataset(dataset_val_filename, max_len, min_count,
                               dataset_train.vocab)
    #dataset.add_file(eng_fr_filename2)

    vectors = embeddings.load_from_dir(embeddings_dir)

    #emb = embeddings.load_from_dir(embeddings_dir)

    embs_matrix = np.zeros((len(dataset_val.vocab), len(vectors.matrix[0])))

    for i, token in enumerate(dataset_val.vocab.token2id):
        if vectors.has_word(token):
            embs_matrix[i] = vectors.get_vector(token)
    np.save('{}embeddings_min{}_max{}'.format(save_dir, min_count, max_len),
            embs_matrix)
コード例 #14
0
def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("dataset")
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=30,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=1,
                        help='Number of layers of RNN or MLP following CNN')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.4,
                        help='Dropout rate')
    parser.add_argument('--model',
                        '-model',
                        default='cnn',
                        choices=['cnn', 'rnn', 'bow'],
                        help='Name of encoder model type')
    parser.add_argument("--path_out",
                        default=False,
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    text_classification = Text_classification(batchsize=args.batchsize,
                                              epoch=args.epoch,
                                              gpu=args.gpu,
                                              layer=args.layer,
                                              dropout=args.dropout,
                                              model=args.model)
    results = text_classification.get_result(embeddings, args.dataset)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            dataset = os.path.basename(os.path.normpath(args.dataset))
            name_file_out = os.path.join(args.path_out, dataset,
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)
コード例 #15
0
ファイル: test_embeddings.py プロジェクト: vecto-ai/vecto
 def test_basic(self):
     WordEmbeddingsDense()
     model = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     model.cmp_words("apple", "banana")
     model.cmp_words("apple", "bananaaaaa")
     x = np.array([0.0, 0.0, 0.0])
     x.fill(np.nan)
     model.cmp_vectors(x, x)
コード例 #16
0
    def test_language_modeling(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        for method in ['lr', '2FFNN', 'rnn', 'lstm']:
            sequence_labeling = Language_modeling(test=True, method=method)
            results = sequence_labeling.get_result(embs)
            print(results)
コード例 #17
0
 def test_basic(self):
     WordEmbeddingsDense()
     model = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     model.cmp_words("apple", "banana")
     model.cmp_words("apple", "bananaaaaa")
     x = np.array([0.0, 0.0, 0.0])
     x.fill(np.nan)
     model.cmp_vectors(x, x)
コード例 #18
0
 def test_fetcher(self):
     if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
         return
     fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     similarity = Similarity()
     path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
     similarity.get_result(embs, path_similarity_dataset)
コード例 #19
0
 def test_categorization_scores(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     categorization = KMeansCategorization()
     result = categorization.get_result(embs, path_categorization_dataset)
     scores = result[0]['global_stats']['scores']
     self.assertEqual(len(scores.keys()), 7)
     self.assertEqual(len(result[0]['global_stats']['true_labels']), 7)
     self.assertEqual(result[0]['global_stats']['true_labels'][3], 1)
コード例 #20
0
    def test_load(self):
        load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        # TODO: assert right class
        load_from_dir("tests/data/embeddings/text/plain_no_file_header")
        # TODO: assert right class
        load_from_dir("tests/data/embeddings/npy")

        embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        embs.get_vector('apple')
        with self.assertRaises(RuntimeError):
            embs.get_vector('word_that_not_in_vocabulary_27')
        embs = load_from_dir("tests/data/embeddings/text/corrupted")
        with self.assertRaises(RuntimeError):
            embs = load_from_dir("tests/data/embeddings/text")
コード例 #21
0
ファイル: test_embeddings.py プロジェクト: vecto-ai/vecto
    def test_load(self):
        load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        # TODO: assert right class
        load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_no_file_header'))
        # TODO: assert right class
        load_from_dir(path.join('tests', 'data', 'embeddings', 'npy'))

        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        embs.get_vector('apple')
        #with self.assertRaises(RuntimeError):
        #    embs.get_vector('word_that_not_in_vocabulary_27')
        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'corrupted'))
        with self.assertRaises(RuntimeError):
            embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text'))
コード例 #22
0
    def test_api(self):
        embs = load_from_dir(path_emb)

        for method in ['lr', '2FFNN']:
            sequence_labeling = Sequence_labeling(method=method)
            for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
                result = sequence_labeling.get_result(
                    embs, path.join(path_sequence_labeling_dataset, subtask))
                self.assertIsInstance(result[0], dict)
                print(result)
コード例 #23
0
    def test_sequence_labeling(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        for method in ['lr', '2FFNN']:
            sequence_labeling = Sequence_labeling(method='lr')
            for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
                results = sequence_labeling.get_result(
                    embs, os.path.join(path_sequence_labeling_dataset,
                                       subtask))
                print(results)
コード例 #24
0
    def test_utils(self):
        embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        results = embs.get_most_similar_words('apple', 5)
        print(results)
        embs.cache_normalized_copy()
        results = embs.get_most_similar_words('apple', 5)
        print(results)

        results = embs.get_most_similar_words(embs.get_vector('apple'), 5)
        print(results)
        embs.get_x_label(0)
コード例 #25
0
ファイル: test_embeddings.py プロジェクト: vecto-ai/vecto
    def test_utils(self):
        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        results = embs.get_most_similar_words('apple', 5)
        print(results)
        embs.cache_normalized_copy()
        results = embs.get_most_similar_words('apple', 5)
        print(results)

        results = embs.get_most_similar_words(embs.get_vector('apple'), 5)
        print(results)
        embs.get_x_label(0)
コード例 #26
0
    def test_outliers_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        outliers = AveragePairwiseCosine()
        result = outliers.get_result(embs, path_outliers_dataset)['test']
        amount_of_categories = 2
        amount_of_word_in_cats = 3

        self.assertEqual(len(result.keys()), amount_of_categories)
        self.assertEqual(len(result['cats']), amount_of_word_in_cats)
コード例 #27
0
ファイル: test_outliers.py プロジェクト: vecto-ai/vecto
    def test_outliers_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        outliers = AveragePairwiseCosine()
        result = outliers.get_result(embs, path_outliers_dataset)['test']
        amount_of_categories = 2
        # TODO: refactor to be understandable, check if ok after covab to UNK
        amount_of_word_in_cats = 4

        self.assertEqual(len(result.keys()), amount_of_categories)
        self.assertEqual(len(result['cats']), amount_of_word_in_cats)
コード例 #28
0
def main():
    embeddings_dir = '/mnt/data1/embeddings/crawl/'
    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'

    dataset = SentenceDataset(eng_fr_filename, 20, 2)

    emb = embeddings.load_from_dir(embeddings_dir)
    vocab_embs = np.zeros((len(dataset.vocab), emb.matrix.shape[1]))

    for i, token in enumerate(dataset.vocab.token2id):
        if emb.has_word(token):
            vocab_embs[i] = emb.get_vector(token)
    np.save('embeddings', vocab_embs)
コード例 #29
0
    def test_similarity(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        similarity = Similarity()
        result = similarity.get_result(embs, path_similarity_dataset)
        print(result)

        similarity = Similarity(ignore_oov=False)
        result = similarity.get_result(embs, path_similarity_dataset)
        print(result)

        similarity_visualize.plot_accuracy()
コード例 #30
0
    def test_synonymy_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        synonymy = CosineDistance()
        result = synonymy.get_result(embs, path_synonymy_dataset)['test']
        cat_is_synonym = 'yes'
        cat_is_hit = False
        distance_to_cat = 1.0

        self.assertEqual(result['tiger'][0]['is_synonym'], cat_is_synonym)
        self.assertEqual(result['tiger'][0]['hit'], cat_is_hit)
        self.assertEqual(result['tiger'][0]['distance'], distance_to_cat)