Esempio n. 1
0
def run_benchmark_by_name(name, args):
    print(name, args)
    print("running ", name)
    mod = importlib.import_module("vecto.benchmarks." + name)
    parser = argparse.ArgumentParser()
    add_extra_args = getattr(mod, 'add_extra_args')
    add_extra_args(parser)
    parser.add_argument("--path_out",
                        default=None,
                        help="destination folder to save results")
    args = parser.parse_args(args)
    dict_args = vars(args)
    embeddings = load_from_dir(args.embeddings)
    # TODO: this is ugly hack, do subparsers or something
    if name == "language_modeling":
        dataset = Dataset("/tmp/")
        dataset.name = "ptb"
    else:
        dataset = Dataset(args.dataset)
        dict_args.pop("dataset")

    dict_args.pop("embeddings")
    # TODO: not sure if all banchmarks use dataset arg
    path_out = dict_args.pop("path_out")
    Benchmark = getattr(mod, "Benchmark")
    benchmark = Benchmark(**dict_args)

    print("SHAPE:", embeddings.matrix.shape)
    print("vocab size:", embeddings.vocabulary.cnt_words)
    results = benchmark.run(embeddings, dataset)
    if path_out:
        save_results(results, path_out, dataset.metadata["name"])
    else:
        print_json(results)
Esempio n. 2
0
    def run(self, embs, path_dataset):  # group_subcategory
        self.embs = embs
        self.solver = select_method(self.method)(self.embs,
                                                 exclude=self.exclude)

        if self.normalize:
            self.embs.normalize()
        self.embs.cache_normalized_copy()

        results = []
        dataset = Dataset(path_dataset)
        for filename in dataset.file_iterator():
            logger.info("processing " + filename)
            pairs = get_pairs(filename)
            name_category = os.path.basename(os.path.dirname(filename))
            name_subcategory = os.path.basename(filename)
            experiment_setup = dict()
            experiment_setup["dataset"] = dataset.metadata
            experiment_setup["embeddings"] = self.embs.metadata
            experiment_setup["category"] = name_category
            experiment_setup["subcategory"] = name_subcategory
            experiment_setup["task"] = "word_analogy"
            experiment_setup["default_measurement"] = "accuracy"
            experiment_setup["method"] = self.method
            experiment_setup["uuid"] = str(uuid.uuid4())
            if not self.exclude:
                experiment_setup["method"] += "_honest"
            experiment_setup["timestamp"] = datetime.datetime.now().isoformat()
            result_for_category = self.run_category(pairs)
            result_for_category["experiment_setup"] = experiment_setup
            results.append(result_for_category)
        # if group_subcategory:
        # results.extend(self.group_subcategory_results(results))
        return results
Esempio n. 3
0
    def test_api(self):
        embs = load_from_dir(path_emb)
        dataset = Dataset(path_text_classification_dataset)

        tc = Text_classification(model='cnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='rnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='bow')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json",
                                               embs.matrix)
        print(predict(model, "I like this"))
        print(get_vectors(model, ["I like this", "I hate this"]))
Esempio n. 4
0
def run_benchmark_by_name(name, args):
    print(name, args)
    print("running ", name)
    mod = importlib.import_module("vecto.benchmarks." + name)
    parser = argparse.ArgumentParser()
    add_extra_args = getattr(mod, 'add_extra_args')
    add_extra_args(parser)
    parser.add_argument("--path_out",
                        default=None,
                        help="destination folder to save results")
    args = parser.parse_args(args)
    dict_args = vars(args)
    embeddings = load_from_dir(args.embeddings)
    # TODO: this is ugly hack, do subparsers or something
    if name == "language_modeling":
        dataset = Dataset("/tmp/")
        dataset.name = "ptb"
    else:
        dataset = Dataset(args.dataset)
        dict_args.pop("dataset")

    dict_args.pop("embeddings")
    # TODO: not sure if all banchmarks use dataset arg
    path_out = dict_args.pop("path_out")
    Benchmark = getattr(mod, "Benchmark")
    benchmark = Benchmark(**dict_args)

    print("SHAPE:", embeddings.matrix.shape)
    print("vocab size:", embeddings.vocabulary.cnt_words)
    results = benchmark.run(embeddings, dataset)
    if path_out:
        if os.path.isdir(path_out) or path_out.endswith("/"):
            dataset = dataset.metadata["name"]
            timestamp = get_time_str()
            if isinstance(results, list):
                task = results[0]["experiment_setup"]["task"]
            else:
                task = results["experiment_setup"]["task"]
            name_file_out = os.path.join(path_out, task, dataset, timestamp,
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, path_out)
    else:
        print_json(results)
Esempio n. 5
0
    def test_api(self):
        embs = load_from_dir(path_emb)

        for method in ['lr', '2FFNN']:
            sequence_labeling = Sequence_labeling(method=method)
            for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
                dataset = Dataset(
                    path.join(path_sequence_labeling_dataset, subtask))
                result = sequence_labeling.run(embs, dataset)
                self.assertIsInstance(result[0], dict)
                print(result)
Esempio n. 6
0
    def test_api(self):
        embs = load_from_dir(path_emb)
        dataset = Dataset(path_similarity_dataset)
        similarity = Similarity()
        result = similarity.run(embs, dataset)
        self.assertIsInstance(result[0], dict)
        print(result)

        similarity = Similarity(ignore_oov=False)
        result = similarity.run(embs, dataset)
        self.assertIsInstance(result[0], dict)
        print(result)

        similarity = Similarity(normalize=False)
        result = similarity.run(embs, dataset)
        self.assertIsInstance(result[0], dict)
        print(result)
Esempio n. 7
0
    def test_api(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        analogy = Analogy(method="3CosAdd")
        dateset = Dataset(path_analogy_dataset)
        result = analogy.run(embs, dateset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="PairDistance")
        result = analogy.run(embs, dateset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul")
        result = analogy.run(embs, dateset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul2")
        result = analogy.run(embs, dateset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosAvg")
        result = analogy.run(embs, dateset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="SimilarToAny")
        result = analogy.run(embs, dateset)
        print(result)

        analogy = Analogy(method="SimilarToB")
        result = analogy.run(embs, dateset)
        print(result)

        analogy = Analogy(method="LRCos")
        result = analogy.run(embs, dateset)
        print(result)
Esempio n. 8
0
from vecto.data import Dataset
path = "/mnt/storage/data/NLP/datasets/text_classification/SST-2"
#path = "/home/blackbird/Projects/NLP/datasets/STSA/binary"

ds = Dataset(path)

print(ds)
print(ds.metadata)
train = ds.get_train()
print(train)
Esempio n. 9
0
 def test_dataset(self):
     with self.assertRaises(FileNotFoundError):
         Dataset("./path/does/not/exist/")
Esempio n. 10
0
 def test_datasets(self):
     Dataset("./")
Esempio n. 11
0
def _parse(args):
    embs = load_from_dir(args.embs)
    dataset = Dataset(args.data)
    # cals main function
    apply_method_analogy(embs, dataset)
Esempio n. 12
0
    def run(self,
            embeddings,
            dataset,
            path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embeddings.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # TODO: move this to protonn ds management
        # self.path_dataset = path_dataset
        # if self.path_dataset == 'dbpedia':
        #     train, test, vocab = text_datasets.get_dbpedia(
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset.startswith('imdb.'):
        #     train, test, vocab = text_datasets.get_imdb(
        #         fine_grained=self.path_dataset.endswith('.fine'),
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine',
        #                            'custrev', 'mpqa', 'rt-polarity', 'subj']:
        #     train, test, vocab = text_datasets.get_other_text_dataset(
        #         self.path_dataset,
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # else:  # finallly, if file is not downloadable, load from local path

        # TODO: make sure dataset module support adapter.py
        path_dataset = dataset.path
        print(path_dataset)
        path_adapter = os.path.join(path_dataset, "adapter.py")
        # TODO: get arrray of ids for train and test here
        if os.path.isfile(path_adapter):
            spec = importlib.util.spec_from_file_location(
                "ds_adapter", path_adapter)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            adapter = module.Adapter()
            train, test, _ = adapter.read()
            vocab = embeddings.vocabulary.dic_words_ids
            train = nlp_utils.transform_to_array(train, vocab)
            test = nlp_utils.transform_to_array(test, vocab)

        else:
            print("loading though DS")
            ds = Dataset(path_dataset)
            train = ds.get_train()
            train = [(word_tokenize_txt(i), j) for i, j in train]
            test = ds.get_test()
            test = [(word_tokenize_txt(i), j) for i, j in test]
            vocab = embeddings.vocabulary.dic_words_ids
            train = nlp_utils.transform_to_array(train, vocab)
            test = nlp_utils.transform_to_array(test, vocab)

        print('# cnt train samples: {}'.format(len(train)))
        print('# cnt test  samples: {}'.format(len(test)))
        print('# size vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# cnt classes: {}'.format(n_class))

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embeddings.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=nlp_utils.convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=nlp_utils.convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        # TODO: move all this to the parent class
        experiment_setup['task'] = "text classification"
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['experiment_setup']['default_measurement'] = 'accuracy'
        result['experiment_setup']['dataset'] = os.path.basename(
            os.path.normpath(path_dataset))
        result['experiment_setup']['method'] = self.model
        result['experiment_setup']['embeddings'] = embeddings.metadata
        result['log'] = load_json(os.path.join(self.out, 'log'))

        # TODO: old version was returning last test value, make a footnote
        # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']}
        accuracy = max(_["validation/main/accuracy"] for _ in result['log'])
        result['result'] = {"accuracy": accuracy}
        return [result]