def run_benchmark_by_name(name, args): print(name, args) print("running ", name) mod = importlib.import_module("vecto.benchmarks." + name) parser = argparse.ArgumentParser() add_extra_args = getattr(mod, 'add_extra_args') add_extra_args(parser) parser.add_argument("--path_out", default=None, help="destination folder to save results") args = parser.parse_args(args) dict_args = vars(args) embeddings = load_from_dir(args.embeddings) # TODO: this is ugly hack, do subparsers or something if name == "language_modeling": dataset = Dataset("/tmp/") dataset.name = "ptb" else: dataset = Dataset(args.dataset) dict_args.pop("dataset") dict_args.pop("embeddings") # TODO: not sure if all banchmarks use dataset arg path_out = dict_args.pop("path_out") Benchmark = getattr(mod, "Benchmark") benchmark = Benchmark(**dict_args) print("SHAPE:", embeddings.matrix.shape) print("vocab size:", embeddings.vocabulary.cnt_words) results = benchmark.run(embeddings, dataset) if path_out: save_results(results, path_out, dataset.metadata["name"]) else: print_json(results)
def run(self, embs, path_dataset): # group_subcategory self.embs = embs self.solver = select_method(self.method)(self.embs, exclude=self.exclude) if self.normalize: self.embs.normalize() self.embs.cache_normalized_copy() results = [] dataset = Dataset(path_dataset) for filename in dataset.file_iterator(): logger.info("processing " + filename) pairs = get_pairs(filename) name_category = os.path.basename(os.path.dirname(filename)) name_subcategory = os.path.basename(filename) experiment_setup = dict() experiment_setup["dataset"] = dataset.metadata experiment_setup["embeddings"] = self.embs.metadata experiment_setup["category"] = name_category experiment_setup["subcategory"] = name_subcategory experiment_setup["task"] = "word_analogy" experiment_setup["default_measurement"] = "accuracy" experiment_setup["method"] = self.method experiment_setup["uuid"] = str(uuid.uuid4()) if not self.exclude: experiment_setup["method"] += "_honest" experiment_setup["timestamp"] = datetime.datetime.now().isoformat() result_for_category = self.run_category(pairs) result_for_category["experiment_setup"] = experiment_setup results.append(result_for_category) # if group_subcategory: # results.extend(self.group_subcategory_results(results)) return results
def test_api(self): embs = load_from_dir(path_emb) dataset = Dataset(path_text_classification_dataset) tc = Text_classification(model='cnn') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) tc = Text_classification(model='rnn') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) tc = Text_classification(model='bow') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json", embs.matrix) print(predict(model, "I like this")) print(get_vectors(model, ["I like this", "I hate this"]))
def run_benchmark_by_name(name, args): print(name, args) print("running ", name) mod = importlib.import_module("vecto.benchmarks." + name) parser = argparse.ArgumentParser() add_extra_args = getattr(mod, 'add_extra_args') add_extra_args(parser) parser.add_argument("--path_out", default=None, help="destination folder to save results") args = parser.parse_args(args) dict_args = vars(args) embeddings = load_from_dir(args.embeddings) # TODO: this is ugly hack, do subparsers or something if name == "language_modeling": dataset = Dataset("/tmp/") dataset.name = "ptb" else: dataset = Dataset(args.dataset) dict_args.pop("dataset") dict_args.pop("embeddings") # TODO: not sure if all banchmarks use dataset arg path_out = dict_args.pop("path_out") Benchmark = getattr(mod, "Benchmark") benchmark = Benchmark(**dict_args) print("SHAPE:", embeddings.matrix.shape) print("vocab size:", embeddings.vocabulary.cnt_words) results = benchmark.run(embeddings, dataset) if path_out: if os.path.isdir(path_out) or path_out.endswith("/"): dataset = dataset.metadata["name"] timestamp = get_time_str() if isinstance(results, list): task = results[0]["experiment_setup"]["task"] else: task = results["experiment_setup"]["task"] name_file_out = os.path.join(path_out, task, dataset, timestamp, "results.json") save_json(results, name_file_out) else: save_json(results, path_out) else: print_json(results)
def test_api(self): embs = load_from_dir(path_emb) for method in ['lr', '2FFNN']: sequence_labeling = Sequence_labeling(method=method) for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner' dataset = Dataset( path.join(path_sequence_labeling_dataset, subtask)) result = sequence_labeling.run(embs, dataset) self.assertIsInstance(result[0], dict) print(result)
def test_api(self): embs = load_from_dir(path_emb) dataset = Dataset(path_similarity_dataset) similarity = Similarity() result = similarity.run(embs, dataset) self.assertIsInstance(result[0], dict) print(result) similarity = Similarity(ignore_oov=False) result = similarity.run(embs, dataset) self.assertIsInstance(result[0], dict) print(result) similarity = Similarity(normalize=False) result = similarity.run(embs, dataset) self.assertIsInstance(result[0], dict) print(result)
def test_api(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) analogy = Analogy(method="3CosAdd") dateset = Dataset(path_analogy_dataset) result = analogy.run(embs, dateset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="PairDistance") result = analogy.run(embs, dateset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosMul") result = analogy.run(embs, dateset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosMul2") result = analogy.run(embs, dateset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosAvg") result = analogy.run(embs, dateset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="SimilarToAny") result = analogy.run(embs, dateset) print(result) analogy = Analogy(method="SimilarToB") result = analogy.run(embs, dateset) print(result) analogy = Analogy(method="LRCos") result = analogy.run(embs, dateset) print(result)
from vecto.data import Dataset path = "/mnt/storage/data/NLP/datasets/text_classification/SST-2" #path = "/home/blackbird/Projects/NLP/datasets/STSA/binary" ds = Dataset(path) print(ds) print(ds.metadata) train = ds.get_train() print(train)
def test_dataset(self): with self.assertRaises(FileNotFoundError): Dataset("./path/does/not/exist/")
def test_datasets(self): Dataset("./")
def _parse(args): embs = load_from_dir(args.embs) dataset = Dataset(args.data) # cals main function apply_method_analogy(embs, dataset)
def run(self, embeddings, dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embeddings.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # TODO: move this to protonn ds management # self.path_dataset = path_dataset # if self.path_dataset == 'dbpedia': # train, test, vocab = text_datasets.get_dbpedia( # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset.startswith('imdb.'): # train, test, vocab = text_datasets.get_imdb( # fine_grained=self.path_dataset.endswith('.fine'), # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine', # 'custrev', 'mpqa', 'rt-polarity', 'subj']: # train, test, vocab = text_datasets.get_other_text_dataset( # self.path_dataset, # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # else: # finallly, if file is not downloadable, load from local path # TODO: make sure dataset module support adapter.py path_dataset = dataset.path print(path_dataset) path_adapter = os.path.join(path_dataset, "adapter.py") # TODO: get arrray of ids for train and test here if os.path.isfile(path_adapter): spec = importlib.util.spec_from_file_location( "ds_adapter", path_adapter) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) adapter = module.Adapter() train, test, _ = adapter.read() vocab = embeddings.vocabulary.dic_words_ids train = nlp_utils.transform_to_array(train, vocab) test = nlp_utils.transform_to_array(test, vocab) else: print("loading though DS") ds = Dataset(path_dataset) train = ds.get_train() train = [(word_tokenize_txt(i), j) for i, j in train] test = ds.get_test() test = [(word_tokenize_txt(i), j) for i, j in test] vocab = embeddings.vocabulary.dic_words_ids train = nlp_utils.transform_to_array(train, vocab) test = nlp_utils.transform_to_array(test, vocab) print('# cnt train samples: {}'.format(len(train))) print('# cnt test samples: {}'.format(len(test))) print('# size vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# cnt classes: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embeddings.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=nlp_utils.convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=nlp_utils.convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ # TODO: move all this to the parent class experiment_setup['task'] = "text classification" experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['experiment_setup']['default_measurement'] = 'accuracy' result['experiment_setup']['dataset'] = os.path.basename( os.path.normpath(path_dataset)) result['experiment_setup']['method'] = self.model result['experiment_setup']['embeddings'] = embeddings.metadata result['log'] = load_json(os.path.join(self.out, 'log')) # TODO: old version was returning last test value, make a footnote # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']} accuracy = max(_["validation/main/accuracy"] for _ in result['log']) result['result'] = {"accuracy": accuracy} return [result]