def _test_conversion(settings, level='token'): reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) data = Dataset(settings, reader, label_encoder) le = label_encoder.tasks['lemma'] for (inp, tasks), (rinp, rtasks) in data.batch_generator(return_raw=True): # preds tinp, tlen = tasks['lemma'] preds = [ le.stringify(t, l) for t, l in zip(tinp.t().tolist(), tlen.tolist()) ] if level == 'token': preds = [w for line in preds for w in line] # tokens tokens = [tok for line in rinp for tok in line] # trues trues = [w for line in rtasks for w in line['lemma']] # check for pred, token, true in zip(preds, tokens, trues): rec = le.preprocessor_fn.inverse_transform(pred, token) assert rec == true, (pred, token, true, rec)
def setUp(self): settings = settings_from_file(testpath) settings['batch_size'] = 1 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) insts = label_encoder.fit(line for _, line in reader.readsents()) self.insts = insts self.num_batches = insts // settings.batch_size self.data = Dataset(settings, reader, label_encoder)
def load(fpath): """ Load model from path """ import pie with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar: # check commit try: commit = utils.get_gzip_from_tar(tar, 'pie-commit.zip') except Exception: commit = None if (pie.__commit__ and commit) and pie.__commit__ != commit: logging.warn( ("Model {} was serialized with a previous " "version of `pie`. This might result in issues. " "Model commit is {}, whereas current `pie` commit is {}." ).format(fpath, commit, pie.__commit__)) # load label encoder le = MultiLabelEncoder.load_from_string( utils.get_gzip_from_tar(tar, 'label_encoder.zip')) # load tasks tasks = json.loads(utils.get_gzip_from_tar(tar, 'tasks.zip')) # load model parameters params = json.loads(utils.get_gzip_from_tar(tar, 'parameters.zip')) # instantiate model model_type = getattr(pie.models, utils.get_gzip_from_tar(tar, 'class.zip')) with utils.shutup(): model = model_type(le, tasks, *params['args'], **params['kwargs']) # load settings try: settings = Settings( json.loads(utils.get_gzip_from_tar(tar, 'settings.zip'))) model._settings = settings except Exception: logging.warn( "Couldn't load settings for model {}!".format(fpath)) # load state_dict with utils.tmpfile() as tmppath: tar.extract('state_dict.pt', path=tmppath) dictpath = os.path.join(tmppath, 'state_dict.pt') model.load_state_dict(torch.load(dictpath, map_location='cpu')) model.eval() return model
def test_serialization(self): le = self.data.label_encoder le.save('/tmp/encoder.json') le2 = MultiLabelEncoder.load_from_file('/tmp/encoder.json') self.assertEqual(len(le.tasks), len(le2.tasks), "Unequal number of Modality encoders") self.assertEqual(le.word, le2.word, "word encoder") self.assertEqual(le.char, le2.char, "char encoder") for task in le.tasks: self.assertTrue( le.tasks[task] == le2.tasks[task], "Unequal serialized label encoder for task {}".format(task))
def load(fpath): """ Load model from path """ import pie with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar: # check commit try: commit = get_gzip_from_tar(tar, 'pie-commit.zip') except Exception: # no commit in file commit = None if pie.__commit__ is not None and commit is not None \ and pie.__commit__ != commit: logging.warn( ("Model {} was serialized with a previous " "version of `pie`. This might result in issues. " "Model commit is {}, whereas current `pie` commit is {}." ).format(fpath, commit, pie.__commit__)) # load label encoder le = MultiLabelEncoder.load_from_string( get_gzip_from_tar(tar, 'label_encoder.zip')) # load model parameters params = json.loads(get_gzip_from_tar(tar, 'parameters.zip')) # instantiate model model_type = getattr(pie.models, get_gzip_from_tar(tar, 'class.zip')) with utils.shutup(): model = model_type(le, *params['args'], **params['kwargs']) # (optional) load settings try: settings = Settings( json.loads(get_gzip_from_tar(tar, 'settings.zip'))) model._settings = settings except: pass # load state_dict tmppath = '/tmp/{}'.format(str(uuid.uuid1())) tar.extract('state_dict.pt', path=tmppath) model.load_state_dict( torch.load(os.path.join(tmppath, 'state_dict.pt'))) shutil.rmtree(tmppath) model.eval() return model
def test_batch_level(self): settings = settings_from_file(testpath) settings['batch_size'] = 20 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) data = Dataset(settings, reader, label_encoder) pre_batches = 0 for batch in data.batch_generator(): pre_batches += 1 self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta) devset = data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in data.batch_generator(): post_batches += 1 self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
def setUp(self): settings = settings_from_file(testpath) reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) self.data = Dataset(settings, reader, label_encoder)
import uuid import torch import os import unittest from pie.models import SimpleModel from pie.data import MultiLabelEncoder, Reader, Dataset from pie.settings import settings_from_file testpath = os.path.join(os.path.dirname(__file__), 'testconfig.json') settings = settings_from_file(testpath) label_encoder = MultiLabelEncoder.from_settings(settings) reader = Reader(settings, settings.input_path) label_encoder.fit_reader(reader) dataset = Dataset(settings, label_encoder, reader) class TestModelSerialization(unittest.TestCase): def setUp(self): emb_dim, hidden_size, num_layers = 64, 100, 1 self.model = SimpleModel(label_encoder, emb_dim, emb_dim, hidden_size, num_layers) def test_serialization(self): model = self.model fid = '/tmp/{}'.format(str(uuid.uuid1())) model.save(fid) model2 = SimpleModel.load(fid) os.remove('{}.tar'.format(fid)) self.assertEqual(model.label_encoder, model2.label_encoder)
return fname, infix if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('config_path', nargs='?', default='config.json') args = parser.parse_args() settings = settings_from_file(args.config_path) # datasets reader = Reader(settings, settings.input_path) tasks = reader.check_tasks(expected=None) label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) if settings.verbose: print("::: Available tasks :::") print() for task in tasks: print("- {}".format(task)) print() # fit start = time.time() if settings.verbose: print("::: Fitting data :::") print() ninsts = label_encoder.fit_reader(reader) if settings.verbose: print("Found {} total instances in training set in {:g} secs".format(
def setUp(self): settings = settings_from_file(testpath) reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) self.data = Dataset(settings, reader, label_encoder)
def run(config_path): now = datetime.now() seed = now.hour * 10000 + now.minute * 100 + now.second print("Using seed:", seed) random.seed(seed) numpy.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) settings = settings_from_file(config_path) # check settings # - check at least and at most one target has_target = False for task in settings.tasks: if len(settings.tasks) == 1: task['target'] = True if task.get('target', False): if has_target: raise ValueError("Got more than one target task") has_target = True if not has_target: raise ValueError("Needs at least one target task") # datasets reader = Reader(settings, settings.input_path) tasks = reader.check_tasks(expected=None) if settings.verbose: print("::: Available tasks :::") print() for task in tasks: print("- {}".format(task)) print() # label encoder label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) if settings.verbose: print("::: Fitting data :::") print() label_encoder.fit_reader(reader) if settings.verbose: print() print("::: Vocabulary :::") print() types = '{}/{}={:.2f}'.format(*label_encoder.word.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.word.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format( "word", types, tokens)) types = '{}/{}={:.2f}'.format(*label_encoder.char.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.char.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format( "char", types, tokens)) print() print("::: Tasks :::") print() for task, le in label_encoder.tasks.items(): print("- {:<15} target={:<6} level={:<6} vocab={:<6}".format( task, le.target, le.level, len(le))) print() trainset = Dataset(settings, reader, label_encoder) devset = None if settings.dev_path: devset = Dataset(settings, Reader(settings, settings.dev_path), label_encoder) else: logging.warning("No devset: cannot monitor/optimize training") # model model = SimpleModel(label_encoder, settings.tasks, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers, dropout=settings.dropout, cell=settings.cell, cemb_type=settings.cemb_type, cemb_layers=settings.cemb_layers, custom_cemb_cell=settings.custom_cemb_cell, linear_layers=settings.linear_layers, scorer=settings.scorer, word_dropout=settings.word_dropout, lm_shared_softmax=settings.lm_shared_softmax, include_lm=settings.include_lm) # pretrain(/load pretrained) embeddings if model.wemb is not None: if settings.pretrain_embeddings: print("Pretraining word embeddings") wemb_reader = Reader(settings, settings.input_path, settings.dev_path, settings.test_path) weight = get_pretrained_embeddings(wemb_reader, label_encoder, size=settings.wemb_dim, window=5, negative=5, min_count=1) model.wemb.weight.data = torch.tensor(weight, dtype=torch.float32) elif settings.load_pretrained_embeddings: print("Loading pretrained embeddings") if not os.path.isfile(settings.load_pretrained_embeddings): print("Couldn't find pretrained eembeddings in: {}".format( settings.load_pretrained_embeddings)) initialization.init_pretrained_embeddings( settings.load_pretrained_embeddings, label_encoder.word, model.wemb) # load pretrained weights if settings.load_pretrained_encoder: model.init_from_encoder( pie.Encoder.load(settings.load_pretrained_encoder)) # freeze embeddings if settings.freeze_embeddings: model.wemb.weight.requires_grad = False model.to(settings.device) print("::: Model :::") print() print(model) print() print("::: Model parameters :::") print() trainable = sum(p.nelement() for p in model.parameters() if p.requires_grad) total = sum(p.nelement() for p in model.parameters()) print("{}/{} trainable/total".format(trainable, total)) print() # training print("Starting training") running_time = time.time() trainer = Trainer(settings, model, trainset, reader.get_nsents()) scores = None try: scores = trainer.train_epochs(settings.epochs, devset=devset) except KeyboardInterrupt: print("Stopping training") finally: model.eval() running_time = time.time() - running_time if settings.test_path: print("Evaluating model on test set") testset = Dataset(settings, Reader(settings, settings.test_path), label_encoder) for task in model.evaluate(testset, trainset).values(): task.print_summary() # save model fpath, infix = get_fname_infix(settings) if not settings.run_test: fpath = model.save(fpath, infix=infix, settings=settings) print("Saved best model to: [{}]".format(fpath)) if devset is not None and not settings.run_test: scorers = model.evaluate(devset, trainset) scores = [] for task in sorted(scorers): scorer = scorers[task] result = scorer.get_scores() for acc in result: scores.append('{}:{:.6f}'.format(task, result[acc]['accuracy'])) scores.append('{}-support:{}'.format(task, result[acc]['support'])) path = '{}.results.{}.csv'.format(settings.modelname, '-'.join(get_targets(settings))) with open(path, 'a') as f: line = [infix, str(seed), str(running_time)] line += scores f.write('{}\n'.format('\t'.join(line))) print("Bye!")
from pie import utils with utils.shutup(): # avoid pattern warning from gensim.models import Word2Vec word2vec = Word2Vec(reader.get_token_iterator(), **kwargs) weight = np.zeros((len(label_encoder.word), word2vec.wv.vector_size)) found = 0 for w, idx in label_encoder.word.table.items(): try: weight[idx] = word2vec.wv[w] found += 1 except KeyError: # reserved symbols are not in training sentences pass print("A total of {}/{} word embeddings were pretrained" .format(found, len(label_encoder.word))) return weight if __name__ == '__main__': from pie.data import Reader, MultiLabelEncoder from pie.settings import settings_from_file settings = settings_from_file("config.json") reader = Reader(settings, settings.input_path) le = MultiLabelEncoder.from_settings(settings) le.fit_reader(reader) get_pretrained_embeddings(reader, le, min_count=1)