def load(self, dir_path): # load options from the json file self.options = opts.load(dir_path) # load vocabularies for each field fields.load_vocabs(dir_path, self.fields_tuples) # set the current gpu self.options.gpu_id = self.gpu_id # load model, optimizer and scheduler self.model = models.load(dir_path, self.fields_tuples, self.gpu_id) self.optimizer = optimizer.load(dir_path, self.model.parameters()) self.scheduler = scheduler.load(dir_path, self.optimizer) # now we have a loaded tagger self._loaded = True
def detect(self, text=None, test_path=None): self.options.text = text self.options.test_path = test_path words_field = fields.WordsField() tags_field = fields.TagsField() fields_tuples = [('words', words_field), ('tags', tags_field)] dataset_iter = None save_dir_path = None if self.options.test_path is None and self.options.text is None: raise Exception('You should inform a path to test data or a text.') if self.options.test_path is not None and self.options.text is not None: raise Exception( 'You cant inform both a path to test data and a text.') if self.options.test_path is not None and self.options.text is None: logger.info('Building test dataset: {}'.format( self.options.test_path)) test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples)) test_dataset = dataset.build(self.options.test_path, test_tuples, self.options) logger.info('Building test iterator...') dataset_iter = iterator.build(test_dataset, self.options.gpu_id, self.options.dev_batch_size, is_train=False) save_dir_path = self.options.test_path if self.options.text is not None and self.options.test_path is None: logger.info('Preparing text...') test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples)) test_dataset = dataset.build_texts(self.options.text, test_tuples, self.options) logger.info('Building iterator...') dataset_iter = iterator.build(test_dataset, self.options.gpu_id, self.options.dev_batch_size, is_train=False) save_dir_path = None logger.info('Loading vocabularies...') fields.load_vocabs(self.options.load, fields_tuples) logger.info('Loading model...') model = models.load(self.options.load, fields_tuples, self.options.gpu_id) logger.info('Predicting...') predicter = Predicter(dataset_iter, model) predictions = predicter.predict(self.options.prediction_type) logger.info('Preparing to save...') if self.options.prediction_type == 'classes': prediction_tags = transform_classes_to_tags( tags_field, predictions) predictions_str = transform_predictions_to_text(prediction_tags) else: predictions_str = transform_predictions_to_text(predictions) words_labels = None if self.options.text is not None: orig_words = self.options.text.split() labels = predictions_str.split() print(orig_words, labels) words_labels = join_words_and_labels(orig_words, labels) return predictions, predictions_str, words_labels
def run(options): words_field = fields.WordsField() tags_field = fields.TagsField() fields_tuples = [('words', words_field), ('tags', tags_field)] # fields_tuples += features.load(options.load) if options.test_path is None and options.text is None: raise Exception('You should inform a path to test data or a text.') if options.test_path is not None and options.text is not None: raise Exception('You cant inform both a path to test data and a text.') dataset_iter = None save_dir_path = None if options.test_path is not None and options.text is None: logger.info('Building test dataset: {}'.format(options.test_path)) test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples)) test_dataset = dataset.build(options.test_path, test_tuples, options) logger.info('Building test iterator...') dataset_iter = iterator.build(test_dataset, options.gpu_id, options.dev_batch_size, is_train=False) save_dir_path = options.test_path if options.text is not None and options.test_path is None: logger.info('Preparing text...') test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples)) test_dataset = dataset.build_texts(options.text, test_tuples, options) logger.info('Building iterator...') dataset_iter = iterator.build(test_dataset, options.gpu_id, options.dev_batch_size, is_train=False) save_dir_path = None logger.info('Loading vocabularies...') fields.load_vocabs(options.load, fields_tuples) logger.info('Loading model...') model = models.load(options.load, fields_tuples, options.gpu_id) logger.info('Predicting...') predicter = Predicter(dataset_iter, model) predictions = predicter.predict(options.prediction_type) logger.info('Preparing to save...') if options.prediction_type == 'classes': prediction_tags = transform_classes_to_tags(tags_field, predictions) predictions_str = transform_predictions_to_text(prediction_tags) else: predictions_str = transform_predictions_to_text(predictions) if options.test_path is not None: save_predictions( options.output_dir, predictions_str, save_dir_path=save_dir_path, ) else: logger.info(options.text) logger.info(predictions_str) return predictions
def run(options): logger.info('Running with options: {}'.format(options)) words_field = fields.WordsField() tags_field = fields.TagsField() fields_tuples = [('words', words_field), ('tags', tags_field)] logger.info('Building train corpus: {}'.format(options.train_path)) train_dataset = dataset.build(options.train_path, fields_tuples, options) logger.info('Building train iterator...') train_iter = iterator.build(train_dataset, options.gpu_id, options.train_batch_size, is_train=True) dev_dataset = None dev_iter = None if options.dev_path is not None: logger.info('Building dev dataset: {}'.format(options.dev_path)) dev_dataset = dataset.build(options.dev_path, fields_tuples, options) logger.info('Building dev iterator...') dev_iter = iterator.build(dev_dataset, options.gpu_id, options.dev_batch_size, is_train=False) test_dataset = None test_iter = None if options.test_path is not None: logger.info('Building test dataset: {}'.format(options.test_path)) test_dataset = dataset.build(options.test_path, fields_tuples, options) logger.info('Building test iterator...') test_iter = iterator.build(test_dataset, options.gpu_id, options.dev_batch_size, is_train=False) datasets = [train_dataset, dev_dataset, test_dataset] datasets = list(filter(lambda x: x is not None, datasets)) # BUILD if not options.load: logger.info('Building vocabulary...') fields.build_vocabs(fields_tuples, train_dataset, datasets, options) loss_weights = None if options.loss_weights == 'balanced': loss_weights = train_dataset.get_loss_weights() logger.info('Building model...') model = models.build(options, fields_tuples, loss_weights) logger.info('Building optimizer...') optim = optimizer.build(options, model.parameters()) logger.info('Building scheduler...') sched = scheduler.build(options, optim) # OR LOAD else: logger.info('Loading vocabularies...') fields.load_vocabs(options.load, fields_tuples) logger.info('Loading model...') model = models.load(options.load, fields_tuples, options.gpu_id) logger.info('Loading optimizer...') optim = optimizer.load(options.load, model.parameters()) logger.info('Loading scheduler...') sched = scheduler.load(options.load, optim) # STATS logger.info('Word vocab size: {}'.format(len(words_field.vocab))) logger.info('Tag vocab size: {}'.format(len(tags_field.vocab) - 1)) logger.info('Number of training examples: {}'.format(len(train_dataset))) if dev_dataset: logger.info('Number of dev examples: {}'.format(len(dev_dataset))) if test_dataset: logger.info('Number of test examples: {}'.format(len(test_dataset))) logger.info('Model info: ') logger.info(str(model)) logger.info('Optimizer info: ') logger.info(str(optim)) logger.info('Scheduler info: ') logger.info(str(sched)) nb_trainable_params = 0 for p_name, p_tensor in model.named_parameters(): if p_tensor.requires_grad: if options.print_parameters_per_layer: logger.info('{} {}: {}'.format(p_name, tuple(p_tensor.size()), p_tensor.size().numel())) nb_trainable_params += p_tensor.size().numel() logger.info('Nb of trainable parameters: {}'.format(nb_trainable_params)) # TRAIN logger.info('Building trainer...') trainer = Trainer(train_iter, model, optim, sched, options, dev_iter=dev_iter, test_iter=test_iter) if options.resume_epoch and options.load is None: logger.info('Resuming training...') trainer.resume(options.resume_epoch) trainer.train() # SAVE if options.save: logger.info('Saving path: {}'.format(options.save)) config_path = Path(options.save) config_path.mkdir(parents=True, exist_ok=True) logger.info('Saving config options...') opts.save(config_path, options) logger.info('Saving vocabularies...') fields.save_vocabs(config_path, fields_tuples) logger.info('Saving model...') models.save(config_path, model) logger.info('Saving optimizer...') optimizer.save(config_path, optim) logger.info('Saving scheduler...') scheduler.save(config_path, sched) return fields_tuples, model, optim, sched