def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params( self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer') )
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) output_file = args.output_file if output_file: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) return metrics
def get_trainer_from_config(config: Params, train_instances: List[Instance], val_instances: List[Instance], vocab: Optional[Vocabulary] = None, device: Optional[int] = -1) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab = vocab or Vocabulary.from_instances(train_instances) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(vocab) trainer = Trainer.from_params( model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=None, params=trainer_params) return trainer
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics()
def get_model_predictions(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> (Dict[str, Any], List): model.eval() model_predictions = [] iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) return model.get_metrics(), model_predictions
def get_model_from_file(archive_path, model_path, overrides=None, eval_suffix='', device=0): if archive_path.endswith('gz'): archive = load_archive(archive_path, device, overrides) config = archive.config prepare_environment(config) model = archive.model serialization_dir = os.path.dirname(archive_path) elif archive_path.endswith('yaml'): config = yaml_to_params(archive_path, overrides) prepare_environment(config) config_dir = os.path.dirname(archive_path) serialization_dir = os.path.join(config_dir, 'serialization') all_datasets = datasets_from_params(config) # We want to create the vocab from scratch since it might be of a # different type. Vocabulary.from_files will always create the base # Vocabulary instance. if os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab_path = os.path.join(serialization_dir, "vocabulary") vocab = Vocabulary.from_files(vocab_path) vocab = Vocabulary.from_params(config.pop('vocabulary')) model = Model.from_params(vocab=vocab, params=config.pop('model')) if model_path: best_model_state = torch.load(model_path) model.load_state_dict(best_model_state) # instances = all_datasets.get('test') iterator = DataIterator.from_params(config.pop("validation_iterator")) iterator.index_with(model.vocab) model.eval().to(device) model.evaluate_mode = True return model
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) model.forward(**tensor_batch ) # stores TP/FN counts for get_metrics as a side-effect metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics(reset=True)
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "MultiTaskTrainer": readers = { name: DatasetReader.from_params(reader_params) for name, reader_params in params.pop( "train_dataset_readers").items() } train_file_paths = params.pop("train_file_paths").as_dict() datasets = { name: reader.read(train_file_paths[name]) for name, reader in readers.items() } instances = (instance for dataset in datasets.values() for instance in dataset) vocab = Vocabulary.from_params(Params({}), instances=instances) model = Model.from_params(params.pop("model"), vocab=vocab) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) mingler = DatasetMingler.from_params(params.pop("mingler")) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) num_epochs = params.pop_int("num_epochs", 10) _ = params.pop("trainer", Params({})) params.assert_empty(__name__) return MultiTaskTrainer(model, serialization_dir, iterator, mingler, optimizer, datasets, num_epochs)
def evaluating(**params): param_is_exist(["model_file", "input_file", "include_package"], params) for package_name in params["include_package"]: import_submodules(package_name) cuda_device = params["cuda_device"] if "cuda_device" in params else -1 overrides = params["overrides"] if "overrides" in params else "" weights_file = params["weights_file"] if "weights_file" in params else "" archive = load_archive(params["model_file"], cuda_device, overrides, weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = params["input_file"] logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, cuda_device, batch_weight_key="loss") logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) if 'metadata' in tensor_batch and 'metadata' not in signature( model.forward).parameters: del tensor_batch['metadata'] model.forward(**tensor_batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, output_file: str = None, file_mode="w", id_to_meta: Dict[str, Any] = {}, feat_id_to_feat_name: Dict[int, str] = {}) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, file_mode)) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: id2label = model.vocab.get_index_to_token_vocabulary("labels") _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label, id_to_meta=id_to_meta, feat_id_to_feat_name=feat_id_to_feat_name) description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-file', type=str, help='path to the file containing the evaluation data') parser.add_argument('--output-file', type=str, help='path to output file') parser.add_argument('--weights-file', type=str, help='a path that overrides which weights file to use') parser.add_argument('--cuda-device', type=int, default=-1, help='id of GPU to use (if any)') parser.add_argument('--overrides', type=str, default="", help='a JSON structure used to override the experiment configuration') parser.add_argument('--include-package', type=str, default='') parser.add_argument('--archive-file', type=str) args = parser.parse_args() if '/' in args.weights_file: label_file = args.weights_file[:args.weights_file.rfind('/') + 1] else: label_file = '' label_file += (args.input_file[args.input_file.rfind('/') + 1: args.input_file.rfind('.')] if '/' in args.input_file else args.input_file[:args.input_file.rfind('.')]) label_file += '_reallabel_guessedlabel.csv' print("Will write labels to " + label_file) print("Evaluating on " + args.input_file) print("Archive file being used is " + args.archive_file) print("Weights file being used is " + args.weights_file) print() logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) if args.include_package.strip() != '': import_submodules(args.include_package) import_submodules("attn_tests_lib") import_submodules("textcat") if args.overrides != '': with open(args.overrides, 'r') as f: args.overrides = " ".join([l.strip() for l in f.readlines()]) archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() if model._output_logit.get_output_dim() == 2: model.calculate_f1 = True model._f1 = F1Measure(1) validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") new_param_dict = {'type': 'basic'} if 'batch_size' in iterator_params.params: new_param_dict['batch_size'] = iterator_params.params['batch_size'] if 'maximum_samples_per_batch' in iterator_params.params: new_param_dict['maximum_samples_per_batch'] = iterator_params.params['maximum_samples_per_batch'] iterator_params.params = new_param_dict iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device, label_file) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) print('\n' + json.dumps(metrics, indent=4)) print("Successfully wrote labels to " + label_file) output_file = args.output_file if output_file: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) return metrics
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params( config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}) if args.extend_vocab: logger.info("Vocabulary is being extended with test instances.") model.vocab.extend_from_instances(Params({}), instances=instances) model.extend_embedder_vocab(embedding_sources) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) thrs = args.thresholds.replace("_", ",").split(",") for thr in thrs: model._temperature_threshold = float(thr) metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s: %s", thr, key, metric) output_file = args.output_file if output_file: with open(output_file + "_" + thr, "w") as file: json.dump(metrics, file, indent=4) return metrics
"The loaded model seems not to be an am-parser (GraphDependencyParser)" ) # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) instances = dataset_reader.read([[args.formalism, args.input_file]]) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") ############ Comment out this block to save class_probabilities, logits, and losses for each batch ######### # print(output_dict['class_probabilities'].shape) # import copy # # newoutput_dict = copy.deepcopy(output_dict) # newoutput_dict['class_probabilities'] = newoutput_dict['class_probabilities'].cpu().data.numpy() # newoutput_dict['logits'] = newoutput_dict['logits'].cpu().data.numpy() # newoutput_dict['loss'] = newoutput_dict['loss'].cpu().data.numpy() # # output_file = os.path.join(os.path.dirname(__file__), '..', "data", "test", # str(batch_count) + "_output.pkl") # import json # import pickle # if output_file: # with open(output_file, "wb") as file: # pickle.dump(newoutput_dict, file) # file.close() # ########################################################################################################### metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.4f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def main(): """The main entry point This is the main entry point for training HAN SOLO models. Usage:: ${PYTHONPATH} -m AttentionSegmentation/main --config_file ${CONFIG_FILE} """ args = get_arguments() # Setup Experiment Directory config = read_from_config_file(args.config_file) if args.seed > 0: np.random.seed(args.seed) torch.manual_seed(args.seed) if config.get('trainer', None) is not None and \ config.get('trainer', None).get('cuda_device', -1) > 0: torch.cuda.manual_seed(args.seed) serial_dir, config = setup_output_dir(config, args.loglevel) logger = logging.getLogger(__name__) # Load Training Data TRAIN_PATH = config.pop("train_data_path") logger.info("Loading Training Data from {0}".format(TRAIN_PATH)) dataset_reader_params = config.pop("dataset_reader") reader_type = dataset_reader_params.pop("type", None) assert reader_type is not None and hasattr(Readers, reader_type),\ f"Cannot find reader {reader_type}" reader = getattr(Readers, reader_type).from_params(dataset_reader_params) instances_train = reader.read(file_path=TRAIN_PATH) instances_train = instances_train logger.info("Length of {0}: {1}".format( "Training Data", len(instances_train))) # Load Validation Data VAL_PATH = config.pop("validation_data_path") logger.info("Loading Validation Data from {0}".format(VAL_PATH)) instances_val = reader.read(VAL_PATH) instances_val = instances_val logger.info("Length of {0}: {1}".format( "Validation Data", len(instances_val))) # Load Test Data TEST_PATH = config.pop("test_data_path", None) instances_test = None if TEST_PATH is not None: logger.info("Loading Test Data from {0}".format(TEST_PATH)) instances_test = reader.read(TEST_PATH) instances_test = instances_test logger.info("Length of {0}: {1}".format( "Testing Data", len(instances_test))) # # Load Pretrained Existing Model # load_config = config.pop("load_from", None) # # Construct Vocabulary vocab_size = config.pop("max_vocab_size", -1) logger.info("Constructing Vocab of size: {0}".format(vocab_size)) vocab_size = None if vocab_size == -1 else vocab_size vocab = Vocabulary.from_instances(instances_train, max_vocab_size=vocab_size) vocab_dir = os.path.join(serial_dir, "vocab") assert os.path.exists(vocab_dir), "Couldn't find the vocab directory" vocab.save_to_files(vocab_dir) # if load_config is not None: # # modify the vocab from the source model vocab # src_vocab_path = load_config.pop("vocab_path", None) # if src_vocab_path is not None: # vocab = construct_vocab(src_vocab_path, vocab_dir) # # Delete the old vocab # for file in os.listdir(vocab_dir): # os.remove(os.path.join(vocab_dir, file)) # # save the new vocab # vocab.save_to_files(vocab_dir) logger.info("Saving vocab to {0}".format(vocab_dir)) logger.info("Vocab Construction Done") # # Construct the data iterators logger.info("Constructing Data Iterators") data_iterator = DataIterator.from_params(config.pop("iterator")) data_iterator.index_with(vocab) logger.info("Data Iterators Done") # Create the model logger.info("Constructing The model") model_params = config.pop("model") model_type = model_params.pop("type") assert model_type is not None and hasattr(Models, model_type),\ f"Cannot find reader {model_type}" model = getattr(Models, model_type).from_params( vocab=vocab, params=model_params, label_indexer=reader.get_label_indexer() ) logger.info("Model Construction done") # visualize = config.pop("visualize", False) # visualizer = None # if visualize: # visualizer = html_visualizer(vocab, reader) segmenter_params = config.pop("segmentation") segment_class = segmenter_params.pop("type") segmenter = getattr(SegmentationModels, segment_class).from_params( vocab=vocab, reader=reader, params=segmenter_params ) logger.info("Segmenter Done") # if load_config is not None: # # Load the weights, as specified by the load_config # model_path = load_config.pop("model_path", None) # layers = load_config.pop("layers", None) # load_config.assert_empty("Load Config") # assert model_path is not None,\ # "You need to specify model path to load from" # model = load_model_from_existing(model_path, model, layers) # logger.info("Pretrained weights loaded") # logger.info("Starting the training process") trainer = Trainer.from_params( model=model, base_dir=serial_dir, iterator=data_iterator, train_data=instances_train, validation_data=instances_val, segmenter=segmenter, params=config.pop("trainer") ) trainer.train() logger.info("Training Done.") if instances_test is not None: logger.info("Computing final Test Accuracy") trainer.test(instances_test) logger.info("Done.")
def evaluate_from_args(args: argparse.Namespace, func_eval=None) -> Dict[str, Any]: # USAGE: # docqa/run.py # evaluate_custom # --archive_file # _trained_models/qanet_semantic_flat_concat_sdp_debug/model.tar.gz # --evaluation_data_file # /Users/mihaylov/research/document-parsing-pipeline/tests/fixtures/data/narrativeqa/third_party/wikipedia/summaries-all.csv.parsed.jsonl.srl.jsonl.with_q_spans.jsonl.with_exp.with_sdp.json.train.2 # --output_file # predictions_dev.json # --batch_size=1 # --item_ids # "00936497f5884881f1df23f4834f6739552cee8b##016[15:30];00936497f5884881f1df23f4834f6739552cee8b##005[17:45];0029bdbe75423337b551e42bb31f9a102785376f##023" # --output_attention # True if func_eval is None: func_eval = evaluate # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) logging.info("Parameters:") for arg in vars(args): logging.info("{0}: {1}".format(arg, getattr(args, arg))) # Load from archive cuda_device = args.cuda_device output_attention = args.output_attention.lower() == "true" display_attention_matplot = args.display_attention_matplot.lower( ) == "true" # selected ids to validate item_ids = [] item_ids_with_range = {} if args.item_ids: separator = ";" item_ids_str = args.item_ids item_ids = item_ids_str.split(separator) for item_id in item_ids: if "[" in item_id and "]" in item_id: tokens_range = [ int(x) for x in item_id.split("[")[-1].replace("]", "").split(":") ] item_id_only = item_id.split("[")[0] item_ids_with_range[item_id_only] = { "attention_range": tokens_range } else: item_ids_with_range[item_id] = None item_ids = set(item_ids_with_range.keys()) logging.info("cuda_device:{0}".format(cuda_device)) archive = load_archive(args.archive_file, cuda_device=cuda_device, overrides=args.overrides) config = archive.config prepare_environment(config) model = archive.model model.eval() if output_attention: if hasattr(model, "return_output_metadata"): model.return_output_metadata = output_attention else: raise Exception( "Model {0} does not support output of the attention weights!". format(model)) # Load the evaluation data dataset_reader = DatasetReader.from_params( config.pop('validation_dataset_reader') if "validation_dataset_reader" in config else config.pop('dataset_reader')) feat_id_to_feat_name = {} if hasattr(dataset_reader, "_semantic_views_extractor"): feat_id_to_feat_name = dataset_reader._semantic_views_extractor.get_vocab_feats_id2name( ) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) batch_size = args.batch_size start_id = args.start_id end_id = args.end_id dataset = dataset_reader.read(evaluation_data_path) file_mode = args.file_open_mode if start_id > 0 or end_id > 0: if not isinstance(dataset, list): raise ValueError( "dataset must be list when start_id and end_id are set") start_id = max(start_id, 0) if end_id <= 0: end_id = len(dataset) dataset = dataset[start_id:end_id] selected_dataset = [] if len(item_ids) > 0: for item in dataset: item_id = item.fields["metadata"]["id"] if item_id in item_ids: selected_dataset.append(item) else: del item dataset = selected_dataset iterator_config = config.pop( 'validation_iterator' ) if "validation_iterator" in config else config.pop('iterator') if batch_size > -1: if "base_iterator" in iterator_config: iterator_config["base_iterator"]["batch_size"] = batch_size else: iterator_config["batch_size"] = batch_size iterator = DataIterator.from_params(iterator_config) iterator.index_with(model.vocab) metrics = func_eval(model, dataset, iterator, args.output_file, file_mode=file_mode, id_to_meta=item_ids_with_range, feat_id_to_feat_name=feat_id_to_feat_name) if args.output_file: absolute_path = os.path.abspath(args.output_file) logging.info("Output saved to \n{}".format(absolute_path)) with open(absolute_path + ".id2featname", mode="w") as fp: json.dump(feat_id_to_feat_name, fp) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, serialization_dir: str, eval_suffix: str, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) nlp = spacy.load("en_core_web_lg") assert not os.path.exists( os.path.join(serialization_dir, f'generations{eval_suffix}.jsonl')) # caching saves us extra 30 minutes if 'goodnews' in serialization_dir: cache_path = 'data/goodnews/evaluation_cache.pkl' elif 'nytimes' in serialization_dir: cache_path = 'data/nytimes/evaluation_cache.pkl' if os.path.exists(cache_path): with open(cache_path, 'rb') as f: cache = pickle.load(f) else: cache = {} with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") write_to_json(output_dict, serialization_dir, nlp, eval_suffix, cache) metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check # if loss_count != batch_count: # raise RuntimeError("The model you are trying to evaluate only sometimes " + # "produced a loss!") final_metrics["loss"] = total_loss / total_weight if not os.path.exists(cache_path): with open(cache_path, 'wb') as f: pickle.dump(cache, f) return final_metrics
dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}) if args.extend_vocab: logger.info("Vocabulary is being extended with test instances.") logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) model.vocab.extend_from_instances(Params({}), instances=instances) model.extend_embedder_vocab(embedding_sources) formalism = args.formalism instances = dataset_reader.read([[formalism, args.input_file]]) # we need to give the formalism to amconll dataset_reader model.train(False) data_iterator = DataIterator.from_params(config.pop('iterator')) with open (args.input_file) as f: conll_sentences = list(amconll_tools.parse_amconll(f)) predictions = dataset_reader.restore_order(forward_on_instances(model, instances, data_iterator)) i2edge_label = [model.vocab.get_token_from_index(i, namespace=formalism + "_head_tags") for i in range(model.vocab.get_vocab_size(formalism + "_head_tags"))] i2supertag = [model.vocab.get_token_from_index(i, namespace=formalism+"_supertag_labels") for i in range(model.vocab.get_vocab_size(formalism+"_supertag_labels"))] lexlabel2i = { model.vocab.get_token_from_index(i, namespace=formalism+"_lex_labels") : i for i in range(model.vocab.get_vocab_size(formalism+"_lex_labels"))}
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data dataset_reader_config = config.pop('dataset_reader') if "evaluator_type" in config: eval_type = config.get("evaluator_type") else: dataset_reader_type = dataset_reader_config.get("type") eval_type = dataset_reader_type dataset_reader = DatasetReader.from_params(dataset_reader_config) evaluation_data_paths_list = [] evaluation_data_short_names = [] output_files_list = args.output_file.split(";") if args.evaluation_data_file: evaluation_data_paths_list.append(args.evaluation_data_file) evaluation_data_short_names.append("input") else: if "validation_data_path" in config: evaluation_data_paths_list.append(config["validation_data_path"]) evaluation_data_short_names.append("dev") if "test_data_path" in config: evaluation_data_paths_list.append(config["test_data_path"]) evaluation_data_short_names.append("test") metrics_out = {} iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(model.vocab) for i in range(len(evaluation_data_paths_list)): evaluation_data_path = evaluation_data_paths_list[i] evaluation_data_short_name = evaluation_data_path if len(evaluation_data_short_names) - 1 < i \ else evaluation_data_short_names[i] if len(output_files_list) == len(evaluation_data_paths_list): out_file = output_files_list[i] else: out_file = "{0}_{1}.txt".format(output_files_list[0], evaluation_data_short_name) logger.info("Reading evaluation data from %s", evaluation_data_path) dataset = dataset_reader.read(evaluation_data_path) metrics = evaluate(model, dataset, iterator, args.cuda_device, out_file, eval_type) if out_file is not None: logging.info("Predictions exported to {0}".format(out_file)) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) if len(evaluation_data_paths_list) == 1: metrics_out = metrics else: metrics_out[evaluation_data_short_name] = metrics return metrics_out
def evaluate( model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int ) -> Dict[str, Any]: """ Evaluate a model for a particular tasks (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the tasks on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ from train import TASKS_NAME check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for tensor_batch in generator_tqdm: nb_batches += 1 # train_stages = ["stm", "sd", "valid"] # task_index = TASKS_NAME.index(task_name) # tensor_batch['task_index'] = torch.tensor(task_index) # tensor_batch["reverse"] = torch.tensor(False) # tensor_batch['for_training'] = torch.tensor(False) # train_stage = train_stages.index("stm") # tensor_batch['train_stage'] = torch.tensor(train_stage) # tensor_batch = move_to_device(tensor_batch, 0) # print(model) # print(tensor_batch.keys()) # tensor_batch, task_name: str, epoch_trained=None, reverse=False, for_training=False eval_output_dict = model.forward(tensor_batch, task_name, for_training=False) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["stm_loss"] = float(eval_loss / nb_batches) description = training_util.description_from_metrics(metrics) generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True) metrics["stm_loss"] = float(eval_loss / nb_batches) return metrics
def evaluate_from_file(archive_path, model_path, overrides=None, eval_suffix='', device=0): if archive_path.endswith('gz'): archive = load_archive(archive_path, device, overrides) config = archive.config prepare_environment(config) model = archive.model serialization_dir = os.path.dirname(archive_path) elif archive_path.endswith('yaml'): config = yaml_to_params(archive_path, overrides) prepare_environment(config) config_dir = os.path.dirname(archive_path) serialization_dir = os.path.join(config_dir, 'serialization') all_datasets = datasets_from_params(config) # We want to create the vocab from scratch since it might be of a # different type. Vocabulary.from_files will always create the base # Vocabulary instance. # if os.path.exists(os.path.join(serialization_dir, "vocabulary")): # vocab_path = os.path.join(serialization_dir, "vocabulary") # vocab = Vocabulary.from_files(vocab_path) vocab = Vocabulary.from_params(config.pop('vocabulary')) model = Model.from_params(vocab=vocab, params=config.pop('model')) if model_path: best_model_state = torch.load(model_path) model.load_state_dict(best_model_state) instances = all_datasets.get('test') iterator = DataIterator.from_params(config.pop("validation_iterator")) iterator.index_with(model.vocab) model.eval().to(device) model.evaluate_mode = True metrics = evaluate(model, instances, iterator, device, serialization_dir, eval_suffix, batch_weight_key='') logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) print("%s: %s", key, metric) output_file = os.path.join(serialization_dir, f"evaluate-metrics3{eval_suffix}.json") print(metrics) if output_file: try: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) except FileNotFoundError as file_not_found: print(file_not_found) with open(f"evaluate-metrics3{eval_suffix}.json", "w") as file: json.dump(metrics, file, indent=4) print(f"Written eval metrics to {os.path.curdir}") return metrics
def main(): parser = arg_parser() args = parser.parse_args() temp_value = "1._1._1._1." optimizer = args.optimizer n_epochs = 10000 if optimizer == 'adam': optimizer = Adam_optimizer(n_epochs) elif optimizer == 'lbfgs': optimizer = LBFGS_optimizer(n_epochs) else: print("Illegal optimizer {}, must be one of (adam, lbfgs)") return -2 temp_value_list = [float(x) for x in temp_value.split("_")] n_layers = len(temp_value_list) model_file = args.model_file serialization_dir = "/".join(args.model_file.split("/")[:-1]) evaluation_data_path = args.dev_file cuda_device = args.cuda_device # output file overrides = "{ iterator: {batch_size: 1}, model: {temperature_threshold: 1, scaling_temperature: '" + temp_value + "'}}" archive = load_archive(serialization_dir, cuda_device, overrides, model_file) config = archive.config model = archive.model if cuda_device >= 0: model = model.cuda() model._scaling_temperatures = temp_value_list model.eval() vocab = model.vocab validation_dataset_reader_params = config.pop("validation_dataset_reader", None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params( config.pop("dataset_reader")) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("iterator") data_iterator = DataIterator.from_params(iterator_params) data_iterator.index_with(vocab) iterator = data_iterator(instances, num_epochs=1, shuffle=False) generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) temp = set_temperature(model, n_layers, optimizer, generator_tqdm, cuda_device) print("\n\n######################################\n") print("# Finished computing temperatures. #\n") print("######################################\n") print("Values are: {}".format("_".join([str(x) for x in temp]))) return 0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) logging.info("Parameters:") for arg in vars(args): logging.info("{0}: {1}".format(arg, getattr(args, arg))) # Load from archive cuda_device = args.cuda_device logging.info("cuda_device:{0}".format(cuda_device)) archive = load_archive(args.archive_file, cuda_device=cuda_device, overrides=args.overrides) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data dataset_reader = DatasetReader.from_params( config.pop('validation_dataset_reader') if "validation_dataset_reader" in config else config.pop('dataset_reader')) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) batch_size = args.batch_size start_id = args.start_id end_id = args.end_id dataset = dataset_reader.read(evaluation_data_path) file_mode = args.file_open_mode if start_id > 0 or end_id > 0: if not isinstance(dataset, list): raise ValueError( "dataset must be list when start_id and end_id are set") start_id = max(start_id, 0) if end_id <= 0: end_id = len(dataset) dataset = dataset[start_id:end_id] iterator_config = config.pop( 'validation_iterator' ) if "validation_iterator" in config else config.pop('iterator') if batch_size > -1: if "base_iterator" in iterator_config: iterator_config["base_iterator"]["batch_size"] = batch_size else: iterator_config["batch_size"] = batch_size iterator = DataIterator.from_params(iterator_config) iterator.index_with(model.vocab) metrics = evaluate(model, dataset, iterator, args.output_file, file_mode=file_mode) if args.output_file: absolute_path = os.path.abspath(args.output_file) logging.info("Output saved to \n{}".format(absolute_path)) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate_from_args(args): logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) print(args.version) # Load from archive if args.unziped_archive_directory != "default": if args.elmo == True: model, config = _load_elmo(args.unziped_archive_directory, args.archive_file, weights_file=None, cuda_device=args.cuda_device) else: model, config = _load(args.unziped_archive_directory, weights_file=None, cuda_device=args.cuda_device) else: archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Load evaluation dataset for multilingual evaluation. # The validation_dataset are called from the achieved model, # so you need to reload the multilingual file. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = SQuADReaderML.from_params( config.pop('dataset_reader')) dataset_reader.set_nmt_models_resources( args.trans_embedding_model, args.trans_encdec_model, args.trans_train_source, args.trans_train_target, args.use_question_tag, args.replace_UNK, args.version, args.online_trans, args.beam, args.soft) if args.language == "Fr": dataset_reader.set_squad_test_resources_fr() elif args.language == "Ja": dataset_reader.set_squad_test_resources_ja() dataset_reader.set_google_translate_mode(args.use_google_translate) dataset_reader.set_bing_translate_mode(args.use_bing_translate) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) # TODO: Fix this file path argument, it is not used and misleading. instances = dataset_reader.read("inputdata_tmp/ja_question_v2.csv") iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(model.vocab) if args.use_google_translate: metrics = evaluate_mlqa_google_translate(model, instances, iterator, args.cuda_device, args.language, args.version) elif args.use_bing_translate: metrics = evaluate_mlqa_bing_translate(model, instances, iterator, args.cuda_device, args.language, args.version, "bing", args.back_trans_bing) else: if args.back_trans_ours == True: metrics = evaluate_mlqa_back_trans_ours(\ model, instances, iterator, args.cuda_device, args.language,\ args.version, args.enja_emb, args.enja_encdec,args.enja_train_source,args.enja_train_target) else: metrics = evaluate_mlqa(model, instances, iterator, args.cuda_device, args.language, args.version, args.trans_embedding_model, args.beam, args.soft) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
# Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) config['iterator']['type'] = 'basic' del config['iterator']['sorting_keys'] data_iterator = DataIterator.from_params(config.pop("iterator")) data_iterator.index_with(model.vocab) cuda_device = args.cuda_device #### EVALUATION AQUI model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) label_probs = [] for batch in generator_tqdm: lol = model(**batch)
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = multitask_datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return MultiTaskTrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def main(): parse = argparse.ArgumentParser("") parse.add_argument("command", type=str, help="one of the following options train, evaluate, generalize") parse.add_argument("--datasets", type=str, help="", default=None) parse.add_argument("--model", type=str, help="", default=None) parse.add_argument("--serialization_dir", type=str, help="the directory storing the intermediate files and output", default=None) parse.add_argument("--cuda_device", type=str, help="Cuda device ID", default="-1") parse.add_argument("--split", type=str, help="dev / test", default="dev") parse.add_argument("--bert_type", type=str, help="Base / Large /", default="Base") parse.add_argument("--config", type=str, help="dev / test", default=None) parse.add_argument("--output_path", type=str, help="directory to which results JSONs of eval will written", default='results/eval/') parse.add_argument("--models_dir", type=str, help="directory containing the models used for eval , (please add '/' at the end)", default=None) parse.add_argument("--data_dir", type=str, help="directory containing the multiqa format datasets , (please add '/' at the end and make sure to have a headers directory with all headers under your specified path)", default='https://multiqa.s3.amazonaws.com/data/') parse.add_argument("--t_total", type=str, help="used for training, see BERT's learning rate schedule for details", default=None) parse.add_argument("--sample_size", type=str, help="used for sampling a subset of the training data", default=-1) parse.add_argument("--validation_sample_size", type=str, help="used for sampling a subset of the training data", default=-1) parse.add_argument("--batch_size", type=str, help="the batch size", default=8) parse.add_argument("--max_instances_in_memory", type=str, help="max number instances in memrory during training", default=5000) parse.add_argument("--num_epochs", type=str, help="", default=2) parse.add_argument("--lr", type=str, help="learning rate", default=0.00003) args = parse.parse_args() import_submodules("models") # TODO add best config for specific datasets as default, not one general default... if args.config is None: config = 'models/MultiQA_BERT' + args.bert_type + '.jsonnet' else: config = args.config config_params = Params(json.loads(_jsonnet.evaluate_file(config))) if args.command == 'train': # building the default dataset urls train_datasets = [args.data_dir + dataset + '_train.jsonl.gz' for dataset in args.datasets.split(',')] val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in args.datasets.split(',')] # calculating the t_total if args.t_total == None: logging.info('getting headers of the chosen dataset in order to compute learning rate schedule t_total') total_number_of_examples = 0 for header_url in [args.data_dir + 'headers/' + dataset + '_train.json' for dataset in args.datasets.split(',')]: with open(cached_path(header_url),'r') as f: header = json.load(f) total_number_of_examples += header['number_of_qas'] t_total = int(total_number_of_examples / float(config_params['iterator']['batch_size']) \ * float(config_params['trainer']['num_epochs'])) \ / len(args.cuda_device.split(',')) if args.serialization_dir is None: serialization_dir = 'models/' + args.datasets.replace(',','_') + f"num_epochs_{args.num_epochs}_batch_size_{args.batch_size}_lr_{args.lr}" else: serialization_dir = args.serialization_dir print(" >>>>>>>> overriding the parameters <<<<<<<<<<< ") overrides = { 'train_data_path': ','.join(train_datasets), 'validation_data_path': ','.join(val_datasets), 'dataset_reader': { 'sample_size': args.sample_size, }, 'validation_dataset_reader': { 'sample_size': args.validation_sample_size, }, 'iterator': { 'batch_size': args.batch_size, 'max_instances_in_memory': args.max_instances_in_memory, }, 'trainer': { 'cuda_device': args.cuda_device, 'num_epochs': args.num_epochs, 'optimizer': { 't_total': t_total, 'lr': args.lr, } } } overrides_str = str(overrides).replace('True', 'true').replace('False', 'false') train_model_from_file(config, serialization_dir, overrides_str, True, False, True, "", "") elif args.command == 'evaluate': print(" evaluate . . . ") if args.models_dir is None: model_path = 'https://multiqa.s3.amazonaws.com/models/BERT' + args.bert_type + '/' + args.model + '.tar.gz' else: model_path = args.models_dir + args.model + '.tar.gz' model_cached_path = cached_path(model_path) print(" loading models . . . .") overrides_str = '' # Load from archive archive = load_archive(model_cached_path, int(args.cuda_device), overrides_str, '') prepare_environment(config_params) model = archive.model model.eval() print(" loading data . . . .") # Load the evaluation data validation_dataset_reader_params = config_params.get('validation_dataset_reader', None) dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) # print(" * * * * * * * ") # print(validation_dataset_reader_params) # print(dataset_reader) print(" looping over datasets . . . .") # running over all validation datasets specified val_dataset_names = args.datasets.split(',') val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in val_dataset_names] for val_dataset_path,val_dataset_name in zip(val_datasets,val_dataset_names): print(f" * * * val_dataset_name: {val_dataset_name}") # This is a bit strange but there is a lot of config "popping" going on implicitly in allennlp # so we need to have the full config reloaded every iteration... config_params = Params(json.loads(_jsonnet.evaluate_file(config))) print("Reading evaluation data from %s", val_dataset_path) logger.info("Reading evaluation data from %s", val_dataset_path) instances = dataset_reader.read(val_dataset_path) # loading iterator iterator_params = config_params.get("validation_iterator", None) iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, int(args.cuda_device), '') logger.info("Finished evaluating " + val_dataset_name) print("Finished evaluating " + val_dataset_name) logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) if not os.path.exists(args.output_path): os.makedirs(args.output_path) output_path = args.output_path + args.model + '_BERT' + args.bert_type + '_eval-on_' \ + val_dataset_name + '_' + args.split + '.json' with open(output_path, "w") as file: json.dump(metrics, file, indent=4) return metrics elif args.command == 'generalize': logging.error('The command %s is not yet supported' % (args.command)) else: logging.error('The command %s is not supported' % (args.command))