def extend_from_instances(self, params: Params, instances: Iterable['adi.Instance'] = ()) -> None: """ Extends an already generated vocabulary using a collection of instances. """ min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None) only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) self._extend(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def from_instances(cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) return cls(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._model.eval() val_generator = self._iterator(self._validation_data, num_epochs=1, cuda_device=self._iterator_device, for_training=False) num_validation_batches = self._iterator.get_num_batches(self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: batches_this_epoch += 1 loss = self._batch_loss(batch, for_training=False) val_loss += loss.data.cpu().numpy() # Update the description with the latest metrics val_metrics = self._get_metrics(val_loss, batches_this_epoch) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) return val_loss, batches_this_epoch
def get_from_cache(url: str, cache_dir: str = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = DATASET_CACHE os.makedirs(cache_dir, exist_ok=True) # make HEAD request to check ETag response = requests.head(url, allow_redirects=True) if response.status_code != 200: raise IOError("HEAD request failed for url {}".format(url)) # add ETag to filename if it exists etag = response.headers.get("ETag") filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with tempfile.NamedTemporaryFile() as temp_file: logger.info("%s not found in cache, downloading to %s", url, temp_file.name) # GET file object req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() # shutil.copyfileobj() starts at the current position, so go to the start temp_file.seek(0) logger.info("copying %s to cache at %s", temp_file.name, cache_path) with open(cache_path, 'wb') as cache_file: shutil.copyfileobj(temp_file, cache_file) logger.info("creating metadata file for %s", cache_path) meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' with open(meta_path, 'w') as meta_file: json.dump(meta, meta_file) logger.info("removing temp file %s", temp_file.name) return cache_path
def http_get(url: str, temp_file: IO) -> None: req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close()
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( val_iterator.get_num_batches(self._validation_data) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def tokenize_and_detect_answers(self, contexts, shuffle=True, search_answer_within_supp_context=False): if shuffle: random.seed(0) random.shuffle(contexts) if self._n_processes == 1: for context in Tqdm.tqdm(contexts, ncols=80): self.preprocess_context(context, search_answer_within_supp_context) else: # multi process (creates chunks of 200 contexts each ) preprocessed_instances = [] with Pool(self._n_processes) as pool: chunks = split(contexts, self._n_processes) chunks = flatten_iterable(group(c, 200) for c in chunks) pbar = Tqdm.tqdm(total=len(chunks), ncols=80, smoothing=0.0) for preproc_inst in pool.imap_unordered(self._preprocess_t,[[c, search_answer_within_supp_context] for c in chunks]): preprocessed_instances += preproc_inst pbar.update(1) pbar.close() contexts = preprocessed_instances return contexts
def __iter__(self) -> Iterator[Instance]: instance_iterator: Iterator[Instance] = self.reader.read( self.file_path) worker_info = data.get_worker_info() if worker_info is None or worker_info.id == 0: # Wrap with Tqdm progress bar if this is the main or only worker. instance_iterator = Tqdm.tqdm(instance_iterator, desc="reading instances") for instance in instance_iterator: self.reader.apply_token_indexers(instance) if self.vocab is not None: instance.index_fields(self.vocab) yield instance
def run(args): print('\nArguments:') for k, v in vars(args).items(): print('{}: {}'.format(k, v)) print() device = args.device if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Loading archive ...') archive = load_archive(args.model_path) # predictor = Predictor.from_archive(archive, 'protein_predictor') config = archive.config.duplicate() dataset_reader = DatasetReader.from_params(config["dataset_reader"]) model = archive.model.to(device).eval() print('Loading data ...') dataset_reader.lazy = False dataset = dataset_reader.read(args.input_path) iterator = BasicIterator(args.batch_size) iterator.index_with(model.vocab) num_batches = iterator.get_num_batches(dataset) data_generator = iterator(dataset, num_epochs=1, shuffle=False) print('Predicting ...') output_dict = {} with torch.no_grad(): for batch in Tqdm.tqdm(data_generator, total=num_batches): batch = move_to_device(batch, model._get_prediction_device()) outputs = model(**batch) predictions = outputs['predictions'].cpu().numpy() for pid, length, pred in zip(outputs['protein_id'], outputs['length'], predictions): if model.target == 'dcalpha': dcalpha = pred[:length, :length] dcalpha = np.triu(dcalpha, 1) + np.tril(dcalpha.transpose(), -1) output_dict[pid] = {'dcalpha': dcalpha} elif model.target == 'angles': psi, phi = pred[:length, 0], pred[:length, 1] # psi[0] = 0. # phi[-1] = 0. output_dict[pid] = {'psi': psi, 'phi': phi} else: coords = pred[:length] output_dict[pid] = {'coords': coords} print('Writing to {}'.format(args.output_path)) with open(args.output_path, 'wb') as fout: pickle.dump(output_dict, fout) print('All done.')
def validate(self, trainer: 'CallbackTrainer'): # If the trainer has MovingAverage objects, use their weights for validation. for moving_average in self.moving_averages: moving_average.assign_average_value() with torch.no_grad(): # We have a validation set, so compute all the metrics on it. logger.info("Validating") trainer.model.eval() num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access raw_val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( self.iterator.get_num_batches(self.instances) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = trainer.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( trainer.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) trainer.val_metrics = training_util.get_metrics(trainer.model, val_loss, batches_this_epoch, reset=True) # If the trainer has a moving average, restore for moving_average in self.moving_averages: moving_average.restore()
def from_instances( cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None, instances_aux: Optional[Iterable['adi.Instance']] = None ) -> 'Vocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) if instances_aux is not None: logger.info("Fitting token dictionary from auxillary dataset.") for instance in Tqdm.tqdm(instances_aux): instance.count_vocab_items(namespace_token_counts) return VocabularyMultitask( counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def test_reset_tqdm_logger_handlers(self): serialization_dir_a = os.path.join(self.TEST_DIR, "test_a") os.makedirs(serialization_dir_a, exist_ok=True) prepare_global_logging(serialization_dir_a) serialization_dir_b = os.path.join(self.TEST_DIR, "test_b") os.makedirs(serialization_dir_b, exist_ok=True) prepare_global_logging(serialization_dir_b) # Use range(1) to make sure there should be only 2 lines in the file (0% and 100%) for _ in Tqdm.tqdm(range(1)): pass with open(os.path.join(serialization_dir_a, "out.log"), "r") as f: assert len(f.readlines()) == 0 with open(os.path.join(serialization_dir_b, "out.log"), "r") as f: assert len(f.readlines()) == 2
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file if line.strip()] split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence as the key. embedded_sentences = zip( sentences, self.embed_sentences(split_sentences, batch_size)) logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if key in fout.keys(): logger.warning( f"Key already exists in {output_file_path}, skipping: {key}" ) else: if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[2] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset(key, output.shape, dtype='float32', data=output) input_file.close()
def predict_instances_to_file(self, instances, path, batch_size=128): all_outputs = [] for i in Tqdm.tqdm(range(math.ceil(len(instances) / batch_size))): batch_instances = instances[i * batch_size:(i + 1) * batch_size] model_outputs = self.predict_batch_instance(batch_instances) for j, instance in enumerate(batch_instances): outputs = self._decode_by_output(model_outputs[j]) outputs['text'] = ''.join( map(str, instance.fields['text'].tokens)) all_outputs.append(outputs) with open(path, 'w', encoding='utf-8') as f: json.dump(all_outputs, f, indent=2, ensure_ascii=False)
def iter_instances(self) -> Iterator[Instance]: if self._instances: yield from self._instances else: if self.max_instances_in_memory is None: self._instances = [] if self.num_workers <= 0: # Just read all instances in main process. for instance in Tqdm.tqdm(self.reader.read(self.data_path), desc="loading instances"): self.reader.apply_token_indexers(instance) if self.max_instances_in_memory is None: self._instances.append(instance) # type: ignore if self._vocab is not None: instance.index_fields(self._vocab) yield instance else: ctx = mp.get_context(self.start_method) queue: mp.JoinableQueue = ( ctx.JoinableQueue() if self._max_instance_queue_size is None else ctx.JoinableQueue(maxsize=self._max_instance_queue_size)) workers = self._start_instance_workers(queue, ctx) try: for instance in Tqdm.tqdm(self._gather_instances(queue), desc="loading instances"): if self.max_instances_in_memory is None: self._instances.append(instance) # type: ignore yield instance finally: if hasattr(queue, "close" ): # for compat with different Python versions. queue.close() # type: ignore[attr-defined] self._join_workers(workers, queue)
def make_hdf5_file(self, sentences: List[str], out_fn: str) -> None: """ Given a list of sentences, tokenize each one and vectorize the tokens. Write the embeddings to out_fn in the HDF5 file format. The index in the data corresponds to the sentence index. """ sentence_index = 0 with h5py.File(out_fn, 'w') as fout: for sentence in Tqdm.tqdm(sentences): embeddings = self.vectorize(sentence) fout.create_dataset(str(sentence_index), embeddings.shape, dtype='float32', data=embeddings) sentence_index += 1
def dataset_path_iterator(file_path: str) -> Iterator[str]: """ An iterator returning file_paths in a directory containing CONLL-formatted files. """ logger.info("Reading CONLL sentences from dataset files at: %s", file_path) for root, _, files in Tqdm.tqdm(list(os.walk(file_path))): for data_file in files: # These are a relic of the dataset pre-processing. Every # file will be duplicated - one file called filename.gold_skel # and one generated from the preprocessing called filename.gold_conll. if not data_file.endswith("gold_conll"): continue yield os.path.join(root, data_file)
def _read(self, file_path): with open(file_path, 'r', encoding="utf8") as data_file: lines = data_file.readlines() self.total_instances = len(lines) if self._tqdm: lines = Tqdm.tqdm(lines) for line_num, line in enumerate(lines): line = line.strip("\n") if not line: continue tokenized_sentence, sentence_len, sentiment = \ self._tokenizer.tokenize(line) if sentence_len > self._max_len or sentence_len < self._min_len: continue yield self.text_to_instance(tokenized_sentence, sentiment)
def _read(self, file_path): # we need to prepare word frequency stats before dealing with our corpus to subsample frequent words if self._word_sample_prob is None: logger.info(f'Building word frequency stats...') self._word_sample_prob = {} total = 0 with open(cached_path(file_path), 'r') as f: for line in Tqdm.tqdm(f.readlines()): tokens = line.strip().split() for token in tokens: if token in self._word_sample_prob: self._word_sample_prob[token] += 1 else: self._word_sample_prob[token] = 1 total += 1 for k, v in self._word_sample_prob.items(): # convert count into frequency self._word_sample_prob[k] = v / total # word downsampling to prevent frequent words from being shown so much self._word_sample_prob[k] = max(0, 1 - np.sqrt(self._subsampling_threshold / self._word_sample_prob[k])) logger.info(f'Reading instances from lines in file at {file_path}') with open(cached_path(file_path), 'r') as f: for line in Tqdm.tqdm(f.readlines()): tokens = line.strip().split() for i in range(len(tokens)): if np.random.binomial(1, self._word_sample_prob[tokens[i]]): start = max(0, i - self._window_size) end = min(len(tokens) - 1, i + self._window_size) source = Token(tokens[i]) targets = [Token(tokens[j]) for j in range(start, end + 1) if i != j] yield self.text_to_instance(source, targets)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache # file_path = cached_path(file_path) for filename in os.listdir(file_path): filename_splitted = filename.split('_') task_name = filename_splitted[-3] domain_name = filename_splitted[-2] if task_name not in self._tasks or domain_name not in self._domains: continue with open(os.path.join(file_path, filename), "r") as data_file: logger.info("Reading instances from lines in file at: %s", filename) for line in Tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField( tags, sequence, label_namespace=task_name + '_labels') task_field = LabelField(task_name, label_namespace="task_labels") domain_field = LabelField(domain_name, label_namespace="domain_labels") input_dict = { 'task_token': task_field, 'domain_token': domain_field, 'tokens': sequence } all_tags = [] empty_tags = ['O'] * len(tags) for tsk in self._tasks: if tsk != task_name: empty_sequence_tags = SequenceLabelField( empty_tags, sequence, label_namespace=tsk + '_labels') all_tags.append(empty_sequence_tags) else: all_tags.append(sequence_tags) input_dict['all_tags'] = ListField(all_tags) yield Instance(input_dict)
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file if line.strip()] split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence as the key. embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if key in fout.keys(): logger.warning(f"Key already exists in {output_file_path}, skipping: {key}") else: if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[2] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( key, output.shape, dtype='float32', data=output ) input_file.close()
def get_from_cache(url: str, cache_dir: str = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = DATASET_CACHE os.makedirs(cache_dir, exist_ok=True) # make HEAD request to check ETag response = requests.head(url) if response.status_code != 200: raise IOError("HEAD request failed for url {}".format(url)) # add ETag to filename if it exists etag = response.headers.get("ETag") filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. _, temp_filename = tempfile.mkstemp() logger.info("%s not found in cache, downloading to %s", url, temp_filename) # GET file object req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) with open(temp_filename, 'wb') as temp_file: for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() logger.info("copying %s to cache at %s", temp_filename, cache_path) shutil.copyfile(temp_filename, cache_path) logger.info("removing temp file %s", temp_filename) os.remove(temp_filename) return cache_path
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) batch_count = 0 loss_count = 0 total_loss = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) loss = model(**batch).get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 metrics["loss"] = loss.item() total_loss += loss.item() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss/batch_count return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]: # Moving this import to the top breaks everything (cycling import, I guess) from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile logger.info('Reading pretrained tokens from: %s', embeddings_file_uri) tokens: List[str] = [] with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file: for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1): token_end = line.find(' ') if token_end >= 0: token = line[:token_end] tokens.append(token) else: line_begin = line[:20] + '...' if len(line) > 20 else line logger.warning(f'Skipping line number %d: %s', line_number, line_begin) return tokens
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics()
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def evaluate(self, model: Model): model.eval() val_generator = self.iterator(self.dataset, num_epochs=1, shuffle=False) num_validation_batches = self.iterator.get_num_batches(self.dataset) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) vocabulary = self.vocab.get_index_to_token_vocabulary('tokens') predictions, sources, references, alignments = [], [], [], [] for data in self.reader.read_raw(self.data_path): sources.append(data['source']) references.append(data['target']) for batch in val_generator_tqdm: batch = util.move_to_device(batch, self.cuda_device) output_dict = model.predict(batch['src'], max_decoding_step=self.max_decoding_step) alignments += output_dict['alignments'] for pred in output_dict['output_ids']: pred_sent = list(map(vocabulary.get, pred)) if '@@EOS@@' in pred_sent: pred_sent = pred_sent[:pred_sent.index('@@EOS@@')] pred_sent = ' '.join(pred_sent) predictions.append(pred_sent) for i in range(len(predictions)): source_sent = sources[i].split(' ') pred_sent = predictions[i].split(' ') for j in range(len(pred_sent)): if pred_sent[j] == '@@UNKNOWN@@' and alignments[i][j] < len(source_sent): pred_sent[j] = source_sent[alignments[i][j]] predictions[i] = ' '.join(pred_sent) if self.post_map is not None: predictions = [self.post_processs(p, m) for p, m in zip(predictions, self.post_map)] references = [self.post_processs(r, m) for r, m in zip(references, self.post_map)] score = {} score['bleu'] = calc_bleu_score(predictions, references, self.log_dir) model.train() return score
def from_dataset_reader( cls, reader: DatasetReader, data_path: str, batch_size: int, shuffle: bool = False, batches_per_epoch: Optional[int] = None, quiet: bool = False, ) -> "SimpleDataLoader": instance_iter = reader.read(data_path) if not quiet: instance_iter = Tqdm.tqdm(instance_iter, desc="loading instances") instances = list(instance_iter) return cls(instances, batch_size, shuffle=shuffle, batches_per_epoch=batches_per_epoch)
def from_instances(cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Sequence[ str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, unk_token_num: Dict[str, int] = None, exclude_namespaces=None, include_namespaces=None) -> 'ExVocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) if exclude_namespaces is not None: for namespace in namespace_token_counts: if namespace in exclude_namespaces: namespace_token_counts[namespace] = dict() if include_namespaces is not None: # If include namespaces is not None, we only include those namespaces. if namespace not in include_namespaces: namespace_token_counts[namespace] = dict() print("Start counting for namespaces:") for namespace, counter in namespace_token_counts.items(): if len(counter) != 0: print(namespace) return ExVocabulary( counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, unk_token_num=unk_token_num)
def _read(self, file_path): with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(Tqdm.tqdm(data_file)): line = line.strip("\n") if not line: continue line_parts = line.split('\t') if len(line_parts) != 2: raise ConfigurationError( "Invalid line format: %s (line number %d)" % (line, line_num + 1)) source_sequence, target_sequence = line_parts yield self.text_to_instance(source_sequence, target_sequence)
def variance_explained_by_pc( embedding_fn: str, word2sent_indexer: Dict[str, List[Tuple[int, int]]], variance_explained_fn : str, pc_fn : str) -> None: """ Each word in word2sent_indexer appears in multiple sentences. Thus each occurrence of the word will have a different embedding at each layer. How much of the variance in these occurrence embeddings can be explained by the first principal component? In other words, to what extent can these different occurrence embeddings be replaced by a single, static word embedding? Create a table of size (#words x #layers) and write the variance explained to variance_explained_fn. Write the first principal component for each word to pc_fn + str(layer_index), where each row starts with a word followed by space-separated numbers. """ f = h5py.File(embedding_fn, 'r') num_layers = f["0"].shape[0] # write statistics to csv file: one row per word, one column per layer # excluding first layer, since we don't expect the input embeddings to be the same at all for gpt2/bert # and we expect them to be identical for elmo fieldnames = ['word'] + list(map(lambda w: 'layer_' + w, map(str, range(1, num_layers)))) writer = csv.DictWriter(open(variance_explained_fn, 'w'), fieldnames=fieldnames) writer.writeheader() # files to write the principal components to pc_vector_files = { layer: open(pc_fn + str(layer), 'w') for layer in range(1, num_layers) } for word in Tqdm.tqdm(word2sent_indexer): variance_explained = { 'word' : word } # calculate variance explained by the first principal component for layer in range(1, num_layers): embeddings = [ f[str(sent_index)][layer, word_index].tolist() for sent_index, word_index in word2sent_indexer[word] if f[str(sent_index)][layer, word_index].shape != () ] pca = PCA(n_components=1) pca.fit(embeddings) pca_svd = TruncatedSVD(n_components=100) pca_svd.fit(embeddings) variance_explained[f'layer_{layer}'] = min(1.0, round(pca.explained_variance_ratio_[0], 3)) pc_vector_files[layer].write(' '.join([word] + list(map(str, pca_svd.components_[0]))) + '\n') writer.writerow(variance_explained)
def _read(self, file_path: str): self._dataset_cache = None if self._dataset_dir_out is not None: self._dataset_cache = [] instances = self._read_internal(file_path) if self._dataset_cache is not None: if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not os.path.exists(self._dataset_dir_out): os.mkdir(self._dataset_dir_out) output_file = os.path.join(self._dataset_dir_out, os.path.basename(file_path)) logger.info(f"Saving contextualized dataset to {output_file}.") with open(output_file, 'w') as file: for d in self._dataset_cache: file.write(json.dumps(d)) file.write("\n") return instances
def _read(self, file_path: str) -> Iterator[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._lower: instance_strings = [ string.lower() for string in instance_strings ] if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] for line in tokenized_strings: sentence = [] ent_types = [] for token in line: # Type: allennlp.data.tokenizers.token.Token token = str(token) ent_type = token[0] if ent_type not in ['!', '*']: ent_type = '_' # Indicates irrelevant non-tagged tokens. else: token = token[1:] sentence.append(token) ent_types.append(ent_type) if sentence != []: yield self.text_to_instance([Token(word) for word in sentence], ent_types)
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._model.eval() val_generator = self._iterator(self._validation_data, num_epochs=1, cuda_device=self._iterator_device, for_training=False) num_validation_batches = self._iterator.get_num_batches( self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batch_num = 0 val_loss = 0 for batch in val_generator_tqdm: batch_num += 1 self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=False) val_loss += loss.data.cpu().numpy() # Update the description with the latest metrics val_metrics = self._get_metrics(val_loss, batch_num, reset=False) if self._model.new and batch_num > 1: logger.info(" ") self.f1s_valid.append(val_metrics['f1-measure-overall']) np.save('results/f1', self.f1s_valid) self._get_metrics(val_loss, batch_num, reset=True) self._batch_loss(batch, for_training=False) val_metrics = self._get_metrics(val_loss, batch_num, reset=False) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) self.f1s_valid.append(val_metrics['f1-measure-overall']) np.save('results/f1', self.f1s_valid) return val_loss, batch_num
def _validation_loss(self) -> Tuple[float, int]: logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) if getattr(self, "val_dataset", None) is None: self.val_dataset = DMDataSet(data=self._validation_data[0], batch_size=self.batch_size, num_gpus=num_gpus, shuffle=False) num_validation_batches = math.ceil( len(self.val_dataset) / self.batch_size / num_gpus) val_generator_tqdm = Tqdm.tqdm(self.val_dataset, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def reptile_outer_update(self, train_generators: List[Iterable], iteration: int, num_gpus: int): # https://github.com/farbodtm/reptile-pytorch/blob/master/reptile.py weights_before = deepcopy(self.model.state_dict()) self.optimizer.zero_grad() random.shuffle(train_generators) new_weights = [] total_loss = 0.0 # for batch in train_generators[0]: # print('[info]batch is:{}'.format(batch)) task_wrap = Tqdm.tqdm(zip(train_generators[0], train_generators[1], train_generators[2]), total=1) # , train_generators[3], train_generators[4]), \ for i, batch_group in enumerate(task_wrap): if not i: for k in range(self.meta_batch_size): # tasks per batch total_loss += self.reptile_inner_update(batch_group[k][0]) new_weights.append(deepcopy(self.model.state_dict())) self.model.load_state_dict({ name: weights_before[name] for name in weights_before }) else: break weights_after = { name: new_weights[0][name] / float(self.meta_batch_size) for name in new_weights[0] } for i in range(1, self.meta_batch_size): for name in new_weights[i]: weights_after[name] += new_weights[i][name] / float( self.meta_batch_size) #They used self.step_size of 1.0 in some of their outer. outerstepsize = self.meta_step_size * ( 1 - iteration / self.meta_batches) # linear schedule self.model.load_state_dict({ name: weights_before[name] + (weights_after[name] - weights_before[name]) * outerstepsize for name in weights_before }) return total_loss / self.meta_batch_size
def predict(self, model: Model): model.eval() generator_tqdm = Tqdm.tqdm(self.dataloader, total=len(self.dataloader)) model_outputs = {} for batch in generator_tqdm: with torch.no_grad(): batch = util.move_to_device(batch, self.cuda_device) output_dict = model.back2table(**batch) for key in output_dict: if key not in model_outputs: model_outputs[key] = output_dict[key] else: model_outputs[key] += output_dict[key] predictions = self.corpus.predict(model_outputs, self.dataset) model.train() return predictions
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._model.eval() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self._iterator val_generator = val_iterator(self._validation_data, num_epochs=1, cuda_device=self._iterator_device) num_validation_batches = val_iterator.get_num_batches(self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: loss = self._batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = self._get_metrics(val_loss, batches_this_epoch) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) return val_loss, batches_this_epoch
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({'input_tokens': input_field, 'output_tokens': output_field})
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() train_loss += loss.item() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard(batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False ) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE, forget_sentences: bool = False, use_sentence_keys: bool = False) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentence embedding is saved in a dataset with the line number in the original file as the key. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. forget_sentences : ``bool``, optional, (default = False). If use_sentence_keys is False, whether or not to include a string serialized JSON dictionary that associates sentences with their line number (its HDF5 key). The mapping is placed in the "sentence_to_index" HDF5 key. This is useful if you want to use the embeddings without keeping the original file of sentences around. use_sentence_keys : ``bool``, optional, (default = False). Whether or not to use full sentences as keys. By default, the line numbers of the input file are used as ids, which is more robust. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file] blank_lines = [i for (i, line) in enumerate(sentences) if line == ""] if blank_lines: raise ConfigurationError(f"Your input file contains empty lines at indexes " f"{blank_lines}. Please remove them.") split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence index as the key. if use_sentence_keys: logger.warning("Using sentences as keys can fail if sentences " "contain forward slashes or colons. Use with caution.") embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) else: embedded_sentences = ((str(i), x) for i, x in enumerate(self.embed_sentences(split_sentences, batch_size))) sentence_to_index = {} logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if use_sentence_keys and key in fout.keys(): raise ConfigurationError(f"Key already exists in {output_file_path}. " f"To encode duplicate sentences, do not pass " f"the --use-sentence-keys flag.") if not forget_sentences and not use_sentence_keys: sentence = sentences[int(key)] sentence_to_index[sentence] = key if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[-1] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( str(key), output.shape, dtype='float32', data=output ) if not forget_sentences and not use_sentence_keys: sentence_index_dataset = fout.create_dataset( "sentence_to_index", (1,), dtype=h5py.special_dtype(vlen=str)) sentence_index_dataset[0] = json.dumps(sentence_to_index) input_file.close()
def main(serialization_directory: int, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = data if data else config['validation_data_path'] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = [x.text for x in fields["tokens"].tokens] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics