コード例 #1
0
ファイル: vocabulary.py プロジェクト: ziaridoy20/allennlp
    def extend_from_instances(self,
                              params: Params,
                              instances: Iterable['adi.Instance'] = ()) -> None:
        """
        Extends an already generated vocabulary using a collection of instances.
        """
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None)
        only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")

        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)
        self._extend(counter=namespace_token_counts,
                     min_count=min_count,
                     max_vocab_size=max_vocab_size,
                     non_padded_namespaces=non_padded_namespaces,
                     pretrained_files=pretrained_files,
                     only_include_pretrained_words=only_include_pretrained_words,
                     tokens_to_add=tokens_to_add,
                     min_pretrained_embeddings=min_pretrained_embeddings)
コード例 #2
0
ファイル: vocabulary.py プロジェクト: ziaridoy20/allennlp
    def from_instances(cls,
                       instances: Iterable['adi.Instance'],
                       min_count: Dict[str, int] = None,
                       max_vocab_size: Union[int, Dict[str, int]] = None,
                       non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
                       pretrained_files: Optional[Dict[str, str]] = None,
                       only_include_pretrained_words: bool = False,
                       tokens_to_add: Dict[str, List[str]] = None,
                       min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        return cls(counter=namespace_token_counts,
                   min_count=min_count,
                   max_vocab_size=max_vocab_size,
                   non_padded_namespaces=non_padded_namespaces,
                   pretrained_files=pretrained_files,
                   only_include_pretrained_words=only_include_pretrained_words,
                   tokens_to_add=tokens_to_add,
                   min_pretrained_embeddings=min_pretrained_embeddings)
コード例 #3
0
ファイル: evaluate.py プロジェクト: ziaridoy20/allennlp
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
コード例 #4
0
ファイル: trainer.py プロジェクト: Jordan-Sauchuk/allennlp
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        val_generator = self._iterator(self._validation_data,
                                       num_epochs=1,
                                       cuda_device=self._iterator_device,
                                       for_training=False)
        num_validation_batches = self._iterator.get_num_batches(self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:
            batches_this_epoch += 1

            loss = self._batch_loss(batch, for_training=False)
            val_loss += loss.data.cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batches_this_epoch)
            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
コード例 #5
0
ファイル: file_utils.py プロジェクト: pyknife/allennlp
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = DATASET_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    # make HEAD request to check ETag
    response = requests.head(url, allow_redirects=True)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with tempfile.NamedTemporaryFile() as temp_file:
            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)

            # GET file object
            req = requests.get(url, stream=True)
            content_length = req.headers.get('Content-Length')
            total = int(content_length) if content_length is not None else None
            progress = Tqdm.tqdm(unit="B", total=total)
            for chunk in req.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)
            progress.close()

            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()
            # shutil.copyfileobj() starts at the current position, so go to the start
            temp_file.seek(0)

            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)

            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
            with open(meta_path, 'w') as meta_file:
                json.dump(meta, meta_file)

            logger.info("removing temp file %s", temp_file.name)

    return cache_path
コード例 #6
0
ファイル: file_utils.py プロジェクト: apmoore1/allennlp
def http_get(url: str, temp_file: IO) -> None:
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = Tqdm.tqdm(unit="B", total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk: # filter out keep-alive new chunks
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()
コード例 #7
0
ファイル: trainer.py プロジェクト: hanseungwook/allennlp
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data,
                                         num_epochs=1,
                                         shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss,
                                                    batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
コード例 #8
0
ファイル: preprocess.py プロジェクト: lixinsu/MultiQA
    def tokenize_and_detect_answers(self, contexts, shuffle=True, search_answer_within_supp_context=False):
        if shuffle:
            random.seed(0)
            random.shuffle(contexts)

        if self._n_processes == 1:
            for context in Tqdm.tqdm(contexts, ncols=80):
                self.preprocess_context(context, search_answer_within_supp_context)
        else:
            # multi process (creates chunks of 200 contexts each )
            preprocessed_instances = []
            with Pool(self._n_processes) as pool:
                chunks = split(contexts, self._n_processes)
                chunks = flatten_iterable(group(c, 200) for c in chunks)
                pbar = Tqdm.tqdm(total=len(chunks), ncols=80, smoothing=0.0)
                for preproc_inst in pool.imap_unordered(self._preprocess_t,[[c, search_answer_within_supp_context] for c in chunks]):
                    preprocessed_instances += preproc_inst
                    pbar.update(1)
                pbar.close()
            contexts = preprocessed_instances

        return contexts
コード例 #9
0
 def __iter__(self) -> Iterator[Instance]:
     instance_iterator: Iterator[Instance] = self.reader.read(
         self.file_path)
     worker_info = data.get_worker_info()
     if worker_info is None or worker_info.id == 0:
         # Wrap with Tqdm progress bar if this is the main or only worker.
         instance_iterator = Tqdm.tqdm(instance_iterator,
                                       desc="reading instances")
     for instance in instance_iterator:
         self.reader.apply_token_indexers(instance)
         if self.vocab is not None:
             instance.index_fields(self.vocab)
         yield instance
コード例 #10
0
ファイル: predictor.py プロジェクト: neoTCR/cu-tsp
def run(args):
    print('\nArguments:')
    for k, v in vars(args).items():
        print('{}: {}'.format(k, v))
    print()

    device = args.device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('Loading archive ...')
    archive = load_archive(args.model_path)
    # predictor = Predictor.from_archive(archive, 'protein_predictor')
    config = archive.config.duplicate()
    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    model = archive.model.to(device).eval()

    print('Loading data ...')
    dataset_reader.lazy = False
    dataset = dataset_reader.read(args.input_path)
    iterator = BasicIterator(args.batch_size)
    iterator.index_with(model.vocab)
    num_batches = iterator.get_num_batches(dataset)
    data_generator = iterator(dataset, num_epochs=1, shuffle=False)

    print('Predicting ...')
    output_dict = {}
    with torch.no_grad():
        for batch in Tqdm.tqdm(data_generator, total=num_batches):
            batch = move_to_device(batch, model._get_prediction_device())
            outputs = model(**batch)
            predictions = outputs['predictions'].cpu().numpy()
            for pid, length, pred in zip(outputs['protein_id'], outputs['length'], predictions):
                if model.target == 'dcalpha':
                    dcalpha = pred[:length, :length]
                    dcalpha = np.triu(dcalpha, 1) + np.tril(dcalpha.transpose(), -1)
                    output_dict[pid] = {'dcalpha': dcalpha}
                elif model.target == 'angles':
                    psi, phi = pred[:length, 0], pred[:length, 1]
                    # psi[0] = 0.
                    # phi[-1] = 0.
                    output_dict[pid] = {'psi': psi, 'phi': phi}
                else:
                    coords = pred[:length]
                    output_dict[pid] = {'coords': coords}

    print('Writing to {}'.format(args.output_path))
    with open(args.output_path, 'wb') as fout:
        pickle.dump(output_dict, fout)

    print('All done.')
コード例 #11
0
    def validate(self, trainer: 'CallbackTrainer'):
        # If the trainer has MovingAverage objects, use their weights for validation.
        for moving_average in self.moving_averages:
            moving_average.assign_average_value()

        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            logger.info("Validating")

            trainer.model.eval()

            num_gpus = len(trainer._cuda_devices)  # pylint: disable=protected-access

            raw_val_generator = self.iterator(self.instances,
                                              num_epochs=1,
                                              shuffle=False)
            val_generator = lazy_groups_of(raw_val_generator, num_gpus)
            num_validation_batches = math.ceil(
                self.iterator.get_num_batches(self.instances) / num_gpus)
            val_generator_tqdm = Tqdm.tqdm(val_generator,
                                           total=num_validation_batches)

            batches_this_epoch = 0
            val_loss = 0
            for batch_group in val_generator_tqdm:

                loss = trainer.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    trainer.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

            trainer.val_metrics = training_util.get_metrics(trainer.model,
                                                            val_loss,
                                                            batches_this_epoch,
                                                            reset=True)

        # If the trainer has a moving average, restore
        for moving_average in self.moving_averages:
            moving_average.restore()
コード例 #12
0
ファイル: vocabulary_multitask.py プロジェクト: hashc/scicite
    def from_instances(
        cls,
        instances: Iterable['adi.Instance'],
        min_count: Dict[str, int] = None,
        max_vocab_size: Union[int, Dict[str, int]] = None,
        non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
        pretrained_files: Optional[Dict[str, str]] = None,
        only_include_pretrained_words: bool = False,
        tokens_to_add: Dict[str, List[str]] = None,
        min_pretrained_embeddings: Dict[str, int] = None,
        instances_aux: Optional[Iterable['adi.Instance']] = None
    ) -> 'Vocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
            lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        if instances_aux is not None:
            logger.info("Fitting token dictionary from auxillary dataset.")
            for instance in Tqdm.tqdm(instances_aux):
                instance.count_vocab_items(namespace_token_counts)

        return VocabularyMultitask(
            counter=namespace_token_counts,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add,
            min_pretrained_embeddings=min_pretrained_embeddings)
コード例 #13
0
 def test_reset_tqdm_logger_handlers(self):
     serialization_dir_a = os.path.join(self.TEST_DIR, "test_a")
     os.makedirs(serialization_dir_a, exist_ok=True)
     prepare_global_logging(serialization_dir_a)
     serialization_dir_b = os.path.join(self.TEST_DIR, "test_b")
     os.makedirs(serialization_dir_b, exist_ok=True)
     prepare_global_logging(serialization_dir_b)
     # Use range(1) to make sure there should be only 2 lines in the file (0% and 100%)
     for _ in Tqdm.tqdm(range(1)):
         pass
     with open(os.path.join(serialization_dir_a, "out.log"), "r") as f:
         assert len(f.readlines()) == 0
     with open(os.path.join(serialization_dir_b, "out.log"), "r") as f:
         assert len(f.readlines()) == 2
コード例 #14
0
ファイル: elmo.py プロジェクト: vedeshk/allennlp
    def embed_file(self,
                   input_file: IO,
                   output_file_path: str,
                   output_format: str = "all",
                   batch_size: int = DEFAULT_BATCH_SIZE) -> None:
        """
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        """

        assert output_format in ["all", "top", "average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file if line.strip()]
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence as the key.
        embedded_sentences = zip(
            sentences, self.embed_sentences(split_sentences, batch_size))

        logger.info("Processing sentences.")
        with h5py.File(output_file_path, 'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if key in fout.keys():
                    logger.warning(
                        f"Key already exists in {output_file_path}, skipping: {key}"
                    )
                else:
                    if output_format == "all":
                        output = embeddings
                    elif output_format == "top":
                        output = embeddings[2]
                    elif output_format == "average":
                        output = numpy.average(embeddings, axis=0)

                    fout.create_dataset(key,
                                        output.shape,
                                        dtype='float32',
                                        data=output)
        input_file.close()
コード例 #15
0
    def predict_instances_to_file(self, instances, path, batch_size=128):
        all_outputs = []
        for i in Tqdm.tqdm(range(math.ceil(len(instances) / batch_size))):
            batch_instances = instances[i * batch_size:(i + 1) * batch_size]
            model_outputs = self.predict_batch_instance(batch_instances)

            for j, instance in enumerate(batch_instances):
                outputs = self._decode_by_output(model_outputs[j])
                outputs['text'] = ''.join(
                    map(str, instance.fields['text'].tokens))
                all_outputs.append(outputs)

        with open(path, 'w', encoding='utf-8') as f:
            json.dump(all_outputs, f, indent=2, ensure_ascii=False)
コード例 #16
0
    def iter_instances(self) -> Iterator[Instance]:
        if self._instances:
            yield from self._instances
        else:
            if self.max_instances_in_memory is None:
                self._instances = []

            if self.num_workers <= 0:
                # Just read all instances in main process.
                for instance in Tqdm.tqdm(self.reader.read(self.data_path),
                                          desc="loading instances"):
                    self.reader.apply_token_indexers(instance)
                    if self.max_instances_in_memory is None:
                        self._instances.append(instance)  # type: ignore
                    if self._vocab is not None:
                        instance.index_fields(self._vocab)
                    yield instance
            else:
                ctx = mp.get_context(self.start_method)
                queue: mp.JoinableQueue = (
                    ctx.JoinableQueue()
                    if self._max_instance_queue_size is None else
                    ctx.JoinableQueue(maxsize=self._max_instance_queue_size))
                workers = self._start_instance_workers(queue, ctx)

                try:
                    for instance in Tqdm.tqdm(self._gather_instances(queue),
                                              desc="loading instances"):
                        if self.max_instances_in_memory is None:
                            self._instances.append(instance)  # type: ignore
                        yield instance
                finally:
                    if hasattr(queue, "close"
                               ):  # for compat with different Python versions.
                        queue.close()  # type: ignore[attr-defined]
                    self._join_workers(workers, queue)
コード例 #17
0
    def make_hdf5_file(self, sentences: List[str], out_fn: str) -> None:
        """
		Given a list of sentences, tokenize each one and vectorize the tokens. Write the embeddings
		to out_fn in the HDF5 file format. The index in the data corresponds to the sentence index.
		"""
        sentence_index = 0

        with h5py.File(out_fn, 'w') as fout:
            for sentence in Tqdm.tqdm(sentences):
                embeddings = self.vectorize(sentence)
                fout.create_dataset(str(sentence_index),
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)
                sentence_index += 1
コード例 #18
0
ファイル: ontonotes.py プロジェクト: uganyasavur/allennlp
    def dataset_path_iterator(file_path: str) -> Iterator[str]:
        """
        An iterator returning file_paths in a directory
        containing CONLL-formatted files.
        """
        logger.info("Reading CONLL sentences from dataset files at: %s", file_path)
        for root, _, files in Tqdm.tqdm(list(os.walk(file_path))):
            for data_file in files:
                # These are a relic of the dataset pre-processing. Every
                # file will be duplicated - one file called filename.gold_skel
                # and one generated from the preprocessing called filename.gold_conll.
                if not data_file.endswith("gold_conll"):
                    continue

                yield os.path.join(root, data_file)
コード例 #19
0
 def _read(self, file_path):
     with open(file_path, 'r', encoding="utf8") as data_file:
         lines = data_file.readlines()
         self.total_instances = len(lines)
         if self._tqdm:
             lines = Tqdm.tqdm(lines)
         for line_num, line in enumerate(lines):
             line = line.strip("\n")
             if not line:
                 continue
             tokenized_sentence, sentence_len, sentiment = \
                 self._tokenizer.tokenize(line)
             if sentence_len > self._max_len or sentence_len < self._min_len:
                 continue
             yield self.text_to_instance(tokenized_sentence, sentiment)
コード例 #20
0
    def _read(self, file_path):
        # we need to prepare word frequency stats before dealing with our corpus to subsample frequent words
        if self._word_sample_prob is None:
            logger.info(f'Building word frequency stats...')
            self._word_sample_prob = {}
            total = 0
            with open(cached_path(file_path), 'r') as f:
                for line in Tqdm.tqdm(f.readlines()):
                    tokens = line.strip().split()
                    for token in tokens:
                        if token in self._word_sample_prob:
                            self._word_sample_prob[token] += 1
                        else:
                            self._word_sample_prob[token] = 1
                        total += 1

                for k, v in self._word_sample_prob.items():
                    # convert count into frequency
                    self._word_sample_prob[k] = v / total
                    # word downsampling to prevent frequent words from being shown so much
                    self._word_sample_prob[k] = max(0, 1 - np.sqrt(self._subsampling_threshold /
                                                                   self._word_sample_prob[k]))

        logger.info(f'Reading instances from lines in file at {file_path}')
        with open(cached_path(file_path), 'r') as f:
            for line in Tqdm.tqdm(f.readlines()):
                tokens = line.strip().split()
                for i in range(len(tokens)):
                    if np.random.binomial(1, self._word_sample_prob[tokens[i]]):
                        start = max(0, i - self._window_size)
                        end = min(len(tokens) - 1, i + self._window_size)

                        source = Token(tokens[i])
                        targets = [Token(tokens[j]) for j in range(start, end + 1) if i != j]

                        yield self.text_to_instance(source, targets)
コード例 #21
0
 def _read(self, file_path):
     # if `file_path` is a URL, redirect to the cache
     # file_path = cached_path(file_path)
     for filename in os.listdir(file_path):
         filename_splitted = filename.split('_')
         task_name = filename_splitted[-3]
         domain_name = filename_splitted[-2]
         if task_name not in self._tasks or domain_name not in self._domains:
             continue
         with open(os.path.join(file_path, filename), "r") as data_file:
             logger.info("Reading instances from lines in file at: %s",
                         filename)
             for line in Tqdm.tqdm(data_file):
                 line = line.strip("\n")
                 # skip blank lines
                 if not line:
                     continue
                 tokens_and_tags = [
                     pair.rsplit(self._word_tag_delimiter, 1)
                     for pair in line.split(self._token_delimiter)
                 ]
                 tokens = [Token(token) for token, tag in tokens_and_tags]
                 tags = [tag for token, tag in tokens_and_tags]
                 sequence = TextField(tokens, self._token_indexers)
                 sequence_tags = SequenceLabelField(
                     tags, sequence, label_namespace=task_name + '_labels')
                 task_field = LabelField(task_name,
                                         label_namespace="task_labels")
                 domain_field = LabelField(domain_name,
                                           label_namespace="domain_labels")
                 input_dict = {
                     'task_token': task_field,
                     'domain_token': domain_field,
                     'tokens': sequence
                 }
                 all_tags = []
                 empty_tags = ['O'] * len(tags)
                 for tsk in self._tasks:
                     if tsk != task_name:
                         empty_sequence_tags = SequenceLabelField(
                             empty_tags,
                             sequence,
                             label_namespace=tsk + '_labels')
                         all_tags.append(empty_sequence_tags)
                     else:
                         all_tags.append(sequence_tags)
                 input_dict['all_tags'] = ListField(all_tags)
                 yield Instance(input_dict)
コード例 #22
0
ファイル: elmo.py プロジェクト: Jordan-Sauchuk/allennlp
    def embed_file(self,
                   input_file: IO,
                   output_file_path: str,
                   output_format: str = "all",
                   batch_size: int = DEFAULT_BATCH_SIZE) -> None:
        """
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        """

        assert output_format in ["all", "top", "average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file if line.strip()]
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence as the key.
        embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size))

        logger.info("Processing sentences.")
        with h5py.File(output_file_path, 'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if key in fout.keys():
                    logger.warning(f"Key already exists in {output_file_path}, skipping: {key}")
                else:
                    if output_format == "all":
                        output = embeddings
                    elif output_format == "top":
                        output = embeddings[2]
                    elif output_format == "average":
                        output = numpy.average(embeddings, axis=0)

                    fout.create_dataset(
                            key,
                            output.shape, dtype='float32',
                            data=output
                    )
        input_file.close()
コード例 #23
0
ファイル: file_utils.py プロジェクト: cl-tohoku/allennlp
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = DATASET_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    # make HEAD request to check ETag
    response = requests.head(url)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        _, temp_filename = tempfile.mkstemp()
        logger.info("%s not found in cache, downloading to %s", url,
                    temp_filename)

        # GET file object
        req = requests.get(url, stream=True)
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = Tqdm.tqdm(unit="B", total=total)
        with open(temp_filename, 'wb') as temp_file:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)

        progress.close()

        logger.info("copying %s to cache at %s", temp_filename, cache_path)
        shutil.copyfile(temp_filename, cache_path)
        logger.info("removing temp file %s", temp_filename)
        os.remove(temp_filename)

    return cache_path
コード例 #24
0
ファイル: evaluate.py プロジェクト: zwdcs/allennlp
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        batch_count = 0
        loss_count = 0
        total_loss = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            loss = model(**batch).get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                metrics["loss"] = loss.item()
                total_loss += loss.item()

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss/batch_count

        return final_metrics
コード例 #25
0
ファイル: evaluate.py プロジェクト: Jordan-Sauchuk/allennlp
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics(reset=True)
コード例 #26
0
ファイル: vocabulary.py プロジェクト: ziaridoy20/allennlp
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    # Moving this import to the top breaks everything (cycling import, I guess)
    from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile

    logger.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                logger.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
コード例 #27
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics()
コード例 #28
0
ファイル: vocabulary.py プロジェクト: wgc20/GrailQA
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    # Moving this import to the top breaks everything (cycling import, I guess)
    from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile

    logger.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                logger.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
コード例 #29
0
def main(serialization_directory, device):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    """

    config = Params.from_file(os.path.join(serialization_directory, "config.json"))
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = config['validation_data_path']

    model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device)

    prediction_file_path = os.path.join(serialization_directory, "predictions.txt")
    gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("Reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(model.vocab)

    model_predictions = []
    batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False)
    for batch in Tqdm.tqdm(batches):
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    for instance, prediction in zip(instances, model_predictions):
        fields = instance.fields
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_tags = fields["tags"].labels
        sentence = fields["tokens"].tokens

        write_to_conll_eval_file(prediction_file, gold_file,
                                 verb_index, sentence, prediction, gold_tags)
    prediction_file.close()
    gold_file.close()
コード例 #30
0
ファイル: predictor.py プロジェクト: yaolinxia/WEAN
    def evaluate(self, model: Model):
        model.eval()

        val_generator = self.iterator(self.dataset,
                                      num_epochs=1,
                                      shuffle=False)

        num_validation_batches = self.iterator.get_num_batches(self.dataset)
        val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
        vocabulary = self.vocab.get_index_to_token_vocabulary('tokens')

        predictions, sources, references, alignments = [], [], [], []
        
        for data in self.reader.read_raw(self.data_path):
            sources.append(data['source'])
            references.append(data['target'])

        for batch in val_generator_tqdm:
            batch = util.move_to_device(batch, self.cuda_device)

            output_dict = model.predict(batch['src'], max_decoding_step=self.max_decoding_step)
            alignments += output_dict['alignments']

            for pred in output_dict['output_ids']:
                pred_sent = list(map(vocabulary.get, pred))
                if '@@EOS@@' in pred_sent:
                    pred_sent = pred_sent[:pred_sent.index('@@EOS@@')]
                pred_sent = ' '.join(pred_sent)
                predictions.append(pred_sent)
        
        for i in range(len(predictions)):
            source_sent = sources[i].split(' ')
            pred_sent = predictions[i].split(' ')
            for j in range(len(pred_sent)):
                if pred_sent[j] == '@@UNKNOWN@@' and alignments[i][j] < len(source_sent):
                    pred_sent[j] = source_sent[alignments[i][j]]
            predictions[i] = ' '.join(pred_sent)

        if self.post_map is not None:
            predictions = [self.post_processs(p, m) for p, m in zip(predictions, self.post_map)]
            references = [self.post_processs(r, m) for r, m in zip(references, self.post_map)]

        score = {}
        score['bleu'] = calc_bleu_score(predictions, references, self.log_dir)
        model.train()

        return score
コード例 #31
0
ファイル: file_utils.py プロジェクト: Jordan-Sauchuk/allennlp
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = DATASET_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    # make HEAD request to check ETag
    response = requests.head(url)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        _, temp_filename = tempfile.mkstemp()
        logger.info("%s not found in cache, downloading to %s", url, temp_filename)

        # GET file object
        req = requests.get(url, stream=True)
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = Tqdm.tqdm(unit="B", total=total)
        with open(temp_filename, 'wb') as temp_file:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)

        progress.close()

        logger.info("copying %s to cache at %s", temp_filename, cache_path)
        shutil.copyfile(temp_filename, cache_path)
        logger.info("removing temp file %s", temp_filename)
        os.remove(temp_filename)

    return cache_path
コード例 #32
0
 def from_dataset_reader(
     cls,
     reader: DatasetReader,
     data_path: str,
     batch_size: int,
     shuffle: bool = False,
     batches_per_epoch: Optional[int] = None,
     quiet: bool = False,
 ) -> "SimpleDataLoader":
     instance_iter = reader.read(data_path)
     if not quiet:
         instance_iter = Tqdm.tqdm(instance_iter, desc="loading instances")
     instances = list(instance_iter)
     return cls(instances,
                batch_size,
                shuffle=shuffle,
                batches_per_epoch=batches_per_epoch)
コード例 #33
0
ファイル: exvocab.py プロジェクト: j6mes/fever-unc-system
    def from_instances(cls,
                       instances: Iterable['adi.Instance'],
                       min_count: Dict[str, int] = None,
                       max_vocab_size: Union[int, Dict[str, int]] = None,
                       non_padded_namespaces: Sequence[
                           str] = DEFAULT_NON_PADDED_NAMESPACES,
                       pretrained_files: Optional[Dict[str, str]] = None,
                       only_include_pretrained_words: bool = False,
                       unk_token_num: Dict[str, int] = None,
                       exclude_namespaces=None,
                       include_namespaces=None) -> 'ExVocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
            lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        if exclude_namespaces is not None:
            for namespace in namespace_token_counts:
                if namespace in exclude_namespaces:
                    namespace_token_counts[namespace] = dict()

                if include_namespaces is not None:
                    # If include namespaces is not None, we only include those namespaces.
                    if namespace not in include_namespaces:
                        namespace_token_counts[namespace] = dict()

        print("Start counting for namespaces:")
        for namespace, counter in namespace_token_counts.items():
            if len(counter) != 0:
                print(namespace)

        return ExVocabulary(
            counter=namespace_token_counts,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            unk_token_num=unk_token_num)
コード例 #34
0
    def _read(self, file_path):
        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line_num, line in enumerate(Tqdm.tqdm(data_file)):
                line = line.strip("\n")

                if not line:
                    continue

                line_parts = line.split('\t')
                if len(line_parts) != 2:
                    raise ConfigurationError(
                        "Invalid line format: %s (line number %d)" %
                        (line, line_num + 1))
                source_sequence, target_sequence = line_parts
                yield self.text_to_instance(source_sequence, target_sequence)
コード例 #35
0
ファイル: analyze.py プロジェクト: oshaikh13/contextual
def variance_explained_by_pc(
	embedding_fn: str, 
	word2sent_indexer: Dict[str, List[Tuple[int, int]]],
	variance_explained_fn : str,
	pc_fn : str) -> None:
	"""
	Each word in word2sent_indexer appears in multiple sentences. Thus each occurrence of the word 
	will have a different embedding at each layer. How much of the variance in these occurrence 
	embeddings can be explained by the first principal component? In other words, to what extent
	can these different occurrence embeddings be replaced by a single, static word embedding?
	
	Create a table of size (#words x #layers) and write the variance explained to variance_explained_fn.
	Write the first principal component for each word to pc_fn + str(layer_index), where each row 
	starts with a word followed by space-separated numbers.
	"""
	f = h5py.File(embedding_fn, 'r')
	num_layers = f["0"].shape[0]

	# write statistics to csv file: one row per word, one column per layer
	# excluding first layer, since we don't expect the input embeddings to be the same at all for gpt2/bert
	# and we expect them to be identical for elmo
	fieldnames = ['word'] + list(map(lambda w: 'layer_' + w, map(str, range(1, num_layers))))
	writer = csv.DictWriter(open(variance_explained_fn, 'w'), fieldnames=fieldnames)
	writer.writeheader()

	# files to write the principal components to 
	pc_vector_files = { layer: open(pc_fn + str(layer), 'w') for layer in range(1, num_layers) }

	for word in Tqdm.tqdm(word2sent_indexer):
		variance_explained = { 'word' : word }

		# calculate variance explained by the first principal component
		for layer in range(1, num_layers):
			embeddings = [ f[str(sent_index)][layer, word_index].tolist() for sent_index, word_index 
				in word2sent_indexer[word] if f[str(sent_index)][layer, word_index].shape != () ]

			pca = PCA(n_components=1)
			pca.fit(embeddings)

			pca_svd = TruncatedSVD(n_components=100)
			pca_svd.fit(embeddings)
			
			variance_explained[f'layer_{layer}'] = min(1.0, round(pca.explained_variance_ratio_[0], 3))
			pc_vector_files[layer].write(' '.join([word] + list(map(str, pca_svd.components_[0]))) + '\n')

		writer.writerow(variance_explained)
コード例 #36
0
 def _read(self, file_path: str):
     self._dataset_cache = None
     if self._dataset_dir_out is not None:
         self._dataset_cache = []
     instances = self._read_internal(file_path)
     if self._dataset_cache is not None:
         if not isinstance(instances, list):
             instances = [instance for instance in Tqdm.tqdm(instances)]
         if not os.path.exists(self._dataset_dir_out):
             os.mkdir(self._dataset_dir_out)
         output_file = os.path.join(self._dataset_dir_out, os.path.basename(file_path))
         logger.info(f"Saving contextualized dataset to {output_file}.")
         with open(output_file, 'w') as file:
             for d in self._dataset_cache:
                 file.write(json.dumps(d))
                 file.write("\n")
     return instances
コード例 #37
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()
            if self._lower:
                instance_strings = [
                    string.lower() for string in instance_strings
                ]

        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s",
                        file_path)
            for index in Tqdm.tqdm(
                    range(0,
                          len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index +
                                                               num_tokens)])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        for line in tokenized_strings:
            sentence = []
            ent_types = []
            for token in line:  # Type: allennlp.data.tokenizers.token.Token
                token = str(token)
                ent_type = token[0]
                if ent_type not in ['!', '*']:
                    ent_type = '_'  # Indicates irrelevant non-tagged tokens.
                else:
                    token = token[1:]
                sentence.append(token)
                ent_types.append(ent_type)
            if sentence != []:
                yield self.text_to_instance([Token(word) for word in sentence],
                                            ent_types)
コード例 #38
0
ファイル: trainer.py プロジェクト: Fritz449/ProtonetCode
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        val_generator = self._iterator(self._validation_data,
                                       num_epochs=1,
                                       cuda_device=self._iterator_device,
                                       for_training=False)
        num_validation_batches = self._iterator.get_num_batches(
            self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batch_num = 0
        val_loss = 0
        for batch in val_generator_tqdm:
            batch_num += 1
            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=False)
            val_loss += loss.data.cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batch_num, reset=False)

            if self._model.new and batch_num > 1:
                logger.info(" ")
                self.f1s_valid.append(val_metrics['f1-measure-overall'])
                np.save('results/f1', self.f1s_valid)
                self._get_metrics(val_loss, batch_num, reset=True)
                self._batch_loss(batch, for_training=False)
                val_metrics = self._get_metrics(val_loss,
                                                batch_num,
                                                reset=False)

            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        self.f1s_valid.append(val_metrics['f1-measure-overall'])
        np.save('results/f1', self.f1s_valid)

        return val_loss, batch_num
コード例 #39
0
ファイル: pt_trainner.py プロジェクト: polixir/abl-sym
    def _validation_loss(self) -> Tuple[float, int]:
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        if getattr(self, "val_dataset", None) is None:
            self.val_dataset = DMDataSet(data=self._validation_data[0],
                                         batch_size=self.batch_size,
                                         num_gpus=num_gpus,
                                         shuffle=False)
        num_validation_batches = math.ceil(
            len(self.val_dataset) / self.batch_size / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(self.val_dataset,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss,
                                                    batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
コード例 #40
0
ファイル: metatrainer.py プロジェクト: ha-lins/medical_dialog
    def reptile_outer_update(self, train_generators: List[Iterable],
                             iteration: int, num_gpus: int):
        # https://github.com/farbodtm/reptile-pytorch/blob/master/reptile.py
        weights_before = deepcopy(self.model.state_dict())
        self.optimizer.zero_grad()
        random.shuffle(train_generators)
        new_weights = []
        total_loss = 0.0
        # for batch in train_generators[0]:
        #     print('[info]batch is:{}'.format(batch))

        task_wrap = Tqdm.tqdm(zip(train_generators[0], train_generators[1],
                                  train_generators[2]),
                              total=1)
        # , train_generators[3], train_generators[4]), \

        for i, batch_group in enumerate(task_wrap):
            if not i:
                for k in range(self.meta_batch_size):  # tasks per batch
                    total_loss += self.reptile_inner_update(batch_group[k][0])
                    new_weights.append(deepcopy(self.model.state_dict()))
                    self.model.load_state_dict({
                        name: weights_before[name]
                        for name in weights_before
                    })
            else:
                break

        weights_after = {
            name: new_weights[0][name] / float(self.meta_batch_size)
            for name in new_weights[0]
        }
        for i in range(1, self.meta_batch_size):
            for name in new_weights[i]:
                weights_after[name] += new_weights[i][name] / float(
                    self.meta_batch_size)
        #They used self.step_size of 1.0 in some of their outer.
        outerstepsize = self.meta_step_size * (
            1 - iteration / self.meta_batches)  # linear schedule
        self.model.load_state_dict({
            name: weights_before[name] +
            (weights_after[name] - weights_before[name]) * outerstepsize
            for name in weights_before
        })
        return total_loss / self.meta_batch_size
コード例 #41
0
ファイル: predictor.py プロジェクト: lancopku/Pivot
    def predict(self, model: Model):
        model.eval()

        generator_tqdm = Tqdm.tqdm(self.dataloader, total=len(self.dataloader))
        model_outputs = {}

        for batch in generator_tqdm:
            with torch.no_grad():
                batch = util.move_to_device(batch, self.cuda_device)
                output_dict = model.back2table(**batch)
                for key in output_dict:
                    if key not in model_outputs:
                        model_outputs[key] = output_dict[key]
                    else:
                        model_outputs[key] += output_dict[key]

        predictions = self.corpus.predict(model_outputs, self.dataset)        
        model.train()
        return predictions
コード例 #42
0
ファイル: trainer.py プロジェクト: pyknife/allennlp
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self._iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     cuda_device=self._iterator_device)
        num_validation_batches = val_iterator.get_num_batches(self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:

            loss = self._batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batches_this_epoch)
            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
コード例 #43
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({'input_tokens': input_field,
                            'output_tokens': output_field})
コード例 #44
0
ファイル: trainer.py プロジェクト: pyknife/allennlp
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)
コード例 #45
0
ファイル: elmo.py プロジェクト: apmoore1/allennlp
    def embed_file(self,
                   input_file: IO,
                   output_file_path: str,
                   output_format: str = "all",
                   batch_size: int = DEFAULT_BATCH_SIZE,
                   forget_sentences: bool = False,
                   use_sentence_keys: bool = False) -> None:
        """
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentence embedding
        is saved in a dataset with the line number in the original file as the key.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        forget_sentences : ``bool``, optional, (default = False).
            If use_sentence_keys is False, whether or not to include a string
            serialized JSON dictionary that associates sentences with their
            line number (its HDF5 key). The mapping is placed in the
            "sentence_to_index" HDF5 key. This is useful if
            you want to use the embeddings without keeping the original file
            of sentences around.
        use_sentence_keys : ``bool``, optional, (default = False).
            Whether or not to use full sentences as keys. By default,
            the line numbers of the input file are used as ids, which is more robust.
        """

        assert output_format in ["all", "top", "average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file]

        blank_lines = [i for (i, line) in enumerate(sentences) if line == ""]
        if blank_lines:
            raise ConfigurationError(f"Your input file contains empty lines at indexes "
                                     f"{blank_lines}. Please remove them.")
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence index as the key.

        if use_sentence_keys:
            logger.warning("Using sentences as keys can fail if sentences "
                           "contain forward slashes or colons. Use with caution.")
            embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size))
        else:
            embedded_sentences = ((str(i), x) for i, x in
                                  enumerate(self.embed_sentences(split_sentences, batch_size)))

        sentence_to_index = {}
        logger.info("Processing sentences.")
        with h5py.File(output_file_path, 'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if use_sentence_keys and key in fout.keys():
                    raise ConfigurationError(f"Key already exists in {output_file_path}. "
                                             f"To encode duplicate sentences, do not pass "
                                             f"the --use-sentence-keys flag.")

                if not forget_sentences and not use_sentence_keys:
                    sentence = sentences[int(key)]
                    sentence_to_index[sentence] = key

                if output_format == "all":
                    output = embeddings
                elif output_format == "top":
                    output = embeddings[-1]
                elif output_format == "average":
                    output = numpy.average(embeddings, axis=0)

                fout.create_dataset(
                        str(key),
                        output.shape, dtype='float32',
                        data=output
                )
            if not forget_sentences and not use_sentence_keys:
                sentence_index_dataset = fout.create_dataset(
                        "sentence_to_index",
                        (1,),
                        dtype=h5py.special_dtype(vlen=str))
                sentence_index_dataset[0] = json.dumps(sentence_to_index)

        input_file.close()
コード例 #46
0
def main(serialization_directory: int,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = data if data else config['validation_data_path']

    archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(model.vocab)

        model_predictions = []
        batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device)
        for batch in Tqdm.tqdm(batches):
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(instances, model_predictions):
            fields = instance.fields
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None

            gold_tags = fields["tags"].labels
            sentence = [x.text for x in fields["tokens"].tokens]

            write_to_conll_eval_file(prediction_file, gold_file,
                                     verb_index, sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
コード例 #47
0
ファイル: evaluate.py プロジェクト: apmoore1/allennlp
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics