Esempio n. 1
0
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Dataset(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device,
                                             for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Esempio n. 2
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         yield batch.as_array_dict(padding_lengths, verbose=False)
Esempio n. 3
0
def token_to_elmo_id(token):
    tokens = [Token(token)]
    field = TextField(tokens, {'character_ids': indexer})
    instance = Instance({"elmo": field})
    instances = [instance]
    dataset = Dataset(instances)
    vocab = Vocabulary()
    for instance in dataset.instances:
        instance.index_fields(vocab)
    #dataset.index_instances(vocab) # replaced by above, so that there's no progress bar
    return dataset.as_tensor_dict()['elmo']['character_ids']
Esempio n. 4
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool,
                      cuda_device: int, for_training: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         yield batch.as_tensor_dict(padding_lengths,
                                    cuda_device=cuda_device,
                                    for_training=for_training)
Esempio n. 5
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo_bilm = _ElmoBiLm(options_file, weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
Esempio n. 6
0
    def test_lazy_as_tensor_dict(self):
        lazy_dataset = self.get_lazy_dataset()
        lazy_dataset.index_instances(self.vocab)

        for _ in range(10):
            dataset = Dataset([instance for instance in lazy_dataset])
            padding_lengths = dataset.get_padding_lengths()
            tensors = dataset.as_tensor_dict(padding_lengths)
            text1 = tensors["text1"]["tokens"].data.cpu().numpy()
            text2 = tensors["text2"]["tokens"].data.cpu().numpy()

            numpy.testing.assert_array_almost_equal(
                text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
            numpy.testing.assert_array_almost_equal(
                text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
Esempio n. 7
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Dataset(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Esempio n. 9
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        ontonotes_reader = Ontonotes()
        for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            instance = self.text_to_instance([s.words for s in sentences], canonical_clusters)
            instances.append(instance)

        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 10
0
    def read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:

            instances = []
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in tqdm.tqdm(data_file):
                line = line.strip("\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [
                    pair.rsplit(self._word_tag_delimiter, 1)
                    for pair in line.split(self._token_delimiter)
                ]
                tokens = [x[0] for x in tokens_and_tags]
                tags = [x[1] for x in tokens_and_tags]

                sequence = TextField(tokens, self._token_indexers)
                sequence_tags = SequenceLabelField(tags, sequence)
                instances.append(
                    Instance({
                        'tokens': sequence,
                        'tags': sequence_tags
                    }))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 11
0
    def read_fnc(self, d):
        instances = []

        for s in tqdm.tqdm(d.stances):

            headline = s['Headline']
            bodyid = s['Body ID']
            actualBody = d.articles[bodyid]
            label = s['Stance']

            if not (label == "unrelated"):

                if (label == 'discuss'):
                    new_label = "NOT ENOUGH INFO"
                if (label == 'agree'):
                    new_label = "SUPPORTS"
                if (label == 'disagree'):
                    new_label = "REFUTES"

                hypothesis = headline
                premise = actualBody
                instances.append(
                    self.text_to_instance(premise, hypothesis, new_label))

                # print(new_label)
                # print(premise)
                # print(hypothesis)
                # sys.exit(1)

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?")
        return Dataset(instances)
Esempio n. 12
0
 def _sort_dataset_by_padding(
     dataset: Dataset,
     sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
     padding_noise: float = 0.0
 ) -> Dataset:
     """
     Sorts the ``Instances`` in this ``Batch`` by their padding lengths, using the keys in
     ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
     ``(field_name, padding_key)`` tuples.
     """
     instances_with_lengths = []
     for instance in dataset.instances:
         padding_lengths = cast(Dict[str, Dict[str, float]],
                                instance.get_padding_lengths())
         if padding_noise > 0.0:
             noisy_lengths = {}
             for field_name, field_lengths in padding_lengths.items():
                 noisy_lengths[field_name] = add_noise_to_dict_values(
                     field_lengths, padding_noise)
             padding_lengths = noisy_lengths
         instance_with_lengths = ([
             padding_lengths[field_name][padding_key]
             for (field_name, padding_key) in sorting_keys
         ], instance)
         instances_with_lengths.append(instance_with_lengths)
     instances_with_lengths.sort(key=lambda x: x[0])
     return Dataset([
         instance_with_lengths[-1]
         for instance_with_lengths in instances_with_lengths
     ])
Esempio n. 13
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField(["a", "a", "a", "a", "b", "b", "c", "c", "c"],
                            {"tokens": token_indexer})
     self.instance = Instance({"text": text_field})
     self.dataset = Dataset([self.instance])
     super(TestVocabulary, self).setUp()
Esempio n. 14
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as snli_file:
            logger.info("Reading instances from tsv/jsonl dataset at: %s",
                        file_path)
            for line in tqdm.tqdm(snli_file):
                if file_path.endswith(".jsonl"):
                    # SNLI format
                    example = json.loads(line)
                    label = example["gold_label"]
                    premise = example["sentence1"]
                    hypothesis = example["sentence2"]
                else:
                    # DGEM/TSV format
                    fields = line.split("\t")
                    premise = fields[0]
                    hypothesis = fields[1]
                    label = fields[2]
                if label == '-':
                    # ignore unknown examples
                    continue
                instances.append(
                    self.text_to_instance(premise, hypothesis, label))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 15
0
    def read(self, file_path) -> Dataset:
        """
        Read data from the `file_path` and return a :class:`Dataset`.
        """
        # set trackers
        et = HCNEntityTracker()
        at = HCNActionTracker(et, file_path)

        action_templates = at.action_templates

        # get dialogs from file
        logger.info("Reading instances from lines in file at: {}".format(file_path))
        dialogs, dialog_indices = util.read_dialogs(file_path, with_indices=True)
        with open('out/dialog_indices.json', 'w') as f:
            json.dump(dialog_indices, f)

        # get utterances
        utterances = util.get_utterances(file_path, dialogs)
        # get responses
        responses = util.get_responses(file_path, dialogs)
        responses = [self.get_template_id(response, et, action_templates) for response in responses]

        instances = []
        for u, r in zip(utterances, responses):
            instances.append(self.text_to_instance(action_templates, u, r))

        if not instances:
            raise ConfigurationError("No instances read!")

        return Dataset(instances)
Esempio n. 16
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as snli_file:
            logger.info("Reading SNLI instances from jsonl dataset at: %s",
                        file_path)
            for line in tqdm.tqdm(snli_file):
                example = json.loads(line)

                label = example["gold_label"]
                if label == '-':
                    # These were cases where the annotators disagreed; we'll just skip them.  It's
                    # like 800 out of 500k examples in the training data.
                    continue

                premise = example["sentence1"]
                hypothesis = example["sentence2"]
                instances.append(
                    self.text_to_instance(premise, hypothesis, label))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 17
0
    def read(self, file_path: str):
        logger.info(
            "Reading FrameNet full text instances from dataset files at: %s",
            file_path)

        instances = []
        # prev_len = len(instances)
        for root, _, directory in tqdm.tqdm(list(os.walk(file_path))):
            for data_file in sorted(directory):
                if not data_file.endswith(".xml"):
                    continue
                instances.extend(
                    self.read_single_fulltext_file(
                        os.path.join(root, data_file)))
                # logger.info("%s: # instances = %d", data_file, len(instances) - prev_len)
                # prev_len = len(instances)
        logger.info("# instances = %d", len(instances))
        logger.info("# sentences = %d", self._num_sents)
        logger.info("# valid sentences = %d", self._valid_sents)
        logger.info("# avg tokens in sentence = %f",
                    self._total_sentence_length / self._valid_sents)
        logger.info("# discontinuous targets = %d",
                    self._discontinuous_targets)
        logger.info("%% adjacent spans with same label = %f (%d/%d)",
                    self._adjacent_labeled_args / self._total_labeled_args,
                    self._adjacent_labeled_args, self._total_labeled_args)
        self._reset()
        return Dataset(instances)
Esempio n. 18
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        instances = []
        with open(file_path) as dataset_file:
            document_state = _DocumentState()

            for line in dataset_file:

                if self._begin_document_regex.match(line):
                    # We're beginning a document. Refresh the state.
                    document_state = _DocumentState()

                elif line.startswith("#end document"):
                    # We've finished a document.
                    document_state.assert_document_is_finished()
                    clusters = document_state.canonicalize_clusters()
                    instance = self.text_to_instance(document_state.sentences, clusters)
                    instances.append(instance)
                else:
                    # Process a line.
                    self._handle_line(line, document_state)

        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 19
0
    def read(self, file_path: str):

        instances = []

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]
            instances.append(self.text_to_instance(premise, hypothesis, label))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 20
0
    def read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []

        # open data file and read lines
        with open(file_path, 'r') as ontm_file:
            logger.info(
                "Reading ontology matching instances from jsonl dataset at: %s",
                file_path)
            for line in tqdm.tqdm(ontm_file):
                training_pair = json.loads(line)
                s_ent = training_pair['source_ent']
                t_ent = training_pair['target_ent']
                label = training_pair['label']

                # convert entry to instance and append to instances
                instances.append(self.text_to_instance(s_ent, t_ent, label))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                instances.append(
                    self.text_to_instance(tokens, verb_label, tags))
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    instances.append(
                        self.text_to_instance(tokens, verb_indicator, tags))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 22
0
    def read(self, file_path: str):
        instances = []
        with open(file_path, 'r') as snli_file:
            logger.info("Reading SNLI instances from jsonl dataset at: %s",
                        file_path)
            for line in tqdm.tqdm(snli_file):
                example = json.loads(line)

                label = example["gold_label"]
                if label == '-':
                    # These were cases where the annotators disagreed; we'll just skip them.  It's
                    # like 800 out of 500k examples in the training data.
                    continue
                label_field = LabelField(label)

                premise = example["sentence1"]
                premise_field = TextField(self._tokenizer.tokenize(premise),
                                          self._token_indexers)
                hypothesis = example["sentence2"]
                hypothesis_field = TextField(
                    self._tokenizer.tokenize(hypothesis), self._token_indexers)
                instances.append(
                    Instance({
                        'label': label_field,
                        'premise': premise_field,
                        'hypothesis': hypothesis_field
                    }))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 23
0
    def read(self, file_path: str):
        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()
        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s",
                        file_path)
            for index in tqdm.tqdm(
                    range(0,
                          len(tokenized_text) - num_tokens, num_tokens)):
                tokenized_strings.append(tokenized_text[index:index +
                                                        num_tokens])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        # TODO(matt): this isn't quite right, because you really want to split on sentences,
        # tokenize the sentences, add the start and end tokens per sentence, then change the tokens
        # per instance if desired.  But, we can fix that later, if someone actually wants to use
        # this for language modeling.  This is just another example of how to use the data reader
        # code, for now.
        tokenized_strings = [[self._start_token] + x + [self._end_token]
                             for x in tokenized_strings]

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        output_indexer = None  # type: Dict[str, TokenIndexer]
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                output_indexer = {name: indexer}
                break
        else:
            output_indexer = {"tokens": SingleIdTokenIndexer()}

        instances = []
        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:], output_indexer)
            instances.append(
                Instance({
                    'input_tokens': input_field,
                    'output_tokens': output_field
                }))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 24
0
    def read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in tqdm.tqdm(
                    itertools.groupby(data_file, _is_divider)):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [
                        list(field) for field in zip(*fields)
                    ]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]
                    sequence = TextField(tokens, self._token_indexers)

                    instance_fields = {'tokens': sequence}

                    # Add "feature labels" to instance
                    if 'pos' in self.feature_labels:
                        instance_fields['pos_tags'] = SequenceLabelField(
                            pos_tags, sequence, "pos_tags")
                    if 'chunk' in self.feature_labels:
                        instance_fields['chunk_tags'] = SequenceLabelField(
                            chunk_tags, sequence, "chunk_tags")
                    if 'ner' in self.feature_labels:
                        instance_fields['ner_tags'] = SequenceLabelField(
                            ner_tags, sequence, "ner_tags")

                    # Add "tag label" to instance
                    if self.tag_label == 'ner':
                        instance_fields['tags'] = SequenceLabelField(
                            ner_tags, sequence)
                    elif self.tag_label == 'pos':
                        instance_fields['tags'] = SequenceLabelField(
                            pos_tags, sequence)
                    elif self.tag_label == 'chunk':
                        instance_fields['tags'] = SequenceLabelField(
                            chunk_tags, sequence)

                    instances.append(Instance(instance_fields))

        if not instances:
            raise ConfigurationError(
                "reading {} resulted in an empty Dataset".format(file_path))

        return Dataset(instances)
Esempio n. 25
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text, _ = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s",
                        file_path)
            for index in tqdm.tqdm(
                    range(0,
                          len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index +
                                                               num_tokens)])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s)[0] for s in instance_strings
            ]

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        output_indexer = None  # type: Dict[str, TokenIndexer]
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                output_indexer = {name: indexer}
                break
        else:
            output_indexer = {"tokens": SingleIdTokenIndexer()}

        instances = []
        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:], output_indexer)
            instances.append(
                Instance({
                    'input_tokens': input_field,
                    'output_tokens': output_field
                }))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 26
0
 def _create_batches(self, dataset: Dataset,
                     shuffle: bool) -> Iterable[Dataset]:
     instances = dataset.instances
     if shuffle:
         random.shuffle(instances)
     grouped_instances = group_by_count(instances, self._batch_size, None)
     # The last group might have not been full, so we check if any of the instances
     # are None, which is how group_by_count pads non-complete batches.
     grouped_instances[-1] = [
         instance for instance in grouped_instances[-1]
         if instance is not None
     ]
     return (Dataset(batch) for batch in grouped_instances)
Esempio n. 27
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as snli_file:
            logger.info("Reading JSONline instances from jsonl dataset at: %s", file_path)
            for line in tqdm.tqdm(snli_file):
                example = json.loads(line)
                input = example[self._input]
                label = str(example[self._gold_label])
                instances.append(self.text_to_instance(input, label))
        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 28
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        instances = []
        for article in tqdm(dataset):
            for paragraph_json in article['paragraphs']:
                paragraph = paragraph_json["context"]
                tokenized_paragraph = self._tokenizer.tokenize(paragraph)

                for question_answer in paragraph_json['qas']:
                    question_text = question_answer["question"].strip(
                    ).replace("\n", "")
                    question_id = question_answer['id'].strip()

                    # There may be multiple answer annotations, so we pick the one that occurs the
                    # most.  This only matters on the SQuAD dev set, and it means our computed
                    # metrics ("start_acc", "end_acc", and "span_acc") aren't quite the same as the
                    # official metrics, which look at all of the annotations.  This is why we have
                    # a separate official SQuAD metric calculation (the "em" and "f1" metrics use
                    # the official script).
                    candidate_answers: Counter = Counter()
                    for answer in question_answer["answers"]:
                        candidate_answers[(answer["answer_start"],
                                           answer["text"])] += 1
                    answer_texts = [
                        answer['text'] for answer in question_answer['answers']
                    ]
                    char_span_start, answer_text = candidate_answers.most_common(
                        1)[0][0]

                    instance = self.text_to_instance(question_text, paragraph,
                                                     question_id, answer_text,
                                                     char_span_start,
                                                     tokenized_paragraph,
                                                     answer_texts)
                    instances.append(instance)
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 29
0
 def read(self, file_path):
     instances = []
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s",
                     file_path)
         for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())):
             line = line.strip("\n")
             if not line:
                 continue
             paper_json = json.loads(line)
             title = paper_json['title']
             abstract = paper_json['paperAbstract']
             venue = paper_json['venue']
             instances.append(self.text_to_instance(title, abstract, venue))
     if not instances:
         raise ConfigurationError("No instances read!")
     return Dataset(instances)
Esempio n. 30
0
 def read(self, file_path):
     instances = []
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s",
                     file_path)
         for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())):
             line = line.strip("\n")
             if not line:
                 continue
             line = line.split("@@@")
             pivot_phrase = line[0]
             context_word = line[1]
             instances.append(
                 self.text_to_instance(pivot_phrase, context_word))
     if not instances:
         raise ConfigurationError("No instances read!")
     return Dataset(instances)