Example #1
0
    def add_file_to_archive(self, name: str) -> None:
        """
        Any class in its ``from_params`` method can request that some of its
        input files be added to the archive by calling this method.

        For example, if some class ``A`` had an ``input_file`` parameter, it could call

        ```
        params.add_file_to_archive("input_file")
        ```

        which would store the supplied value for ``input_file`` at the key
        ``previous.history.and.then.input_file``. The ``files_to_archive`` dict
        is shared with child instances via the ``_check_is_dict`` method, so that
        the final mapping can be retrieved from the top-level ``Params`` object.

        NOTE: You must call ``add_file_to_archive`` before you ``pop()``
        the parameter, because the ``Params`` instance looks up the value
        of the filename inside itself.

        If the ``loading_from_archive`` flag is True, this will be a no-op.
        """
        if not self.loading_from_archive:
            self.files_to_archive[f"{self.history}{name}"] = cached_path(
                self.get(name))
Example #2
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        for article in dataset:
            for paragraph_json in article['paragraphs']:
                paragraph = paragraph_json["context"]
                tokenized_paragraph = self._tokenizer.tokenize(paragraph)

                for question_answer in paragraph_json['qas']:
                    question_text = question_answer["question"].strip(
                    ).replace("\n", "")
                    answer_texts = [
                        answer['text'] for answer in question_answer['answers']
                    ]
                    span_starts = [
                        answer['answer_start']
                        for answer in question_answer['answers']
                    ]
                    span_ends = [
                        start + len(answer)
                        for start, answer in zip(span_starts, answer_texts)
                    ]
                    instance = self.text_to_instance(
                        question_text, paragraph, zip(span_starts, span_ends),
                        answer_texts, tokenized_paragraph)
                    yield instance
Example #3
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s",
                        file_path)
            for index in Tqdm.tqdm(
                    range(0,
                          len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index +
                                                               num_tokens)])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:],
                                     self._output_indexer)
            yield Instance({
                'input_tokens': input_field,
                'output_tokens': output_field
            })
Example #4
0
    def _load_cnn_weights(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        char_embed_dim = cnn_options['embedding']['dim']

        convolutions = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(in_channels=char_embed_dim,
                                   out_channels=num,
                                   kernel_size=width,
                                   bias=True)
            # load the weights
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                weight = fin['CNN']['W_cnn_{}'.format(i)][...]
                bias = fin['CNN']['b_cnn_{}'.format(i)][...]

            w_reshaped = numpy.transpose(weight.squeeze(axis=0),
                                         axes=(2, 1, 0))
            if w_reshaped.shape != tuple(conv.weight.data.shape):
                raise ValueError("Invalid weight file")
            conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
            conv.bias.data.copy_(torch.FloatTensor(bias))

            conv.weight.requires_grad = self.requires_grad
            conv.bias.requires_grad = self.requires_grad

            convolutions.append(conv)
            self.add_module('char_conv_{}'.format(i), conv)

        self._convolutions = convolutions
Example #5
0
    def __init__(self,
                 file_uri: str,
                 encoding: str = DEFAULT_ENCODING,
                 cache_dir: str = None) -> None:

        self.uri = file_uri
        self._encoding = encoding
        self._cache_dir = cache_dir
        self._archive_handle: Any = None  # only if the file is inside an archive

        main_file_uri, path_inside_archive = parse_embeddings_file_uri(
            file_uri)
        main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir)

        if zipfile.is_zipfile(main_file_local_path):  # ZIP archive
            self._open_inside_zip(main_file_uri, path_inside_archive)

        elif tarfile.is_tarfile(main_file_local_path):  # TAR archive
            self._open_inside_tar(main_file_uri, path_inside_archive)

        else:  # all the other supported formats, including uncompressed files
            if path_inside_archive:
                raise ValueError('Unsupported archive format: %s' +
                                 main_file_uri)

            # All the python packages for compressed files share the same interface of io.open
            extension = get_file_extension(main_file_uri)
            package = {
                '.txt': io,
                '.vec': io,
                '.gz': gzip,
                '.bz2': bz2,
                '.lzma': lzma,
            }.get(extension, None)

            if package is None:
                logger.warning(
                    'The embeddings file has an unknown file extension "%s". '
                    'We will assume the file is an (uncompressed) text file',
                    extension)
                package = io

            self._handle = package.open(main_file_local_path,
                                        'rt',
                                        encoding=encoding)  # type: ignore

        # To use this with tqdm we'd like to know the number of tokens. It's possible that the
        # first line of the embeddings file contains this: if it does, we want to start iteration
        # from the 2nd line, otherwise we want to start from the 1st.
        # Unfortunately, once we read the first line, we cannot move back the file iterator
        # because the underlying file may be "not seekable"; we use itertools.chain instead.
        first_line = next(self._handle)  # this moves the iterator forward
        self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(
            first_line)
        if self.num_tokens:
            # the first line is a header line: start iterating from the 2nd line
            self._iterator = self._handle
        else:
            # the first line is not a header line: start iterating from the 1st line
            self._iterator = itertools.chain([first_line], self._handle)
Example #6
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    if self.ignore_ner_tags:
                        tokens_, pos_tags, chunk_tags = fields[:3]
                        ner_tags = None
                    else:
                        tokens_, pos_tags, chunk_tags, ner_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags,
                                                ner_tags)
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as conllu_file:
            logger.info("Reading UD instances from conllu dataset at: %s",
                        file_path)

            for annotation in lazy_parse(conllu_file.read()):
                # CoNLLU annotations sometimes add back in words that have been elided
                # in the original sentence; we remove these, as we're just predicting
                # dependencies for the original sentence.
                # We filter by None here as elided words have a non-integer word id,
                # and are replaced with None by the conllu python library.
                annotation = [x for x in annotation if x["id"] is not None]

                heads = [x["head"] for x in annotation]
                tags = [x["deprel"] for x in annotation]
                words = [x["form"] for x in annotation]
                if self.use_language_specific_pos:
                    pos_tags = [x["xpostag"] for x in annotation]
                else:
                    pos_tags = [x["upostag"] for x in annotation]
                yield self.text_to_instance(words, pos_tags,
                                            list(zip(tags, heads)))
Example #8
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Example #9
0
    def load_weights(self, weight_file: str) -> None:
        """
        Load the pre-trained weights from the file.
        """
        requires_grad = self.requires_grad

        with h5py.File(cached_path(weight_file), 'r') as fin:
            for i_layer, lstms in enumerate(
                    zip(self.forward_layers, self.backward_layers)
            ):
                for j_direction, lstm in enumerate(lstms):
                    # lstm is an instance of LSTMCellWithProjection
                    cell_size = lstm.cell_size

                    dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer
                                                                                ]['LSTMCell']

                    # tensorflow packs together both W and U matrices into one matrix,
                    # but pytorch maintains individual matrices.  In addition, tensorflow
                    # packs the gates as input, memory, forget, output but pytorch
                    # uses input, forget, memory, output.  So we need to modify the weights.
                    tf_weights = numpy.transpose(dataset['W_0'][...])
                    torch_weights = tf_weights.copy()

                    # split the W from U matrices
                    input_size = lstm.input_size
                    input_weights = torch_weights[:, :input_size]
                    recurrent_weights = torch_weights[:, input_size:]
                    tf_input_weights = tf_weights[:, :input_size]
                    tf_recurrent_weights = tf_weights[:, input_size:]

                    # handle the different gate order convention
                    for torch_w, tf_w in [[input_weights, tf_input_weights],
                                          [recurrent_weights, tf_recurrent_weights]]:
                        torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :]
                        torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :]

                    lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights))
                    lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights))
                    lstm.input_linearity.weight.requires_grad = requires_grad
                    lstm.state_linearity.weight.requires_grad = requires_grad

                    # the bias weights
                    tf_bias = dataset['B'][...]
                    # tensorflow adds 1.0 to forget gate bias instead of modifying the
                    # parameters...
                    tf_bias[(2 * cell_size):(3 * cell_size)] += 1
                    torch_bias = tf_bias.copy()
                    torch_bias[(1 * cell_size):(2 * cell_size)
                              ] = tf_bias[(2 * cell_size):(3 * cell_size)]
                    torch_bias[(2 * cell_size):(3 * cell_size)
                              ] = tf_bias[(1 * cell_size):(2 * cell_size)]
                    lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias))
                    lstm.state_linearity.bias.requires_grad = requires_grad

                    # the projection weights
                    proj_weights = numpy.transpose(dataset['W_P_0'][...])
                    lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights))
                    lstm.state_projection.weight.requires_grad = requires_grad
Example #10
0
 def _read(self, file_path):
     logger.info("Reading instances from lines in file at: %s", file_path)
     with open(cached_path(file_path), "r") as data_file:
         tsv_in = csv.reader(data_file, delimiter='\t')
         for row in tsv_in:
             if len(row) == 4:
                 yield self.text_to_instance(premise=row[1],
                                             hypothesis=row[2],
                                             label=row[0])
Example #11
0
    def _load_char_embedding(self):
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            char_embed_weights = fin['char_embed'][...]

        weights = numpy.zeros(
            (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]),
            dtype='float32')
        weights[1:, :] = char_embed_weights

        self._char_embedding_weights = torch.nn.Parameter(
            torch.FloatTensor(weights), requires_grad=self.requires_grad)
Example #12
0
    def _execute_logical_form_on_table(logical_form: str, table: str):
        """
        The parameters are written out to files which the jar file reads and then executes the
        logical form.
        """
        logical_form_filename = os.path.join(SEMPRE_DIR, 'logical_forms.txt')
        with open(logical_form_filename, 'w') as temp_file:
            temp_file.write(logical_form + '\n')

        table_dir = os.path.join(SEMPRE_DIR, 'tsv/')
        os.makedirs(table_dir, exist_ok=True)
        # The .tsv file extension is important here since the table string parameter is in tsv format.
        # If this file was named with suffix .csv then Sempre would interpret it as comma separated
        # and return the wrong denotation.
        table_filename = 'context.tsv'
        with open(os.path.join(table_dir, table_filename),
                  'w',
                  encoding='utf-8') as temp_file:
            temp_file.write(table)

        # The id, target, and utterance are ignored, we just need to get the
        # table filename into sempre's lisp format.
        test_record = (
            '(example (id nt-0) (utterance none) (context (graph tables.TableKnowledgeGraph %s))'
            '(targetValue (list (description "6"))))' % (table_filename))
        test_data_filename = os.path.join(SEMPRE_DIR, 'data.examples')
        with open(test_data_filename, 'w') as temp_file:
            temp_file.write(test_record)

        # TODO(matt): The jar that we have isn't optimal for this use case - we're using a
        # script designed for computing accuracy, and just pulling out a piece of it. Writing
        # a new entry point to the jar that's tailored for this use would be cleaner.
        command = ' '.join([
            'java', '-jar',
            cached_path(DEFAULT_EXECUTOR_JAR), test_data_filename,
            logical_form_filename, table_dir
        ])
        run(command, shell=True)

        denotations_file = os.path.join(SEMPRE_DIR,
                                        'logical_forms_denotations.tsv')
        with open(denotations_file) as temp_file:
            line = temp_file.readline().split('\t')

        # Clean up all the temp files generated from this function.
        # Take care to not remove the auxiliary sempre files
        os.remove(logical_form_filename)
        shutil.rmtree(table_dir)
        os.remove(denotations_file)
        os.remove(test_data_filename)
        return line[1] if len(line) > 1 else line[0]
Example #13
0
 def _open_inside_zip(self,
                      archive_path: str,
                      member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path,
                                       cache_dir=self._cache_dir)
     archive = zipfile.ZipFile(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.namelist()
         member_path = self._get_the_only_file_in_the_archive(
             members_list, archive_path)
     member_path = cast(str, member_path)
     member_file = archive.open(member_path, 'r')
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Example #14
0
    def test_cached_path(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" /
                                   "fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == str(self.glove_file)

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Example #15
0
    def from_file(params_file: str, params_overrides: str = "") -> 'Params':
        """
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = dict(os.environ)

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict,
                                   fallback=file_dict)

        return Params(param_dict)
Example #16
0
 def _open_inside_tar(self,
                      archive_path: str,
                      member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path,
                                       cache_dir=self._cache_dir)
     archive = tarfile.open(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.getnames()
         member_path = self._get_the_only_file_in_the_archive(
             members_list, archive_path)
     member_path = cast(str, member_path)
     member = archive.getmember(
         member_path)  # raises exception if not present
     member_file = cast(IO[bytes], archive.extractfile(member))
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Example #17
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info("Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line['interaction']:
                    if not current_interaction['utterance']:
                        continue
                    utterances.append(current_interaction['utterance'])
                    instance = self.text_to_instance(utterances, current_interaction['sql'])
                    if not instance:
                        continue
                    yield instance
Example #18
0
    def _load_projection(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)

        self._projection = torch.nn.Linear(n_filters,
                                           self.output_dim,
                                           bias=True)
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            weight = fin['CNN_proj']['W_proj'][...]
            bias = fin['CNN_proj']['b_proj'][...]
            self._projection.weight.data.copy_(
                torch.FloatTensor(numpy.transpose(weight)))
            self._projection.bias.data.copy_(torch.FloatTensor(bias))

            self._projection.weight.requires_grad = self.requires_grad
            self._projection.bias.requires_grad = self.requires_grad
Example #19
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        directory, filename = os.path.split(file_path)
        logger.info("Reading instances from lines in file at: %s", file_path)
        for parse in BracketParseCorpusReader(root=directory,
                                              fileids=[filename
                                                       ]).parsed_sents():

            self._strip_functional_tags(parse)
            # This is un-needed and clutters the label space.
            # All the trees also contain a root S node.
            if parse.label() == "VROOT":
                parse = parse[0]
            pos_tags = [x[1]
                        for x in parse.pos()] if self._use_pos_tags else None
            yield self.text_to_instance(parse.leaves(), pos_tags, parse)
Example #20
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning(
                "You are fine tuning ELMo and caching char CNN word vectors. "
                "This behaviour is not guaranteed to be well defined, particularly. "
                "if not all of your inputs will occur in the vocabulary cache."
            )
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info(
                "Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
Example #21
0
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line_num, line in enumerate(data_file):
                line = line.strip("\n")

                if not line:
                    continue

                line_parts = line.split('\t')
                if len(line_parts) != 2:
                    raise ConfigurationError(
                        "Invalid line format: %s (line number %d)" %
                        (line, line_num + 1))
                source_sequence, target_sequence = line_parts
                yield self.text_to_instance(source_sequence, target_sequence)
    def _create_sempre_executor(self) -> None:
        """
        Creates a server running SEMPRE that we can send logical forms to for evaluation.  This
        uses inter-process communication, because SEMPRE is java code.  We also need to be careful
        to clean up the process when our program exits.
        """
        if self._executor_process:
            return

        # It'd be much nicer to just use `cached_path` for these files.  However, the SEMPRE jar
        # that we're using expects to find these files in a particular location, so we need to make
        # sure we put the files in that location.
        os.makedirs(SEMPRE_DIR, exist_ok=True)
        abbreviations_path = os.path.join(SEMPRE_DIR, 'abbreviations.tsv')
        if not os.path.exists(abbreviations_path):
            subprocess.run(f'wget {ABBREVIATIONS_FILE}', shell=True)
            subprocess.run(
                f'mv wikitables-abbreviations.tsv {abbreviations_path}',
                shell=True)

        grammar_path = os.path.join(SEMPRE_DIR, 'grow.grammar')
        if not os.path.exists(grammar_path):
            subprocess.run(f'wget {GROW_FILE}', shell=True)
            subprocess.run(f'mv wikitables-grow.grammar {grammar_path}',
                           shell=True)

        args = [
            'java', '-jar',
            cached_path(SEMPRE_EXECUTOR_JAR), 'serve', self._table_directory
        ]
        self._executor_process = subprocess.Popen(args,
                                                  stdin=subprocess.PIPE,
                                                  stdout=subprocess.PIPE,
                                                  bufsize=1)

        lines = []
        for _ in range(6):
            # SEMPRE outputs six lines of stuff when it loads that I can't disable.  So, we clear
            # that here.
            lines.append(str(self._executor_process.stdout.readline()))
        assert 'Parser' in lines[
            -1], "SEMPRE server output unexpected; the server may have changed"
        logger.info("Started SEMPRE server for evaluating logical forms")

        # This is supposed to ensure that the subprocess gets killed when python exits.
        atexit.register(self._stop_sempre_executor)
    def __init__(self,
                 encoder: Dict[str, int] = None,
                 byte_pairs: List[Tuple[str, str]] = None,
                 n_ctx: int = 512,
                 model_path: str = None) -> None:

        too_much_information = model_path and (encoder or byte_pairs)
        too_little_information = not model_path and not (encoder and byte_pairs)

        if too_much_information or too_little_information:
            raise ConfigurationError("must specify either model path or (encoder + byte_pairs) but not both")

        if model_path:
            model_path = cached_path(model_path)

            # Load encoder and byte_pairs from tar.gz
            with tarfile.open(model_path) as tmp:
                encoder_name = next(m.name for m in tmp.getmembers() if 'encoder_bpe' in m.name)
                encoder_info = tmp.extractfile(encoder_name)

                if encoder_info:
                    encoder = json.loads(encoder_info.read())
                else:
                    raise ConfigurationError(f"expected encoder_bpe file in archive {model_path}")

                bpe_name = next(m.name for m in tmp.getmembers() if m.name.endswith('.bpe'))
                bpe_info = tmp.extractfile(bpe_name)

                if bpe_info:
                    # First line is "version", last line is blank
                    lines = bpe_info.read().decode('utf-8').split('\n')[1:-1]
                    # Convert "b1 b2" -> (b1, b2)
                    byte_pairs = [tuple(line.split()) for line in lines]  # type: ignore
                else:
                    raise ConfigurationError(f"expected .bpe file in archive {model_path}")

        self.encoder = encoder
        self.decoder = {word_id: word for word, word_id in self.encoder.items()}

        # Compute ranks
        self.bpe_ranks = {pair: idx for idx, pair in enumerate(byte_pairs)}

        self.cache: Dict[str, List[str]] = {}
        self.n_ctx = n_ctx
Example #24
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:

            logger.info("Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip("\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags)
Example #25
0
 def _read(self, file_path: str):
     # if `file_path` is a URL, redirect to the cache
     file_path = cached_path(file_path)
     logger.info("Reading file at %s", file_path)
     with open(file_path) as dataset_file:
         dataset_json = json.load(dataset_file)
         dataset = dataset_json['data']
     logger.info("Reading the dataset")
     for article in dataset:
         for paragraph_json in article['paragraphs']:
             paragraph = paragraph_json["context"]
             tokenized_paragraph = self._tokenizer.tokenize(paragraph)
             qas = paragraph_json['qas']
             metadata = {}
             metadata["instance_id"] = [qa['id'] for qa in qas]
             question_text_list = [
                 qa["question"].strip().replace("\n", "") for qa in qas
             ]
             answer_texts_list = [
                 [answer['text'] for answer in qa['answers']] for qa in qas
             ]
             metadata["question"] = question_text_list
             metadata['answer_texts_list'] = answer_texts_list
             span_starts_list = [[
                 answer['answer_start'] for answer in qa['answers']
             ] for qa in qas]
             span_ends_list = []
             for answer_starts, an_list in zip(span_starts_list,
                                               answer_texts_list):
                 span_ends = [
                     start + len(answer)
                     for start, answer in zip(answer_starts, an_list)
                 ]
                 span_ends_list.append(span_ends)
             yesno_list = [str(qa['yesno']) for qa in qas]
             followup_list = [str(qa['followup']) for qa in qas]
             instance = self.text_to_instance(question_text_list, paragraph,
                                              span_starts_list,
                                              span_ends_list,
                                              tokenized_paragraph,
                                              yesno_list, followup_list,
                                              metadata)
             yield instance
Example #26
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as snli_file:
            logger.info("Reading SNLI instances from jsonl dataset at: %s",
                        file_path)
            for line in snli_file:
                example = json.loads(line)

                label = example["gold_label"]
                if label == '-':
                    # These were cases where the annotators disagreed; we'll just skip them.  It's
                    # like 800 out of 500k examples in the training data.
                    continue

                premise = example["sentence1"]
                hypothesis = example["sentence2"]

                yield self.text_to_instance(premise, hypothesis, label)
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s",
                     file_path)
         for line in data_file.readlines():
             line = line.strip("\n")
             if not line:
                 continue
             parsed_line = Tree.fromstring(line)
             if self._use_subtrees:
                 for subtree in parsed_line.subtrees():
                     instance = self.text_to_instance(
                         subtree.leaves(), subtree.label())
                     if instance is not None:
                         yield instance
             else:
                 instance = self.text_to_instance(parsed_line.leaves(),
                                                  parsed_line.label())
                 if instance is not None:
                     yield instance
Example #28
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoCharacterEncoder, self).__init__()

        with open(cached_path(options_file), 'r') as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options['lstm']['projection_dim']
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) +
            1)
        self._end_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
Example #29
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
Example #30
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        logger.info("Reading instances from lines in file at: %s", file_path)

        with open(file_path) as input_file:
            for line in input_file:
                if line.startswith("(<"):
                    # Each leaf looks like
                    # (<L ccg_category modified_pos original_pos token predicate_arg_category>)
                    leaves = re.findall("<L (.*?)>", line)

                    # Use magic unzipping trick to split into tuples
                    tuples = zip(*[leaf.split() for leaf in leaves])

                    # Convert to lists and assign to variables.
                    ccg_categories, modified_pos_tags, original_pos_tags, tokens, predicate_arg_categories = \
                            [list(result) for result in tuples]

                    yield self.text_to_instance(tokens, ccg_categories,
                                                modified_pos_tags,
                                                original_pos_tags,
                                                predicate_arg_categories)