Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description="Perform surgery on a model.tar.gz archive")

    parser.add_argument("--input-file", required=True)
    parser.add_argument("--output-file", required=True)
    parser.add_argument("--editor")

    args = parser.parse_args()

    editor = args.editor or os.environ.get("EDITOR")
    if editor is None:
        raise RuntimeError("please specify an editor or set the $EDITOR environment variable")

    if os.path.exists(args.output_file):
        raise ValueError("output file already exists")

    archive_file = cached_path(args.input_file)
    if not os.path.exists(archive_file):
        raise ValueError("input file doesn't exist")

    # Extract archive to temp dir
    tempdir = tempfile.mkdtemp()
    with tarfile.open(archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    atexit.register(lambda: shutil.rmtree(tempdir))

    config_path = os.path.join(tempdir, CONFIG_NAME)
    subprocess.run([editor, config_path])

    with tarfile.open(args.output_file, "w:gz") as tar:
        tar.add(tempdir, arcname=os.path.sep)
Esempio n. 2
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)

        with open(file_path) as dataset_file:
            dataset = dataset_file.readlines()

        logger.info("Reading the dataset")

        context: List[List[str]] = [[]]
        for line in dataset:
            if '?' in line:
                question_str, answer, supports_str = line.replace('?', ' ?').split('\t')
                question = question_str.split()[1:]
                supports = [int(support) - 1 for support in supports_str.split()]

                yield self.text_to_instance(context, question, answer, supports)
            else:
                new_entry = line.replace('.', ' .').split()[1:]

                if line[0] == '1':
                    context = [new_entry]
                else:
                    context.append(new_entry)
Esempio n. 3
0
 def _read(self, file_path: str):
     # if `file_path` is a URL, redirect to the cache
     file_path = cached_path(file_path)
     logger.info("Reading file at %s", file_path)
     with open(file_path) as dataset_file:
         dataset_json = json.load(dataset_file)
         dataset = dataset_json['data']
     logger.info("Reading the dataset")
     for article in dataset:
         for paragraph_json in article['paragraphs']:
             paragraph = paragraph_json["context"]
             tokenized_paragraph = self._tokenizer.tokenize(paragraph)
             qas = paragraph_json['qas']
             metadata = {}
             metadata["instance_id"] = [qa['id'] for qa in qas]
             question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas]
             answer_texts_list = [[answer['text'] for answer in qa['answers']] for qa in qas]
             metadata["question"] = question_text_list
             metadata['answer_texts_list'] = answer_texts_list
             span_starts_list = [[answer['answer_start'] for answer in qa['answers']] for qa in qas]
             span_ends_list = []
             for answer_starts, an_list in zip(span_starts_list, answer_texts_list):
                 span_ends = [start + len(answer) for start, answer in zip(answer_starts, an_list)]
                 span_ends_list.append(span_ends)
             yesno_list = [str(qa['yesno']) for qa in qas]
             followup_list = [str(qa['followup']) for qa in qas]
             instance = self.text_to_instance(question_text_list,
                                              paragraph,
                                              span_starts_list,
                                              span_ends_list,
                                              tokenized_paragraph,
                                              yesno_list,
                                              followup_list,
                                              metadata)
             yield instance
Esempio n. 4
0
    def _read(self, file_path: str):

        for sentence in open(cached_path(file_path), "r"):
            tokens = sentence.strip().split(" ")
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)
            words = []
            for index, token in enumerate(tokens):
                # Coreference is annotated using [square brackets]
                # or (round brackets) around coreferent phrases.
                if "[" in token and "]" in token:
                    clusters[0].append((index, index))
                elif "[" in token:
                    clusters[0].append((index, index))
                elif "]" in token:
                    old_span = clusters[0][-1]
                    clusters[0][-1] = (old_span[0], index)

                if "(" in token and ")" in token:
                    clusters[1].append((index, index))
                elif "(" in token:
                    clusters[1].append((index, index))
                elif ")" in token:
                    old_span = clusters[1][-1]
                    clusters[1][-1] = (old_span[0], index)

                if token.endswith("."):
                    # Winobias is tokenised, but not for full stops.
                    # We'll just special case them here.
                    token = token[:-1]
                    words.append(token.strip("[]()"))
                    words.append(".")
                else:
                    words.append(token.strip("[]()"))

            yield self.text_to_instance([Token(x) for x in words], [x for x in clusters.values()])
Esempio n. 5
0
    def _load_cnn_weights(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        char_embed_dim = cnn_options['embedding']['dim']

        convolutions = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(
                    in_channels=char_embed_dim,
                    out_channels=num,
                    kernel_size=width,
                    bias=True
            )
            # load the weights
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                weight = fin['CNN']['W_cnn_{}'.format(i)][...]
                bias = fin['CNN']['b_cnn_{}'.format(i)][...]

            w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0))
            if w_reshaped.shape != tuple(conv.weight.data.shape):
                raise ValueError("Invalid weight file")
            conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
            conv.bias.data.copy_(torch.FloatTensor(bias))

            conv.weight.requires_grad = self.requires_grad
            conv.bias.requires_grad = self.requires_grad

            convolutions.append(conv)
            self.add_module('char_conv_{}'.format(i), conv)

        self._convolutions = convolutions
Esempio n. 6
0
    def _load_highway(self):
        # pylint: disable=protected-access
        # the highway layers have same dimensionality as the number of cnn filters
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)
        n_highway = cnn_options['n_highway']

        # create the layers, and load the weights
        self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu)
        for k in range(n_highway):
            # The AllenNLP highway is one matrix multplication with concatenation of
            # transform and carry weights.
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                # The weights are transposed due to multiplication order assumptions in tf
                # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
                w_transform = numpy.transpose(fin['CNN_high_{}'.format(k)]['W_transform'][...])
                # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
                w_carry = -1.0 * numpy.transpose(fin['CNN_high_{}'.format(k)]['W_carry'][...])
                weight = numpy.concatenate([w_transform, w_carry], axis=0)
                self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight))
                self._highways._layers[k].weight.requires_grad = self.requires_grad

                b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...]
                b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...]
                bias = numpy.concatenate([b_transform, b_carry], axis=0)
                self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias))
                self._highways._layers[k].bias.requires_grad = self.requires_grad
Esempio n. 7
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        for article in dataset:
            for paragraph_json in article['paragraphs']:
                paragraph = paragraph_json["context"]
                tokenized_paragraph = self._tokenizer.tokenize(paragraph)

                for question_answer in paragraph_json['qas']:
                    question_text = question_answer["question"].strip().replace("\n", "")
                    answer_texts = [answer['text'] for answer in question_answer['answers']]
                    span_starts = [answer['answer_start'] for answer in question_answer['answers']]
                    span_ends = [start + len(answer) for start, answer in zip(span_starts, answer_texts)]
                    instance = self.text_to_instance(question_text,
                                                     paragraph,
                                                     zip(span_starts, span_ends),
                                                     answer_texts,
                                                     tokenized_paragraph)
                    yield instance
Esempio n. 8
0
 def _read(self, file_path):
     logger.info("Reading instances from lines in file at: %s", file_path)
     with open(cached_path(file_path), "r") as data_file:
         tsv_in = csv.reader(data_file, delimiter='\t')
         for row in tsv_in:
             if len(row) == 4:
                 yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0])
Esempio n. 9
0
    def from_file(params_file: str, params_overrides: str = "", ext_vars: dict = None) -> 'Params':
        """
        Load a `Params` object from a configuration file.

        Parameters
        ----------
        params_file : ``str``
            The path to the configuration file to load.
        params_overrides : ``str``, optional
            A dict of overrides that can be applied to final object.
            e.g. {"model.embedding_dim": 10}
        ext_vars : ``dict``, optional
            Our config files are Jsonnet, which allows specifying external variables
            for later substitution. Typically we substitute these using environment
            variables; however, you can also specify them here, in which case they
            take priority over environment variables.
            e.g. {"HOME_DIR": "/Users/allennlp/home"}
        """
        if ext_vars is None:
            ext_vars = {}

        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = {**dict(os.environ), **ext_vars}

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)

        return Params(param_dict)
Esempio n. 10
0
    def _read(self, file_path: str):
        logger.info("Opening base tarball file at %s", self._base_tarball_path)
        base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r')
        if 'unfiltered' in file_path:
            logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path)
            unfiltered_tarball = tarfile.open(cached_path(self._unfiltered_tarball_path), 'r')
            logger.info("Loading question file from tarball")
            data_json = json.loads(unfiltered_tarball.extractfile(file_path).read().decode('utf-8'))
        else:
            logger.info("Loading question file from tarball")
            path = os.path.join('qa', file_path)
            data_json = json.loads(base_tarball.extractfile(path).read().decode('utf-8'))

        logger.info("Reading the dataset")
        for question_json in data_json['Data']:
            question_text = question_json['Question']
            question_tokens = self._tokenizer.tokenize(question_text)

            evidence_files: List[List[str]] = []  # contains lines from each evidence file
            if 'web' in file_path:
                for result in question_json['SearchResults']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "web", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])
            else:
                for result in question_json['EntityPages']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "wikipedia", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])

            answer_json = question_json['Answer']
            human_answers = [util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', [])]
            answer_texts = answer_json['NormalizedAliases'] + human_answers
            for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts):
                paragraph_tokens = self._tokenizer.tokenize(paragraph)
                token_spans = util.find_valid_answer_spans(paragraph_tokens, answer_texts)
                if not token_spans:
                    # For now, we'll just ignore instances that we can't find answer spans for.
                    # Maybe we can do something smarter here later, but this will do for now.
                    continue
                instance = self.text_to_instance(question_text,
                                                 paragraph,
                                                 token_spans,
                                                 answer_texts,
                                                 question_tokens,
                                                 paragraph_tokens)
                yield instance
Esempio n. 11
0
    def load_weights(self, weight_file: str) -> None:
        """
        Load the pre-trained weights from the file.
        """
        requires_grad = self.requires_grad

        with h5py.File(cached_path(weight_file), 'r') as fin:
            for i_layer, lstms in enumerate(
                    zip(self.forward_layers, self.backward_layers)
            ):
                for j_direction, lstm in enumerate(lstms):
                    # lstm is an instance of LSTMCellWithProjection
                    cell_size = lstm.cell_size

                    dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer
                                                                                ]['LSTMCell']

                    # tensorflow packs together both W and U matrices into one matrix,
                    # but pytorch maintains individual matrices.  In addition, tensorflow
                    # packs the gates as input, memory, forget, output but pytorch
                    # uses input, forget, memory, output.  So we need to modify the weights.
                    tf_weights = numpy.transpose(dataset['W_0'][...])
                    torch_weights = tf_weights.copy()

                    # split the W from U matrices
                    input_size = lstm.input_size
                    input_weights = torch_weights[:, :input_size]
                    recurrent_weights = torch_weights[:, input_size:]
                    tf_input_weights = tf_weights[:, :input_size]
                    tf_recurrent_weights = tf_weights[:, input_size:]

                    # handle the different gate order convention
                    for torch_w, tf_w in [[input_weights, tf_input_weights],
                                          [recurrent_weights, tf_recurrent_weights]]:
                        torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :]
                        torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :]

                    lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights))
                    lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights))
                    lstm.input_linearity.weight.requires_grad = requires_grad
                    lstm.state_linearity.weight.requires_grad = requires_grad

                    # the bias weights
                    tf_bias = dataset['B'][...]
                    # tensorflow adds 1.0 to forget gate bias instead of modifying the
                    # parameters...
                    tf_bias[(2 * cell_size):(3 * cell_size)] += 1
                    torch_bias = tf_bias.copy()
                    torch_bias[(1 * cell_size):(2 * cell_size)
                              ] = tf_bias[(2 * cell_size):(3 * cell_size)]
                    torch_bias[(2 * cell_size):(3 * cell_size)
                              ] = tf_bias[(1 * cell_size):(2 * cell_size)]
                    lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias))
                    lstm.state_linearity.bias.requires_grad = requires_grad

                    # the projection weights
                    proj_weights = numpy.transpose(dataset['W_P_0'][...])
                    lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights))
                    lstm.state_projection.weight.requires_grad = requires_grad
Esempio n. 12
0
def _read_pretrained_words(embeddings_filename: str)-> Set[str]:
    words = set()
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            word = fields[0]
            words.add(word)
    return words
Esempio n. 13
0
 def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
     archive = zipfile.ZipFile(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.namelist()
         member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
     member_path = cast(str, member_path)
     member_file = archive.open(member_path, 'r')
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Esempio n. 14
0
    def test_atis_keep_unparseable(self):
        database_file = cached_path("https://s3-us-west-2.amazonaws.com/allennlp/datasets/atis/atis.db")
        reader = AtisDatasetReader(database_file=database_file, keep_if_unparseable=True)
        instance = reader.text_to_instance(utterances=['show me the one way flights from detroit me to westchester county'],
                                           sql_query_labels=['this is not a query that can be parsed'])

        # If we have a query that can't be parsed, we check that it only has one element in the list of index fields and
        # that index is the padding index, -1.
        assert len(instance.fields['target_action_sequence'].field_list) == 1
        assert instance.fields['target_action_sequence'].field_list[0].sequence_index == -1
    def __init__(self,
                 encoder: Dict[str, int] = None,
                 byte_pairs: List[Tuple[str, str]] = None,
                 n_ctx: int = 512,
                 model_path: str = None,
                 namespace: str = 'openai_transformer',
                 tokens_to_add: List[str] = None) -> None:
        self._namespace = namespace
        self._added_to_vocabulary = False

        too_much_information = model_path and (encoder or byte_pairs)
        too_little_information = not model_path and not (encoder and byte_pairs)

        if too_much_information or too_little_information:
            raise ConfigurationError("must specify either model path or (encoder + byte_pairs) but not both")

        if model_path:
            model_path = cached_path(model_path)

            # Load encoder and byte_pairs from tar.gz
            with tarfile.open(model_path) as tmp:
                encoder_name = next(m.name for m in tmp.getmembers() if 'encoder_bpe' in m.name)
                encoder_info = tmp.extractfile(encoder_name)

                if encoder_info:
                    encoder = json.loads(encoder_info.read())
                else:
                    raise ConfigurationError(f"expected encoder_bpe file in archive {model_path}")

                bpe_name = next(m.name for m in tmp.getmembers() if m.name.endswith('.bpe'))
                bpe_info = tmp.extractfile(bpe_name)

                if bpe_info:
                    # First line is "version", last line is blank
                    lines = bpe_info.read().decode('utf-8').split('\n')[1:-1]
                    # Convert "b1 b2" -> (b1, b2)
                    byte_pairs = [tuple(line.split()) for line in lines]  # type: ignore
                else:
                    raise ConfigurationError(f"expected .bpe file in archive {model_path}")

        if tokens_to_add is not None:
            for token in tokens_to_add:
                encoder[token + '</w>'] = len(encoder)
            self.tokens_to_add = set(tokens_to_add)
        else:
            self.tokens_to_add = None

        self.encoder = encoder
        self.decoder = {word_id: word for word, word_id in self.encoder.items()}

        # Compute ranks
        self.bpe_ranks = {pair: idx for idx, pair in enumerate(byte_pairs)}

        self.cache: Dict[str, List[str]] = {}
        self.n_ctx = n_ctx
Esempio n. 16
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading Fine-Grained NER instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(_normalize_word(t)) for t in sentence.words]
            yield self.text_to_instance(tokens, sentence.named_entities)
Esempio n. 17
0
 def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
     archive = tarfile.open(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.getnames()
         member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
     member_path = cast(str, member_path)
     member = archive.getmember(member_path)   # raises exception if not present
     member_file = cast(IO[bytes], archive.extractfile(member))
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Esempio n. 18
0
    def from_file(params_file: str, params_overrides: str = "") -> 'Params':
        """
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)

        file_dict = pyhocon.ConfigFactory.parse_file(params_file)

        overrides_dict = pyhocon.ConfigFactory.parse_string(params_overrides)
        param_dict = overrides_dict.with_fallback(file_dict)
        return Params(param_dict)
Esempio n. 19
0
    def __init__(self,
                 file_uri: str,
                 encoding: str = DEFAULT_ENCODING,
                 cache_dir: str = None) -> None:

        self.uri = file_uri
        self._encoding = encoding
        self._cache_dir = cache_dir
        self._archive_handle: Any = None   # only if the file is inside an archive

        main_file_uri, path_inside_archive = parse_embeddings_file_uri(file_uri)
        main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir)

        if zipfile.is_zipfile(main_file_local_path):  # ZIP archive
            self._open_inside_zip(main_file_uri, path_inside_archive)

        elif tarfile.is_tarfile(main_file_local_path):  # TAR archive
            self._open_inside_tar(main_file_uri, path_inside_archive)

        else:  # all the other supported formats, including uncompressed files
            if path_inside_archive:
                raise ValueError('Unsupported archive format: %s' + main_file_uri)

            # All the python packages for compressed files share the same interface of io.open
            extension = get_file_extension(main_file_uri)
            package = {
                    '.txt': io,
                    '.vec': io,
                    '.gz': gzip,
                    '.bz2': bz2,
                    '.lzma': lzma,
                    }.get(extension, None)

            if package is None:
                logger.warning('The embeddings file has an unknown file extension "%s". '
                               'We will assume the file is an (uncompressed) text file', extension)
                package = io

            self._handle = package.open(main_file_local_path, 'rt', encoding=encoding)  # type: ignore

        # To use this with tqdm we'd like to know the number of tokens. It's possible that the
        # first line of the embeddings file contains this: if it does, we want to start iteration
        # from the 2nd line, otherwise we want to start from the 1st.
        # Unfortunately, once we read the first line, we cannot move back the file iterator
        # because the underlying file may be "not seekable"; we use itertools.chain instead.
        first_line = next(self._handle)     # this moves the iterator forward
        self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(first_line)
        if self.num_tokens:
            # the first line is a header line: start iterating from the 2nd line
            self._iterator = self._handle
        else:
            # the first line is not a header line: start iterating from the 1st line
            self._iterator = itertools.chain([first_line], self._handle)
Esempio n. 20
0
    def _load_char_embedding(self):
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            char_embed_weights = fin['char_embed'][...]

        weights = numpy.zeros(
                (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]),
                dtype='float32'
        )
        weights[1:, :] = char_embed_weights

        self._char_embedding_weights = torch.nn.Parameter(
                torch.FloatTensor(weights), requires_grad=self.requires_grad
        )
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading semantic dependency parsing data from: %s", file_path)

        with open(file_path) as sdp_file:
            for annotated_sentence, directed_arc_indices, arc_tags in lazy_parse(sdp_file.read()):
                # If there are no arc indices, skip this instance.
                if not directed_arc_indices:
                    continue
                tokens = [word["form"] for word in annotated_sentence]
                pos_tags = [word["pos"] for word in annotated_sentence]
                yield self.text_to_instance(tokens, pos_tags, directed_arc_indices, arc_tags)
Esempio n. 22
0
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
         for line_num, line in enumerate(data_file):
             line = line.strip("\n")
             if not line:
                 continue
             line_parts = line.split('\t')
             if len(line_parts) != 2:
                 raise RuntimeError("Invalid line format: %s (line number %d)" % (line, line_num + 1))
             source_sequence, target_sequence = line_parts
             if not source_sequence:
                 continue
             yield self.text_to_instance(source_sequence, target_sequence)
Esempio n. 23
0
    def _load_projection(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)

        self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True)
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            weight = fin['CNN_proj']['W_proj'][...]
            bias = fin['CNN_proj']['b_proj'][...]
            self._projection.weight.data.copy_(torch.FloatTensor(numpy.transpose(weight)))
            self._projection.bias.data.copy_(torch.FloatTensor(bias))

            self._projection.weight.requires_grad = self.requires_grad
            self._projection.bias.requires_grad = self.requires_grad
Esempio n. 24
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        directory, filename = os.path.split(file_path)
        logger.info("Reading instances from lines in file at: %s", file_path)
        for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents():

            self._strip_functional_tags(parse)
            # This is un-needed and clutters the label space.
            # All the trees also contain a root S node.
            if parse.label() == "VROOT":
                parse = parse[0]
            pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None
            yield self.text_to_instance(parse.leaves(), pos_tags, parse)
Esempio n. 25
0
    def test_cached_path(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path("tests/fixtures/does_not_exist/fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == self.glove_file

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Esempio n. 26
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        # Set debug_counter to, say, 5 to get extra information logged for first 5 instances
        debug_counter = 5
        counter = self._sample
        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file: %s", file_path)
            for line in tqdm.tqdm(data_file):
                counter -= 1
                if counter == 0:
                    break
                line = line.strip("\n")
                if not line:
                    continue
                question_data_orig = json.loads(line)
                question_data_list = self.preprocess(question_data_orig)

                debug_counter -= 1
                if debug_counter > 0:
                    logger.info(f'question_data_list = {question_data_list}')
                for question_data in question_data_list:
                    question = question_data['question']
                    question_id = question_data['id']
                    logical_forms = question_data['logical_forms']
                    # Skip examples with certain attributes
                    if (self._skip_attributes_regex is not None and
                                self._skip_attributes_regex.search(logical_forms[0])):
                        continue
                    # Somewhat hacky filtering to "friction" subset of questions based on id
                    if not self._compatible_question(question_data):
                        continue

                    if debug_counter > 0:
                        logger.info(f'logical_forms = {logical_forms}')
                    answer_index = question_data['answer_index']
                    world_extractions = question_data.get('world_extractions')
                    entity_literals = question_data.get('entity_literals')
                    if entity_literals is not None and world_extractions is not None:
                        # This will catch flipped worlds if need be
                        entity_literals.update(world_extractions)
                    additional_metadata = {'id': question_id,
                                           'question': question,
                                           'answer_index': answer_index,
                                           'logical_forms': logical_forms}

                    yield self.text_to_instance(question, logical_forms,
                                                additional_metadata, world_extractions,
                                                entity_literals, debug_counter=debug_counter)
Esempio n. 27
0
    def _execute_logical_form_on_table(logical_form: str, table: str):
        """
        The parameters are written out to files which the jar file reads and then executes the
        logical form.
        """
        logical_form_filename = os.path.join(SEMPRE_DIR, 'logical_forms.txt')
        with open(logical_form_filename, 'w') as temp_file:
            temp_file.write(logical_form + '\n')

        table_dir = os.path.join(SEMPRE_DIR, 'tsv/')
        os.makedirs(table_dir, exist_ok=True)
        # The .tsv file extension is important here since the table string parameter is in tsv format.
        # If this file was named with suffix .csv then Sempre would interpret it as comma separated
        # and return the wrong denotation.
        table_filename = 'context.tsv'
        with open(os.path.join(table_dir, table_filename), 'w', encoding='utf-8') as temp_file:
            temp_file.write(table)

        # The id, target, and utterance are ignored, we just need to get the
        # table filename into sempre's lisp format.
        test_record = ('(example (id nt-0) (utterance none) (context (graph tables.TableKnowledgeGraph %s))'
                       '(targetValue (list (description "6"))))' % (table_filename))
        test_data_filename = os.path.join(SEMPRE_DIR, 'data.examples')
        with open(test_data_filename, 'w') as temp_file:
            temp_file.write(test_record)

        # TODO(matt): The jar that we have isn't optimal for this use case - we're using a
        # script designed for computing accuracy, and just pulling out a piece of it. Writing
        # a new entry point to the jar that's tailored for this use would be cleaner.
        command = ' '.join(['java',
                            '-jar',
                            cached_path(DEFAULT_EXECUTOR_JAR),
                            test_data_filename,
                            logical_form_filename,
                            table_dir])
        run(command, shell=True)

        denotations_file = os.path.join(SEMPRE_DIR, 'logical_forms_denotations.tsv')
        with open(denotations_file) as temp_file:
            line = temp_file.readline().split('\t')

        # Clean up all the temp files generated from this function.
        # Take care to not remove the auxiliary sempre files
        os.remove(logical_form_filename)
        shutil.rmtree(table_dir)
        os.remove(denotations_file)
        os.remove(test_data_filename)
        return line[1] if len(line) > 1 else line[0]
Esempio n. 28
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info("Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line['interaction']:
                    if not current_interaction['utterance'] or not current_interaction['sql']:
                        continue
                    utterances.append(current_interaction['utterance'])
                    sql_query_labels = [query for query in current_interaction['sql'].split('\n') if query]
                    instance = self.text_to_instance(deepcopy(utterances), sql_query_labels)
                    if not instance:
                        continue
                    yield instance
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
         for line in data_file.readlines():
             line = line.strip("\n")
             if not line:
                 continue
             parsed_line = Tree.fromstring(line)
             if self._use_subtrees:
                 for subtree in parsed_line.subtrees():
                     instance = self.text_to_instance(subtree.leaves(), subtree.label())
                     if instance is not None:
                         yield instance
             else:
                 instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label())
                 if instance is not None:
                     yield instance
Esempio n. 30
0
    def test_atis_read_from_file(self):
        data_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "atis" / "sample.json"
        database_file = cached_path("https://s3-us-west-2.amazonaws.com/allennlp/datasets/atis/atis.db")
        reader = AtisDatasetReader(database_file=database_file)

        instances = list(reader.read(str(data_path)))

        assert len(instances) == 13
        instance = instances[0]

        assert set(instance.fields.keys()) == \
                {'utterance',
                 'actions',
                 'world',
                 'sql_queries',
                 'target_action_sequence',
                 'linking_scores'}

        assert [t.text for t in instance.fields["utterance"].tokens] == \
                ['show', 'me', 'the', 'one', 'way',
                 'flights', 'from', 'detroit', 'to',
                 'westchester', 'county']

        assert isinstance(instance.fields['world'].as_tensor({}), AtisWorld)

        world = instance.fields['world'].metadata
        assert set(world.valid_actions['number']) == \
                {'number -> ["1"]',
                 'number -> ["0"]',
                 'number -> ["41"]',
                 'number -> ["60"]'}

        assert world.linked_entities['string']['airport_airport_code_string -> ["\'DTW\'"]'][2] == \
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DTW``
        assert world.linked_entities['string']['flight_stop_stop_airport_string -> ["\'DTW\'"]'][2] == \
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DTW``
        assert world.linked_entities['string']['city_city_code_string -> ["\'DDTT\'"]'][2] == \
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DDTT``
        assert world.linked_entities['string']['fare_basis_economy_string -> ["\'NO\'"]'][2] == \
                [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0] # ``one way`` -> ``NO``
        assert world.linked_entities['string']['city_city_name_string -> ["\'WESTCHESTER COUNTY\'"]'][2] == \
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # ``westchester county`` -> ``WESTCHESTER COUNTY``
        assert world.linked_entities['string']['city_city_code_string -> ["\'HHPN\'"]'][2] == \
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # ``westchester county`` -> ``HHPN``
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError('We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
                                   hidden_size=options['lstm']['projection_dim'],
                                   cell_size=options['lstm']['dim'],
                                   num_layers=options['lstm']['n_layers'],
                                   memory_cell_clip_value=options['lstm']['cell_clip'],
                                   state_projection_clip_value=options['lstm']['proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
Esempio n. 32
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as te_file:
            logger.info("Reading Target Sentiment instances from jsonl "
                        "dataset at: %s", file_path)
            for line in te_file:
                example = json.loads(line)
                example_instance: Dict[str, Any] = {}

                example_instance["text"] = example["text"]
                if 'target_sentiments' in example and 'targets' in example:
                    example_instance['targets'] = example['targets']
                    example_instance['target_sentiments'] = example['target_sentiments']
                if 'categories' in example:
                    example_instance['categories'] = example['categories']
                if 'category_sentiments' in example:
                    example_instance['category_sentiments'] = example['category_sentiments']
                if 'spans' in example:
                    example_instance['spans'] = example['spans']
                yield self.text_to_instance(**example_instance)
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoCharacterEncoder, self).__init__()

        with open(cached_path(options_file), 'r') as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options['lstm']['projection_dim']
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
        )
        self._end_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
        )
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
         columns = data_file.readline().strip('\n').split('\t')
         token_col_inds = [columns.index(self._column_titles_to_index[field_ind])
                           for field_ind in range(len(self._column_titles_to_index))]
         for line in data_file.readlines():
             if not line:
                 continue
             items = line.strip("\n").split("\t")
             tokens = ''
             for col_ind in token_col_inds:
                 tokens += items[col_ind] + ' '
             tokens = tokens[:-1]
             tokens = items[columns.index("tokens")]
             if len(tokens.strip()) == 0:
                 continue
             category = items[columns.index("category")]
             instance = self.text_to_instance(tokens=tokens,
                                              category=category)
             if instance is not None:
                 yield instance
Esempio n. 35
0
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            reader = csv.reader(data_file, delimiter="\t")

            line_num: int
            row: List[str]
            for line_num, row in enumerate(reader):
                if len(row) < 4:
                    raise ConfigurationError(
                        "Invalid line format: %s (line number %d)" %
                        (row, line_num + 1))
                sample_id, source_sequence, target_lang, target_sequence, *rest = row

                if not target_lang:
                    raise ConfigurationError(
                        "Empty target language: {} (line number {})".format(
                            row, line_num + 1))

                yield self.text_to_instance(source_sequence, target_lang,
                                            target_sequence)
    def __init__(self,
                 all_tables: Dict[str, List[str]] = None,
                 tables_with_strings: Dict[str, List[str]] = None,
                 database_file: str = None) -> None:
        self.all_tables = all_tables
        self.tables_with_strings = tables_with_strings
        if database_file:
            self.database_file = cached_path(database_file)
            self.connection = sqlite3.connect(self.database_file)
            self.cursor = self.connection.cursor()

        grammar_dictionary, strings_list = self.create_grammar_dict_and_strings(
        )
        self.grammar_dictionary: Dict[str, List[str]] = grammar_dictionary
        self.strings_list: List[Tuple[str, str]] = strings_list

        self.grammar_string: str = self.get_grammar_string()
        self.grammar: Grammar = Grammar(self.grammar_string)
        self.valid_actions: Dict[str, List[str]] = initialize_valid_actions(
            self.grammar, KEYWORDS)
        if database_file:
            self.connection.close()
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         logger.info("Reading instances from lines in file at: %s",
                     file_path)
         for line in data_file.readlines():
             line = line.strip("\n")
             if not line:
                 continue
             parsed_line = Tree.fromstring(line)
             if self._use_subtrees:
                 for subtree in parsed_line.subtrees():
                     instance = self.text_to_instance(
                         subtree.leaves(), subtree.label())
                     if instance is None:
                         continue
                     yield instance
             else:
                 instance = self.text_to_instance(parsed_line.leaves(),
                                                  parsed_line.label())
                 if instance is None:
                     continue
                 yield instance
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path, extract_archive=True)

        logger.info("Reading file at %s", file_path)
        yielded_relation_count = 0
        from allennlp.common.file_utils import json_lines_from_file

        for relation in self.shard_iterable(json_lines_from_file(file_path)):
            premise = relation["premise"]
            hypothesis = relation["hypothesis"]
            if "label" in relation:
                label = relation["label"]
            else:
                label = None
            index = relation["idx"]

            # todo: see if we even need this to be in a separate method
            instance = self.text_to_instance(index, label, premise, hypothesis)

            yield instance
            yielded_relation_count += 1
Esempio n. 39
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index : (index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({"input_tokens": input_field, "output_tokens": output_field})
Esempio n. 40
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:

            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in data_file:
                line = line.strip("\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [
                    pair.rsplit(self._word_tag_delimiter, 1)
                    for pair in line.split(self._token_delimiter)
                ]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags)
Esempio n. 41
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            # If we are sampling spans (i.e. we are training) we need to shuffle the data so that
            # we don't yield instances in the same order every epoch. Our current solution is to
            # read the entire file into memory. This is a little expensive (roughly 1G per 1 million
            # docs), so a better solution might be required down the line.
            data: Iterable[Any] = []
            if self.sample_spans:
                data = list(enumerate(data_file))
                random.shuffle(data)
                data = iter(data)
            else:
                data = enumerate(data_file)

            for _, text in data:
                yield self.text_to_instance(text)
Esempio n. 42
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info(
                "Reading SemEval 2010 Task 8 instances from "
                "jsonl dataset at: %s", file_path)
            for line in data_file:
                example = json.loads(line)

                text = " ".join(example["tokens"])
                relation = example["label"]

                entity1, entity2 = example["entities"]

                id_ = example["id"]

                head = (entity1[0], entity1[1])
                tail = (entity2[0], entity2[1])

                yield self.text_to_instance(text, head, tail, id_, relation)
Esempio n. 43
0
 def _read(self, file_path: str) -> Iterable[Instance]:
     with open(cached_path(file_path), "r") as data_file:
         for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())):
             line = line.strip("\n")
             if not line:
                 continue
             try:
                 event_json = json.loads(line)
             except:
                 continue
             event = event_json['event']
             #if len(event['title']) and len(event['title']) == 0:
             #   continue
             tweets = event_json['tweets']
             #[preproccess_tweet(tweet) for tweet in tweets]
             #tweets = sorted(tweets,key=lambda tweet:tweet['boe_cosine'],reverse=True)
             tweet_texts = ' '.join([tweet['text'] for tweet in tweets])
             description = event.get('description', None)
             #title = event.get('title',None)
             #category = event.get('category',None)
             yield self.text_to_instance(tweet_texts,
                                         description)  #,title,category)
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    # for i in range(len(tags)):
                    #     if tags[i] != 'O':
                            # tags[i] = 'I-ARG1'
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Esempio n. 45
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)

        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        logger.info("Reading the dataset")
        for sample in dataset:

            instance = self.text_to_instance(
                sample["candidates"],
                sample["query"],
                sample["supports"],
                sample["id"],
                sample["answer"],
                sample["annotations"] if "annotations" in sample else [[]],
            )

            yield instance
Esempio n. 46
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, pos_tags, chunk_tags, ner_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags,
                                                ner_tags)
Esempio n. 47
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as snli_file:
            logger.info("Reading CSQA instances from jsonl dataset at: %s",
                        file_path)
            for line in snli_file:
                sample = json.loads(line)
                qid = sample["id"]
                question = sample["question"]["stem"]
                choices = [
                    choice['text']
                    for choice in sorted(sample["question"]["choices"],
                                         key=lambda c: c['label'])
                ]
                answer = sample['answerKey'] if 'answerKey' in sample else None

                choice_evidences = []
                for choice in sorted(sample['question']['choices'],
                                     key=lambda c: c['label']):
                    if 'evidence_selected' in choice and choice[
                            'evidence_selected']:
                        choice_evidences.append(choice['evidence_selected'])
                    else:
                        choice_evidences.append(None)

                if 'evidence_ranked' in sample['question']['choices'][0]:
                    choice_evidences = [
                        choice['evidence_ranked']
                        for choice in sorted(sample["question"]["choices"],
                                             key=lambda c: c['label'])
                    ]

                yield self.text_to_instance(qid,
                                            question,
                                            choices,
                                            choice_evidences=choice_evidences,
                                            answer=answer)
Esempio n. 48
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            yield self.text_to_instance([s.words for s in sentences],
                                        list(clusters.values()))
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as conllu_file:
            logger.info("Reading UD instances from conllu dataset at: %s",
                        file_path)

            for annotation in lazy_parse(conllu_file.read()):
                # CoNLLU annotations sometimes add back in words that have been elided
                # in the original sentence; we remove these, as we're just predicting
                # dependencies for the original sentence.
                # We filter by None here as elided words have a non-integer word id,
                # and are replaced with None by the conllu python library.
                annotation = [x for x in annotation if x["id"] is not None]

                heads = [x["head"] for x in annotation]
                tags = [x["deprel"] for x in annotation]
                words = [x["form"] for x in annotation]
                pos_tags = [x["upostag"] for x in annotation]
                yield self.text_to_instance(words, pos_tags,
                                            list(zip(tags, heads)))
Esempio n. 50
0
def load_words(
    fname: Union[str, PathLike],
    tokenizer: Tokenizer,
    vocab: Optional[Vocabulary] = None,
    namespace: str = "tokens",
    all_cases: bool = True,
) -> List[torch.Tensor]:
    """
    This function loads a list of words from a file,
    tokenizes each word into subword tokens, and converts the
    tokens into IDs.

    # Parameters

    fname : `Union[str, PathLike]`
        Name of file containing list of words to load.
    tokenizer : `Tokenizer`
        Tokenizer to tokenize words in file.
    vocab : `Vocabulary`, optional (default=`None`)
        Vocabulary of tokenizer. If `None`, assumes tokenizer is of
        type `PreTrainedTokenizer` and uses tokenizer's `vocab` attribute.
    namespace : `str`
        Namespace of vocab to use when tokenizing.
    all_cases : `bool`, optional (default=`True`)
        Whether to tokenize lower, title, and upper cases of each word.

    # Returns

    word_ids : `List[torch.Tensor]`
        List of tensors containing the IDs of subword tokens for
        each word in the file.
    """
    word_ids = []
    with open(cached_path(fname)) as f:
        words = json.load(f)
        for w in words:
            word_ids.extend(_convert_word_to_ids_tensor(w, tokenizer, vocab, namespace, all_cases))
    return word_ids
Esempio n. 51
0
    def _load_highway(self):
        # pylint: disable=protected-access
        # the highway layers have same dimensionality as the number of cnn
        # filters
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)
        n_highway = cnn_options['n_highway']

        # create the layers, and load the weights
        self._highways = Highway(n_filters,
                                 n_highway,
                                 activation=torch.nn.functional.relu)
        for k in range(n_highway):
            # The AllenNLP highway is one matrix multplication with concatenation of
            # transform and carry weights.
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                # The weights are transposed due to multiplication order assumptions in tf
                # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
                w_transform = numpy.transpose(
                    fin['CNN_high_{}'.format(k)]['W_transform'][...])
                # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
                w_carry = -1.0 * \
                    numpy.transpose(
                        fin['CNN_high_{}'.format(k)]['W_carry'][...])
                weight = numpy.concatenate([w_transform, w_carry], axis=0)
                self._highways._layers[k].weight.data.copy_(
                    torch.FloatTensor(weight))
                self._highways._layers[
                    k].weight.requires_grad = self.requires_grad

                b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...]
                b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...]
                bias = numpy.concatenate([b_transform, b_carry], axis=0)
                self._highways._layers[k].bias.data.copy_(
                    torch.FloatTensor(bias))
                self._highways._layers[
                    k].bias.requires_grad = self.requires_grad
Esempio n. 52
0
    def from_file(
        cls, params_file: Union[str, PathLike], params_overrides: str = "", ext_vars: dict = None
    ) -> "Params":
        """
        Load a `Params` object from a configuration file.

        # Parameters

        params_file: `str`

            The path to the configuration file to load.

        params_overrides: `str`, optional

            A dict of overrides that can be applied to final object.
            e.g. {"model.embedding_dim": 10}

        ext_vars: `dict`, optional

            Our config files are Jsonnet, which allows specifying external variables
            for later substitution. Typically we substitute these using environment
            variables; however, you can also specify them here, in which case they
            take priority over environment variables.
            e.g. {"HOME_DIR": "/Users/allennlp/home"}
        """
        if ext_vars is None:
            ext_vars = {}

        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = {**_environment_variables(), **ext_vars}

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)

        return cls(param_dict)
Esempio n. 53
0
    def _read(self, file_path):
        label_file_path = file_path[:file_path.rfind(
            '.')] + "_attnperformancelabels_" + self.model_folder_name + ".txt"
        label_file = open(label_file_path, 'r')
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            columns = data_file.readline().strip('\n').split('\t')
            token_col_inds = [
                columns.index(self._column_titles_to_index[field_ind])
                for field_ind in range(len(self._column_titles_to_index))
            ]
            for line in data_file.readlines():
                if not line:
                    continue
                items = line.strip("\n").split("\t")
                tokens = ''
                for col_ind in token_col_inds:
                    tokens += items[col_ind] + ' '
                tokens = tokens[:-1]
                tokens = items[columns.index("tokens")]
                if len(tokens.strip()) == 0:
                    continue
                instance = self.text_to_instance(tokens=tokens)
                if instance is not None:
                    str_category = label_file.readline().strip()
                    assert str_category != ''
                    instance.fields['label'] = LabelField(str_category)
                    yield instance

        try:
            next_label = label_file.readline()
            if isinstance(next_label, str) and len(next_label.strip()) >= 1:
                assert not next_label[0].isdigit(), \
                    "We had too many labels corresponding to the given data file " + file_path
        except:
            pass
        label_file.close()
Esempio n. 54
0
    def _load_cnn_weights(self, model_state: Dict = None):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        char_embed_dim = cnn_options['embedding']['dim']

        convolutions = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(
                    in_channels=char_embed_dim,
                    out_channels=num,
                    kernel_size=width,
                    bias=True
            )
            # load the weights
            if model_state:
                convolution_layer = '_encoder._character_encoder.conv_{}'.format(i)
                w_reshaped = model_state['{}.weight'.format(convolution_layer)]
                bias = model_state['{}.bias'.format(convolution_layer)]
                if w_reshaped.size() != conv.weight.data.shape:
                    raise ValueError("Invalid weight file")
            else:
                with h5py.File(cached_path(self._weight_file), 'r') as fin:
                    weight = fin['CNN']['W_cnn_{}'.format(i)][...]
                    bias = fin['CNN']['b_cnn_{}'.format(i)][...]

                w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0))
                if w_reshaped.shape != tuple(conv.weight.data.shape):
                    raise ValueError("Invalid weight file")
            conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
            conv.bias.data.copy_(torch.FloatTensor(bias))

            conv.weight.requires_grad = self.requires_grad
            conv.bias.requires_grad = self.requires_grad

            convolutions.append(conv)
            self.add_module('char_conv_{}'.format(i), conv)

        self._convolutions = convolutions
Esempio n. 55
0
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        instances = []
        for article in tqdm(dataset):
            for paragraph_json in article['paragraphs']:
                paragraph = paragraph_json["context"]
                tokenized_paragraph = self._tokenizer.tokenize(paragraph)

                for question_answer in paragraph_json['qas']:
                    question_text = question_answer["question"].strip(
                    ).replace("\n", "")
                    answer_texts = [
                        answer['text'] for answer in question_answer['answers']
                    ]
                    span_starts = [
                        answer['answer_start']
                        for answer in question_answer['answers']
                    ]
                    span_ends = [
                        start + len(answer)
                        for start, answer in zip(span_starts, answer_texts)
                    ]
                    instance = self.text_to_instance(
                        question_text, paragraph, zip(span_starts, span_ends),
                        answer_texts, tokenized_paragraph)
                    instances.append(instance)
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Esempio n. 56
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in Tqdm.tqdm(itertools.groupby(data_file, _is_divider)):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in zip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]
                    sequence = TextField(tokens, self._token_indexers)

                    instance_fields: Dict[str, Field] = {'tokens': sequence}

                    # Add "feature labels" to instance
                    if 'pos' in self.feature_labels:
                        instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
                    if 'chunk' in self.feature_labels:
                        instance_fields['chunk_tags'] = SequenceLabelField(chunk_tags, sequence, "chunk_tags")
                    if 'ner' in self.feature_labels:
                        instance_fields['ner_tags'] = SequenceLabelField(ner_tags, sequence, "ner_tags")

                    # Add "tag label" to instance
                    if self.tag_label == 'ner':
                        instance_fields['tags'] = SequenceLabelField(ner_tags, sequence)
                    elif self.tag_label == 'pos':
                        instance_fields['tags'] = SequenceLabelField(pos_tags, sequence)
                    elif self.tag_label == 'chunk':
                        instance_fields['tags'] = SequenceLabelField(chunk_tags, sequence)

                    yield Instance(instance_fields)
    def _read(self, file_path: str):
        self._debug_prints = 5
        cached_file_path = cached_path(file_path)

        if file_path.endswith('.gz'):
            data_file = gzip.open(cached_file_path, 'rb')
        else:
            data_file = open(cached_file_path, 'r')

        logger.info("Reading QA instances from jsonl dataset at: %s",
                    file_path)
        item_jsons = []
        for line in data_file:
            item_jsons.append(json.loads(line.strip()))

        if self._sample != -1:
            item_jsons = random.sample(item_jsons, self._sample)
            logger.info("Sampling %d examples", self._sample)

        for item_json in Tqdm.tqdm(item_jsons, total=len(item_jsons)):
            self._debug_prints -= 1
            if self._debug_prints >= 0:
                logger.info(f"====================================")
                logger.info(f"Input json: {item_json}")
            item_id = item_json["id"]

            statement_text = item_json["phrase"]
            metadata = {} if "metadata" not in item_json else item_json[
                "metadata"]
            context = item_json["context"] if "context" in item_json else None

            yield self.text_to_instance(item_id=item_id,
                                        question=statement_text,
                                        answer_id=item_json["answer"],
                                        context=context,
                                        org_metadata=metadata)

        data_file.close()
Esempio n. 58
0
    def _read(self, 
              file_path: str) -> Iterable[Instance]:
        # KB_path should be a pickle file of a dictionary
        # if `file_path` is a URL, redirect to the cache
        KB_path = self.KB_path
        file_path = cached_path(file_path)
        dict_entity_lookup = pickle.load(open(KB_path,"rb"))

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            
            tmp_instance = []
            for line in data_file.readlines():
                if line.strip()=="==================================================":
                    instance_dict = instance2dict(tmp_instance)
                    
                    question = instance_dict['question']
                    entity_surface = instance_dict['parameters_surface']
                    
                    entity = instance_dict['parameters']
                    try:
                        KB_gloss = dict_entity_lookup[entity]["itemListElement"][0]
                        # useful fields: @type -> list, description, detailedDescription
                        e_type = KB_gloss["result"].get("@type",[])
                        e_descr = [Token(token) for token in word_tokenize(KB_gloss["result"].get("description",[]))]
                        e_detail = [Token(token) for token in word_tokenize(KB_gloss["result"].get("detailedDescription",{}).get("articleBody",[]))]
                    except:
                        KB_gloss = None
                        e_type = []
                        e_descr = []
                        e_detail = []
                    
                    logical_form = instance_dict['logical_form']
                    if instance_dict["question_type"] == self.question_type:
                        yield self.text_to_instance(question, entity, entity_surface, e_type, e_descr, e_detail, logical_form)
                    tmp_instance = []
                else:
                    tmp_instance.append(line)
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r', encoding='utf8') as eds_file:
            logger.info("Reading EDS instances from conllu dataset at: %s",
                        file_path)
            for ret in lazy_parse(eds_file.read()):
                tokens = ret["tokens"]
                arc_indices = ret["arc_indices"]
                arc_tags = ret["arc_tags"]
                root_id = ret["root_id"]
                lemmas = ret["lemmas"]
                pos_tags = ret["pos_tags"]
                meta_info = ret["meta_info"]
                node_info_dict = ret["node_info_dict"]
                tokens_range = ret["tokens_range"]
                gold_mrps = ret["gold_mrps"]

                concept_node = ret["concept_node"]
                gold_actions = get_oracle_actions(
                    tokens, arc_indices, arc_tags, root_id, concept_node,
                    node_info_dict) if arc_indices else None

                # if len(gold_actions) / len(tokens) > 20:
                #     print(len(gold_actions) / len(tokens))

                if gold_actions and gold_actions[-1] == '-E-':
                    print('-E-', ret["graph_id"])
                    continue

                concept_label_list = list(
                    node_info_dict["node_label_dict"].values())
                yield self.text_to_instance(tokens, lemmas, pos_tags,
                                            arc_indices, arc_tags,
                                            gold_actions, [root_id],
                                            [meta_info], concept_label_list,
                                            tokens_range, [gold_mrps])
Esempio n. 60
0
 def _read(self, file_path: str):
     """
     Parameters
     ----------
     file_path : ``str``, required.
         For this dataset reader, file_path can either be a path to a file `or` a
         path to a directory containing json files. The reason for this is because
         some of the text2sql datasets require cross validation, which means they are split
         up into many small files, for which you only want to exclude one.
     """
     files = [
         p for p in glob.glob(file_path) if
         self._cross_validation_split_to_exclude not in os.path.basename(p)
     ]
     for path in files:
         split_data = []
         logger.info("Reading instances from lines in file at: %s", path)
         with open(cached_path(path), "r") as data_file:
             data = json.load(data_file)
         for text, sql in text2sql_utils.process_sql_data_standard(
                 data, self._use_prelinked_entities):
             instance = self.text_to_instance(text, sql)
             if instance is not None:
                 split_data.append(instance)
         # if needs to augment data
         if self._aug_ratio > 0:
             random.Random(self._random_seed).shuffle(self._aug_data)
             aug_num = int(self._aug_ratio * len(self._aug_data)) + 1
             aug_data = [
                 self.text_to_instance(" ".join(entry["inp"]),
                                       " ".join(entry["out"]))
                 for entry in self._aug_data[:aug_num]
             ]
             split_data.extend(aug_data)
         # randomize and output
         random.Random(self._random_seed).shuffle(split_data)
         for instance in split_data:
             yield instance