def main(): parser = argparse.ArgumentParser(description="Perform surgery on a model.tar.gz archive") parser.add_argument("--input-file", required=True) parser.add_argument("--output-file", required=True) parser.add_argument("--editor") args = parser.parse_args() editor = args.editor or os.environ.get("EDITOR") if editor is None: raise RuntimeError("please specify an editor or set the $EDITOR environment variable") if os.path.exists(args.output_file): raise ValueError("output file already exists") archive_file = cached_path(args.input_file) if not os.path.exists(archive_file): raise ValueError("input file doesn't exist") # Extract archive to temp dir tempdir = tempfile.mkdtemp() with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) atexit.register(lambda: shutil.rmtree(tempdir)) config_path = os.path.join(tempdir, CONFIG_NAME) subprocess.run([editor, config_path]) with tarfile.open(args.output_file, "w:gz") as tar: tar.add(tempdir, arcname=os.path.sep)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = dataset_file.readlines() logger.info("Reading the dataset") context: List[List[str]] = [[]] for line in dataset: if '?' in line: question_str, answer, supports_str = line.replace('?', ' ?').split('\t') question = question_str.split()[1:] supports = [int(support) - 1 for support in supports_str.split()] yield self.text_to_instance(context, question, answer, supports) else: new_entry = line.replace('.', ' .').split()[1:] if line[0] == '1': context = [new_entry] else: context.append(new_entry)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") for article in dataset: for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) qas = paragraph_json['qas'] metadata = {} metadata["instance_id"] = [qa['id'] for qa in qas] question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas] answer_texts_list = [[answer['text'] for answer in qa['answers']] for qa in qas] metadata["question"] = question_text_list metadata['answer_texts_list'] = answer_texts_list span_starts_list = [[answer['answer_start'] for answer in qa['answers']] for qa in qas] span_ends_list = [] for answer_starts, an_list in zip(span_starts_list, answer_texts_list): span_ends = [start + len(answer) for start, answer in zip(answer_starts, an_list)] span_ends_list.append(span_ends) yesno_list = [str(qa['yesno']) for qa in qas] followup_list = [str(qa['followup']) for qa in qas] instance = self.text_to_instance(question_text_list, paragraph, span_starts_list, span_ends_list, tokenized_paragraph, yesno_list, followup_list, metadata) yield instance
def _read(self, file_path: str): for sentence in open(cached_path(file_path), "r"): tokens = sentence.strip().split(" ") clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) words = [] for index, token in enumerate(tokens): # Coreference is annotated using [square brackets] # or (round brackets) around coreferent phrases. if "[" in token and "]" in token: clusters[0].append((index, index)) elif "[" in token: clusters[0].append((index, index)) elif "]" in token: old_span = clusters[0][-1] clusters[0][-1] = (old_span[0], index) if "(" in token and ")" in token: clusters[1].append((index, index)) elif "(" in token: clusters[1].append((index, index)) elif ")" in token: old_span = clusters[1][-1] clusters[1][-1] = (old_span[0], index) if token.endswith("."): # Winobias is tokenised, but not for full stops. # We'll just special case them here. token = token[:-1] words.append(token.strip("[]()")) words.append(".") else: words.append(token.strip("[]()")) yield self.text_to_instance([Token(x) for x in words], [x for x in clusters.values()])
def _load_cnn_weights(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] char_embed_dim = cnn_options['embedding']['dim'] convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True ) # load the weights with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN']['W_cnn_{}'.format(i)][...] bias = fin['CNN']['b_cnn_{}'.format(i)][...] w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0)) if w_reshaped.shape != tuple(conv.weight.data.shape): raise ValueError("Invalid weight file") conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) conv.bias.data.copy_(torch.FloatTensor(bias)) conv.weight.requires_grad = self.requires_grad conv.bias.requires_grad = self.requires_grad convolutions.append(conv) self.add_module('char_conv_{}'.format(i), conv) self._convolutions = convolutions
def _load_highway(self): # pylint: disable=protected-access # the highway layers have same dimensionality as the number of cnn filters cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) n_highway = cnn_options['n_highway'] # create the layers, and load the weights self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu) for k in range(n_highway): # The AllenNLP highway is one matrix multplication with concatenation of # transform and carry weights. with h5py.File(cached_path(self._weight_file), 'r') as fin: # The weights are transposed due to multiplication order assumptions in tf # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X)) w_transform = numpy.transpose(fin['CNN_high_{}'.format(k)]['W_transform'][...]) # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x) w_carry = -1.0 * numpy.transpose(fin['CNN_high_{}'.format(k)]['W_carry'][...]) weight = numpy.concatenate([w_transform, w_carry], axis=0) self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight)) self._highways._layers[k].weight.requires_grad = self.requires_grad b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...] b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...] bias = numpy.concatenate([b_transform, b_carry], axis=0) self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias)) self._highways._layers[k].bias.requires_grad = self.requires_grad
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") for article in dataset: for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) for question_answer in paragraph_json['qas']: question_text = question_answer["question"].strip().replace("\n", "") answer_texts = [answer['text'] for answer in question_answer['answers']] span_starts = [answer['answer_start'] for answer in question_answer['answers']] span_ends = [start + len(answer) for start, answer in zip(span_starts, answer_texts)] instance = self.text_to_instance(question_text, paragraph, zip(span_starts, span_ends), answer_texts, tokenized_paragraph) yield instance
def _read(self, file_path): logger.info("Reading instances from lines in file at: %s", file_path) with open(cached_path(file_path), "r") as data_file: tsv_in = csv.reader(data_file, delimiter='\t') for row in tsv_in: if len(row) == 4: yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0])
def from_file(params_file: str, params_overrides: str = "", ext_vars: dict = None) -> 'Params': """ Load a `Params` object from a configuration file. Parameters ---------- params_file : ``str`` The path to the configuration file to load. params_overrides : ``str``, optional A dict of overrides that can be applied to final object. e.g. {"model.embedding_dim": 10} ext_vars : ``dict``, optional Our config files are Jsonnet, which allows specifying external variables for later substitution. Typically we substitute these using environment variables; however, you can also specify them here, in which case they take priority over environment variables. e.g. {"HOME_DIR": "/Users/allennlp/home"} """ if ext_vars is None: ext_vars = {} # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = {**dict(os.environ), **ext_vars} file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
def _read(self, file_path: str): logger.info("Opening base tarball file at %s", self._base_tarball_path) base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r') if 'unfiltered' in file_path: logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path) unfiltered_tarball = tarfile.open(cached_path(self._unfiltered_tarball_path), 'r') logger.info("Loading question file from tarball") data_json = json.loads(unfiltered_tarball.extractfile(file_path).read().decode('utf-8')) else: logger.info("Loading question file from tarball") path = os.path.join('qa', file_path) data_json = json.loads(base_tarball.extractfile(path).read().decode('utf-8')) logger.info("Reading the dataset") for question_json in data_json['Data']: question_text = question_json['Question'] question_tokens = self._tokenizer.tokenize(question_text) evidence_files: List[List[str]] = [] # contains lines from each evidence file if 'web' in file_path: for result in question_json['SearchResults']: filename = result['Filename'] evidence_file = base_tarball.extractfile(os.path.join("evidence", "web", filename)) evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()]) else: for result in question_json['EntityPages']: filename = result['Filename'] evidence_file = base_tarball.extractfile(os.path.join("evidence", "wikipedia", filename)) evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()]) answer_json = question_json['Answer'] human_answers = [util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', [])] answer_texts = answer_json['NormalizedAliases'] + human_answers for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts): paragraph_tokens = self._tokenizer.tokenize(paragraph) token_spans = util.find_valid_answer_spans(paragraph_tokens, answer_texts) if not token_spans: # For now, we'll just ignore instances that we can't find answer spans for. # Maybe we can do something smarter here later, but this will do for now. continue instance = self.text_to_instance(question_text, paragraph, token_spans, answer_texts, question_tokens, paragraph_tokens) yield instance
def load_weights(self, weight_file: str) -> None: """ Load the pre-trained weights from the file. """ requires_grad = self.requires_grad with h5py.File(cached_path(weight_file), 'r') as fin: for i_layer, lstms in enumerate( zip(self.forward_layers, self.backward_layers) ): for j_direction, lstm in enumerate(lstms): # lstm is an instance of LSTMCellWithProjection cell_size = lstm.cell_size dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer ]['LSTMCell'] # tensorflow packs together both W and U matrices into one matrix, # but pytorch maintains individual matrices. In addition, tensorflow # packs the gates as input, memory, forget, output but pytorch # uses input, forget, memory, output. So we need to modify the weights. tf_weights = numpy.transpose(dataset['W_0'][...]) torch_weights = tf_weights.copy() # split the W from U matrices input_size = lstm.input_size input_weights = torch_weights[:, :input_size] recurrent_weights = torch_weights[:, input_size:] tf_input_weights = tf_weights[:, :input_size] tf_recurrent_weights = tf_weights[:, input_size:] # handle the different gate order convention for torch_w, tf_w in [[input_weights, tf_input_weights], [recurrent_weights, tf_recurrent_weights]]: torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :] torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :] lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights)) lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights)) lstm.input_linearity.weight.requires_grad = requires_grad lstm.state_linearity.weight.requires_grad = requires_grad # the bias weights tf_bias = dataset['B'][...] # tensorflow adds 1.0 to forget gate bias instead of modifying the # parameters... tf_bias[(2 * cell_size):(3 * cell_size)] += 1 torch_bias = tf_bias.copy() torch_bias[(1 * cell_size):(2 * cell_size) ] = tf_bias[(2 * cell_size):(3 * cell_size)] torch_bias[(2 * cell_size):(3 * cell_size) ] = tf_bias[(1 * cell_size):(2 * cell_size)] lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias)) lstm.state_linearity.bias.requires_grad = requires_grad # the projection weights proj_weights = numpy.transpose(dataset['W_P_0'][...]) lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights)) lstm.state_projection.weight.requires_grad = requires_grad
def _read_pretrained_words(embeddings_filename: str)-> Set[str]: words = set() with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') word = fields[0] words.add(word) return words
def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = zipfile.ZipFile(cached_archive_path, 'r') if member_path is None: members_list = archive.namelist() member_path = self._get_the_only_file_in_the_archive(members_list, archive_path) member_path = cast(str, member_path) member_file = archive.open(member_path, 'r') self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def test_atis_keep_unparseable(self): database_file = cached_path("https://s3-us-west-2.amazonaws.com/allennlp/datasets/atis/atis.db") reader = AtisDatasetReader(database_file=database_file, keep_if_unparseable=True) instance = reader.text_to_instance(utterances=['show me the one way flights from detroit me to westchester county'], sql_query_labels=['this is not a query that can be parsed']) # If we have a query that can't be parsed, we check that it only has one element in the list of index fields and # that index is the padding index, -1. assert len(instance.fields['target_action_sequence'].field_list) == 1 assert instance.fields['target_action_sequence'].field_list[0].sequence_index == -1
def __init__(self, encoder: Dict[str, int] = None, byte_pairs: List[Tuple[str, str]] = None, n_ctx: int = 512, model_path: str = None, namespace: str = 'openai_transformer', tokens_to_add: List[str] = None) -> None: self._namespace = namespace self._added_to_vocabulary = False too_much_information = model_path and (encoder or byte_pairs) too_little_information = not model_path and not (encoder and byte_pairs) if too_much_information or too_little_information: raise ConfigurationError("must specify either model path or (encoder + byte_pairs) but not both") if model_path: model_path = cached_path(model_path) # Load encoder and byte_pairs from tar.gz with tarfile.open(model_path) as tmp: encoder_name = next(m.name for m in tmp.getmembers() if 'encoder_bpe' in m.name) encoder_info = tmp.extractfile(encoder_name) if encoder_info: encoder = json.loads(encoder_info.read()) else: raise ConfigurationError(f"expected encoder_bpe file in archive {model_path}") bpe_name = next(m.name for m in tmp.getmembers() if m.name.endswith('.bpe')) bpe_info = tmp.extractfile(bpe_name) if bpe_info: # First line is "version", last line is blank lines = bpe_info.read().decode('utf-8').split('\n')[1:-1] # Convert "b1 b2" -> (b1, b2) byte_pairs = [tuple(line.split()) for line in lines] # type: ignore else: raise ConfigurationError(f"expected .bpe file in archive {model_path}") if tokens_to_add is not None: for token in tokens_to_add: encoder[token + '</w>'] = len(encoder) self.tokens_to_add = set(tokens_to_add) else: self.tokens_to_add = None self.encoder = encoder self.decoder = {word_id: word for word, word_id in self.encoder.items()} # Compute ranks self.bpe_ranks = {pair: idx for idx, pair in enumerate(byte_pairs)} self.cache: Dict[str, List[str]] = {} self.n_ctx = n_ctx
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading Fine-Grained NER instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(_normalize_word(t)) for t in sentence.words] yield self.text_to_instance(tokens, sentence.named_entities)
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = tarfile.open(cached_archive_path, 'r') if member_path is None: members_list = archive.getnames() member_path = self._get_the_only_file_in_the_archive(members_list, archive_path) member_path = cast(str, member_path) member = archive.getmember(member_path) # raises exception if not present member_file = cast(IO[bytes], archive.extractfile(member)) self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def from_file(params_file: str, params_overrides: str = "") -> 'Params': """ Load a `Params` object from a configuration file. """ # redirect to cache, if necessary params_file = cached_path(params_file) file_dict = pyhocon.ConfigFactory.parse_file(params_file) overrides_dict = pyhocon.ConfigFactory.parse_string(params_overrides) param_dict = overrides_dict.with_fallback(file_dict) return Params(param_dict)
def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri(file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: logger.warning('The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)
def _load_char_embedding(self): with h5py.File(cached_path(self._weight_file), 'r') as fin: char_embed_weights = fin['char_embed'][...] weights = numpy.zeros( (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), dtype='float32' ) weights[1:, :] = char_embed_weights self._char_embedding_weights = torch.nn.Parameter( torch.FloatTensor(weights), requires_grad=self.requires_grad )
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading semantic dependency parsing data from: %s", file_path) with open(file_path) as sdp_file: for annotated_sentence, directed_arc_indices, arc_tags in lazy_parse(sdp_file.read()): # If there are no arc indices, skip this instance. if not directed_arc_indices: continue tokens = [word["form"] for word in annotated_sentence] pos_tags = [word["pos"] for word in annotated_sentence] yield self.text_to_instance(tokens, pos_tags, directed_arc_indices, arc_tags)
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(data_file): line = line.strip("\n") if not line: continue line_parts = line.split('\t') if len(line_parts) != 2: raise RuntimeError("Invalid line format: %s (line number %d)" % (line, line_num + 1)) source_sequence, target_sequence = line_parts if not source_sequence: continue yield self.text_to_instance(source_sequence, target_sequence)
def _load_projection(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True) with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN_proj']['W_proj'][...] bias = fin['CNN_proj']['b_proj'][...] self._projection.weight.data.copy_(torch.FloatTensor(numpy.transpose(weight))) self._projection.bias.data.copy_(torch.FloatTensor(bias)) self._projection.weight.requires_grad = self.requires_grad self._projection.bias.requires_grad = self.requires_grad
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) directory, filename = os.path.split(file_path) logger.info("Reading instances from lines in file at: %s", file_path) for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents(): self._strip_functional_tags(parse) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if parse.label() == "VROOT": parse = parse[0] pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None yield self.text_to_instance(parse.leaves(), pos_tags, parse)
def test_cached_path(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path("tests/fixtures/does_not_exist/fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == self.glove_file # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) # Set debug_counter to, say, 5 to get extra information logged for first 5 instances debug_counter = 5 counter = self._sample with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file: %s", file_path) for line in tqdm.tqdm(data_file): counter -= 1 if counter == 0: break line = line.strip("\n") if not line: continue question_data_orig = json.loads(line) question_data_list = self.preprocess(question_data_orig) debug_counter -= 1 if debug_counter > 0: logger.info(f'question_data_list = {question_data_list}') for question_data in question_data_list: question = question_data['question'] question_id = question_data['id'] logical_forms = question_data['logical_forms'] # Skip examples with certain attributes if (self._skip_attributes_regex is not None and self._skip_attributes_regex.search(logical_forms[0])): continue # Somewhat hacky filtering to "friction" subset of questions based on id if not self._compatible_question(question_data): continue if debug_counter > 0: logger.info(f'logical_forms = {logical_forms}') answer_index = question_data['answer_index'] world_extractions = question_data.get('world_extractions') entity_literals = question_data.get('entity_literals') if entity_literals is not None and world_extractions is not None: # This will catch flipped worlds if need be entity_literals.update(world_extractions) additional_metadata = {'id': question_id, 'question': question, 'answer_index': answer_index, 'logical_forms': logical_forms} yield self.text_to_instance(question, logical_forms, additional_metadata, world_extractions, entity_literals, debug_counter=debug_counter)
def _execute_logical_form_on_table(logical_form: str, table: str): """ The parameters are written out to files which the jar file reads and then executes the logical form. """ logical_form_filename = os.path.join(SEMPRE_DIR, 'logical_forms.txt') with open(logical_form_filename, 'w') as temp_file: temp_file.write(logical_form + '\n') table_dir = os.path.join(SEMPRE_DIR, 'tsv/') os.makedirs(table_dir, exist_ok=True) # The .tsv file extension is important here since the table string parameter is in tsv format. # If this file was named with suffix .csv then Sempre would interpret it as comma separated # and return the wrong denotation. table_filename = 'context.tsv' with open(os.path.join(table_dir, table_filename), 'w', encoding='utf-8') as temp_file: temp_file.write(table) # The id, target, and utterance are ignored, we just need to get the # table filename into sempre's lisp format. test_record = ('(example (id nt-0) (utterance none) (context (graph tables.TableKnowledgeGraph %s))' '(targetValue (list (description "6"))))' % (table_filename)) test_data_filename = os.path.join(SEMPRE_DIR, 'data.examples') with open(test_data_filename, 'w') as temp_file: temp_file.write(test_record) # TODO(matt): The jar that we have isn't optimal for this use case - we're using a # script designed for computing accuracy, and just pulling out a piece of it. Writing # a new entry point to the jar that's tailored for this use would be cleaner. command = ' '.join(['java', '-jar', cached_path(DEFAULT_EXECUTOR_JAR), test_data_filename, logical_form_filename, table_dir]) run(command, shell=True) denotations_file = os.path.join(SEMPRE_DIR, 'logical_forms_denotations.tsv') with open(denotations_file) as temp_file: line = temp_file.readline().split('\t') # Clean up all the temp files generated from this function. # Take care to not remove the auxiliary sempre files os.remove(logical_form_filename) shutil.rmtree(table_dir) os.remove(denotations_file) os.remove(test_data_filename) return line[1] if len(line) > 1 else line[0]
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path) as atis_file: logger.info("Reading ATIS instances from dataset at : %s", file_path) for line in _lazy_parse(atis_file.read()): utterances = [] for current_interaction in line['interaction']: if not current_interaction['utterance'] or not current_interaction['sql']: continue utterances.append(current_interaction['utterance']) sql_query_labels = [query for query in current_interaction['sql'].split('\n') if query] instance = self.text_to_instance(deepcopy(utterances), sql_query_labels) if not instance: continue yield instance
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file.readlines(): line = line.strip("\n") if not line: continue parsed_line = Tree.fromstring(line) if self._use_subtrees: for subtree in parsed_line.subtrees(): instance = self.text_to_instance(subtree.leaves(), subtree.label()) if instance is not None: yield instance else: instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label()) if instance is not None: yield instance
def test_atis_read_from_file(self): data_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "atis" / "sample.json" database_file = cached_path("https://s3-us-west-2.amazonaws.com/allennlp/datasets/atis/atis.db") reader = AtisDatasetReader(database_file=database_file) instances = list(reader.read(str(data_path))) assert len(instances) == 13 instance = instances[0] assert set(instance.fields.keys()) == \ {'utterance', 'actions', 'world', 'sql_queries', 'target_action_sequence', 'linking_scores'} assert [t.text for t in instance.fields["utterance"].tokens] == \ ['show', 'me', 'the', 'one', 'way', 'flights', 'from', 'detroit', 'to', 'westchester', 'county'] assert isinstance(instance.fields['world'].as_tensor({}), AtisWorld) world = instance.fields['world'].metadata assert set(world.valid_actions['number']) == \ {'number -> ["1"]', 'number -> ["0"]', 'number -> ["41"]', 'number -> ["60"]'} assert world.linked_entities['string']['airport_airport_code_string -> ["\'DTW\'"]'][2] == \ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DTW`` assert world.linked_entities['string']['flight_stop_stop_airport_string -> ["\'DTW\'"]'][2] == \ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DTW`` assert world.linked_entities['string']['city_city_code_string -> ["\'DDTT\'"]'][2] == \ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DDTT`` assert world.linked_entities['string']['fare_basis_economy_string -> ["\'NO\'"]'][2] == \ [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0] # ``one way`` -> ``NO`` assert world.linked_entities['string']['city_city_name_string -> ["\'WESTCHESTER COUNTY\'"]'][2] == \ [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # ``westchester county`` -> ``WESTCHESTER COUNTY`` assert world.linked_entities['string']['city_city_code_string -> ["\'HHPN\'"]'][2] == \ [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # ``westchester county`` -> ``HHPN``
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError('We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as te_file: logger.info("Reading Target Sentiment instances from jsonl " "dataset at: %s", file_path) for line in te_file: example = json.loads(line) example_instance: Dict[str, Any] = {} example_instance["text"] = example["text"] if 'target_sentiments' in example and 'targets' in example: example_instance['targets'] = example['targets'] example_instance['target_sentiments'] = example['target_sentiments'] if 'categories' in example: example_instance['categories'] = example['categories'] if 'category_sentiments' in example: example_instance['category_sentiments'] = example['category_sentiments'] if 'spans' in example: example_instance['spans'] = example['spans'] yield self.text_to_instance(**example_instance)
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoCharacterEncoder, self).__init__() with open(cached_path(options_file), 'r') as fin: self._options = json.load(fin) self._weight_file = weight_file self.output_dim = self._options['lstm']['projection_dim'] self.requires_grad = requires_grad self._load_weights() # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1 ) self._end_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 )
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) columns = data_file.readline().strip('\n').split('\t') token_col_inds = [columns.index(self._column_titles_to_index[field_ind]) for field_ind in range(len(self._column_titles_to_index))] for line in data_file.readlines(): if not line: continue items = line.strip("\n").split("\t") tokens = '' for col_ind in token_col_inds: tokens += items[col_ind] + ' ' tokens = tokens[:-1] tokens = items[columns.index("tokens")] if len(tokens.strip()) == 0: continue category = items[columns.index("category")] instance = self.text_to_instance(tokens=tokens, category=category) if instance is not None: yield instance
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) reader = csv.reader(data_file, delimiter="\t") line_num: int row: List[str] for line_num, row in enumerate(reader): if len(row) < 4: raise ConfigurationError( "Invalid line format: %s (line number %d)" % (row, line_num + 1)) sample_id, source_sequence, target_lang, target_sequence, *rest = row if not target_lang: raise ConfigurationError( "Empty target language: {} (line number {})".format( row, line_num + 1)) yield self.text_to_instance(source_sequence, target_lang, target_sequence)
def __init__(self, all_tables: Dict[str, List[str]] = None, tables_with_strings: Dict[str, List[str]] = None, database_file: str = None) -> None: self.all_tables = all_tables self.tables_with_strings = tables_with_strings if database_file: self.database_file = cached_path(database_file) self.connection = sqlite3.connect(self.database_file) self.cursor = self.connection.cursor() grammar_dictionary, strings_list = self.create_grammar_dict_and_strings( ) self.grammar_dictionary: Dict[str, List[str]] = grammar_dictionary self.strings_list: List[Tuple[str, str]] = strings_list self.grammar_string: str = self.get_grammar_string() self.grammar: Grammar = Grammar(self.grammar_string) self.valid_actions: Dict[str, List[str]] = initialize_valid_actions( self.grammar, KEYWORDS) if database_file: self.connection.close()
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file.readlines(): line = line.strip("\n") if not line: continue parsed_line = Tree.fromstring(line) if self._use_subtrees: for subtree in parsed_line.subtrees(): instance = self.text_to_instance( subtree.leaves(), subtree.label()) if instance is None: continue yield instance else: instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label()) if instance is None: continue yield instance
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path, extract_archive=True) logger.info("Reading file at %s", file_path) yielded_relation_count = 0 from allennlp.common.file_utils import json_lines_from_file for relation in self.shard_iterable(json_lines_from_file(file_path)): premise = relation["premise"] hypothesis = relation["hypothesis"] if "label" in relation: label = relation["label"] else: label = None index = relation["idx"] # todo: see if we even need this to be in a separate method instance = self.text_to_instance(index, label, premise, hypothesis) yield instance yielded_relation_count += 1
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index : (index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({"input_tokens": input_field, "output_tokens": output_field})
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags)
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # If we are sampling spans (i.e. we are training) we need to shuffle the data so that # we don't yield instances in the same order every epoch. Our current solution is to # read the entire file into memory. This is a little expensive (roughly 1G per 1 million # docs), so a better solution might be required down the line. data: Iterable[Any] = [] if self.sample_spans: data = list(enumerate(data_file)) random.shuffle(data) data = iter(data) else: data = enumerate(data_file) for _, text in data: yield self.text_to_instance(text)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info( "Reading SemEval 2010 Task 8 instances from " "jsonl dataset at: %s", file_path) for line in data_file: example = json.loads(line) text = " ".join(example["tokens"]) relation = example["label"] entity1, entity2 = example["entities"] id_ = example["id"] head = (entity1[0], entity1[1]) tail = (entity2[0], entity2[1]) yield self.text_to_instance(text, head, tail, id_, relation)
def _read(self, file_path: str) -> Iterable[Instance]: with open(cached_path(file_path), "r") as data_file: for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())): line = line.strip("\n") if not line: continue try: event_json = json.loads(line) except: continue event = event_json['event'] #if len(event['title']) and len(event['title']) == 0: # continue tweets = event_json['tweets'] #[preproccess_tweet(tweet) for tweet in tweets] #tweets = sorted(tweets,key=lambda tweet:tweet['boe_cosine'],reverse=True) tweet_texts = ' '.join([tweet['text'] for tweet in tweets]) description = event.get('description', None) #title = event.get('title',None) #category = event.get('category',None) yield self.text_to_instance(tweet_texts, description) #,title,category)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags] # for i in range(len(tags)): # if tags[i] != 'O': # tags[i] = 'I-ARG1' yield self.text_to_instance(tokens, verb_indicator, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) logger.info("Reading the dataset") for sample in dataset: instance = self.text_to_instance( sample["candidates"], sample["query"], sample["supports"], sample["id"], sample["answer"], sample["annotations"] if "annotations" in sample else [[]], ) yield instance
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_, pos_tags, chunk_tags, ner_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as snli_file: logger.info("Reading CSQA instances from jsonl dataset at: %s", file_path) for line in snli_file: sample = json.loads(line) qid = sample["id"] question = sample["question"]["stem"] choices = [ choice['text'] for choice in sorted(sample["question"]["choices"], key=lambda c: c['label']) ] answer = sample['answerKey'] if 'answerKey' in sample else None choice_evidences = [] for choice in sorted(sample['question']['choices'], key=lambda c: c['label']): if 'evidence_selected' in choice and choice[ 'evidence_selected']: choice_evidences.append(choice['evidence_selected']) else: choice_evidences.append(None) if 'evidence_ranked' in sample['question']['choices'][0]: choice_evidences = [ choice['evidence_ranked'] for choice in sorted(sample["question"]["choices"], key=lambda c: c['label']) ] yield self.text_to_instance(qid, question, choices, choice_evidences=choice_evidences, answer=answer)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as conllu_file: logger.info("Reading UD instances from conllu dataset at: %s", file_path) for annotation in lazy_parse(conllu_file.read()): # CoNLLU annotations sometimes add back in words that have been elided # in the original sentence; we remove these, as we're just predicting # dependencies for the original sentence. # We filter by None here as elided words have a non-integer word id, # and are replaced with None by the conllu python library. annotation = [x for x in annotation if x["id"] is not None] heads = [x["head"] for x in annotation] tags = [x["deprel"] for x in annotation] words = [x["form"] for x in annotation] pos_tags = [x["upostag"] for x in annotation] yield self.text_to_instance(words, pos_tags, list(zip(tags, heads)))
def load_words( fname: Union[str, PathLike], tokenizer: Tokenizer, vocab: Optional[Vocabulary] = None, namespace: str = "tokens", all_cases: bool = True, ) -> List[torch.Tensor]: """ This function loads a list of words from a file, tokenizes each word into subword tokens, and converts the tokens into IDs. # Parameters fname : `Union[str, PathLike]` Name of file containing list of words to load. tokenizer : `Tokenizer` Tokenizer to tokenize words in file. vocab : `Vocabulary`, optional (default=`None`) Vocabulary of tokenizer. If `None`, assumes tokenizer is of type `PreTrainedTokenizer` and uses tokenizer's `vocab` attribute. namespace : `str` Namespace of vocab to use when tokenizing. all_cases : `bool`, optional (default=`True`) Whether to tokenize lower, title, and upper cases of each word. # Returns word_ids : `List[torch.Tensor]` List of tensors containing the IDs of subword tokens for each word in the file. """ word_ids = [] with open(cached_path(fname)) as f: words = json.load(f) for w in words: word_ids.extend(_convert_word_to_ids_tensor(w, tokenizer, vocab, namespace, all_cases)) return word_ids
def _load_highway(self): # pylint: disable=protected-access # the highway layers have same dimensionality as the number of cnn # filters cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) n_highway = cnn_options['n_highway'] # create the layers, and load the weights self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu) for k in range(n_highway): # The AllenNLP highway is one matrix multplication with concatenation of # transform and carry weights. with h5py.File(cached_path(self._weight_file), 'r') as fin: # The weights are transposed due to multiplication order assumptions in tf # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X)) w_transform = numpy.transpose( fin['CNN_high_{}'.format(k)]['W_transform'][...]) # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x) w_carry = -1.0 * \ numpy.transpose( fin['CNN_high_{}'.format(k)]['W_carry'][...]) weight = numpy.concatenate([w_transform, w_carry], axis=0) self._highways._layers[k].weight.data.copy_( torch.FloatTensor(weight)) self._highways._layers[ k].weight.requires_grad = self.requires_grad b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...] b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...] bias = numpy.concatenate([b_transform, b_carry], axis=0) self._highways._layers[k].bias.data.copy_( torch.FloatTensor(bias)) self._highways._layers[ k].bias.requires_grad = self.requires_grad
def from_file( cls, params_file: Union[str, PathLike], params_overrides: str = "", ext_vars: dict = None ) -> "Params": """ Load a `Params` object from a configuration file. # Parameters params_file: `str` The path to the configuration file to load. params_overrides: `str`, optional A dict of overrides that can be applied to final object. e.g. {"model.embedding_dim": 10} ext_vars: `dict`, optional Our config files are Jsonnet, which allows specifying external variables for later substitution. Typically we substitute these using environment variables; however, you can also specify them here, in which case they take priority over environment variables. e.g. {"HOME_DIR": "/Users/allennlp/home"} """ if ext_vars is None: ext_vars = {} # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = {**_environment_variables(), **ext_vars} file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return cls(param_dict)
def _read(self, file_path): label_file_path = file_path[:file_path.rfind( '.')] + "_attnperformancelabels_" + self.model_folder_name + ".txt" label_file = open(label_file_path, 'r') with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) columns = data_file.readline().strip('\n').split('\t') token_col_inds = [ columns.index(self._column_titles_to_index[field_ind]) for field_ind in range(len(self._column_titles_to_index)) ] for line in data_file.readlines(): if not line: continue items = line.strip("\n").split("\t") tokens = '' for col_ind in token_col_inds: tokens += items[col_ind] + ' ' tokens = tokens[:-1] tokens = items[columns.index("tokens")] if len(tokens.strip()) == 0: continue instance = self.text_to_instance(tokens=tokens) if instance is not None: str_category = label_file.readline().strip() assert str_category != '' instance.fields['label'] = LabelField(str_category) yield instance try: next_label = label_file.readline() if isinstance(next_label, str) and len(next_label.strip()) >= 1: assert not next_label[0].isdigit(), \ "We had too many labels corresponding to the given data file " + file_path except: pass label_file.close()
def _load_cnn_weights(self, model_state: Dict = None): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] char_embed_dim = cnn_options['embedding']['dim'] convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True ) # load the weights if model_state: convolution_layer = '_encoder._character_encoder.conv_{}'.format(i) w_reshaped = model_state['{}.weight'.format(convolution_layer)] bias = model_state['{}.bias'.format(convolution_layer)] if w_reshaped.size() != conv.weight.data.shape: raise ValueError("Invalid weight file") else: with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN']['W_cnn_{}'.format(i)][...] bias = fin['CNN']['b_cnn_{}'.format(i)][...] w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0)) if w_reshaped.shape != tuple(conv.weight.data.shape): raise ValueError("Invalid weight file") conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) conv.bias.data.copy_(torch.FloatTensor(bias)) conv.weight.requires_grad = self.requires_grad conv.bias.requires_grad = self.requires_grad convolutions.append(conv) self.add_module('char_conv_{}'.format(i), conv) self._convolutions = convolutions
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") instances = [] for article in tqdm(dataset): for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) for question_answer in paragraph_json['qas']: question_text = question_answer["question"].strip( ).replace("\n", "") answer_texts = [ answer['text'] for answer in question_answer['answers'] ] span_starts = [ answer['answer_start'] for answer in question_answer['answers'] ] span_ends = [ start + len(answer) for start, answer in zip(span_starts, answer_texts) ] instance = self.text_to_instance( question_text, paragraph, zip(span_starts, span_ends), answer_texts, tokenized_paragraph) instances.append(instance) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in Tqdm.tqdm(itertools.groupby(data_file, _is_divider)): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in zip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Add "feature labels" to instance if 'pos' in self.feature_labels: instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: instance_fields['chunk_tags'] = SequenceLabelField(chunk_tags, sequence, "chunk_tags") if 'ner' in self.feature_labels: instance_fields['ner_tags'] = SequenceLabelField(ner_tags, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner': instance_fields['tags'] = SequenceLabelField(ner_tags, sequence) elif self.tag_label == 'pos': instance_fields['tags'] = SequenceLabelField(pos_tags, sequence) elif self.tag_label == 'chunk': instance_fields['tags'] = SequenceLabelField(chunk_tags, sequence) yield Instance(instance_fields)
def _read(self, file_path: str): self._debug_prints = 5 cached_file_path = cached_path(file_path) if file_path.endswith('.gz'): data_file = gzip.open(cached_file_path, 'rb') else: data_file = open(cached_file_path, 'r') logger.info("Reading QA instances from jsonl dataset at: %s", file_path) item_jsons = [] for line in data_file: item_jsons.append(json.loads(line.strip())) if self._sample != -1: item_jsons = random.sample(item_jsons, self._sample) logger.info("Sampling %d examples", self._sample) for item_json in Tqdm.tqdm(item_jsons, total=len(item_jsons)): self._debug_prints -= 1 if self._debug_prints >= 0: logger.info(f"====================================") logger.info(f"Input json: {item_json}") item_id = item_json["id"] statement_text = item_json["phrase"] metadata = {} if "metadata" not in item_json else item_json[ "metadata"] context = item_json["context"] if "context" in item_json else None yield self.text_to_instance(item_id=item_id, question=statement_text, answer_id=item_json["answer"], context=context, org_metadata=metadata) data_file.close()
def _read(self, file_path: str) -> Iterable[Instance]: # KB_path should be a pickle file of a dictionary # if `file_path` is a URL, redirect to the cache KB_path = self.KB_path file_path = cached_path(file_path) dict_entity_lookup = pickle.load(open(KB_path,"rb")) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) tmp_instance = [] for line in data_file.readlines(): if line.strip()=="==================================================": instance_dict = instance2dict(tmp_instance) question = instance_dict['question'] entity_surface = instance_dict['parameters_surface'] entity = instance_dict['parameters'] try: KB_gloss = dict_entity_lookup[entity]["itemListElement"][0] # useful fields: @type -> list, description, detailedDescription e_type = KB_gloss["result"].get("@type",[]) e_descr = [Token(token) for token in word_tokenize(KB_gloss["result"].get("description",[]))] e_detail = [Token(token) for token in word_tokenize(KB_gloss["result"].get("detailedDescription",{}).get("articleBody",[]))] except: KB_gloss = None e_type = [] e_descr = [] e_detail = [] logical_form = instance_dict['logical_form'] if instance_dict["question_type"] == self.question_type: yield self.text_to_instance(question, entity, entity_surface, e_type, e_descr, e_detail, logical_form) tmp_instance = [] else: tmp_instance.append(line)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r', encoding='utf8') as eds_file: logger.info("Reading EDS instances from conllu dataset at: %s", file_path) for ret in lazy_parse(eds_file.read()): tokens = ret["tokens"] arc_indices = ret["arc_indices"] arc_tags = ret["arc_tags"] root_id = ret["root_id"] lemmas = ret["lemmas"] pos_tags = ret["pos_tags"] meta_info = ret["meta_info"] node_info_dict = ret["node_info_dict"] tokens_range = ret["tokens_range"] gold_mrps = ret["gold_mrps"] concept_node = ret["concept_node"] gold_actions = get_oracle_actions( tokens, arc_indices, arc_tags, root_id, concept_node, node_info_dict) if arc_indices else None # if len(gold_actions) / len(tokens) > 20: # print(len(gold_actions) / len(tokens)) if gold_actions and gold_actions[-1] == '-E-': print('-E-', ret["graph_id"]) continue concept_label_list = list( node_info_dict["node_label_dict"].values()) yield self.text_to_instance(tokens, lemmas, pos_tags, arc_indices, arc_tags, gold_actions, [root_id], [meta_info], concept_label_list, tokens_range, [gold_mrps])
def _read(self, file_path: str): """ Parameters ---------- file_path : ``str``, required. For this dataset reader, file_path can either be a path to a file `or` a path to a directory containing json files. The reason for this is because some of the text2sql datasets require cross validation, which means they are split up into many small files, for which you only want to exclude one. """ files = [ p for p in glob.glob(file_path) if self._cross_validation_split_to_exclude not in os.path.basename(p) ] for path in files: split_data = [] logger.info("Reading instances from lines in file at: %s", path) with open(cached_path(path), "r") as data_file: data = json.load(data_file) for text, sql in text2sql_utils.process_sql_data_standard( data, self._use_prelinked_entities): instance = self.text_to_instance(text, sql) if instance is not None: split_data.append(instance) # if needs to augment data if self._aug_ratio > 0: random.Random(self._random_seed).shuffle(self._aug_data) aug_num = int(self._aug_ratio * len(self._aug_data)) + 1 aug_data = [ self.text_to_instance(" ".join(entry["inp"]), " ".join(entry["out"])) for entry in self._aug_data[:aug_num] ] split_data.extend(aug_data) # randomize and output random.Random(self._random_seed).shuffle(split_data) for instance in split_data: yield instance