def test_language_model_data_collator(): """ Ensure `LanguageModelingDataCollator` works """ norm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16) vocab = Vocabulary.from_instances(norm_loader.iter_instances()) norm_loader.index_with(vocab) batch0 = list(norm_loader)[0] model_name = "epwalsh/bert-xsmall-dummy" data_collate = LanguageModelingDataCollator(model_name) mlm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, collate_fn=data_collate) vocab = Vocabulary.from_instances(mlm_loader.iter_instances()) mlm_loader.index_with(vocab) batch1 = list(mlm_loader)[0] norm_inputs = batch0["source"]["tokens"]["token_ids"] mlm_inputs = batch1["source"]["tokens"]["token_ids"] mlm_labels = batch1["source"]["tokens"]["labels"] # if we replace the mlm inputs with their labels, should be same as origin inputs assert torch.where(mlm_labels != -100, mlm_labels, mlm_inputs).tolist() == norm_inputs.tolist()
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances( self.dataset, min_count=4, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances( self.dataset, min_count=-1, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_from_dataset_respects_exclusive_embedding_file(self): embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write("b 0.1 0.4 -4.0\n".encode("utf-8")) vocab = Vocabulary.from_instances( self.dataset, min_count={"tokens": 4}, pretrained_files={"tokens": embeddings_filename}, only_include_pretrained_words=True, ) words = vocab.get_index_to_token_vocabulary().values() assert "a" in words assert "b" not in words assert "c" not in words vocab = Vocabulary.from_instances( self.dataset, pretrained_files={"tokens": embeddings_filename}, only_include_pretrained_words=True, ) words = vocab.get_index_to_token_vocabulary().values() assert "a" in words assert "b" in words assert "c" not in words
def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) vocab = Vocabulary.from_instances( self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances( self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={"tokens": 4}) words = vocab.get_index_to_token_vocabulary().values() assert "a" in words assert "b" not in words assert "c" not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert "a" in words assert "b" in words assert "c" in words
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def forward_on_instances(self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: model_input = {} dataset = Batch(instances) dataset.index_instances(self.vocab) if self._pointer_gen: model_input.update({'raw':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)}) #extend extend_vocab = Vocabulary.from_instances(dataset.instances) self.vocab.extend_from(extend_vocab) dataset.index_instances(self.vocab) model_input.update({'extended':dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)}) else: model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) #input model_input.update({'instances':instances}) model_input.update({'predict':True}) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip(instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def prepare1(): """ First part of preparing data for training :return: biLSTM model object, biLSTM vocabulary, data for training, data for validation, cuda biLSTM object, biLSTM reader object """ reader = PosDatasetReader() train_dataset = reader.read(train_path) validation_dataset = reader.read(validation_path) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 200 HIDDEN_DIM = 200 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) model = LstmTagger(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 return model, vocab, train_dataset, validation_dataset, cuda_device, reader
def main(): reader = LinzenDatasetReader(append_null=False) train_dataset = reader.read( "StackNN/data/linzen/rnn_agr_simple/numpred.train") validation_dataset = reader.read( "StackNN/data/linzen/rnn_agr_simple/numpred.val") vocab = Vocabulary.from_instances(train_dataset + validation_dataset) model = StackRNNAgreementPredictor(vocab, rnn_dim=100, rnn_cell_type=torch.nn.GRUCell) # model = SimpleRNNAgreementPredictor( # vocab, rnn_dim=18, rnn_type=torch.nn.GRU) optimizer = torch.optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=16, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=5) trainer.train() with open("/tmp/model.th", "wb") as fh: torch.save(model.state_dict(), fh) vocab.save_to_files("/tmp/vocabulary")
def test_read_from_file(self): MAX_LEN = 100 OFFSET_INDICES_HEAD_NAME = 'offset_indices_head' OFFSET_INDICES_TAIL_NAME = 'offset_indices_tail' reader = FewRelDatasetReader(max_len=MAX_LEN) instances = ensure_list(reader.read("tests/fixtures/fewrel.json")) vocab = Vocabulary.from_instances(instances) fields = instances[0].fields tokens = fields['text'].tokens head_offsets = [-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2] offset_indices_head = { OFFSET_INDICES_HEAD_NAME: [o + MAX_LEN for o in head_offsets] } tail_offsets = [-13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 0, 1, 2, 3, 4] offset_indices_tail = { OFFSET_INDICES_TAIL_NAME: [o + MAX_LEN for o in tail_offsets] } token_indexer_head = OffsetTokenIndexer(token_attribute='offset_head') token_indexer_tail = OffsetTokenIndexer(token_attribute='offset_tail') assert offset_indices_head == token_indexer_head.tokens_to_indices( tokens, vocab, OFFSET_INDICES_HEAD_NAME) assert offset_indices_tail == token_indexer_tail.tokens_to_indices( tokens, vocab, OFFSET_INDICES_TAIL_NAME)
def evaluate(model: Model, reader: readers.BaseReader, test_data: List[Instance]) -> None: visualise_model(model) vocab = Vocabulary.from_instances(test_data) iterator = BucketIterator(batch_size=ARGS.BATCH_SIZE, sorting_keys=reader.keys) # Our data should be indexed using the vocabulary we learned. iterator.index_with(vocab) data_types = split_list(test_data) results: Dict[str, Tuple[int, float]] = {} model.eval() print() print('#' * 5, 'PER TYPE EVALUATION', '#' * 5) for qtype, data in data_types.items(): num_items = len(data) print(f'Type: {qtype} ({num_items})') metrics = allen_eval(model, data, iterator, ARGS.CUDA_DEVICE, "") print() accuracy = metrics['accuracy'] results[qtype] = (num_items, accuracy)
def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if 'dataset_reader' in params: reader = DatasetReader.from_params(params.pop('dataset_reader')) else: raise RuntimeError('`dataset_reader` section is required') all_instances = [] if 'train_data_path' in params: print('Reading the training data...') train_data = reader.read(params.pop('train_data_path')) all_instances.extend(train_data) else: raise RuntimeError('`train_data_path` section is required') validation_data = None if 'validation_data_path' in params: print('Reading the validation data...') validation_data = reader.read(params.pop('validation_data_path')) all_instances.extend(validation_data) print('Building the vocabulary...') vocab = Vocabulary.from_instances(all_instances) model = None iterator = None if 'model' not in params: # 'dataset' mode — just preview the (first 10) instances print('Showing the first 10 instances:') for inst in all_instances[:10]: print(inst) else: model = Model.from_params(vocab=vocab, params=params.pop('model')) loader_params = deepcopy(params.pop("data_loader")) train_data_loader = DataLoader.from_params(dataset=train_data, params=loader_params) dev_data_loader = DataLoader.from_params(dataset=validation_data, params=loader_params) train_data.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop('trainer')) trainer.train() return { 'params': params_copy, 'dataset_reader': reader, 'vocab': vocab, 'iterator': iterator, 'model': model }
def main(train_file_path, val_file_path, vocab_dir, max_vocab_size, min_frq, additional): logger = logging.getLogger(__name__) reader = CopyNetSharedDecoderDatasetReader("tokens") logger.info("Reading train file") train = reader.read(train_file_path) logger.info("Reading val file") val = reader.read(val_file_path) added_data = [] for data in additional: logger.info("Adding additional data from {}".format(data)) added_data.append(reader.read(data)) if added_data: added_data = functools.reduce(lambda a, b: a + b, added_data) logger.info("Building vocabulary") logger.info("Minimal token frequency: {}".format(min_frq)) logger.info("Max vocab size: {}".format(max_vocab_size)) vocab = Vocabulary.from_instances(train + val + added_data, min_count={'tokens': min_frq}, max_vocab_size=max_vocab_size) vocab.add_token_to_namespace('@COPY@', namespace='tokens') vocab.add_token_to_namespace('@BLANKED@', namespace='tokens') vocab.save_to_files(vocab_dir)
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def main(): reader = LanguageModelingReader() train_dataset = reader.read('data/mt/sentences.eng.10k.txt') # for inst in train_dataset: # print(inst) vocab = Vocabulary.from_instances(train_dataset, min_count={'tokens': 5}) iterator = BucketIterator(batch_size=32, sorting_keys=[("input_tokens", "num_tokens")]) iterator.index_with(vocab) model = RNNLanguageModel(vocab, cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, patience=10, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() print(model.generate()) print(model.generate()) print(model.generate()) print(model.generate()) print(model.generate())
def test_batches(self): readers = { "a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader() } reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" file_path = f"""{{ "a": "{data_dir / 'babi.txt'}", "b": "{data_dir / 'conll2000.txt'}", "c": "{data_dir / 'conll2003.txt'}" }}""" instances = list(reader.read(file_path)) vocab = Vocabulary.from_instances(instances) actual_instance_type_counts = Counter(instance.fields["dataset"].metadata for instance in instances) iterator = HomogeneousBatchIterator(batch_size=3) iterator.index_with(vocab) observed_instance_type_counts = Counter() for batch in iterator(instances, num_epochs=1, shuffle=True): # batch should be homogeneous instance_types = set(batch["dataset"]) assert len(instance_types) == 1 observed_instance_type_counts.update(batch["dataset"]) assert observed_instance_type_counts == actual_instance_type_counts
def __init__(self): self.reader = LinzenDatasetReader() self.dataset = self.reader.read('data/rnn_agr_simple/numpred.train') self.vocab = Vocabulary.from_instances(self.dataset) self.dataset_list = list(iter(self.dataset)) self.instance = None self._label = None
def __init__(self, filename): self.reader = LinzenDatasetReader() self.dataset = self.reader.read(filename) self.vocab = Vocabulary.from_instances(self.dataset) self.dataset_list = list(iter(self.dataset)) self.instance = None self._label = None
def main(): reader = TatoebaSentenceReader() train_set = reader.read('data/mt/sentences.top10langs.train.tsv') dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv') vocab = Vocabulary.from_instances(train_set, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) positive_label = vocab.get_token_index('eng', namespace='labels') model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_set, validation_dataset=dev_set, num_epochs=3) trainer.train()
def test_skip_smaller_batches(self): readers = { "a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader() } reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" file_path = f"""{{ "a": "{data_dir / 'babi.txt'}", "b": "{data_dir / 'conll2000.txt'}", "c": "{data_dir / 'conll2003.txt'}" }}""" instances = list(reader.read(file_path)) vocab = Vocabulary.from_instances(instances) iterator = HomogeneousBatchIterator(batch_size=3, skip_smaller_batches=True) iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=True): # every batch should have length 3 (batch size) assert len(batch["dataset"]) == 3
def from_partial_objects( cls, serialization_dir: str, train_dataset_readers: Dict[str, DatasetReader], train_file_paths: Dict[str, str], model: Lazy[Model], iterator: DataIterator, mingler: DatasetMingler, optimizer: Lazy[Optimizer], num_epochs: int = 10, ) -> "MultiTaskTrainer": datasets = { name: reader.read(train_file_paths[name]) for name, reader in train_dataset_readers.items() } instances = (instance for dataset in datasets.values() for instance in dataset) vocab = Vocabulary.from_instances(instances=instances) model = model.construct(vocab=vocab) iterator.index_with(vocab) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) return MultiTaskTrainer(model, serialization_dir, iterator, mingler, optimizer_, datasets, num_epochs)
def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'identical_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.all_distinct_path = str(self.TEST_DIR / 'all_distinct.tsv') with open(self.all_distinct_path, 'w') as all_distinct: for i in range(100): file_path = self.TEST_DIR / f'distinct_{i}.tsv' line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n" with open(file_path, 'w') as f: f.write(line) all_distinct.write(line) self.identical_files_glob = str(self.TEST_DIR / 'identical_*.tsv') self.distinct_files_glob = str(self.TEST_DIR / 'distinct_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def train_model(parameters, name): token_indexer = { "tokens": ELMoTokenCharactersIndexer() } if parameters['use_elmo'] else None reader = SSJ500KReader( token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader( token_indexer) train_dataset = reader.read("train") validation_dataset = reader.read("test") vocab = Vocabulary.from_instances(train_dataset + validation_dataset) # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset) model = get_model(vocab, parameters) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.Adam(model.parameters(), lr=parameters['lr'], weight_decay=parameters['weight_decay']) iterator = BucketIterator(batch_size=parameters['batch_size'], sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=parameters['patience'], num_epochs=parameters['num_epochs'], cuda_device=cuda_device) trainer.train() metrics = evaluate(model, validation_dataset, iterator, cuda_device, None) save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
def __init__(self, vocab_size: int, max_len: int, batch_size: int, log_dir: str = '', mode: str = 'train', scale: int = 10000): data_path = os.path.expanduser('~/data/wiki2bio/') train_path = os.path.join(data_path, 'train.t2p.{0}.jsonl'.format(scale)) dev_path = os.path.join(data_path, 'valid.t2p.jsonl') test_path = os.path.join(data_path, 'test.t2p.jsonl') vocab_dir = os.path.join(data_path, 'dicts-{0}-t2p-{1}'.format(vocab_size, scale)) self.metrics = '+f1' self.data_path = data_path self.mode = mode self.train_dataset = Table2PivotDataset(path=train_path, max_len=max_len) self.test_dataset = Table2PivotDataset(path=test_path, max_len=max_len) self.dev_dataset = Table2PivotDataset(path=dev_path, max_len=max_len) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances(instances=self.train_dataset, max_vocab_size=vocab_size) vocab.save_to_files(vocab_dir) collate_fn = basic_collate(vocab=vocab) self.train_loader = torch.utils.data.DataLoader( dataset=self.train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True) self.dev_loader = torch.utils.data.DataLoader(dataset=self.dev_dataset, batch_size=128, collate_fn=collate_fn, shuffle=False) self.test_loader = torch.utils.data.DataLoader( dataset=self.test_dataset, batch_size=128, collate_fn=collate_fn, shuffle=False) self.vocab = vocab self.scale = scale if not log_dir: self.log_dir = Path(data_path) / 'log' / time.strftime( "%Y-%m-%dT%H_%M_%S") else: self.log_dir = Path(data_path) / 'log' / log_dir self.log_dir.mkdir(parents=True, exist_ok=True) self.log_dir = str(self.log_dir)
def _get_expected_vocab(dataset, namespace, model_name): vocab_from_instances = Vocabulary.from_instances(dataset) instance_tokens = set( vocab_from_instances._token_to_index[namespace].keys()) transformer_tokens = set( Vocabulary.from_pretrained_transformer( model_name, namespace)._token_to_index[namespace].keys()) return instance_tokens.union(transformer_tokens)
def build_allennlp_vocab(self, splits=None): if splits is None: splits = [TRAIN, VALIDATION] iterator = [instance for instance in self.read(self.data_file_paths[split]) for split in splits] vocab = Vocabulary.from_instances(iterator) return vocab
def train(model_args): model_name = model_args.serialization_dir checkpoint_dir = model_args.store_folder learning_rate = model_args.learning_rate rl_basic = model_args.rl_basic pretrain_folder = '' if checkpoint_dir == 'pretrain': is_pretrain = True else: # check if rl_basic is specified pretrain_folder = os.path.join('pretrain', rl_basic) if not os.path.exists(pretrain_folder): raise FileNotFoundError(f'Can not find the pretrained model {pretrain_folder}!') is_pretrain = False reader = construct_reader(is_pretrain=is_pretrain) train_dataset = reader.read("data_processed\\train.jsonl") test_dataset = reader.read("data_processed\\test.jsonl") # build vocabulary vocab = Vocabulary.from_instances(train_dataset + test_dataset) # build model and move it into cuda model = construct_model(vocab, model_args) model.cuda() # allocate optimizer = optim.Adam(model.parameters(), weight_decay=1e-5, lr=learning_rate) scheduler = construct_learning_scheduler(optimizer) iterator = BucketIterator(batch_size=2, sorting_keys=[("prev_tokens", "num_tokens")]) iterator.index_with(vocab) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # not recover from previous state, we should load the pretrain model as default. if not is_pretrain and not os.path.exists(os.path.join(checkpoint_dir, model_name, "best.th")): model_state = torch.load(os.path.join(pretrain_folder, "best.th")) model.load_state_dict(model_state) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=test_dataset, learning_rate_scheduler=scheduler, patience=model_args.patience, validation_metric="+{}".format(model_args.validation_metric), num_epochs=model_args.epoch, serialization_dir=os.path.join(checkpoint_dir, model_name), cuda_device=0, should_log_learning_rate=True) trainer.train() return model_name
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train()
def test_vocab_size_correct_scierc(self): vocab = Vocabulary.from_instances(self.instances_scierc) # There are 4 unique NER labels and 6 relation labels in the text fixture doc. For the ner # labels, there is an extra category for the null label. For the relation labels, there # isn't. This is due to the way their respective `Field`s represent labels. assert vocab.get_vocab_size("ner_labels") == 5 assert vocab.get_vocab_size("relation_labels") == 6 # For numeric labels, vocab size is 0. assert vocab.get_vocab_size("coref_labels") == 0
def test_vocab_from_instances_namespaces(self): reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg']) instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt')) # check that we didn't clobber the labels namespace vocab = Vocabulary.from_instances(instances) self.assertSetEqual( set(vocab._token_to_index.keys()), # pylint: disable=protected-access {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags', 'predicate_arg_tags'} )
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)} #### Now that we've implemented a <code>DatasetReader</code> and <code>Model</code>, we're ready to train. We first need an instance of our dataset reader. reader = PosDatasetReader() #### Which we can use to read in the training data and validation data. Here we read them in from a URL, but you could read them in from local files if your data was local. We use <code>cached_path</code> to cache the files locally (and to hand <code>reader.read</code> the path to the local cached version.) train_dataset = reader.read(cached_path( 'https://raw.githubusercontent.com/allenai/allennlp' '/master/tutorials/tagger/training.txt')) validation_dataset = reader.read(cached_path( 'https://raw.githubusercontent.com/allenai/allennlp' '/master/tutorials/tagger/validation.txt')) #### Once we've read in the datasets, we use them to create our <code>Vocabulary</code> (that is, the mapping[s] from tokens / labels to ids). vocab = Vocabulary.from_instances(train_dataset + validation_dataset) #### Now we need to construct the model. We'll choose a size for our embedding layer and for the hidden layer of our LSTM. EMBEDDING_DIM = 6 HIDDEN_DIM = 6 #### For embedding the tokens we'll just use the <code>BasicTextFieldEmbedder</code> which takes a mapping from index names to embeddings. If you go back to where we defined our <code>DatasetReader</code>, the default parameters included a single index called "tokens", so our mapping just needs an embedding corresponding to that index. We use the <code>Vocabulary</code> to find how many embeddings we need and our <code>EMBEDDING_DIM</code> parameter to specify the output dimension. It's also possible to start with pre-trained embeddings (for example, GloVe vectors), but there's no need to do that on this tiny toy dataset. token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) #### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well. lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) #### Finally, we can instantiate the model. model = LstmTagger(word_embeddings, lstm, vocab)