def from_params(self, params: Params) -> PytorchSeq2VecWrapper: if not params.pop('batch_first', True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True module = self._module_class(**params.as_dict()) return PytorchSeq2VecWrapper(module)
def from_params(self, params: Params, **extras) -> PytorchSeq2VecWrapper: if not params.pop("batch_first", True): raise ConfigurationError( "Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params["batch_first"] = True module = self._module_class(**params.as_dict(infer_type_and_cast=True)) return PytorchSeq2VecWrapper(module)
def train_on(dataset, params): print("Using hyperparameter configuration:", params) losses = [] state_dicts = [] kfold = StratifiedKFold(dataset, k=10, grouping=origin_of) for train, val in kfold: # TODO: Figure how much of the following code we can put outside the loop vocab = Vocabulary.from_instances(dataset) # TODO: Figure out the best parameters here elmo = Elmo(cached_path(OPTIONS_FILE), cached_path(WEIGHTS_FILE), num_output_representations=2, dropout=params["dropout"] ) # TODO: Does dropout refer to the LSTM or ELMo? word_embeddings = ELMoTextFieldEmbedder({"tokens": elmo}) # TODO: Figure out the best parameters here lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(input_size=elmo.get_output_dim(), hidden_size=64, num_layers=params["num_layers"], batch_first=True)) model = RuseModel(word_embeddings, lstm, vocab) optimizer = optim.Adam(model.parameters()) # TODO: What kind of iterator should be used? iterator = BucketIterator(batch_size=params["batch_size"], sorting_keys=[("mt_sent", "num_tokens"), ("ref_sent", "num_tokens")]) iterator.index_with(vocab) # TODO: Figure out best hyperparameters trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, cuda_device=0, train_dataset=train, validation_dataset=val, patience=5, num_epochs=100) trainer.train() # TODO: Better way to access the validation loss? loss, _ = trainer._validation_loss() losses.append(loss) state_dicts.append(model.state_dict()) mean_loss = np.mean(losses) print("Mean validation loss was:", mean_loss) return TrainResults(cv_loss=mean_loss, state_dicts=state_dicts)
def main (): #Initlizing the embeddings (ELMO) elmo_token_indexer = ELMoTokenCharactersIndexer() reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer}) train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) # elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_instances(train_dataset + test_dataset + dev_dataset) word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) #Initializing the model #takes the hidden state at the last time step of the LSTM for every layer as one single output lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) model = LstmModel(word_embeddings, lstm_encoder, vocab) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, cuda_device=0 if USE_GPU else -1, num_epochs=20) trainer.train() #Saving the model with open("model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary")
def _load_embedder(config, vocab, bert_max_length): embedders = {} for embedder_config in config.embedder.models: if embedder_config.name == 'elmo': embedders[embedder_config.name] = ElmoTokenEmbedder( options_file=os.path.join(config.data.pretrained_models_dir, 'elmo/options.json'), weight_file=os.path.join(config.data.pretrained_models_dir, 'elmo/model.hdf5'), requires_grad=embedder_config.params['requires_grad'], dropout=0.) embedders[embedder_config.name].eval() elif embedder_config.name.endswith('bert'): embedders[ embedder_config. name] = PretrainedTransformerMismatchedEmbedder( model_name=os.path.join(config.data.pretrained_models_dir, embedder_config.name), max_length=bert_max_length, requires_grad=embedder_config.params['requires_grad']) elif embedder_config.name == 'char_bilstm': embedders[embedder_config.name] = TokenCharactersEncoder( embedding=Embedding( num_embeddings=vocab.get_vocab_size('token_characters'), embedding_dim=embedder_config.params['char_embedding_dim'] ), encoder=PytorchSeq2VecWrapper( torch.nn.LSTM( embedder_config.params['char_embedding_dim'], embedder_config.params['lstm_dim'], num_layers=embedder_config.params['lstm_num_layers'], dropout=embedder_config.params['lstm_dropout'], bidirectional=True, batch_first=True)), dropout=embedder_config.params['dropout']) else: assert False, 'Unknown embedder {}'.format(embedder_config.name) return BasicTextFieldEmbedder(embedders)
def _init_from_archive(self, pretrained_model: Model): """ Given a TopicRNN instance, take its weights. """ self.text_field_embedder = pretrained_model.text_field_embedder self.vocab_size = pretrained_model.vocab_size self.text_encoder = pretrained_model.text_encoder # This function is only to be invoved when needing to classify. # To avoid manually dealing with padding, instantiate a Seq2Vec instead. self.text_to_vec = PytorchSeq2VecWrapper( self.text_encoder._modules['_module']) self.topic_dim = pretrained_model.topic_dim self.vocabulary_projection_layer = pretrained_model.vocabulary_projection_layer self.stopword_projection_layer = pretrained_model.stopword_projection_layer self.tokens_to_index = pretrained_model.tokens_to_index self.stop_indices = pretrained_model.stop_indices self.beta = pretrained_model.beta self.mu_linear = pretrained_model.mu_linear self.sigma_linear = pretrained_model.sigma_linear self.noise = pretrained_model.noise self.variational_autoencoder = pretrained_model.variational_autoencoder self.sentiment_classifier = pretrained_model.sentiment_classifier
def predict(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer}) train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) # elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) vocab2 = Vocabulary.from_files("./vocabulary") model2 = LstmModel(word_embeddings, lstm_encoder, vocab2) if USE_GPU: model2.cuda() else: model2 with open("./model.th", 'rb') as f: model2.load_state_dict(torch.load(f)) predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader) with open('test.txt', 'w+') as f: top_10_words_list = [] for analogy_test in test_dataset: logits = predictor2.predict_instance(analogy_test)['logits'] label_id = np.argmax(logits) label_predict = model2.vocab.get_token_from_index(label_id, 'labels') top_10_ids = np.argsort(logits)[-10:] top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids] top_10_words_list.append(top_10_words) f.write(label_predict + "\n") top_10_words_list = np.array(top_10_words_list) print(top_10_words_list.shape) np.save('elmo_top_10_words_list.npy', np.array(top_10_words_list))
# [ 0, 0, 5, 0, 0, 0, 4] # ] # ) # } # In[1149]: # embedded_parse_label = word_embedder(parse_label) # In[1150]: # embedded_parse_label.shape # In[1151]: seq2vec_encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) seq2seq_encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) # In[1152]: classifier_params = Params({ "input_dim": HIDDEN_DIM * 2, "num_layers": 2, "hidden_dims": [50, 3], "activations": ["sigmoid", "linear"], "dropout": [0.2, 0.0] }) # In[1153]:
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target=False, label=True, lazy=False) # train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") val_data = reader.read("./generate_files") vocab = Vocabulary() vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens') vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier(vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir(): raise NotImplementedError("The model seems not to exist") with open(Path(args.serialization_dir) / "best.th", "rb") as model_path: model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) model.load_state_dict(model_state) model.eval() predictor = TextClassifierPredictor(model, dataset_reader=reader) # TEST correct = 0 total = 0 pbar = tqdm(val_data) batch_instance = list() batch_gt = list() idx_last = 0 for idx, instance in enumerate(pbar): if idx != (idx_last + BATCH_SIZE): batch_instance.append(instance) batch_gt.append(instance.fields["labels"].label) # str else: idx_last = idx outputs = predictor.predict(batch_instance) for i, output in enumerate(outputs): if batch_gt[i] == output['predict_labels']: correct += 1 total += 1 batch_instance = list() batch_gt = list() pbar.set_description("correct/total %.3f" % (correct / total))
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target=False, label=True, lazy=True) train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") vocab = Vocabulary() vocab.add_tokens_to_namespace([ START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}' ], namespace='tokens') vocab.add_tokens_to_namespace([ 'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability' ], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier( vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-6) train_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens")]) train_iterator = MultiprocessIterator(train_iterator, num_workers=16) train_iterator.index_with(vocab) val_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens") ]) val_iterator = MultiprocessIterator(val_iterator, num_workers=16) val_iterator.index_with(vocab) #pdb.set_trace() LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1} lr_scheduler = LearningRateScheduler.from_params(optimizer, Params(LR_SCHEDULER)) # TRAIN trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=None, train_dataset=train_data, validation_dataset=None, patience=None, shuffle=True, num_epochs=1, summary_interval=100, learning_rate_scheduler=lr_scheduler, cuda_device=CUDA_DEVICES, grad_norm=5, grad_clipping=5, model_save_interval=600, serialization_dir=args.serialization_dir, keep_serialized_model_every_num_seconds=3600, should_log_parameter_statistics=True, should_log_learning_rate=True) trainer.train()