if opts.gpu > -1: model.cuda(opts.gpu) train_dataset, dev_dataset = split_to_train_and_dev( dataset, opts.train_ratio) optimizer = BertAdam(model.parameters(), lr=opts.lr) iterator = BucketIterator(batch_size=opts.batch_size, sorting_keys=[("text", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, patience=opts.patience, train_dataset=train_dataset, validation_dataset=dev_dataset, validation_metric='+accuracy', cuda_device=opts.gpu, serialization_dir=opts.save_dir, num_epochs=opts.epoch) trainer.train() if opts.eval: vocab = Vocabulary.from_files(os.path.join(opts.save_dir, VOCAB_DIR)) model = BertForMultiTaskSLU(vocab, opts.bert) model.load_state_dict( torch.load(os.path.join(opts.save_dir, BEST_MODEL_FILENAME), map_location=device_mapping(opts.gpu))) predictor = SLUPredict(model, reader, vocab)
def main(): # In order to use ELMo, each word in a sentence needs to be indexed with # an array of character IDs. elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # Initialize the ELMo-based token embedder using a pre-trained file. # This takes a while if you run this script for the first time # Original # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # Medium # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Pass in the ElmoTokenEmbedder instance instead embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states] elmo_embedding_dim = 256 lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(embedder, lstm, vocab) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train() tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!'] predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict(tokens)['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
# HW encoder = TransformerSeq2VecEncoder(EMBEDDING_DIM, HIDDEN_DIM, projection_dim=256, feedforward_hidden_dim=128, num_layers=2, num_attention_heads=4) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=64, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, cuda_device=0, patience=5, num_epochs=15) metrics = trainer.train() print(metrics)
def main(): target_namespace = "target_tokens" if not USE_COPY: reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace=target_namespace) }) else: reader = CopyNetDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_namespace=target_namespace) train_dataset = reader.read('./data/data_train.tsv') validation_dataset = reader.read('./data/data_val.tsv') vocab = Vocabulary.from_instances(train_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=SRC_EMBEDDING_DIM, pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt") assert en_embedding.weight.requires_grad datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file, SRC_EMBEDDING_DIM, vocab) datas.requires_grad = True en_embedding.weight.data = datas print(en_embedding.weight.data) assert en_embedding.weight.requires_grad encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(SRC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.3, num_layers=1)) #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM, # hidden_dim=HIDDEN_DIM, # projection_dim=128, feedforward_hidden_dim=128, # num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) attention = DotProductAttention() if not USE_COPY: model = SimpleSeq2Seq(vocab, source_embedder, encoder, MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) else: model = MyCopyNet(vocab, source_embedder, encoder, max_decoding_steps=MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace=target_namespace, attention=attention, beam_size=8, tgt_embedder_pretrain_file= "../opennmt/glove_dir/glove.840B.300d.txt") model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=64, sorting_keys=[("source_tokens", "num_tokens")], padding_noise=0.2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=22, patience=4, serialization_dir="./checkpoints", cuda_device=CUDA_DEVICE, summary_interval=100) trainer.train() print(en_embedding.weight.data) predictor = Seq2SeqPredictor(model, reader) # Dump all predictions to a file # TODO (DNGros): Is there an automatic way in allennlp to do this?? pred_toks = [] with open("pred.txt", "w") as outfile: for instance in tqdm(validation_dataset): pred = predictor.predict_instance(instance) toks = pred['predicted_tokens'] if toks: outfile.write(" ".join(toks[0]) + "\n") else: outfile.write("" + "\n")
#### Here we indicate that we want to sort the instances by the number of tokens in the sentence field. iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")]) #### We also specify that the iterator should make sure its instances are indexed using our vocabulary; #### that is, that their strings have been converted to integers using the mapping we previously created. iterator.index_with(vocab) #### Now we instantiate our <code>Trainer</code> and run it. #### Here we tell it to run for 1000 epochs and to stop training early #### if it ever spends 10 epochs without the validation metric improving. #### The default validation metric is loss (which improves by getting smaller), #### but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger). trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000) #### When we launch it it will print a progress bar for each epoch #### that includes both the "loss" and the "accuracy" metric. #### If our model is good, the loss should go down and the accuracy up as we train. trainer.train() #### As in the original PyTorch tutorial, we'd like to look at the predictions our model generates. #### AllenNLP contains a <code>Predictor</code> abstraction that takes inputs, #### converts them to instances, feeds them through your model, #### and returns JSON-serializable results. Often you'd need to implement your own Predictor, #### but AllenNLP already has a <code>SentenceTaggerPredictor</code> that works perfectly here, so we can use it. #### It requires our model (for making predictions) and a dataset reader (for creating instances).
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = {} for task in TASKS: if task.task_type in TAGGING_TASKS: readers[task.tag_namespace] = ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, lazy=True) elif task.task_type in CLASSIFICATION_TASKS: readers[task.tag_namespace] = JSONDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, lazy=True) else: raise NotImplementedError( f"task_namespace={task.task_type} not yet supported.") elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_classification_tagging_vocab( [train_dataset, validation_dataset]) # Special case for CCG if "ccg" in task_suffixes or "pos" in task_suffixes: for task in TASKS: if task.task_type == "ccg": for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]: vocab.add_token_to_namespace(tag, task.tag_namespace) if task.tag_namespace == "ud_pos": for tag in ["CONJ"]: vocab.add_token_to_namespace(tag, task.tag_namespace) else: vocab = Vocabulary.from_files( os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info( f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list( roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values())) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() # Also garbage collect gc.collect() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data(model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)
def train(train_data_path, validation_data_path, embedding_dim, hidden_dim, learning_rate=0.1, batch_size=2, num_epochs=100, save_dir="/tmp"): _train_data_path = cached_path(train_data_path) _validation_data_path = cached_path(validation_data_path) reader = PosDatasetReader() train_dataset = reader.read(_train_data_path) validation_dataset = reader.read(_validation_data_path) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=embedding_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD(model.parameters(), lr=learning_rate) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=num_epochs, cuda_device=cuda_device) metrics = trainer.train() for m in metrics: if m.startswith("validation"): print("{}={}".format(m, metrics[m])) predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) # Here's how to save the model. model_path = os.path.join(save_dir, "model.th") vocab_path = os.path.join(save_dir, "vocabulary") with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) # And here's how to reload the model. vocab2 = Vocabulary.from_files(vocab_path) model2 = LstmTagger(word_embeddings, lstm, vocab2) with open(model_path, 'rb') as f: model2.load_state_dict(torch.load(f)) if cuda_device > -1: model2.cuda(cuda_device) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits'] np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
eps=1e-9) scheduler = NoamLR(optimizer=optimizer, model_size=HIDDEN_DIM, warmup_steps=WARMUP_STEPS, factor=1) iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, learning_rate_scheduler=scheduler, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=PATIENCE, num_epochs=EPOCH, cuda_device=cuda_device) trainer.train() # Here's how to save the model. with open("model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary") # # And here's how to reload the model. # vocab2 = Vocabulary.from_files("vocabulary") # model2 = BiLSTMClassifier(word_embeddings, lstm, DROPOUT_RATE, vocab)
#just using toydatasetreader to build vocab reader = ToyDatasetReader() dataset = reader.read("") vocab = Vocabulary.from_instances(dataset) print(vocab.get_vocab_size()) reader = TestReader(vocab) reader.set_compute_nnrank_features(False) dataset = reader.read("") opts = ModelOptions() text_embedder = Text_Embedding(opts, vocab) paper_embedder = Paper_Embedding() embedder = EmbeddingModel(vocab, text_embedder, paper_embedder) iterator = BasicIterator() iterator.index_with(vocab) optimizer = torch.optim.SGD(embedder.parameters(), lr=0.1) trainer = Trainer(model=embedder, optimizer=optimizer, iterator=iterator, train_dataset=dataset, validation_dataset=dataset, patience=1000, num_epochs=10, summary_interval=2) trainer.train()
def EnhancedRCNN_train(): print("enter train") with open (model_config.glove_file_path) as fp: text = fp.readlines() # 这里如何优雅地解决这个初始counter的问题 glove_lines = len(text) token_counts = {"tokens": dict([(line.split(' ')[0], glove_lines - idx + 2) for idx, line in enumerate(text)])} #print(list(token_counts.items())[:10]) vocab = Vocabulary(counter=token_counts, min_count={"tokens": 1}, #non_padded_namespaces=['tokens'], pretrained_files={'tokens': model_config.glove_file_path}, only_include_pretrained_words=True) EMBEDDING_DIM = 300 token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'trainable': False, 'pretrained_file': model_config.glove_file_path, 'embedding_dim': EMBEDDING_DIM, 'vocab_namespace': "tokens"}) ) print("GloVe loaded") word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) model = EnhancedRCNNModel(word_embeddings, model_config.num_class, vocab=vocab) if torch.cuda.is_available(): cuda_device = list(range(torch.cuda.device_count())) model = model.cuda(cuda_device[0]) else: cuda_device = -1 print("cuda device : {}".format(cuda_device)) reader = ListWiseDatasetReader(vocab=vocab) train_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_train.jsonl")) dev_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_dev.jsonl")) test_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_test.jsonl")) #fc_lr = 1e-3 optimizer = torch.optim.SGD(model.parameters(), lr=model_config.learning_rate, momentum=0.9) ''' optimizer = torch.optim.SGD([{'params': model.embedder.parameters()}, {'params': model.fc1.parameters(), 'lr': fc_lr}, {'params': model.fc2.parameters(), 'lr': fc_lr}, {'params': model.proj_1.parameters(), 'lr': fc_lr}, {'params': model.proj_2.parameters(), 'lr': fc_lr}, {'params': model.bert_prediction.parameters(), 'lr': fc_lr}, ], lr=model_config.learning_rate, momentum=0.9) ''' #optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) iterator_train = BucketIterator(batch_size=model_config.batch_size, sorting_keys=[("left_input_tokens_field", "num_tokens"), ("right_input_tokens_field", "num_tokens")]) iterator_train.index_with(vocab) model.train() trainer = Trainer(model = model, optimizer = optimizer, iterator = iterator_train, train_dataset = train_dataset, validation_dataset = dev_dataset, patience = model_config.patience, num_epochs = model_config.epochs, cuda_device = cuda_device, shuffle=True ) train_start_time = time.time() trainer.train() train_end_time = time.time() # test model.eval() preds = [] gd = [] gd_pos = [] with torch.no_grad(): iterator_test = BucketIterator(batch_size = model_config.batch_size, sorting_keys=[("left_input_tokens_field", "num_tokens"), ("right_input_tokens_field", "num_tokens")]) iterator_test.index_with(vocab) generator_test = iterator_test(test_dataset, 1, False) test_start_time = time.time() for batch in generator_test: batch = move_to_device(batch, cuda_device[0]) gd.extend(batch['label'].squeeze(-1).long().cpu().numpy().tolist()) out_dict = model(batch['left_input_tokens_field'], batch['right_input_tokens_field'], batch['label']) batch_pred = torch.argmax(out_dict['logits'], -1).cpu().numpy() preds.extend(batch_pred.tolist()) sorted_batch, sorted_idx = torch.sort(out_dict['logits'], dim=-1, descending=True) label_mat = batch['label'].repeat(1, out_dict['logits'].shape[-1]).long().cuda() pos_mat = label_mat.eq(sorted_idx.cuda()) pos_tensor = pos_mat.nonzero()[:, 1].cpu().numpy().tolist() gd_pos.extend(pos_tensor) test_end_time = time.time() print("p@1 : ", (np.sum(np.equal(gd, preds))) / len(gd)) print("[train time] : {}".format(train_end_time - train_start_time)) print("[test time] : {}".format(test_end_time - test_start_time)) # 先检查文件是否存在,不存在则写入,存在则continue save_path = os.path.join(root_path, model_config.save_path) if os.path.exists(save_path): print("save path already exists") else: pd = pandas.DataFrame({'gd': gd, 'preds': preds}) pd.to_csv(save_path, index=False) print("save to path : {}".format(save_path))
tokens = batch["tokens"] labels = batch mask = get_text_field_mask(tokens) mask embeddings = model.word_embeddings(tokens) state = model.encoder(embeddings, mask) class_logits = model.projection(state) class_logits model(**batch) loss = model(**batch)["loss"] loss.backward() optimizer = optim.Adam(model.parameters(), lr=config.lr) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_ds, cuda_device=0 if USE_GPU else -1, num_epochs=config.epochs, ) metrics = trainer.train() tagger = SentenceTaggerPredictor(model, reader) tagger.predict("this tutorial was great!")
embed_training_reader.set_compute_nnrank_features(False) embed_training_data = embed_training_reader.read("") rank_training_reader = CiteomaticReader(df, idx_to_id_dict, ann, train_frac=train_frac, validation=False) rank_training_reader.set_compute_nnrank_features(True) rank_training_data = embed_training_reader.read("") embed_trainer = Trainer( model=embedder, optimizer=optimizer, iterator=iterator, train_dataset=embed_training_data, #validation_dataset=val_data, patience=10, num_epochs=1, shuffle=False, cuda_device=cuda_device) rank_trainer = Trainer( model=ranker, optimizer=optimizer, iterator=iterator, train_dataset=rank_training_data, #validation_dataset=val_data, patience=10, num_epochs=1, shuffle=False, cuda_device=cuda_device)
def _build_trainer(config, model, vocab, train_data, valid_data): optimizer = optim.AdamW(model.parameters(), lr=config.trainer.lr) scheduler = None is_bert_based = any( model.name.endswith('bert') for model in config.embedder.models) is_trainable_elmo_based = any( model.name == 'elmo' and model.params['requires_grad'] for model in config.embedder.models) if is_bert_based or is_trainable_elmo_based: def _is_pretrained_param(name): return 'transformer_model' in name or '_elmo_lstm' in name pretrained_params, non_pretrained_params = [], [] for name, param in model.named_parameters(): if _is_pretrained_param(name): logger.info('Pretrained param: %s', name) pretrained_params.append(param) else: logger.info('Non-pretrained param: %s', name) non_pretrained_params.append(param) optimizer = optim.AdamW([{ 'params': pretrained_params, 'lr': config.trainer.bert_lr }, { 'params': non_pretrained_params, 'lr': config.trainer.lr }, { 'params': [] }]) scheduler = SlantedTriangular( optimizer=optimizer, num_epochs=config.trainer.num_epochs, num_steps_per_epoch=len(train_data) / config.trainer.batch_size, cut_frac=config.trainer.cut_frac, gradual_unfreezing=config.trainer.gradual_unfreezing, discriminative_fine_tuning=config.trainer. discriminative_fine_tuning) logger.info('Trainable params:') for name, param in model.named_parameters(): if param.requires_grad: logger.info('\t' + name) iterator = BucketIterator(batch_size=config.trainer.batch_size) iterator.index_with(vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) logger.info('Using cuda') else: cuda_device = -1 logger.info('Using cpu') logger.info('Example batch:') _log_batch(next(iterator(train_data))) if is_bert_based: train_data = _filter_data(train_data, vocab) valid_data = _filter_data(valid_data, vocab) return Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=valid_data, validation_metric='+MeanAcc', patience=config.trainer.patience, num_epochs=config.trainer.num_epochs, cuda_device=cuda_device, grad_clipping=5., learning_rate_scheduler=scheduler, serialization_dir=os.path.join(config.data.models_dir, config.model_name), should_log_parameter_statistics=False, should_log_learning_rate=False, num_gradient_accumulation_steps=config.trainer. num_gradient_accumulation_steps)
reader = ClassifierDatasetReader() train_dataset = reader.read(cached_path(config.train_data_dir)) validation_dataset = reader.read(cached_path(config.dev_data_dir)) # tokens索引使用预训练语言模型, labels索引使用了这个vocab vocab = Vocabulary.from_instances(train_dataset + validation_dataset) model = BertClassifier( vocab=vocab, bert_model=config.bert_model_dir, ) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD(model.parameters(), lr=config.learning_rate) iterator = BucketIterator(batch_size=config.train_batch_size) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=config.early_stop, num_epochs=config.num_train_epochs, cuda_device=cuda_device) trainer.train()
out_features=vocab.get_vocab_size('labels')) model = TargetLSTMClassifier(vocab, word_embeddings, text_lstm, target_lstm, feed_forward) # Data iterator sort_fields = [("text", "num_tokens"), ("target", "num_tokens")] iterator = BucketIterator(batch_size=32, sorting_keys=sort_fields) iterator.index_with(vocab) # Model training optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=40, histogram_interval=100, should_log_learning_rate=True) serialization_dir = '/tmp/anything100' another_log = SummaryWriter(os.path.join(serialization_dir, "log", "embeddings")) train_log = SummaryWriter(os.path.join(serialization_dir, "log", "train")) validation_log = SummaryWriter(os.path.join(serialization_dir, "log", "validation")) trainer._tensorboard = TensorboardWriter(train_log=train_log, validation_log=validation_log) trainer.train() # Project the learnt word embeddings another_log.add_embedding(token_embedding.weight, metadata=token_names,
# get a new model for each iteration. model, optimizer, cuda_device = get_model(pretrained_file, WORD_EMB_DIM, vocab, len(reader.alltags)) iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) ser_dir_iter = serialization_dir + "/iter-{}".format(iteration) prepare_global_logging(ser_dir_iter, False) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=25, # FIXME: consider more iterations. validation_metric="+f1-measure-overall", cuda_device=cuda_device, num_serialized_models_to_keep=3, serialization_dir=ser_dir_iter) metrics = trainer.train() print("tagging training data...") for inst in tqdm(train_dataset): model.eval() output = model.forward_on_instance(inst) seq_len, num_tags = output["logits"].shape orig_tags = inst["metadata"]["orig_tags"]
def train(train_dataset, val_dataset, cfg): # Vocabularyを生成 VOCAB_SIZE = cfg.w2v.vocab_size vocab = Vocabulary.from_instances(train_dataset + val_dataset, max_vocab_size=VOCAB_SIZE) BATCH_SIZE = cfg.training.batch_size # パディング済みミニバッチを生成してくれるIterator iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/ model_name = cfg.w2v.model_name norm = cfg.w2v.norm cwd = hydra.utils.get_original_cwd() params = Params({ 'embedding_dim': 200, 'padding_index': 0, 'pretrained_file': os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'), 'norm_type': norm }) token_embedding = Embedding.from_params(vocab=vocab, params=params) HIDDEN_SIZE = cfg.model.hidden_size dropout = cfg.model.dropout word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper( nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_SIZE, bidirectional=True, batch_first=True)) model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout) model.train() USE_GPU = True if USE_GPU and torch.cuda.is_available(): model = model.cuda(0) LR = cfg.training.learning_rate EPOCHS = cfg.training.epoch patience = cfg.training.patience if cfg.training.patience > 0 else None optimizer = optim.Adam(model.parameters(), lr=LR) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=val_dataset, patience=patience, cuda_device=0 if USE_GPU else -1, num_epochs=EPOCHS) metrics = trainer.train() logger.info(metrics) return model, metrics
# # {'params': model.text_field_embedder.token_embedder_tokens.bert_model.encoder.layer[8].parameters(), 'lr': 0.000855} # ], lr=1e-4) # Default # optimizer = optim.SGD(model.parameters(), lr=0.001) model = model.cuda() print('Start training') # Old trainer trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, validation_iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=5, validation_metric='-loss', num_epochs=cfg.num_epochs, cuda_device=[0, 1] ) # New trainer # trainer = Trainer( # model=model, # optimizer=optimizer, # iterator=iterator, # validation_iterator=iterator, # train_dataset=train_dataset, # validation_dataset=validation_dataset, # validation_metric='-loss',
dataset = reader.read("") vocab = Vocabulary.from_instances(dataset) print(vocab.get_vocab_size()) opts = ModelOptions() reader = TestReader(vocab) reader.set_compute_nnrank_features(True) dataset = reader.read("") text_embedder = Text_Embedding(opts, vocab) nnrank = CitationRanker(vocab, opts, text_embedder) iterator = BasicIterator() iterator.index_with(vocab) optimizer = torch.optim.SGD(nnrank.parameters(), lr=0.1) move_optimizer_to_cuda(optimizer) trainer = Trainer(model=nnrank, optimizer=optimizer, iterator=iterator, train_dataset=dataset, validation_dataset=dataset, patience=1000, num_epochs=10, summary_interval=2, cuda_device=0) trainer.train()
def objective_kw( num_epochs=10, lr=0.1, lr_gamma=0.25, EMBEDDING_DIM=16, HIDDEN_DIM=6, DROPOUT=0.5, AUGMENT=True, weight_exponent=1.0, ): weights = label_counts.map(lambda x: x** (-1 / (1 + weight_exponent))).loc[labels] weights = weights / ((label_counts * weights).mean() / label_counts.mean()) loss_params = dict(alpha=weights.values, gamma=None) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=DROPOUT)) model = LstmTagger(word_embeddings, lstm, vocab, loss_params=loss_params) optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) if AUGMENT: iterator = AdvancedBucketIterator( batch_size=2, sorting_keys=[("sentence", "num_tokens")], preprocess=partial(permute_token, frequency=0.2), ) iterator.index_with(vocab) val_iterator = AdvancedBucketIterator( batch_size=2, sorting_keys=[("sentence", "num_tokens")], ) val_iterator.index_with(vocab) else: val_iterator = iterator for _ in range(1): optimizer = optim.SGD(model.parameters(), lr=lr) learning_rate_scheduler = _PyTorchLearningRateSchedulerWrapper( MultiStepLR(optimizer, [10, 20, 40], gamma=lr_gamma, last_epoch=-1)) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, validation_iterator=val_iterator, train_dataset=datasets['train_no_punctuation'] + datasets['train'], validation_dataset=datasets['val'], patience=10, num_epochs=num_epochs, learning_rate_scheduler=learning_rate_scheduler, # model_save_interval=10, # serialization_dir=serialization_dir, # num_serialized_models_to_keep=10, ) res = trainer.train() return 1 - res['validation_accuracy'] # res['validation_loss']
def run_experiment( use_soft_targets, soft_target_path, embedding_type, rnn_type, hparams ): log = {} log["name"] = "{} {} {}".format( rnn_type, embedding_type, "soft_target" if use_soft_targets else "hard_target" ) log["soft_target"] = soft_target_path if use_soft_targets else None vocab = Vocabulary().from_files(hparams["vocab_path"]) if embedding_type == "Chord": # data reader reader = CpmDatasetReader() # chord embedder token_embedding = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=hparams["chord_token_embedding_dim"], ) chord_embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) elif embedding_type == "Note": # data reader note_tokenizer = NoteTokenizer() note_indexer = TokenCharactersIndexer( namespace="notes", min_padding_length=4, character_tokenizer=note_tokenizer ) reader = CpmDatasetReader( token_indexers={"tokens": SingleIdTokenIndexer(), "notes": note_indexer} ) # chord embedder token_embedding = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=hparams["chord_token_embedding_dim"], ) note_token_embedding = Embedding( vocab.get_vocab_size("notes"), hparams["note_embedding_dim"] ) note_encoder = CnnEncoder( num_filters=hparams["cnn_encoder_num_filters"], ngram_filter_sizes=hparams["cnn_encoder_n_gram_filter_sizes"], embedding_dim=hparams["note_embedding_dim"], output_dim=hparams["note_level_embedding_dim"], ) note_embedding = TokenCharactersEncoder(note_token_embedding, note_encoder) chord_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding, "notes": note_embedding} ) else: raise ValueError("Unknown embedding type:", embedding_type) # read data train_dataset = reader.read(os.path.join(hparams["data_path"], "train.txt")) val_dataset = reader.read(os.path.join(hparams["data_path"], "val.txt")) test_dataset = reader.read(os.path.join(hparams["data_path"], "test.txt")) # contextualizer contextual_input_dim = chord_embedder.get_output_dim() if rnn_type == "RNN": contextualizer = PytorchSeq2SeqWrapper( torch.nn.RNN( contextual_input_dim, hparams["rnn_hidden_dim"], batch_first=True, bidirectional=False, ) ) elif rnn_type == "LSTM": contextualizer = PytorchSeq2SeqWrapper( torch.nn.LSTM( contextual_input_dim, hparams["lstm_hidden_dim"], batch_first=True, bidirectional=False, ) ) elif rnn_type == "GRU": contextualizer = PytorchSeq2SeqWrapper( torch.nn.GRU( contextual_input_dim, hparams["gru_hidden_dim"], batch_first=True, bidirectional=False, ) ) else: raise ValueError("Unknown rnn type:", rnn_type) if use_soft_targets: vocab_size = vocab.get_vocab_size("tokens") soft_targets = Embedding( num_embeddings=vocab_size, embedding_dim=vocab_size, weight=torch.load(soft_target_path), trainable=False, ) else: soft_targets = None iterator = BucketIterator( batch_size=hparams["batch_size"], sorting_keys=[("input_tokens", "num_tokens")] ) iterator.index_with(vocab) batches_per_epoch = math.ceil(len(train_dataset) / hparams["batch_size"]) model_hparams = { "dropout": None, "soft_targets": soft_targets, "T_initial": hparams["T_initial"], "decay_rate": hparams["decay_rate"], "batches_per_epoch": batches_per_epoch, } # chord progression model model = Cpm(vocab, chord_embedder, contextualizer, model_hparams) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) print("GPU available.") else: cuda_device = -1 optimizer = optim.Adam(model.parameters(), lr=hparams["lr"]) ts = time.gmtime() saved_model_path = os.path.join( hparams["saved_model_path"], time.strftime("%Y-%m-%d %H-%M-%S", ts) ) serialization_dir = os.path.join(saved_model_path, "checkpoints") trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=val_dataset, serialization_dir=serialization_dir, patience=hparams["patience"], num_epochs=hparams["num_epochs"], cuda_device=cuda_device, ) trainer.train() saved_model_path = os.path.join(saved_model_path, "{}.th".format(log["name"])) torch.save(model.state_dict(), saved_model_path) predictor = Predictor(model=model, iterator=iterator, cuda_device=cuda_device) pred_metrics = predictor.predict(test_dataset) log["metrics"] = pred_metrics log["saved_mode_path"] = saved_model_path return log
def main(): parser = argparse.ArgumentParser(description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--emb_size', type=int, default=256, help='elmo embeddings size (default: 256)') parser.add_argument('--model_name', type=str, default='baseline', help='model name (default: baseline)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() annotations = pd.read_csv('data/data/annotations_merged.csv') prompts = pd.read_csv('data/data/prompts_merged.csv') feature_dictionary = {} prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']] for index, row in annotations.iterrows(): if row['PMCID'] not in feature_dictionary: feature_dictionary[row['PMCID']] = [] feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']] + prompts_dictionary[row['PromptID']]) train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()} reader = EIDatasetReader(elmo_token_indexer, feature_dictionary) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) urls = [ 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_options.json', 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_weights.hdf5' ] elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable, projection_dim=args.emb_size) word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True) model = Baseline(word_embeddings, vocab) cuda_device = args.cuda_device if torch.cuda.is_available(): model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('article', 'num_tokens')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=8, sorting_keys=[("target_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, shuffle=True, patience=5, num_epochs=50, summary_interval=100, # to tensorboard serialization_dir = "./models_saved/SPNet/", num_serialized_models_to_keep = 5, grad_norm=2.0, cuda_device=cuda_device) print("The training starts, results will be serialized to dir", serialization_dir) trainer.train()
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print (testSeq2SeqFile) #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model #SingleIdTokenIndexer = Tokens are single integers #TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = Seq2SeqDatasetReader( source_tokenizer = WordTokenizer(), target_tokenizer = WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer()} # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list(set(trainExtraVocab+validExtraVocab+testExtraVocab)) print("length:",len(finalExtraVocab)) #input() #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print ("Vocab SIze :",vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(ENC_EMBEDDING_DIM,HIDDEN_DIM,batch_first=True,dropout=0.2)) attention = DotProductAttention() max_decoding_steps = 4 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim = TGT_EMBEDDING_DIM, #target_namespace = 'target_tokens', attention = attention, beam_size = beamSize, use_bleu = True, extra_vocab = finalExtraVocab) #Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches #iterator = BasicIterator(batch_size=2) #iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model = model, optimizer = optimizer, iterator = iterator, train_dataset = train_dataset, validation_dataset = validation_dataset, #patience = 3, num_epochs = numEpochs, cuda_device = CUDA_DEVICE) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) '''for i in range(2): print ("Epoch: {}".format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 'loss': 5.9835076332092285, 'class_log_probabilities': [-20.10894012451172], 'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']} """ print (predictor.predict_instance(instance)) ''' outFile = open("output_"+str(HIDDEN_DIM)+"_"+str(numEpochs)+"_"+str(beamSize)+".csv","w") writer = csv.writer(outFile,delimiter="\t") for instance in itertools.islice(test_dataset,500): src = instance.fields['source_tokens'].tokens gold = instance.fields['target_tokens'].tokens pred = predictor.predict_instance(instance)['predicted_tokens'] writer.writerow([src,gold,pred]) outFile.close()
"mode": "max", "factor": 0.5, "patience": 5 }) lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_params) iterator = BasicIterator(batch_size=64) iterator.index_with(vocab) for (l, train_dataset, validation_dataset, n_classes, num_epochs, patience) in [(discourse_dict, discourse_train_dataset, discourse_validation_dataset, 5, 20, 2), (claim_dict, claim_train_dataset, claim_validation_dataset, 2, 20, 2), (discourse_dict, discourse_train_dataset, claim_validation_dataset, 5, 10, 2), (claim_dict, claim_train_dataset, claim_validation_dataset, 2, 20, 2)]: model.vocab._token_to_index['labels'] = l model.num_classes = n_classes trainer = Trainer(model=model, optimizer=optimizer, learning_rate_scheduler=lr_scheduler, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=patience, num_epochs=num_epochs, cuda_device=0) trainer.train() # save trained weight torch.save(model.state_dict(), './model_alternate_training_crf.th')
def main(): #Initlizing the embeddings (BERT) bert_token_indexer = PretrainedBertIndexer( pretrained_model="./biobert_pubmed/vocab.txt", max_pieces=config.max_seq_len, do_lowercase=True, ) reader = BertAnalogyDatasetReader( tokenizer=bert_tokenizer, token_indexers={'tokens': bert_token_indexer}) train_dataset, test_dataset, dev_dataset = ( reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) vocab = Vocabulary.from_instances(train_dataset + test_dataset + dev_dataset) bert_embedder = PretrainedBertEmbedder( pretrained_model='biobert_pubmed', top_layer_only=True, # conserve memory ) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": bert_embedder}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) BERT_DIM = word_embeddings.get_output_dim() class BertSentencePooler(Seq2VecEncoder): def forward(self, embs: torch.tensor, mask: torch.tensor = None) -> torch.tensor: # extract first token tensor return embs[:, 0] @overrides def get_output_dim(self) -> int: return BERT_DIM #Initializing the model #takes the hidden state at the last time step of the LSTM for every layer as one single output bert_encoder = BertSentencePooler(vocab) model = LstmModel(word_embeddings, bert_encoder, vocab) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, cuda_device=0 if USE_GPU else -1, num_epochs=20) trainer.train() #Saving the model with open("biobert/model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("biobert/vocabulary") return vocab
def train_only_lee(): # This is WORKING! # load datasetreader # Save logging to a local file # Multitasking log.getLogger().addHandler(log.FileHandler(directory+"/log.log")) lr = 0.00001 batch_size = 2 epochs = 100 max_seq_len = 512 max_span_width = 30 #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,) token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False) reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer}) EMBEDDING_DIM = 1024 HIDDEN_DIM = 200 processed_reader_dir = Path(directory+"processed/") train_ds = None if processed_reader_dir.is_dir(): print("Loading indexed from checkpoints") train_path = Path(directory +"processed/train_d") if train_path.exists(): train_ds = pickle.load(open(directory + "processed/conll/train_d", "rb")) val_ds = pickle.load(open(directory + "processed/conll/val_d", "rb")) test_ds = pickle.load(open(directory + "processed/conll/test_d", "rb")) else: print("checkpoints not found") train_ds, val_ds, test_ds = (reader.read(dataset_folder + fname) for fname in ["train.english.v4_gold_conll", "dev.english.v4_gold_conll", "test.english.v4_gold_conll"]) pickle.dump(train_ds,open(directory + "processed/train_d", "wb")) pickle.dump(val_ds,open(directory + "processed/val_d", "wb")) pickle.dump(test_ds,open(directory + "processed/test_d", "wb")) print("saved checkpoints") # restore checkpoint here #vocab = Vocabulary.from_instances(train_ds + val_ds) vocab = Vocabulary() iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) val_iterator = BasicIterator(batch_size=batch_size) val_iterator.index_with(vocab) from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-cased", top_layer_only=True, # conserve memory requires_grad=True ) # here, allow_unmatched_key = True since we dont pass in offsets since #we allow for word embedings of the bert-tokenized, wnot necessiarly the # original tokens # see the documetnation for offsets here for more info: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/bert_token_embedder.py word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) BERT_DIM = word_embedding.get_output_dim() # at each batch, sample from the two, and load th eLSTM shared_layer = torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True) seq2seq = PytorchSeq2SeqWrapper(shared_layer) mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) model = CoreferenceResolver(vocab=vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2) print(model) optimizer = optim.Adam(model.parameters(), lr=lr) # and then we can do the shared loss # # Get USE_GPU = 0 trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, validation_iterator = val_iterator, train_dataset=train_ds, validation_dataset = val_ds, validation_metric = "+coref_f1", cuda_device=0 if USE_GPU else -1, serialization_dir= directory + "saved_models/only_lee", num_epochs=epochs, ) metrics = trainer.train() # save the model with open(directory + "saved_models/current_run_model_state", 'wb') as f: torch.save(model.state_dict(), f)
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # intiialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
optimizer = optim.SGD(model.parameters(), lr=0.001) iterator = BasicIterator(batch_size=64) iterator.index_with(vocab) model.cuda(0) print('Start training 1:') # unfreeze top layers and train for param in list(model.parameters())[:-4]: param.requires_grad = False trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, validation_iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=3, num_epochs=100, cuda_device=0) trainer.train() print('Start training 2:') # unfreeze most layers and continue training for param in list(model.parameters())[1:]: param.requires_grad = True trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, validation_iterator=iterator, train_dataset=train_dataset,
def train_cnn(train_dataset, batch_size, num_filters, filter_sizes, epochs=15, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains CNN on train_dataset; uses pre-trained ELMo model to dynamically compute embeddings. The CNN has one convolution layer for each ngram filter size. Parameters ---------- train_dataset: List[Instance] Instances for training set batch_size: int number of Instances to process in a batch num_filters: int output dim for each convolutional layer, which is the number of 'filters' learned by that layer filter_sizes: Tuple[int] specifies the number of convolutional layers and their sizes epochs: int total number of epochs to train on (default=15) learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary """ vocab = Vocabulary() word_embeddings: TextFieldEmbedder = load_elmo_embeddings() iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data # CNN encoder encoder: Seq2VecEncoder = CnnEncoder( embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) # Feedforward: classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.Classifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab