def test_create_batches_groups_correctly(self): iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')]) batches = list(iterator._create_batches(self.instances, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[4], self.instances[2]], [self.instances[0], self.instances[1]], [self.instances[3]]]
def test_from_params(self): # pylint: disable=protected-access params = Params({}) with pytest.raises(ConfigurationError): iterator = BucketIterator.from_params(params) sorting_keys = [("s1", "nt"), ("s2", "nt2")] params['sorting_keys'] = sorting_keys iterator = BucketIterator.from_params(params) assert iterator._sorting_keys == sorting_keys assert iterator._padding_noise == 0.1 assert not iterator._biggest_batch_first assert iterator._batch_size == 32 params = Params({ "sorting_keys": sorting_keys, "padding_noise": 0.5, "biggest_batch_first": True, "batch_size": 100 }) iterator = BucketIterator.from_params(params) assert iterator._sorting_keys == sorting_keys assert iterator._padding_noise == 0.5 assert iterator._biggest_batch_first assert iterator._batch_size == 100
def test_biggest_batch_first_works(self): iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')], biggest_batch_first=True) iterator.index_with(self.vocab) batches = list(iterator._create_batches(self.instances, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[3]], [self.instances[0], self.instances[1]], [self.instances[4], self.instances[2]]]
def test_create_batches_groups_correctly_with_max_instances(self): # If we knew all the instances, the correct order is 4 -> 2 -> 0 -> 1 -> 3. # Here max_instances_in_memory is 3, so we load instances [0, 1, 2] # and then bucket them by size into batches of size 2 to get [2, 0] -> [1]. # Then we load the remaining instances and bucket them by size to get [4, 3]. iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')], max_instances_in_memory=3) for test_instances in (self.instances, self.lazy_instances): batches = list(iterator._create_batches(test_instances, shuffle=False)) grouped_instances = [batch.instances for batch in batches] assert grouped_instances == [[self.instances[2], self.instances[0]], [self.instances[1]], [self.instances[4], self.instances[3]]]
def test_bucket_iterator_maximum_samples_per_batch(self): iterator = BucketIterator( batch_size=3, padding_noise=0, sorting_keys=[('text', 'num_tokens')], maximum_samples_per_batch=['num_tokens', 9] ) iterator.index_with(self.vocab) batches = list(iterator._create_batches(self.instances, shuffle=False)) stats = self.get_batches_stats(batches) # ensure all instances are in a batch assert stats['total_instances'] == len(self.instances) # ensure correct batch sizes assert stats['batch_lengths'] == [2, 2, 1] # ensure correct sample sizes (<= 9) assert stats['sample_sizes'] == [6, 8, 9]
def test_bucket_iterator_maximum_samples_per_batch(self): iterator = BucketIterator( batch_size=3, padding_noise=0, sorting_keys=[('text', 'num_tokens')], maximum_samples_per_batch=['num_tokens', 9] ) batches = list(iterator._create_batches(self.instances, shuffle=False)) # ensure all instances are in a batch grouped_instances = [batch.instances for batch in batches] num_instances = sum(len(group) for group in grouped_instances) assert num_instances == len(self.instances) # ensure all batches are sufficiently small for batch in batches: batch_sequence_length = max( [instance.get_padding_lengths()['text']['num_tokens'] for instance in batch.instances] ) assert batch_sequence_length * len(batch.instances) <= 9
def test_maximum_samples_per_batch_packs_tightly(self): token_counts = [10, 4, 3] test_instances = self.create_instances_from_token_counts(token_counts) iterator = BucketIterator( batch_size=3, padding_noise=0, sorting_keys=[('text', 'num_tokens')], maximum_samples_per_batch=['num_tokens', 11] ) iterator.index_with(self.vocab) batches = list(iterator._create_batches(test_instances, shuffle=False)) stats = self.get_batches_stats(batches) # ensure all instances are in a batch assert stats['total_instances'] == len(test_instances) # ensure correct batch sizes assert stats['batch_lengths'] == [2, 1] # ensure correct sample sizes (<= 11) assert stats['sample_sizes'] == [8, 10]
def load_SQUAD1_dataset(cf_a,vocab): """ Loads the dataset and creates iterators and so on """ ## Create the Data Reader with the Tokenization and indexing if (cf_a.datareader_lazy): #If we do lazy loading, the training will be slower but we dont have RAM so.... # We also can specify: instances_per_epoch_train = cf_a.instances_per_epoch_train instances_per_epoch_validation = cf_a.instances_per_epoch_validation max_instances_in_memory = cf_a.max_instances_in_memory else: instances_per_epoch_train = None instances_per_epoch_validation = None max_instances_in_memory = None ## Instantiate the datareader squad_reader = Squad1Reader(lazy = cf_a.datareader_lazy, tokenizer_indexer_type = cf_a.tokenizer_indexer_type) ## Load the datasets train_dataset = squad_reader.read(file_path = cf_a.train_squad1_file) validation_dataset = squad_reader.read(file_path = cf_a.validation_squad1_file) """ ########################## ITERATORS ############################ Iterator that will get the samples for the problem """ if(cf_a.datareader_lazy == False): instances_per_epoch_train = len(train_dataset) instances_per_epoch_validation = len(validation_dataset) train_iterator = BucketIterator(batch_size= cf_a.batch_size_train, instances_per_epoch = instances_per_epoch_train, max_instances_in_memory = max_instances_in_memory, sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]]) train_iterator.index_with(vocab) validation_iterator = BucketIterator(batch_size= cf_a.batch_size_validation, instances_per_epoch = instances_per_epoch_validation, max_instances_in_memory = max_instances_in_memory, sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]]) validation_iterator.index_with(vocab) num_batches = int(np.ceil(instances_per_epoch_train/cf_a.batch_size_train)) num_batches_validation = int(np.ceil(instances_per_epoch_validation/cf_a.batch_size_validation)) # Create the iterator over the data: train_iterable = train_iterator(train_dataset) validation_iterable = validation_iterator(validation_dataset) return squad_reader, num_batches, train_iterable, num_batches_validation, validation_iterable
def get_training_values (model, vocab, train_dataset, validation_dataset, tr_data_loss, val_data_loss, KL_loss,final_loss_tr, final_loss_val, batch_size=100): model.eval() model.set_posterior_mean(True) data_loss_validation = 0 data_loss_train = 0 loss_validation = 0 loss_train = 0 # Create own iterators for this: iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")]) iterator.index_with(vocab) iterator_validation = BucketIterator(batch_size = batch_size, sorting_keys=[("text_field", "num_tokens")]) iterator_validation.index_with(vocab) num_batches = int(np.floor(len(train_dataset)/batch_size)) num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation)) # Create the iterator over the data: batches_iterable = iterator(train_dataset) batches_iterable_validation = iterator(validation_dataset) # Compute the validation accuracy by using all the Validation dataset but in batches. for j in range(num_batches_validation): batch = next(batches_iterable_validation) tensor_dict = batch # Already converted data_loss_validation += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"]) loss_validation += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"]) data_loss_validation = data_loss_validation/num_batches_validation loss_validation = loss_validation/num_batches_validation ## Same for training for j in range(num_batches): batch = next(batches_iterable) tensor_dict = batch # Already converted data_loss_train += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"]) loss_train += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"]) data_loss_train = data_loss_train/num_batches loss_train = loss_train/num_batches tr_data_loss.append(data_loss_train) val_data_loss.append(data_loss_validation) KL_loss.append(-model.get_KL_loss()) final_loss_tr.append(loss_train) final_loss_val.append(loss_validation) model.train() model.set_posterior_mean(False)
similarity_function=simfunc, projection_feedforward=projection_feedforward, output_feedforward=output_feedforward, output_logit=output_logit) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[("premise", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1, cuda_device=cuda_device) trainer.train() # predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
attention = DotProductAttention() max_decoding_steps = 100 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True).cuda() <<<<<<< HEAD optimizer = optim.Adam(model.parameters(), lr=lr) ======= optimizer = optim.Adam(model.parameters()) >>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768 iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, <<<<<<< HEAD num_epochs=8, ======= validation_dataset=validation_dataset, patience=7, num_epochs=25, >>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768 cuda_device=cuda_id)
def main(): parser = argparse.ArgumentParser(description='Input, output and other configurations') # Old eval on general RC # parser.add_argument('--csv_path', type=str, # default="/Users/xinq/Desktop/lit-review/de-contextualize/output/reproducibility_sentence_output_annotated_Xin_021319.csv") parser.add_argument('--csv_path', type=str, default="output/reproducibility_sentence_output_to_annotate_021919_randomized-Xin.csv") # parser.add_argument('--output', type=str, default="../output/reproducibility_sentence_output_to_annotate_new.csv") # parser.add_argument('--no_extract_candidates', dest='extract_candidates', action='store_false', default=True) parser.add_argument('--csv_test_path', type=str, default="output/reproducibility_sentence.csv") parser.add_argument('--csv_out_path', type=str, default="output/reproducibility_sentence_scored.csv") parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--glove', dest='glove', action='store_true', default=False) parser.add_argument('--small_test', dest='small_test', action='store_true', default=False) parser.add_argument('--model_path',type=str,default="model/model.th") parser.add_argument('--vocab_path',type=str,default="model/vocab.th") parser.add_argument('--embedding_path',type=str,default="model/embedding.th") parser.add_argument('--no_test', dest='no_test', action='store_true',default=False) # parser.add_argument('--split', type=int, default=0) args = parser.parse_args() reader = ReproducibilityClaimDatasetReader() train_dataset = reader.read(args.csv_path) reader.switch_to_test() ## Note: we implemented train/dev split (over the single annotation files that we have) ## Note (cont.) such that unlabelled are automatically considered as dev_dataset. dev_dataset = reader.read(args.csv_path) # Using the same path here if args.small_test or args.no_test: test_dataset = dev_dataset else: test_dataset = reader.read(args.csv_test_path) # The test set contains all sentence from 100 CHI 2018 papers vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # input(vocab._non_padded_namespaces) ## Still confused!! # print(vocab.get_index_to_token_vocabulary("tokens")) ## Output is like {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'the', 3: 'to', 4: 'of', 5: 'and', 6: 'a', 7: 'in', 8: 'that', 9: 'for', 10: 'with' # print(vocab.__dict__) print("Namespaces of vocab are", vocab._token_to_index.keys()) # input("Get label_idx from label "+str(vocab.get_token_index("2","labels"))+str(type(vocab.get_token_index("2","labels")))) # input("Get label_idx from label "+str(vocab.get_token_index("1","labels"))) # input("Get label_idx from label "+str(vocab.get_token_index("0","labels"))) # input() print(vocab.get_vocab_size("tokens"), "vocab.get_vocab_size(tokens") print(vocab.__dict__['_token_to_index'].__dict__['_non_padded_namespaces']) print(vocab.__dict__['_token_to_index'].__dict__['_padded_function']) print(vocab.__dict__['_padding_token']) print(vocab.__dict__['_oov_token']) # input() EMBEDDING_DIM = args.embedding_dim if not args.glove else 100 HIDDEN_DIM = args.hidden_dim # TODO: switch to Glove for now!? (worked on 022119) # If you go back to where we defined our DatasetReader, the default parameters included a single index called "tokens", \ # so our mapping just needs an embedding corresponding to that index. # token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=EMBEDDING_DIM) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) if args.glove: params = Params({"pretrained_file": "output/glove.6B." + str(EMBEDDING_DIM) + "d" + ".txt", "embedding_dim": EMBEDDING_DIM}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM).from_params( vocab=vocab, params=params) # pretrained_file="/Users/xinq/Downloads/glove/glove.6B." + str( # EMBEDDING_DIM) + "d" + ".txt") word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # torch.save(word_embeddings,open("../model/toy","wb")) # word_embeddings=torch.load(open("../model/toy","rb")) lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) # batch_size * seqlen * embedding/hidden model = LSTMClassifier(word_embeddings, lstm, vocab) # TODO: implement self-attention based on paper: (efficiency is also important!) # TODO: Option A: biattention (biattentive classifier) # # Compute biattention. This is a special case since the inputs are the same. # attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) # https://pytorch.org/docs/stable/torch.html#torch.bmm # attention_weights = util.masked_softmax(attention_logits, text_mask) # TODO: confirm where is text_mask -> text_mask = util.get_text_field_mask(tokens).float() # encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # function https://github.com/allenai/allennlp/blob/6d8da97312bfbde05a41558668ff63d92a9928e9/allennlp/nn/util.py#L530 # TODO: Option B: Bilinear attention # Bilinear matrix attention (对吗???) ``X W Y^T + b``. W=weight # intermediate = torch.matmul(matrix_1.unsqueeze(1), weight) # final = torch.matmul(intermediate, matrix_2.unsqueeze(1).transpose(2, 3)) # return self._activation(final.squeeze(1) + self._bias) # # TODO (cont.) a structured self-attentive sentence embedding https://arxiv.org/pdf/1703.03130.pdf # optimizer = optim.SGD(model.parameters(), lr=0.1) # optimizer=optim.Adam(model.parameters,lr=0.1) optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) # current setting that coverges on train: optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) # sort by num_tokens iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, # validation_dataset=None, # validation_dataset=train_dataset, patience=10, num_epochs=15) # 10 # seems that w/ Glove 20 will be better... trainer.train() predictor = SentenceTaggerPredictor(model, dataset_reader=reader) # SentenceTagger shares the same logic as sentence classification predictor ''' allennlp/allennlp/commands/predict.py The ``predict`` subcommand allows you to make bulk JSON-to-JSON or dataset to JSON predictions using a trained model and its :class:`~allennlp.service.predictors.predictor.Predictor` wrapper. ''' if not args.no_test: sents = [] delimiter = "pdf_" # for line in open(args.csv_test_path) for instance in tqdm(test_dataset): # Loop over every single instance on test_dataset # print(instance.fields['tokens']['tokens'].__dict__) # print((instance.fields['tokens'][0].__dict__)) # NOTE: stop here # input() prediction = predictor.predict_instance(instance) # logits = prediction['logits'] # print(logits) softmax = prediction['softmax'] # print(softmax) # input() # label_id = np.argmax(logits) pos_label_idx = vocab.get_token_index("2", "labels") # getting the corresponding dimension integer idx for label "2" pos_score = softmax[pos_label_idx] # print("metadata for this instance",instance.fields['metadata']['sent_id'],type(instance.fields['metadata']['sent_id'])) # print(str(instance.fields['tokens'])) # print(instance.fields['tokens'].get_text()) # input() # input(type(instance.fields['tokens'])) # input(instance.fields['tokens']) # sents.append({"paperID": instance.fields['metadata']['sent_id'].split(delimiter)[0], "sent_pos": int( # instance.fields['metadata']['sent_id'].split(delimiter)[1]), "text": instance.fields['tokens'].get_text(), # "pos_score": float(pos_score)}) sents.append({"paperID": instance.fields['metadata']['sent_id'].split(delimiter)[0], "sent_pos": int( instance.fields['metadata']['sent_id'].split(delimiter)[1]), "text": instance.fields['metadata']['text'], "pos_score": float(pos_score)}) # write output into a .csv file. Takes about 2 mins df = pd.DataFrame(sents) # TODO: change the sort_values criteria when we generate the eval plot # df = df.sort_values(by=['paperID', 'pos_score'], ascending=False) df = df.sort_values(by=['pos_score'], ascending=False) df.to_csv(args.csv_out_path) # print("label_id=np.argmax(logits)", pos_label_idx, model.vocab.get_token_from_index(label_id, 'labels')) # print(instance.__dict__) # print(type(instance)) # logits = predictor.predict("We allow participants to speak out loud.")['logits'] # label_id=np.argmax(logits) # print("label_id=np.argmax(logits)",label_id, model.vocab.get_token_from_index(label_id, 'labels')) # tag_ids = np.argmax(tag_logits, axis=-1) # print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) # # Here's how to save the model. with open(args.model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(args.vocab_path) torch.save(word_embeddings,open(args.embedding_path,"wb"))
def train(): reader = PWKPReader() train_dataset = reader.read(train_path) valid_dataset = reader.read(dev_path) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances(instances=train_dataset, max_vocab_size=opt.vocab_size) vocab.save_to_files(vocab_dir) iterator = BucketIterator(batch_size=opt.batch_size, sorting_keys=[("src", "num_tokens"), ("tgt", "num_tokens")]) iterator.index_with(vocab) model = Seq2Seq(emb_size=opt.emb_size, hidden_size=opt.hidden_size, enc_layers=opt.enc_layers, dec_layers=opt.dec_layers, dropout=opt.dropout, bidirectional=opt.bidirectional, beam_size=opt.beam_size, label_smoothing=opt.label_smoothing, vocab=vocab) optimizer = optim.Adam(model.parameters(), lr=opt.lr) #learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=opt.lr_decay) val_iterator = BasicIterator(batch_size=opt.batch_size) val_iterator.index_with(vocab) predictor = Predictor(iterator=val_iterator, max_decoding_step=opt.max_step, vocab=vocab, reader=reader, data_path=test_path, log_dir=save_dir, map_path=ner_path, cuda_device=opt.gpu) trainer = Trainer( model=model, optimizer=optimizer, #learning_rate_scheduler=learning_rate_scheduler, learning_rate_decay=opt.lr_decay, ema_decay=opt.ema_decay, predictor=predictor, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, validation_metric='+bleu', cuda_device=opt.gpu, num_epochs=opt.epoch, serialization_dir=save_dir, num_serialized_models_to_keep=5, #model_save_interval=60, #summary_interval=500, should_log_parameter_statistics=False, grad_norm=10) trainer.train()
UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder) for folder in [train_dataset_folder, validation_dataset_folder]) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances( itertools.chain(train_ds, validation_ds)) vocab.save_to_files(vocab_dir) vocab_namespaces = vocab._index_to_token.keys() max_vocab_size = max( [vocab.get_vocab_size(namespace) for namespace in vocab_namespaces]) iterator = BucketIterator( batch_size=batch_size, # This is for testing. To see how big of batch size the GPU can handle. biggest_batch_first=True, sorting_keys=[("tokens", "num_tokens")], ) iterator.index_with(vocab) linguistic_features_embedding = Embedding( num_embeddings=max_vocab_size + 2, embedding_dim=linguistic_features_embedding_dim, # padding_index=0 I do not understand what is does ) bert_embedder = PretrainedBertEmbedder( pretrained_model=bert_mode, top_layer_only=False, requires_grad=bert_finetuning, ) word_embedder = BasicTextFieldEmbedder(
def run_model(args): st_ds_conf = get_updated_settings(args) reader = data_adapter.GeoQueryDatasetReader() training_set = reader.read(config.DATASETS[args.dataset].train_path) try: validation_set = reader.read(config.DATASETS[args.dataset].dev_path) except: validation_set = None vocab = allennlp.data.Vocabulary.from_instances(training_set) model = get_model(vocab, st_ds_conf) device_tag = "cpu" if config.DEVICE < 0 else f"cuda:{config.DEVICE}" if args.models: model.load_state_dict( torch.load(args.models[0], map_location=device_tag)) if not args.test or not args.models: iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens") ], batch_size=st_ds_conf['batch_sz']) iterator.index_with(vocab) optim = torch.optim.Adam(model.parameters(), lr=config.ADAM_LR, betas=config.ADAM_BETAS, eps=config.ADAM_EPS) savepath = os.path.join( config.SNAPSHOT_PATH, args.dataset, 'base_s2s', datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" + args.memo) if not os.path.exists(savepath): os.makedirs(savepath, mode=0o755) trainer = allennlp.training.Trainer( model=model, optimizer=optim, iterator=iterator, train_dataset=training_set, validation_dataset=validation_set, serialization_dir=savepath, cuda_device=config.DEVICE, num_epochs=config.TRAINING_LIMIT, grad_clipping=config.GRAD_CLIPPING, ) trainer.train() else: testing_set = reader.read(config.DATASETS[args.dataset].test_path) model.eval() if config.DEVICE > -1: model = model.cuda(config.DEVICE) predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader) for instance in tqdm.tqdm(testing_set, total=len(testing_set)): print('SRC: ', instance.fields['source_tokens'].tokens) print( 'GOLD:', ' '.join( str(x) for x in instance.fields['target_tokens'].tokens[1:-1])) del instance.fields['target_tokens'] output = predictor.predict_instance(instance) print('PRED:', ' '.join(output['predicted_tokens']))
def run(trainp="overnight/calendar_train_delex.tsv", testp="overnight/calendar_test_delex.tsv", batsize=8, embdim=50, encdim=50, maxtime=100, lr=.001, gpu=0, cuda=False, epochs=20): device = torch.device("cuda", gpu) if cuda else torch.device("cpu") tt = q.ticktock("script") tt.tick("loading data") def tokenizer(x: str, splitter: WordSplitter = None) -> List[str]: return [xe.text for xe in splitter.split_words(x)] reader = OvernightReader( partial(tokenizer, splitter=JustSpacesWordSplitter()), partial(tokenizer, splitter=JustSpacesWordSplitter()), SingleIdTokenIndexer(namespace="nl_tokens"), SingleIdTokenIndexer(namespace="fl_tokens")) trainds = reader.read(trainp) testds = reader.read(testp) tt.tock("data loaded") tt.tick("building vocabulary") vocab = Vocabulary.from_instances(trainds) tt.tock("vocabulary built") tt.tick("making iterator") iterator = BucketIterator(sorting_keys=[("nl", "num_tokens"), ("fl", "num_tokens")], batch_size=batsize, biggest_batch_first=True) iterator.index_with(vocab) batch = next(iter(iterator(trainds))) #print(batch["id"]) #print(batch["nl"]) tt.tock("made iterator") # region model nl_emb = Embedding(vocab.get_vocab_size(namespace="nl_tokens"), embdim, padding_index=0) fl_emb = Embedding(vocab.get_vocab_size(namespace="fl_tokens"), embdim, padding_index=0) nl_field_emb = BasicTextFieldEmbedder({"tokens": nl_emb}) fl_field_emb = BasicTextFieldEmbedder({"tokens": fl_emb}) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(embdim, encdim, bidirectional=True, batch_first=True)) attention = DotProductAttention() smodel = Seq2Seq(vocab, nl_field_emb, encoder, maxtime, target_embedding_dim=embdim, attention=attention, target_namespace='fl_tokens', beam_size=1, use_bleu=True) smodel_out = smodel(batch["nl"], batch["fl"]) smodel.to(device) optim = torch.optim.Adam(smodel.parameters(), lr=lr) trainer = Trainer(model=smodel, optimizer=optim, iterator=iterator, train_dataset=trainds, validation_dataset=testds, num_epochs=epochs, cuda_device=gpu if cuda else -1) metrics = trainer.train() sys.exit() class MModel(Model): def __init__(self, nlemb: Embedding, flemb: Embedding, vocab: Vocabulary, **kwargs): super(MModel, self).__init__(vocab, **kwargs) self.nlemb, self.flemb = nlemb, flemb @overrides def forward(self, nl: Dict[str, torch.Tensor], fl: Dict[str, torch.Tensor], id: Any): nlemb = self.nlemb(nl["tokens"]) flemb = self.flemb(fl["tokens"]) print(nlemb.size()) pass m = MModel(nl_emb, fl_emb, vocab) batch = next(iter(iterator(trainds))) out = m(**batch)
def main(param2val): # params params = Params.from_param2val(param2val) print(params, flush=True) # paths project_path = Path(param2val['project_path']) save_path = Path(param2val['save_path']) srl_eval_path = project_path / 'perl' / 'srl-eval.pl' data_path_mlm = project_path / 'data' / 'training' / f'{params.corpus_name}_mlm.txt' data_path_train_srl = project_path / 'data' / 'training' / f'{params.corpus_name}_no-dev_srl.txt' data_path_devel_srl = project_path / 'data' / 'training' / f'human-based-2018_srl.txt' data_path_test_srl = project_path / 'data' / 'training' / f'human-based-2008_srl.txt' childes_vocab_path = project_path / 'data' / f'{params.corpus_name}_vocab.txt' google_vocab_path = project_path / 'data' / 'bert-base-cased.txt' # to get word pieces # word-piece tokenizer - defines input vocabulary vocab = load_vocab(childes_vocab_path, google_vocab_path, params.vocab_size) # TODO testing google vocab with wordpieces assert vocab['[PAD]'] == 0 # AllenNLP expects this assert vocab['[UNK]'] == 1 # AllenNLP expects this assert vocab['[CLS]'] == 2 assert vocab['[SEP]'] == 3 assert vocab['[MASK]'] == 4 wordpiece_tokenizer = WordpieceTokenizer(vocab) print(f'Number of types in vocab={len(vocab):,}') # load utterances for MLM task utterances = load_utterances_from_file(data_path_mlm) train_utterances, devel_utterances, test_utterances = split(utterances) # load propositions for SLR task propositions = load_propositions_from_file(data_path_train_srl) train_propositions, devel_propositions, test_propositions = split( propositions) if data_path_devel_srl.is_file( ): # use human-annotated data as devel split print(f'Using {data_path_devel_srl.name} as SRL devel split') devel_propositions = load_propositions_from_file(data_path_devel_srl) if data_path_test_srl.is_file(): # use human-annotated data as test split print(f'Using {data_path_test_srl.name} as SRL test split') test_propositions = load_propositions_from_file(data_path_test_srl) # converters handle conversion from text to instances converter_mlm = ConverterMLM(params, wordpiece_tokenizer) converter_srl = ConverterSRL(params, wordpiece_tokenizer) # get output_vocab # note: Allen NLP vocab holds labels, wordpiece_tokenizer.vocab holds input tokens # what from_instances() does: # 1. it iterates over all instances, and all fields, and all token indexers # 2. the token indexer is used to update vocabulary count, skipping words whose text_id is already set # 4. a PADDING and MASK symbol are added to 'tokens' namespace resulting in vocab size of 2 # input tokens are not indexed, as they are already indexed by bert tokenizer vocab. # this ensures that the model is built with inputs for all vocab words, # such that words that occur only in LM or SRL task can still be input # make instances once - this allows iterating multiple times (required when num_epochs > 1) train_instances_mlm = converter_mlm.make_instances(train_utterances) devel_instances_mlm = converter_mlm.make_instances(devel_utterances) test_instances_mlm = converter_mlm.make_instances(test_utterances) train_instances_srl = converter_srl.make_instances(train_propositions) devel_instances_srl = converter_srl.make_instances(devel_propositions) test_instances_srl = converter_srl.make_instances(test_propositions) all_instances_mlm = chain(train_instances_mlm, devel_instances_mlm, test_instances_mlm) all_instances_srl = chain(train_instances_srl, devel_instances_srl, test_instances_srl) # make vocab from all instances output_vocab_mlm = Vocabulary.from_instances(all_instances_mlm) output_vocab_srl = Vocabulary.from_instances(all_instances_srl) # print(f'mlm vocab size={output_vocab_mlm.get_vocab_size()}') # contain just 2 tokens # print(f'srl vocab size={output_vocab_srl.get_vocab_size()}') # contain just 2 tokens assert output_vocab_mlm.get_vocab_size( 'tokens') == output_vocab_srl.get_vocab_size('tokens') # BERT print('Preparing Multi-task BERT...') input_vocab_size = len(converter_mlm.wordpiece_tokenizer.vocab) bert_config = BertConfig( vocab_size_or_config_json_file=input_vocab_size, # was 32K hidden_size=params.hidden_size, # was 768 num_hidden_layers=params.num_layers, # was 12 num_attention_heads=params.num_attention_heads, # was 12 intermediate_size=params.intermediate_size) # was 3072 bert_model = BertModel(config=bert_config) # Multi-tasking BERT mt_bert = MTBert(vocab_mlm=output_vocab_mlm, vocab_srl=output_vocab_srl, bert_model=bert_model, embedding_dropout=params.embedding_dropout) mt_bert.cuda() num_params = sum(p.numel() for p in mt_bert.parameters() if p.requires_grad) print('Number of model parameters: {:,}'.format(num_params), flush=True) # optimizers optimizer_mlm = BertAdam(params=mt_bert.parameters(), lr=params.lr) optimizer_srl = BertAdam(params=mt_bert.parameters(), lr=params.lr) move_optimizer_to_cuda(optimizer_mlm) move_optimizer_to_cuda(optimizer_srl) # batching bucket_batcher_mlm = BucketIterator(batch_size=params.batch_size, sorting_keys=[('tokens', "num_tokens") ]) bucket_batcher_mlm.index_with(output_vocab_mlm) bucket_batcher_srl = BucketIterator(batch_size=params.batch_size, sorting_keys=[('tokens', "num_tokens") ]) bucket_batcher_srl.index_with(output_vocab_srl) # big batcher to speed evaluation - 1024 is too big bucket_batcher_mlm_large = BucketIterator(batch_size=512, sorting_keys=[('tokens', "num_tokens")]) bucket_batcher_srl_large = BucketIterator(batch_size=512, sorting_keys=[('tokens', "num_tokens")]) bucket_batcher_mlm_large.index_with(output_vocab_mlm) bucket_batcher_srl_large.index_with(output_vocab_srl) # init performance collection name2col = { 'devel_pps': [], 'devel_f1s': [], } # init eval_steps = [] train_start = time.time() loss_mlm = None no_mlm_batches = False step = 0 # generators train_generator_mlm = bucket_batcher_mlm(train_instances_mlm, num_epochs=params.num_mlm_epochs) train_generator_srl = bucket_batcher_srl( train_instances_srl, num_epochs=None) # infinite generator num_train_mlm_batches = bucket_batcher_mlm.get_num_batches( train_instances_mlm) if params.srl_interleaved: max_step = num_train_mlm_batches else: max_step = num_train_mlm_batches * 2 print(f'Will stop training at step={max_step:,}') while step < max_step: # TRAINING if step != 0: # otherwise evaluation at step 0 is influenced by training on one batch mt_bert.train() # masked language modeling task try: batch_mlm = next(train_generator_mlm) except StopIteration: if params.srl_interleaved: break else: no_mlm_batches = True else: loss_mlm = mt_bert.train_on_batch('mlm', batch_mlm, optimizer_mlm) # semantic role labeling task if params.srl_interleaved: if random.random() < params.srl_probability: batch_srl = next(train_generator_srl) mt_bert.train_on_batch('srl', batch_srl, optimizer_srl) elif no_mlm_batches: batch_srl = next(train_generator_srl) mt_bert.train_on_batch('srl', batch_srl, optimizer_srl) # EVALUATION if step % config.Eval.interval == 0: mt_bert.eval() eval_steps.append(step) # evaluate perplexity devel_generator_mlm = bucket_batcher_mlm_large(devel_instances_mlm, num_epochs=1) devel_pp = evaluate_model_on_pp(mt_bert, devel_generator_mlm) name2col['devel_pps'].append(devel_pp) print(f'devel-pp={devel_pp}', flush=True) # test sentences if config.Eval.test_sentences: test_generator_mlm = bucket_batcher_mlm_large( test_instances_mlm, num_epochs=1) out_path = save_path / f'test_split_mlm_results_{step}.txt' predict_masked_sentences(mt_bert, test_generator_mlm, out_path) # probing - test sentences for specific syntactic tasks for name in config.Eval.probing_names: # prepare data probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt' if not probing_data_path_mlm.exists(): print(f'WARNING: {probing_data_path_mlm} does not exist') continue probing_utterances_mlm = load_utterances_from_file( probing_data_path_mlm) # check that probing words are in vocab for u in probing_utterances_mlm: # print(u) for w in u: if w == '[MASK]': continue # not in output vocab # print(w) assert output_vocab_mlm.get_token_index( w, namespace='labels'), w # probing + save results to text probing_instances_mlm = converter_mlm.make_probing_instances( probing_utterances_mlm) probing_generator_mlm = bucket_batcher_mlm( probing_instances_mlm, num_epochs=1) out_path = save_path / f'probing_{name}_results_{step}.txt' predict_masked_sentences(mt_bert, probing_generator_mlm, out_path, print_gold=False, verbose=True) # evaluate devel f1 devel_generator_srl = bucket_batcher_srl_large(devel_instances_srl, num_epochs=1) devel_f1 = evaluate_model_on_f1(mt_bert, srl_eval_path, devel_generator_srl) name2col['devel_f1s'].append(devel_f1) print(f'devel-f1={devel_f1}', flush=True) # console min_elapsed = (time.time() - train_start) // 60 pp = torch.exp(loss_mlm) if loss_mlm is not None else np.nan print( f'step {step:<6,}: pp={pp :2.4f} total minutes elapsed={min_elapsed:<3}', flush=True) # only increment step once in each iteration of the loop, otherwise evaluation may never happen step += 1 # evaluate train perplexity if config.Eval.train_split: generator_mlm = bucket_batcher_mlm_large(train_instances_mlm, num_epochs=1) train_pp = evaluate_model_on_pp(mt_bert, generator_mlm) else: train_pp = np.nan print(f'train-pp={train_pp}', flush=True) # evaluate train f1 if config.Eval.train_split: generator_srl = bucket_batcher_srl_large(train_instances_srl, num_epochs=1) train_f1 = evaluate_model_on_f1(mt_bert, srl_eval_path, generator_srl, print_tag_metrics=True) else: train_f1 = np.nan print(f'train-f1={train_f1}', flush=True) # test sentences if config.Eval.test_sentences: test_generator_mlm = bucket_batcher_mlm(test_instances_mlm, num_epochs=1) out_path = save_path / f'test_split_mlm_results_{step}.txt' predict_masked_sentences(mt_bert, test_generator_mlm, out_path) # probing - test sentences for specific syntactic tasks for name in config.Eval.probing_names: # prepare data probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt' if not probing_data_path_mlm.exists(): print(f'WARNING: {probing_data_path_mlm} does not exist') continue probing_utterances_mlm = load_utterances_from_file( probing_data_path_mlm) probing_instances_mlm = converter_mlm.make_probing_instances( probing_utterances_mlm) # batch and do inference probing_generator_mlm = bucket_batcher_mlm(probing_instances_mlm, num_epochs=1) out_path = save_path / f'probing_{name}_results_{step}.txt' predict_masked_sentences(mt_bert, probing_generator_mlm, out_path, print_gold=False, verbose=True) # put train-pp and train-f1 into pandas Series s1 = pd.Series([train_pp], index=[eval_steps[-1]]) s1.name = 'train_pp' s2 = pd.Series([train_f1], index=[eval_steps[-1]]) s2.name = 'train_f1' # return performance as pandas Series series_list = [s1, s2] for name, col in name2col.items(): print(f'Making pandas series with name={name} and length={len(col)}') s = pd.Series(col, index=eval_steps) s.name = name series_list.append(s) return series_list
class SpanBasedModelForAtsa(ModelTrainTemplate.ModelTrainTemplate): """ 2019-acl-Open-Domain Targeted Sentiment Analysisvia Span-Based Extraction and Classification """ def __init__(self, configuration): super().__init__(configuration) self.data_reader = None self.train_data = None self.dev_data = None self.test_data = None self.hard_test_data = None self.distinct_categories = None self.distinct_polarities = None self._load_data() if self.configuration['debug']: self.train_data = self.train_data[:128] self.dev_data = self.dev_data[:128] self.test_data = self.test_data[:128] self.vocab = None self._build_vocab() self.iterator = None self.val_iterator = None self._build_iterator() def _get_data_reader(self): token_indexer = SingleIdTokenIndexer(namespace="tokens") position_indexer = SingleIdTokenIndexer(namespace='position') reader = atsa_data_reader.TextAspectInSentimentOut( self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration) return reader def _load_data(self): data_filepath = self.base_data_dir + 'data' if os.path.exists(data_filepath): self.train_data, self.dev_data, self.test_data, self.distinct_polarities, max_aspect_term_num = \ super()._load_object(data_filepath) reader = self._get_data_reader() self.data_reader = reader self.configuration['max_aspect_term_num'] = max_aspect_term_num else: train_dev_test_data, distinct_polarities = self.dataset.generate_atsa_data( ) if self.configuration['data_augmentation']: augment_data_filepath = self.dataset.conceptnet_augment_data_filepath with open(augment_data_filepath, mode='rb') as input_file: augment_data = pickle.load(input_file) distinct_polarities_new = [] for polarity in distinct_polarities: if polarity != 'conflict': distinct_polarities_new.append(polarity) self.distinct_polarities = distinct_polarities_new train_dev_test_data_label_indexed = {} max_aspect_term_num = -1 for data_type, data in train_dev_test_data.items(): if data is None: continue data_new = [] for sample in data: sample_new = [sample[0]] labels_new = [] for label in sample[1]: if label.polarity == 'conflict': continue else: labels_new.append(label) if len(labels_new) != 0: max_aspect_term_num = max(max_aspect_term_num, len(labels_new)) labels_new.sort(key=lambda x: x.from_index) sample_new.append(labels_new) data_new.append(sample_new) train_dev_test_data_label_indexed[data_type] = data_new if self.configuration['sample_mode'] == 'single': max_aspect_term_num = 1 self.configuration['max_aspect_term_num'] = max_aspect_term_num self.model_meta_data['max_aspect_term_num'] = max_aspect_term_num reader = self._get_data_reader() self.data_reader = reader self.train_data = reader.read( train_dev_test_data_label_indexed['train']) self.dev_data = reader.read( train_dev_test_data_label_indexed['dev']) self.test_data = reader.read( train_dev_test_data_label_indexed['test']) data = [ self.train_data, self.dev_data, self.test_data, self.distinct_polarities, max_aspect_term_num ] super()._save_object(data_filepath, data) def _build_vocab(self): if self.configuration['train']: vocab_file_path = self.base_data_dir + 'vocab' if os.path.exists(vocab_file_path): self.vocab = super()._load_object(vocab_file_path) else: data = self.train_data + self.dev_data + self.test_data self.vocab = Vocabulary.from_instances( data, max_vocab_size=sys.maxsize) super()._save_object(vocab_file_path, self.vocab) self.model_meta_data['vocab'] = self.vocab else: self.vocab = self.model_meta_data['vocab'] def _build_iterator(self): self.iterator = BucketIterator( batch_size=self.configuration['batch_size'], sorting_keys=[("tokens", "num_tokens")], ) self.iterator.index_with(self.vocab) self.val_iterator = BasicIterator( batch_size=self.configuration['batch_size']) self.val_iterator.index_with(self.vocab) def _print_args(self, model): n_trainable_params, n_nontrainable_params = 0, 0 for p in model.parameters(): n_params = torch.prod(torch.tensor(p.shape)).item() if p.requires_grad: n_trainable_params += n_params else: n_nontrainable_params += n_params self.logger.info( 'n_trainable_params: {0}, n_nontrainable_params: {1}'.format( n_trainable_params, n_nontrainable_params)) self.logger.info('> training arguments:') for arg in self.configuration.keys(): self.logger.info('>>> {0}: {1}'.format(arg, self.configuration[arg])) def _find_model_function_pure(self): return pytorch_models.SpanBasedModel def _get_position_embeddings_dim(self): return 300 def _is_train_token_embeddings(self): return False def _find_model_function(self): embedding_dim = self.configuration['embed_size'] embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix' if os.path.exists(embedding_matrix_filepath): embedding_matrix = super()._load_object(embedding_matrix_filepath) else: embedding_filepath = self.configuration['embedding_filepath'] embedding_matrix = embedding._read_embeddings_from_text_file( embedding_filepath, embedding_dim, self.vocab, namespace='tokens') super()._save_object(embedding_matrix_filepath, embedding_matrix) embedding_matrix = embedding_matrix.to(self.configuration['device']) token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size(namespace='tokens'), embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens', trainable=self._is_train_token_embeddings(), weight=embedding_matrix) # the embedder maps the input tokens to the appropriate embedding matrix word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) position_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size(namespace='position'), embedding_dim=self._get_position_embeddings_dim(), padding_index=0) position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {"position": position_embedding}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) model_function = self._find_model_function_pure() model = model_function( word_embedder, position_embedder, self.distinct_polarities, self.vocab, self.configuration, ) self._print_args(model) model = model.to(self.configuration['device']) return model def _get_optimizer(self, model): _params = filter(lambda p: p.requires_grad, model.parameters()) return optim.Adam(_params, lr=0.001, weight_decay=0.00001) def _get_estimator(self, model): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 estimator = pytorch_models.SpanBasedModelEstimator( self.model, self.val_iterator, self.distinct_polarities, cuda_device=gpu_id, configuration=self.configuration) return estimator def _get_estimate_callback(self, model): result = [] data_type_and_data = { 'train': self.train_data, 'dev': self.dev_data, 'test': self.test_data } estimator = self._get_estimator(model) estimate_callback = allennlp_callback.EstimateCallback( data_type_and_data, estimator, self.logger) result.append(estimate_callback) return result def _get_loss_weight_callback(self): result = [] set_loss_weight_callback = allennlp_callback.SetLossWeightCallback( self.model, self.logger, acd_warmup_epoch_num=self._get_acd_warmup_epoch_num()) result.append(set_loss_weight_callback) return result def _get_fixed_loss_weight_callback(self, model, category_loss_weight=1, sentiment_loss_weight=1): result = [] fixed_loss_weight_callback = allennlp_callback.FixedLossWeightCallback( model, self.logger, category_loss_weight=category_loss_weight, sentiment_loss_weight=sentiment_loss_weight) result.append(fixed_loss_weight_callback) return result def _get_bert_word_embedder(self): return None def _inner_train(self): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 self.model = self._find_model_function() estimator = self._get_estimator(self.model) callbacks = self._get_estimate_callback(self.model) validation_metric = '+accuracy' self.logger.info('validation_metric: %s' % validation_metric) optimizer = self._get_optimizer(self.model) trainer = Trainer(model=self.model, optimizer=optimizer, iterator=self.iterator, train_dataset=self.train_data, validation_dataset=self.dev_data, cuda_device=gpu_id, num_epochs=self.configuration['epochs'], validation_metric=validation_metric, validation_iterator=self.val_iterator, serialization_dir=self.model_dir, patience=self.configuration['patience'], callbacks=callbacks, num_serialized_models_to_keep=0, early_stopping_by_batch=self. configuration['early_stopping_by_batch'], estimator=estimator, grad_clipping=5) metrics = trainer.train() self.logger.info('metrics: %s' % str(metrics)) def _save_model(self): torch.save(self.model, self.best_model_filepath) def _load_model(self): if torch.cuda.is_available(): self.model = torch.load(self.best_model_filepath) else: self.model = torch.load(self.best_model_filepath, map_location=torch.device('cpu')) self.model.configuration = self.configuration def evaluate(self): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 estimator = pytorch_models.SpanBasedModelEstimator( self.model, self.val_iterator, self.distinct_polarities, configuration=self.configuration, cuda_device=gpu_id) data_type_and_data = { 'train': self.train_data, 'dev': self.dev_data, 'test': self.test_data } if self.hard_test_data: data_type_and_data['hard_test'] = self.hard_test_data for data_type, data in data_type_and_data.items(): result = estimator.estimate(data) self.logger.info('data_type: %s result: %s' % (data_type, result)) def predict_backup(self): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 predictor = pytorch_models.SpanBasedModelPredictor( self.model, self.val_iterator, self.distinct_polarities, configuration=self.configuration, cuda_device=gpu_id) data_type_and_data = { # 'train': self.train_data, # 'dev': self.dev_data, 'test': self.test_data } if self.hard_test_data: data_type_and_data['hard_test'] = self.hard_test_data for data_type, data_temp in data_type_and_data.items(): # for multi data = [] for instance in data_temp: aspect_terms = instance.fields['sample'].metadata[ 'aspect_terms'] if len(aspect_terms) != 2: continue data.append(instance) # text = instance.fields['sample'].metadata['text'] # # i love the keyboard and the screen. () # # The best thing about this laptop is the price along with some of the newer features. # if 'that any existing MagSafe' in text: # data.append(instance) # break result = predictor.predict(data) correct_sentences = [] for e in result: sentiment_outputs_for_aspect_terms = e[ 'sentiment_outputs_for_aspect_terms'] aspect_terms = e['aspect_terms'] for i in range(len(aspect_terms)): if aspect_terms[ i].polarity != sentiment_outputs_for_aspect_terms[ i][1]: break else: correct_sentences.append(e['text']) file_utils.write_lines(correct_sentences, 'd:/correct_sentences.txt') self.logger.info('data_type: %s result: %s' % (data_type, result)) def predict(self): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 predictor = pytorch_models.SpanBasedModelPredictor( self.model, self.val_iterator, self.distinct_polarities, configuration=self.configuration, cuda_device=gpu_id) data_type_and_data = { # 'train': self.train_data, # 'dev': self.dev_data, 'test': self.test_data } if self.hard_test_data: data_type_and_data['hard_test'] = self.hard_test_data for data_type, data_temp in data_type_and_data.items(): # for multi correct_sentences = file_utils.read_all_lines( 'd:/correct_sentences.txt') for sentence in correct_sentences: data = [] for instance in data_temp: text = instance.fields['sample'].metadata['text'] # i love the keyboard and the screen. () # The best thing about this laptop is the price along with some of the newer features. if sentence in text: data.append(instance) result = predictor.predict(data) if result[0]['aspect_terms'][0].polarity == 'neutral' or result[ 1]['aspect_terms'][0].polarity == 'neutral': continue for e in result: sentiment_outputs_for_aspect_terms = e[ 'sentiment_outputs_for_aspect_terms'] aspect_terms = e['aspect_terms'] for i in range(len(aspect_terms)): if aspect_terms[i].polarity != 'neutral' and aspect_terms[ i].polarity != sentiment_outputs_for_aspect_terms[ i][1]: print() def predict_test(self, output_filepath): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 predictor = pytorch_models.SpanBasedModelPredictor( self.model, self.val_iterator, self.distinct_polarities, configuration=self.configuration, cuda_device=gpu_id) data = self.test_data result = predictor.predict(data) output_lines = [] for sample in result: text = sample['text'] words_for_test = text.split(' ') aspect_terms = sample['aspect_terms'] word_indices_of_aspect_terms = [] for aspect_term in aspect_terms: from_index = aspect_term.from_index term = aspect_term.term start_index = 0 if from_index > 0: start_index = len(text[:from_index].strip().split(' ')) term_length = len(term.split(' ')) word_indices_of_aspect_terms.append( [start_index, start_index + term_length]) sentiment_outputs_for_aspect_terms = sample[ 'sentiment_outputs_for_aspect_terms'] for i in range(len(word_indices_of_aspect_terms)): term = aspect_terms[i].term word_indices = word_indices_of_aspect_terms[i] if term != ' '.join( words_for_test[word_indices[0]:word_indices[1]]): print('error') sentiment = sentiment_outputs_for_aspect_terms[i][1] output_line = json.dumps({ 'text': text, 'aspect_term': '%s-%d-%d' % (term, word_indices[0], word_indices[1]), 'sentiment': sentiment }) output_lines.append(output_line) file_utils.write_lines(output_lines, output_filepath)
def predict(cuda_device: int, char_encoder: str, data_dir: Path, glove_path: Path, temp_dir: Path, random_seed: int = 13370, numpy_seed: int = 1337, torch_seed: int = 133) -> List[Tuple[float, float, str]]: ''' This allows you to train an NER model that has either a CNN character encoder or LSTM based on the `char_encoder` argument. The encoded characters are then combined with 100D Glove vectors and put through a Bi-Directional LSTM. This is based on the following two papers: 1. CNN character encoder version `Ma and Hovy \ <https://arxiv.org/abs/1603.01354>`_ 2. LSTM character encoder version `Lample et al. \ <https://arxiv.org/abs/1603.01360>`_ :param cuda_device: Whether to use GPU or CPU, CPU = -1, GPU = 0 :param char_encoder: Whether to use an LSTM or CNN. Acceptable values are: 1. lstm, 2. cnn :param data_dir: A file path to a directory that contains three files: 1. train.txt, 2. dev.txt, 3. test.txt that are the train, dev, and test files respectively in CONLL 2003 format where the NER labels are in BIO format. :param glove_path: A file path to the `Glove 6 billion word vectors 100D \ <https://nlp.stanford.edu/projects/glove/>`_ :returns: The results as a list of tuples which are (dev f1 score, test f1 score, char encoder) where the list represents a different trained model using the same train, dev, and test split but different random seed. ''' # # The dataset we are using has already been formatted from IOB1 to BIO # When reading the dataset state the coding is the orignal as this will not # affect the labels i.e. the labels and schema is not checked. label_encoding = 'BIO' constrain_crf_decoding = True dropout = 0.5 char_embedding_dim = 30 cnn_window_size = (3, ) cnn_filters = 50 cnn_output_dim = len(cnn_window_size) * cnn_filters lstm_char_dim = 25 lstm_char_output_dim = lstm_char_dim * 2 word_embedding_dim = 100 # LSTM size is that of Ma and Hovy lstm_dim = 100 # Dropout applies dropout after the encoded text and after the word embedding. #tensorboard_dir = Path('..', 'tensorboard ner') #tensorboard_dir.mkdir(parents=True, exist_ok=True) #train_log = SummaryWriter(Path(tensorboard_dir, "log", "train")) #validation_log = SummaryWriter(Path(tensorboard_dir, "log", "validation")) train_fp = Path(data_dir, 'train.txt') dev_fp = Path(data_dir, 'dev.txt') test_fp = Path(data_dir, 'test.txt') result_fp = Path(data_dir, 'results.json') result_data = [] if result_fp.exists(): with result_fp.open('r') as json_file: result_data = json.load(json_file) indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True), 'chars': TokenCharactersIndexer(namespace='token_characters') } conll_reader = Conll2003DatasetReader(token_indexers=indexers) train_dataset = conll_reader.read(cached_path(train_fp)) dev_dataset = conll_reader.read(cached_path(dev_fp)) test_dataset = conll_reader.read(cached_path(test_fp)) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=char_embedding_dim) if char_encoder.strip().lower() == 'lstm': character_lstm = torch.nn.LSTM(char_embedding_dim, lstm_char_dim, batch_first=True, bidirectional=True) character_lstm_wrapper = PytorchSeq2VecWrapper(character_lstm) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_lstm_wrapper) total_char_embedding_dim = lstm_char_output_dim elif char_encoder.strip().lower() == 'cnn': character_cnn = CnnEncoder(embedding_dim=char_embedding_dim, num_filters=cnn_filters, ngram_filter_sizes=cnn_window_size, output_dim=cnn_output_dim) token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) total_char_embedding_dim = cnn_output_dim else: raise ValueError('The Character encoder can only be `lstm` or `cnn` ' f'and not {char_encoder}') glove_path = cached_path(glove_path) glove_100_weights = _read_pretrained_embeddings_file( glove_path, word_embedding_dim, vocab, 'tokens') token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=word_embedding_dim, weight=glove_100_weights) word_embeddings = BasicTextFieldEmbedder({ "tokens": token_embedding, "chars": token_character_encoder }) total_embedding_dim = word_embedding_dim + total_char_embedding_dim lstm = torch.nn.LSTM(total_embedding_dim, lstm_dim, batch_first=True, bidirectional=True) lstm_wrapper = PytorchSeq2SeqWrapper(lstm) model = CrfTagger(vocab, word_embeddings, lstm_wrapper, label_encoding=label_encoding, dropout=dropout, constrain_crf_decoding=constrain_crf_decoding) optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-8) schedule = LearningRateWithoutMetricsWrapper( torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9524)) iterator = BucketIterator(batch_size=64, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) temp_dir_fp = str(temp_dir.resolve()) temp_folder_path = tempfile.mkdtemp(dir=temp_dir_fp) set_random_env(cuda_device, random_seed, numpy_seed, torch_seed) trainer = Trainer(model=model, grad_clipping=5.0, learning_rate_scheduler=schedule, serialization_dir=temp_folder_path, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, shuffle=True, cuda_device=cuda_device, patience=5, num_epochs=1000) #trainer._tensorboard = TensorboardWriter(train_log=train_log, # validation_log=validation_log) interesting_metrics = trainer.train() best_model_weights = Path(temp_folder_path, 'best.th') best_model_state = torch.load(best_model_weights) model.load_state_dict(best_model_state) test_result = evaluate(model, test_dataset, iterator, cuda_device) dev_result = evaluate(model, dev_dataset, iterator, cuda_device) test_f1 = test_result['f1-measure-overall'] dev_f1 = dev_result['f1-measure-overall'] result_data.append((dev_f1, test_f1, char_encoder)) with result_fp.open('w+') as json_file: json.dump(result_data, json_file) print(f'{interesting_metrics}') return result_data
def main(): parser = argparse.ArgumentParser(description='Evidence oracle QA') parser.add_argument('--epochs', type=int, default=5, help='upper epoch limit (default: 5)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--model_name', type=str, default='sentence_oracle_bert', help='model name (default: sentence_oracle_bert)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') parser.add_argument('--ev_type', type=str, default='sentence', help='how to train the oracle - sentence or full (evidence) (default: sentence)') args = parser.parse_args() if args.ev_type == 'sentence': train = pickle.load(open('data/oracle_train.p', 'rb')) valid = pickle.load(open('data/oracle_val.p', 'rb')) test = pickle.load(open('data/oracle_test.p', 'rb')) elif args.ev_type == 'full': train = pickle.load(open('data/oracle_full_train.p', 'rb')) valid = pickle.load(open('data/oracle_full_val.p', 'rb')) test = pickle.load(open('data/oracle_full_test.p', 'rb')) else: print('ev_type should be either sentence or full') return bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)} pipeline_train = pickle.load(open('data/train_instances.p', 'rb')) pipeline_val = pickle.load(open('data/val_instances.p', 'rb')) pipeline_test = pickle.load(open('data/test_instances.p', 'rb')) pipeline_reader = PipelineDatasetReader(bert_token_indexer) p_train = pipeline_reader.read(pipeline_train) p_val = pipeline_reader.read(pipeline_val) p_test = pipeline_reader.read(pipeline_test) p_vocab = Vocabulary.from_instances(p_train + p_val + p_test) reader = EIDatasetReader(bert_token_indexer) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) bert_token_embedding = PretrainedBertEmbedder( 'scibert/weights.tar.gz', requires_grad=args.tunable ) word_embeddings = BasicTextFieldEmbedder( {"bert": bert_token_embedding}, {"bert": ['bert']}, allow_unmatched_keys=True ) model = Oracle(word_embeddings, p_vocab) cuda_device = list(range(torch.cuda.device_count())) if torch.cuda.is_available(): model = model.cuda() else: cuda_device = -1 t_total = len(train_data) // args.epochs optimizer = BertAdam(model.parameters(), lr=1e-5, warmup=0.05, t_total=t_total) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('comb_prompt_ev', 'num_tokens')], padding_noise=0.1) iterator.index_with(p_vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=valid_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) if cuda_device != -1: cuda_device = 0 test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
def main(args): fix_seed() if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab if args.vocab_path: vocab = Vocabulary.from_files(args.vocab_path) else: vocab = Vocabulary.from_instances(train_data, max_vocab_size={'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2}, tokens_to_add=tokens_to_add) vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: model.load_state_dict(torch.load(os.path.join(args.pretrain_folder, args.pretrain + '.th'))) model = model.to(device) print("Model is set") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=False, accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None ) print("Start training") trainer.train() # Here's how to save the model. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped")
# Now names token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=WORD_EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding, "token_characters" : character_embeddings}) lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = NamesClassifier(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 # Train the model - 30 epochs seem to give a pretty good baseline accuracy - 0.7 val accuracy optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens"), ("token_characters", "num_token_characters")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_set, validation_dataset=val_set, patience=10, num_epochs=2, cuda_device=cuda_device) trainer.train() # Manually test predictions from allennlp.predictors import Predictor class OwnPredictor(Predictor):
sum(p.numel() for p in model.parameters() if p.requires_grad)) print('Network:', model) # # train # iterBatchSize = 64 _triple_loader = IrTripleDatasetReader( lazy=True, max_doc_length=180, max_query_length=30, tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter() )) # already spacy tokenized, so that it is faster _iterator = BucketIterator(batch_size=iterBatchSize, sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) _iterator.index_with(vocab) # Create a folder which will store the model state, and the results: model name + current timestamp without seconds from datetime import datetime import os dt_string = datetime.now().strftime("%d-%m-%Y-%H_%M") newFolder = str(config["model"]) + "_" + dt_string + '/' resultFolder = pathPrefix + '/air_results/' + newFolder os.mkdir(resultFolder) # %%
# read data # token_indexer = SingleIdTokenIndexer() token_indexer = ELMoTokenCharactersIndexer() reader = JigsawDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer}) DATA_ROOT = Path("data") / "jigsaw" train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"]) val_ds = None # prepare vocab # vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size) vocab = Vocabulary() # prepare iterator iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # test data # batch = next(iter(iterator(train_ds))) # print(batch) # print(batch.keys()) # print(batch["tokens"]["tokens"].shape) class BaselineModel(Model): def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, out_sz: int=len(label_cols)): super(BaselineModel, self).__init__(vocab) self.word_embeddings = word_embeddings
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target=False, label=True, lazy=True) train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") vocab = Vocabulary() vocab.add_tokens_to_namespace([ START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}' ], namespace='tokens') vocab.add_tokens_to_namespace([ 'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability' ], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier( vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-6) train_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens")]) train_iterator = MultiprocessIterator(train_iterator, num_workers=16) train_iterator.index_with(vocab) val_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens") ]) val_iterator = MultiprocessIterator(val_iterator, num_workers=16) val_iterator.index_with(vocab) #pdb.set_trace() LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1} lr_scheduler = LearningRateScheduler.from_params(optimizer, Params(LR_SCHEDULER)) # TRAIN trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=None, train_dataset=train_data, validation_dataset=None, patience=None, shuffle=True, num_epochs=1, summary_interval=100, learning_rate_scheduler=lr_scheduler, cuda_device=CUDA_DEVICES, grad_norm=5, grad_clipping=5, model_save_interval=600, serialization_dir=args.serialization_dir, keep_serialized_model_every_num_seconds=3600, should_log_parameter_statistics=True, should_log_learning_rate=True) trainer.train()
### Check Cuda if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 ### Train Model optimizer = optim.SGD( model.parameters(), lr=0.01, momentum=0.9) #optim.Adam(same), play with lr and momentum (SGD ony) iterator = BucketIterator(batch_size=32, sorting_keys=[('text', 'num_tokens') ]) # 32 speed - 64 precission iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=100, num_epochs=10000, cuda_device=cuda_device ) #Select patience and play with number of epochs results = trainer.train()
def main(): parser = argparse.ArgumentParser( description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=8, help='batch size (default: 8)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--emb_size', type=int, default=256, help='elmo embeddings size (default: 256)') parser.add_argument('--model_name', type=str, default='attention', help='model name (default: attention)') parser.add_argument( '--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() processed_annotations = pickle.load(open('data/data/p_annotations.p', 'rb')) prompts = pd.read_csv('data/data/prompts_merged.csv') prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [ row['Outcome'], row['Intervention'], row['Comparator'] ] for article_key in processed_annotations: for article_item in processed_annotations[article_key]: article_item += prompts_dictionary[article_item[-1]] train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) elmo_token_indexer = { 'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer() } reader = EIDatasetReader(elmo_token_indexer, processed_annotations) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) urls = [ 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_options.json', 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_weights.hdf5' ] elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable, projection_dim=args.emb_size) word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True) model = Baseline(word_embeddings, vocab) global cuda_device cuda_device = args.cuda_device if torch.cuda.is_available(): logger.info('Running on GPU') model = model.cuda(cuda_device) else: logger.info('Running on CPU') cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('article', 'num_fields')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 print(cuda_device) mymodel = BERTWino(word_embeddings, vocab, cuda_device) if cuda_device >= 0: mymodel = mymodel.cuda(cuda_device) optimizer = optim.Adam(mymodel.parameters(), lr=LR) iterator = BucketIterator(batch_size=BATCH, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=mymodel, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=EPOCHS, cuda_device=cuda_device) indexer = PretrainedBertIndexer(pretrained_model="bert-base-cased", do_lowercase=False #max_pieces=config.max_seq_length ) trainer.train()
def main(): args = get_args() # TODO 增加char n-gram embeddings if args.embedding == 'elmo': token_indexer = ELMoTokenCharactersIndexer() else: token_indexer = SingleIdTokenIndexer() # Kaggle的多标签“恶意评论分类挑战 reader = JigsawDatasetReader(tokenizer=None, token_indexers={"tokens": token_indexer}, max_seq_len=200) dataset_root = Path('../../data/jigsaw') train_dataset, dev_dataset = (reader.read( dataset_root / fname) for fname in ["train.csv", "test_proced.csv"]) print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 # if args.embedding == 'elmo': # vocab = Vocabulary() # else: vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) # 建立token embedding token_embedding = None print(f"embedding dim: {args.embedding_dim}") if args.embedding == 'random': token_embedding = Embedding(num_embeddings=vocab_dim, embedding_dim=args.embedding_dim) elif args.embedding == 'glove': glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.6B.100d.txt' token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': args.embedding_dim, 'trainable': False })) elif args.embedding == 'elmo': # pretrained elmo LM model, transformed from bilm-tf with dump_weights in bin/training.py options_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' token_embedding = ElmoTokenEmbedder(options_file, weight_file, requires_grad=True, do_layer_norm=False) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) if args.embedding == 'elmo': args.embedding_dim = word_embeddings.get_output_dim() # 建立encoder seq2vec if args.encoder == 'lstm': hidden_dim = 256 encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(args.embedding_dim, hidden_dim, bidirectional=True, batch_first=True)) elif args.encoder == 'cnn': encoder = CnnEncoder( embedding_dim=args.embedding_dim, num_filters=128, ngram_filter_sizes=(2, 3, 4, 5, 6, 7), ) else: encoder = None # 建立 主分类网络 if args.network is None: model = MultiLabelClassifier( word_embeddings, 0.5, encoder, 0.2, vocab=vocab, out_dim=6, ) elif args.network == 'bcn': # TODO 转换成code line 形式 实例化分类器网络 bcn_params = { "text_field_embedder": { "token_embedders": { "tokens": { "pretrained_file": "/home/lirui/nlp/document-qa/data/glove/glove.840B.300d.txt", "type": "embedding", "embedding_dim": 300, "trainable": False } } }, "embedding_dropout": 0.5, "pre_encode_feedforward": { "input_dim": 300, "num_layers": 1, "hidden_dims": [300], "activations": ["relu"], "dropout": [0.25] }, "encoder": { "type": "lstm", "input_size": 300, "hidden_size": 300, "num_layers": 1, "bidirectional": True }, "integrator": { "type": "lstm", "input_size": 1800, "hidden_size": 300, "num_layers": 1, "bidirectional": True }, "integrator_dropout": 0.1, # "elmo": { # "options_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json", # "weight_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", # "do_layer_norm": False, # "dropout": 0.0, # "num_output_representations": 1 # }, # "use_input_elmo": False, # "use_integrator_output_elmo": False, "output_layer": { "input_dim": 2400, "num_layers": 3, "output_dims": [1200, 600, 5], "pool_sizes": 4, "dropout": [0.2, 0.3, 0.0] } } model = BiattentiveClassificationNetwork.from_params( vocab, params=Params(bcn_params)) # 训练参数 gpu_id = args.gpu_id if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, grad_norm=5.0, # validation_metric='+accuracy', cuda_device=gpu_id, patience=5, num_epochs=args.n_epochs) trainer.train()
def main(): parser = argparse.ArgumentParser( description='Evidence sentence classifier') parser.add_argument('--epochs', type=int, default=5, help='upper epoch limit (default: 5)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=8, help='batch size (default: 8)') parser.add_argument( '--loss', type=str, default='hinge', help= 'loss function to train the model - choose bce or hinge (default: hinge)' ) parser.add_argument( '--hinge_margin', type=float, default=0.5, help='the margin for the hinge loss, if used (default: 0.5)') parser.add_argument('--model_name', type=str, default='ev_classifier_bert', help='model name (default: ev_classifier_bert)') parser.add_argument( '--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() if args.loss not in ['bce', 'hinge']: print('Loss must be bce or hinge') return bert_token_indexer = { 'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512) } pipeline_train = pickle.load(open('data/train_instances.p', 'rb')) pipeline_val = pickle.load(open('data/val_instances.p', 'rb')) pipeline_test = pickle.load(open('data/test_instances.p', 'rb')) pipeline_reader = PipelineDatasetReader(bert_token_indexer) p_train = pipeline_reader.read(pipeline_train) p_val = pipeline_reader.read(pipeline_val) p_test = pipeline_reader.read(pipeline_test) p_vocab = Vocabulary.from_instances(p_train + p_val + p_test) classifier_train = pickle.load(open('data/classifier_train.p', 'rb')) classifier_val = pickle.load(open('data/classifier_val.p', 'rb')) reader = EvidenceDatasetReader(bert_token_indexer) train_data = reader.read(classifier_train) valid_data = reader.read(classifier_val) bert_token_embedding = PretrainedBertEmbedder('scibert/weights.tar.gz', requires_grad=args.tunable) word_embeddings = BasicTextFieldEmbedder({"bert": bert_token_embedding}, {"bert": ['bert']}, allow_unmatched_keys=True) model = Classifier(word_embeddings=word_embeddings, vocab=p_vocab, loss=args.loss, hinge_margin=args.hinge_margin) cuda_device = list(range(torch.cuda.device_count())) if torch.cuda.is_available(): model = model.cuda() else: cuda_device = -1 t_total = len(train_data) // args.epochs optimizer = BertAdam(model.parameters(), lr=2e-5, warmup=0.1, t_total=t_total) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('comb_evidence', 'num_tokens')], padding_noise=0.1, biggest_batch_first=True) iterator.index_with(p_vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=valid_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, # learning_rate_scheduler=scheduler, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key]))
def main(): parser = utils.opt_parser.get_trainer_opt_parser() parser.add_argument('models', nargs='*', help='pretrained models for the same setting') parser.add_argument('--test', action="store_true", help='use testing mode') parser.add_argument('--emb-dim', type=int, help='basic embedding dimension') parser.add_argument('--act-max-layer', type=int, help='maximum number of stacked layers') parser.add_argument('--use-act', action="store_true", help='Use adaptive computation time for decoder') parser.add_argument('--act-loss-weight', type=float, help="the loss of the act weights") parser.add_argument('--enc-layers', type=int, help="layers in encoder") parser.add_argument('--act-mode', choices=['basic', 'random', 'mean_field']) parser.add_argument('--encoder', choices=['transformer', 'lstm', 'bilstm']) parser.add_argument( '--decoder', choices=['lstm', 'rnn', 'gru', 'ind_rnn', 'n_lstm', 'n_gru'], ) parser.add_argument('--dec-cell-height', type=int, help="the height for n_layer lstm/gru") args = parser.parse_args() reader = data_adapter.GeoQueryDatasetReader() training_set = reader.read(config.DATASETS[args.dataset].train_path) try: validation_set = reader.read(config.DATASETS[args.dataset].dev_path) except: validation_set = None vocab = allennlp.data.Vocabulary.from_instances(training_set) if args.epoch: config.TRAINING_LIMIT = args.epoch if args.device: config.DEVICE = args.device st_ds_conf = get_updated_settings(args) model = get_model(vocab, st_ds_conf) if args.models: model.load_state_dict(torch.load(args.models[0])) if not args.test or not args.models: iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens") ], batch_size=st_ds_conf['batch_sz']) iterator.index_with(vocab) optim = torch.optim.Adam(model.parameters(), lr=config.ADAM_LR, betas=config.ADAM_BETAS, eps=config.ADAM_EPS) savepath = os.path.join( config.SNAPSHOT_PATH, args.dataset, 'ada_trans2seq', datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" + args.memo) if not os.path.exists(savepath): os.makedirs(savepath, mode=0o755) trainer = allennlp.training.Trainer( model=model, optimizer=optim, iterator=iterator, train_dataset=training_set, validation_dataset=validation_set, serialization_dir=savepath, cuda_device=config.DEVICE, num_epochs=config.TRAINING_LIMIT, grad_clipping=config.GRAD_CLIPPING, ) trainer.train() else: testing_set = reader.read(config.DATASETS[args.dataset].test_path) model.eval() if config.DEVICE > -1: model = model.cuda(config.DEVICE) predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader) for instance in tqdm.tqdm(testing_set, total=len(testing_set)): print('SRC: ', instance.fields['source_tokens'].tokens) print( 'GOLD:', ' '.join( str(x) for x in instance.fields['target_tokens'].tokens[1:-1])) del instance.fields['target_tokens'] output = predictor.predict_instance(instance) print('PRED:', ' '.join(output['predicted_tokens']))
def get_accuracy_detection(model, dev_dataset, vocab, trigger_token_ids=None, snli=False, get_threshold=False, verbose=False): """ When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with triggers prepended for the whole dev_dataset. """ model.get_metrics(reset=True) model.eval() # model should be in eval() already, but just in case clean_dataset = [] adv_dataset = [] for data in dev_dataset: fields = {} fields['tokens'] = data['tokens'] fields['label'] = LabelField(0, skip_indexing=True) fields['adv'] = LabelField(0, skip_indexing=True) clean_dataset.append(Instance(fields)) fields = {} fields['tokens'] = data['tokens'] fields['label'] = LabelField(1, skip_indexing=True) fields['adv'] = LabelField(1, skip_indexing=True) adv_dataset.append(Instance(fields)) if snli: iterator = BucketIterator(batch_size=128, sorting_keys=[("premise", "num_tokens")]) else: iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) print_string = [] for idx in trigger_token_ids: print_string += [vocab.get_token_from_index(idx)] logits = [] labels = [] for batch in lazy_groups_of(iterator(clean_dataset, num_epochs=1, shuffle=False), group_size=1): output = evaluate_batch(model, batch, None, snli) logits.append(output['logits'].detach().cpu().numpy()) labels.append(output['labels'].detach().cpu().numpy()) for batch in lazy_groups_of(iterator(adv_dataset, num_epochs=1, shuffle=False), group_size=1): output = evaluate_batch(model, batch, trigger_token_ids, snli) logits.append(output['logits'].detach().cpu().numpy()) labels.append(output['labels'].detach().cpu().numpy()) logits = np.concatenate(logits, 0) labels = np.concatenate(labels, 0) num = int(len(labels) / 2) if not model.use_cosine: if len(logits.shape) > 1: preds_int = np.argmax(logits, 1) preds_int[preds_int > 0] = 1 scores = preds_int else: if "use" in str(type(model)).lower() and model.threshold: best_threshold = model.threshold preds_int = logits <= best_threshold scores = preds_int print(logits) else: fpr, tpr, thresholds = roc_curve(labels, logits) gmeans = np.sqrt(tpr * (1 - fpr)) idx = np.argmax(gmeans) best_threshold = thresholds[idx] print("threshold", best_threshold) print("Median", np.median(logits)) print("TPR:", tpr[idx]) print("FPR:", fpr[idx]) preds_int = logits >= best_threshold scores = logits else: preds_int = (logits >= 0.5) # need to find threshold scores = logits acc = accuracy_score(labels, preds_int) auc = roc_auc_score(labels, scores) remain_clean = np.where(preds_int[:num] == 0)[0] remain_adv = np.where(preds_int[num:] == 0)[0] return acc, auc, remain_clean, remain_adv
embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=100, cuda_device=cuda_device) trainer.train() predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1)
def main(): parser = utils.opt_parser.get_trainer_opt_parser() parser.add_argument('models', nargs='*', help='pretrained models for the same setting') parser.add_argument('--enc-layers', type=int, default=1, help="encoder layer number defaulted to 1") parser.add_argument('--test', action="store_true", help='use testing mode') parser.add_argument('--use-dev', action="store_true") args = parser.parse_args() reader = data_adapter.GeoQueryDatasetReader() training_set = reader.read(config.DATASETS[args.dataset].train_path) if args.use_dev: validation_set = reader.read(config.DATASETS[args.dataset].dev_path) vocab = allennlp.data.Vocabulary.from_instances(training_set) st_ds_conf = config.SEQ2SEQ_CONF[args.dataset] if args.epoch: config.TRAINING_LIMIT = args.epoch bsz = st_ds_conf['batch_sz'] emb_sz = st_ds_conf['emb_sz'] src_embedder = BasicTextFieldEmbedder( token_embedders={ "tokens": Embedding(vocab.get_vocab_size('nltokens'), emb_sz) }) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(emb_sz, emb_sz, num_layers=args.enc_layers, batch_first=True)) model = allennlp.models.SimpleSeq2Seq( vocab, source_embedder=src_embedder, encoder=encoder, max_decoding_steps=st_ds_conf['max_decoding_len'], attention=allennlp.modules.attention.DotProductAttention(), beam_size=8, target_namespace="lftokens", use_bleu=True) if args.models: model.load_state_dict(torch.load(args.models[0])) if not args.test or not args.models: iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens") ], batch_size=bsz) iterator.index_with(vocab) optim = torch.optim.Adam(model.parameters()) savepath = os.path.join( config.SNAPSHOT_PATH, args.dataset, 'seq2seq', datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" + args.memo) if not os.path.exists(savepath): os.makedirs(savepath, mode=0o755) trainer = allennlp.training.Trainer( model=model, optimizer=optim, iterator=iterator, train_dataset=training_set, validation_dataset=validation_set if args.use_dev else None, serialization_dir=savepath, cuda_device=args.device, num_epochs=config.TRAINING_LIMIT, ) trainer.train() else: testing_set = reader.read(config.DATASETS[args.dataset].test_path) model.eval() predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader) for instance in testing_set: print('SRC: ', instance.fields['source_tokens'].tokens) print( 'GOLD:', ' '.join( str(x) for x in instance.fields['target_tokens'].tokens[1:-1])) print( 'PRED:', ' '.join( predictor.predict_instance(instance)['predicted_tokens']))
def train_epoch(model, train_dataset, validation_dataset, batch_size, optimizer, log_period, validation_period, save_dir, log_dir, cuda): """ Train the model for one epoch. """ # Set model to train mode (turns on dropout and such). model.train() # Create objects for calculating metrics. span_start_accuracy_metric = CategoricalAccuracy() span_end_accuracy_metric = CategoricalAccuracy() span_accuracy_metric = BooleanAccuracy() squad_metrics = SquadEmAndF1() # Create Tensorboard logger. writer = SummaryWriter(log_dir) # Build iterater, and have it bucket batches by passage / question length. iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("passage", "num_tokens"), ("question", "num_tokens")]) num_training_batches = iterator.get_num_batches(train_dataset) # Get a generator of train batches. train_generator = tqdm(iterator(train_dataset, num_epochs=1, cuda_device=0 if cuda else -1), total=num_training_batches, leave=False) log_period_losses = 0 for batch in train_generator: # Extract the relevant data from the batch. passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] span_start = batch["span_start"] span_end = batch["span_end"] metadata = batch.get("metadata", {}) # Run data through model to get start and end logits. output_dict = model(passage, question) start_logits = output_dict["start_logits"] end_logits = output_dict["end_logits"] softmax_start_logits = output_dict["softmax_start_logits"] softmax_end_logits = output_dict["softmax_end_logits"] # Calculate loss for start and end indices. loss = nll_loss(softmax_start_logits, span_start.view(-1)) loss += nll_loss(softmax_end_logits, span_end.view(-1)) log_period_losses += loss.data[0] # Backprop and take a gradient step. optimizer.zero_grad() loss.backward() optimizer.step() model.global_step += 1 # Calculate categorical span start and end accuracy. span_start_accuracy_metric(start_logits, span_start.view(-1)) span_end_accuracy_metric(end_logits, span_end.view(-1)) # Compute the best span, and calculate overall span accuracy. best_span = get_best_span(start_logits, end_logits) span_accuracy_metric(best_span, torch.stack([span_start, span_end], -1)) # Calculate EM and F1 scores calculate_em_f1(best_span, metadata, passage.size(0), squad_metrics) if model.global_step % log_period == 0: # Calculate metrics on train set. loss = log_period_losses / log_period span_start_accuracy = span_start_accuracy_metric.get_metric( reset=True) span_end_accuracy = span_end_accuracy_metric.get_metric(reset=True) span_accuracy = span_accuracy_metric.get_metric(reset=True) em, f1 = squad_metrics.get_metric(reset=True) tqdm_description = _make_tqdm_description(loss, em, f1) # Log training statistics to progress bar train_generator.set_description(tqdm_description) # Log training statistics to Tensorboard log_to_tensorboard(writer, model.global_step, "train", loss, span_start_accuracy, span_end_accuracy, span_accuracy, em, f1) log_period_losses = 0 if model.global_step % validation_period == 0: # Calculate metrics on validation set. (loss, span_start_accuracy, span_end_accuracy, span_accuracy, em, f1) = evaluate(model, validation_dataset, batch_size, cuda) # Save a checkpoint. save_name = ("{}_step_{}_loss_{:.3f}_" "em_{:.3f}_f1_{:.3f}.pth".format( model.__class__.__name__, model.global_step, loss, em, f1)) save_model(model, save_dir, save_name) # Log validation statistics to Tensorboard. log_to_tensorboard(writer, model.global_step, "validation", loss, span_start_accuracy, span_end_accuracy, span_accuracy, em, f1)
def main(): print("===experiment starts===") exp_start_time = time.time() P = Params() opts = P.opts experiment_logdir = experiment_logger(args=opts) print("experiment_logdir:", experiment_logdir) P.dump_params(experiment_dir=experiment_logdir) cuda_devices = cuda_device_parser(str_ids=opts.cuda_devices) TRAIN_WORLDS, DEV_WORLDS, TEST_WORLDS = worlds_loader(args=opts) vocab = Vocabulary() iterator_for_training_and_evaluating_mentions = BucketIterator(batch_size=opts.batch_size_for_train, sorting_keys=[('context', 'num_tokens')]) iterator_for_training_and_evaluating_mentions.index_with(vocab) embloader = EmbLoader(args=opts) emb_mapper, emb_dim, textfieldEmbedder = embloader.emb_returner() tokenIndexing = TokenIndexerReturner(args=opts) global_tokenizer = tokenIndexing.berttokenizer_returner() global_tokenIndexer = tokenIndexing.token_indexer_returner() mention_encoder = Pooler_for_mention(args=opts, word_embedder=textfieldEmbedder) entity_encoder = Pooler_for_title_and_desc(args=opts, word_embedder=textfieldEmbedder) model = Biencoder(args=opts, mention_encoder=mention_encoder, entity_encoder=entity_encoder, vocab=vocab) model = model.cuda() optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=opts.lr, eps=opts.epsilon, weight_decay=opts.weight_decay, betas=(opts.beta1, opts.beta2), amsgrad=opts.amsgrad) devEvalEpochs = [j for j in range(1, 1000)] if opts.add_hard_negatives else \ [1, 3, 5] + [k * 10 for k in range(1, 100)] for epoch in range(opts.num_epochs): oneep_train_start = time.time() for world_name in TRAIN_WORLDS: reader = WorldsReader(args=opts, world_name=world_name, token_indexers=global_tokenIndexer, tokenizer=global_tokenizer) if opts.add_hard_negatives: with torch.no_grad(): mention_encoder.eval(), entity_encoder.eval() hardNegativeSearcher = HardNegativesSearcherForEachEpochStart(args=opts, world_name=world_name, reader=reader, embedder=textfieldEmbedder, mention_encoder=mention_encoder, entity_encoder=entity_encoder, vocab=vocab, berttokenizer=global_tokenizer, bertindexer=global_tokenIndexer) hardNegativeSearcher.hardNegativesSearcherandSetter() trains = reader.read('train') mention_encoder.train(), entity_encoder.train() trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator_for_training_and_evaluating_mentions, train_dataset=trains, cuda_device=cuda_devices, num_epochs=1 ) trainer.train() if epoch + 1 in devEvalEpochs: print('\n===================\n', 'TEMP DEV EVALUATION@ Epoch', epoch + 1,'\n===================\n') t_entire_h1c, t_entire_h10c, t_entire_h50c, t_entire_h64c, t_entire_h100c, t_entire_h500c, t_entire_datapoints \ = oneLineLoaderForDevOrTestEvaluation( dev_or_test_flag='dev', opts=opts, global_tokenIndexer=global_tokenIndexer, global_tokenizer=global_tokenizer, textfieldEmbedder=textfieldEmbedder, mention_encoder=mention_encoder, entity_encoder=entity_encoder, vocab=vocab, experiment_logdir=experiment_logdir, finalEvalFlag=0, trainEpoch=epoch+1) devEvalExperimentEntireDevWorldLog(experiment_logdir, t_entire_h1c, t_entire_h10c, t_entire_h50c, t_entire_h64c, t_entire_h100c, t_entire_h500c, t_entire_datapoints, epoch=epoch) oneep_train_end = time.time() print('epoch {0} train time'.format(epoch+1), oneep_train_end - oneep_train_start, 'sec') print('====training finished=======') with torch.no_grad(): model.eval() print('===FINAL Evaluation starts===') for dev_or_test_flag in ['dev','test']: print('\n===================\n', dev_or_test_flag, 'EVALUATION', '\n===================\n') entire_h1c, entire_h10c, entire_h50c, entire_h64c, entire_h100c, entire_h500c, entire_datapoints \ = oneLineLoaderForDevOrTestEvaluation(dev_or_test_flag=dev_or_test_flag, opts=opts, global_tokenIndexer=global_tokenIndexer, global_tokenizer=global_tokenizer, textfieldEmbedder=textfieldEmbedder, mention_encoder=mention_encoder, entity_encoder=entity_encoder, vocab=vocab, experiment_logdir=experiment_logdir, finalEvalFlag=1, trainEpoch=-1) dev_or_test_finallog(entire_h1c, entire_h10c, entire_h50c, entire_h64c, entire_h100c, entire_h500c, entire_datapoints, dev_or_test_flag, experiment_logdir, ) exp_end_time = time.time() print('===experiment finised', exp_end_time-exp_start_time, 'sec') print(experiment_logdir)
from embeddings import get_token_utils, get_embedder from dataloaders import ReutersDataSetReader, NewsGroupsDataSetReader token_indexers, tokenizer = get_token_utils() # reader = ReutersDataSetReader(tokenizer=tokenizer, # TODO: token_indexer 的 key # token_indexers={'tokens': token_indexers}) # train_ds, test_ds = [reader.read(fname) for fname in ['train.json', 'test.json']] reader = NewsGroupsDataSetReader( tokenizer=tokenizer, # TODO: token_indexer 的 key token_indexers={'tokens': token_indexers}) train_ds, test_ds = [reader.read(fname) for fname in ['train', 'test']] val_ds = None voc = Vocabulary() iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[('sentence', 'num_tokens')]) iterator.index_with(vocab=voc) # 2. 搭建模型 word_embeddings = get_embedder() encoder = get_encoder(voc, word_embeddings.get_output_dim()) model = BaseModelWithoutKnowledge(voc=voc, word_embeddings=word_embeddings, encoder=encoder, out_sz=reader.label_length, multi=False) model = model.cuda(cuda_device) if cuda_device > -1 else model # 3. 训练
""" ############### Instantiate the model and optimizer ################## """ model = Ncut.NameCountryModel(cf_a, vocab) optimizer = optim.SGD(model.parameters(), lr=0.01) cf_a.optimizer = optimizer model.to(device = device, dtype = dtype) """ ############ Iterator that will get the samples for the problem ############# """ batch_size=10 batch_size_validation = 100 iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")]) iterator.index_with(vocab) iterator_validation = BucketIterator(batch_size = batch_size_validation, sorting_keys=[("text_field", "num_tokens")]) iterator_validation.index_with(vocab) num_batches = int(np.floor(len(train_dataset)/batch_size)) num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation)) # Create the iterator over the data: batches_iterable = iterator(train_dataset) batches_iterable_validation = iterator_validation(validation_dataset) """ ############################################################################## ######################### TRAINING ####################################### Probably should not use this one because we want more features for the Bayesian elements.
def tokenizer(x): return [w.text for w in SpacyWordSplitter(language='en_core_web_sm',pos_tags=False).split_words(x)[:config.max_seq_len]] reader = JigsawDataReader(tokenizer=tokenizer,token_indexers={'tokens':token_indexers}) train_ds,test_ds = (reader.read(DATA_PATH+w) for w in ["train.csv", "test_proced.csv"]) val_ds = None vars(train_ds[0].fields['tokens']) from allennlp.data.vocabulary import Vocabulary vocab = Vocabulary.from_instances(train_ds,max_vocab_size = config.max_vocab_size) from allennlp.data.iterators import BucketIterator iterator = BucketIterator(batch_size=config.batch_size,sorting_keys=[('tokens','num_tokens')],) iterator.index_with(vocab) batch = next(iter(iterator(train_ds))) from allennlp.modules.seq2vec_encoders import Seq2VecEncoder,PytorchSeq2VecWrapper from allennlp.modules.text_field_embedders import TextFieldEmbedder from allennlp.nn.util import get_text_field_mask from allennlp.models import Model class BaslinModel(Model): def __init__(self,word_embeddings,encoder,out_sz=len(label_cols)): super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder
#### For embedding the tokens we'll just use the <code>BasicTextFieldEmbedder</code> which takes a mapping from index names to embeddings. If you go back to where we defined our <code>DatasetReader</code>, the default parameters included a single index called "tokens", so our mapping just needs an embedding corresponding to that index. We use the <code>Vocabulary</code> to find how many embeddings we need and our <code>EMBEDDING_DIM</code> parameter to specify the output dimension. It's also possible to start with pre-trained embeddings (for example, GloVe vectors), but there's no need to do that on this tiny toy dataset. token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) #### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well. lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) #### Finally, we can instantiate the model. model = LstmTagger(word_embeddings, lstm, vocab) #### Now we're ready to train the model. The first thing we'll need is an optimizer. We can just use PyTorch's stochastic gradient descent. optimizer = optim.SGD(model.parameters(), lr=0.1) #### And we need a <code>DataIterator</code> that handles batching for our datasets. The <code>BucketIterator</code> sorts instances by the specified fields in order to create batches with similar sequence lengths. Here we indicate that we want to sort the instances by the number of tokens in the sentence field. iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")]) #### We also specify that the iterator should make sure its instances are indexed using our vocabulary; that is, that their strings have been converted to integers using the mapping we previously created. iterator.index_with(vocab) #### Now we instantiate our <code>Trainer</code> and run it. Here we tell it to run for 1000 epochs and to stop training early if it ever spends 10 epochs without the validation metric improving. The default validation metric is loss (which improves by getting smaller), but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger). trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000) #### When we launch it it will print a progress bar for each epoch that includes both the "loss" and the "accuracy" metric. If our model is good, the loss should go down and the accuracy up as we train. trainer.train()
word = vocab.get_token_from_index(i, 'tokens') if word in word_vector.vocab: pretrained_weight[vocab.get_token_index(word)] = word_vector[word] del word_vector token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_size, weight=torch.from_numpy(pretrained_weight).float()) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) model = ATAE(args, word_embeddings, vocab) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[("trigger_0", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=eval_dataset, num_epochs=args.epochs, patience=args.patience, # stop training before loss raise cuda_device=args.cuda_device, # cuda device id ) # start train metrics = trainer.train()