def train_gemb(args): config = configuration.get_config(args.config) i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator( data.get_development_data(), # data for eval script data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: # Data and evaluator for PropId. data = TaggerData( config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_gemb_development_data( batch_size=config.dev_batch_size) # data for NN print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print( '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) writer = open(os.path.join(args.model, 'gemb_checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config) model.load(os.path.join(args.model, 'model.npz')) model.add_gemb() for param in model.gemb.params: print param, param.name, param.shape.eval() gemb_loss_function = model.get_gemb_loss_function() gemb_eval_function = model.get_eval_with_gemb_function() ctx_emb_function = model.get_ctx_emb_function() while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: train_data = data.get_gemb_training_data(include_last_batch=True) for batched_tensor in train_data: x, y, oov_pos, _, weights = batched_tensor loss = gemb_loss_function(x, weights, oov_pos) train_loss += loss i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format( i, train_loss / i)) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, train_loss)) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, ctx_emb_function, gemb_eval_function, batched_dev_data, evaluator, writer, global_step) # Done. :) writer.close()
def train_tagger(args): config = configuration.get_config(args.config) numpy.random.seed(666) torch.manual_seed(666) torch.set_printoptions(precision=20) ### gpu gpu = torch.cuda.is_available() if args.gpu and gpu: print("GPU available: {}\t GPU ID: {}".format(gpu, args.gpu)) torch.cuda.manual_seed(666) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: print "Not implemented yet!" exit() # Data and evaluator for PropId. data = TaggerData( config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data( batch_size=config.dev_batch_size) print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer("Get training and devlel sentences dict"): training_sentences = [] for sen in get_srl_sentences(args.train): if len(sen[1]) <= config.max_train_length: training_sentences.append(' '.join(sen[1])) training_ids = [int(sen[0][0]) for sen in data.train_sents] temp = {} assert len(training_sentences) == len(training_ids) for idx, sen in zip(training_ids, training_sentences): temp[idx] = sen training_sentences = temp devel_sentences = [ ' '.join(sen[1]) for sen in get_srl_sentences(args.dev) ] devel_ids = [int(sen[0][0]) for sen in data.dev_sents] temp = {} assert len(devel_sentences) == len(devel_ids) for idx, sen in zip(devel_ids, devel_sentences): temp[idx] = sen devel_sentences = temp with Timer('Syntactic Information Extracting' ): # extract the syntactic information from file train_dep_trees = SyntacticCONLL() dev_dep_trees = SyntacticCONLL() train_dep_trees.read_from_file(args.train_dep_trees) dev_dep_trees.read_from_file(args.dev_dep_trees) with Timer("TPF2 generating..."): # generate the tree-based position features according the Dependency Tree. train_tpf2, data.tpf2_dict = train_dep_trees.get_tpf2_dict( data.train_tensors) print("Extract {} training TPF2 features".format(len(train_tpf2))) assert len(train_tpf2) == len(data.train_tensors) data.tpf2_dict.accept_new = False dev_tpf2 = dev_dep_trees.get_tpf2_dict(data.dev_tensors, data.tpf2_dict) print("Extract {} dev TPF2 features".format(len(dev_tpf2))) assert len(dev_tpf2) == len(data.dev_tensors) with Timer("Loading ELMO"): train_elmo_hdf5 = hdf5_reader() train_elmo_hdf5.read_from_file(args.train_elmo, training_sentences) dev_elmo_hdf5 = hdf5_reader() dev_elmo_hdf5.read_from_file(args.dev_elmo, devel_sentences) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' \ .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) # data.syntactic_dict.save(os.path.join(args.model, 'syn_label_dict')) data.tpf2_dict.save(os.path.join(args.model, 'tpf2_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "BiLSTMTaggerModel initialize with Cuda!" model = model.cuda() if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for param in model.parameters(): print param.size() optimizer = torch.optim.Adadelta( model.parameters(), lr=1.0, rho=0.95) # initialize the optimizer outside the epoch while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: model.train() train_data = data.get_training_data(include_last_batch=True) for batched_tensor in train_data: # for each batch in the training corpus x, y, lengths, weights = batched_tensor word_inputs_seqs, predicate_inputs_seqs, tpf_ids, sentences_ids, answers, input_lengths, masks, padding_answers = \ batch_data_variable(train_tpf2, x, y, lengths, weights) elmo_representations = train_elmo_hdf5.forward( sentences_ids, word_inputs_seqs.size()[-1], [len(ans) for ans in answers]) if args.gpu: word_inputs_seqs, predicate_inputs_seqs, tpf_ids, input_lengths, masks, padding_answers = \ word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), tpf_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda() elmo_representations = elmo_representations.cuda() optimizer.zero_grad() output = model.forward(word_inputs_seqs, predicate_inputs_seqs, tpf_ids, elmo_representations, input_lengths) loss = model.compute_loss(output, padding_answers, masks) loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() train_loss += loss.data # should be tensor not Variable, avoiding the graph accumulates i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format( i, float(train_loss / i))) sys.stdout.flush() train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format( epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batched_dev_data, dev_tpf2, dev_elmo_hdf5, evaluator, writer, global_step) # Done. :) writer.close()
# Getting evaluator gold_props_file = args.gold if args.gold != '' else None pred_props_file = args.inputprops if args.inputprops != '' else None if args.task == 'srl': evaluator = SRLEvaluator(data.get_test_data(test_sentences, batch_size=None), data.label_dict, gold_props_file, use_se_marker=config.use_se_marker, pred_props_file=pred_props_file, word_dict=data.word_dict) else: evaluator = PropIdEvaluator( data.get_test_data(test_sentences, batch_size=None), data.label_dict) if args.proto != '': print 'Writing to proto {}'.format(args.proto) pb_file = open(args.proto, 'wb') else: pb_file = None with Timer("Decoding"): transition_params = get_transition_params(data.label_dict.idx2str) num_tokens = None # Collect sentence length information for (i, batched_tensor) in enumerate(test_data): _, _, nt, _ = batched_tensor
def train_tagger(args): config = configuration.get_config(args.config) numpy.random.seed(666) torch.manual_seed(666) torch.cuda.manual_seed(666) ### gpu gpu = torch.cuda.is_available() print("GPU available: ", gpu) i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print ('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData(config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: # Data and evaluator for PropId. data = TaggerData(config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data(batch_size=config.dev_batch_size) print ('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Preparation'): if not os.path.isdir(args.model): print ('Directory {} does not exist. Creating new.'.format(args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write('step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu) if args.gpu: print "Use Cuda!" model = model.cuda() if args.gpu != "" and not torch.cuda.is_available(): raise Exception("No GPU Found!") exit() for param in model.parameters(): print param.size() """for param in model.params: print param, param.name, param.shape.eval() loss_function = model.get_loss_function() eval_function = model.get_eval_function()""" while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: train_data = data.get_training_data(include_last_batch=True) model.bilstm.dropout = 0.1 for batched_tensor in train_data: # for each batch in the training corpus x, y, _, weights = batched_tensor batch_input_lengths = ([sentence_x.shape[0] for sentence_x in x]) max_length = max(batch_input_lengths) # padding # input = [numpy.pad(sentence_x, (0, max_length - sentence_x.shape[0]), 'constant') for sentence_x in x] word_input = [numpy.pad(sentence_x[:, 0], (0, max_length - sentence_x.shape[0]), 'constant') \ for sentence_x in x] # padding predicate_input = [numpy.pad(sentence_x[:, 1], (0, max_length - sentence_x.shape[0]), 'constant') \ for sentence_x in x] # padding word_input, predicate_input = numpy.vstack(word_input), numpy.vstack(predicate_input) # numpy batch input to Variable word_input_seqs = torch.autograd.Variable(torch.from_numpy(word_input.astype('int64')).long()) predicate_input_seqs = torch.autograd.Variable(torch.from_numpy(predicate_input.astype('int64')).long()) # First: order the batch by decreasing sequence length input_lengths = torch.LongTensor(batch_input_lengths) input_lengths, perm_idx = input_lengths.sort(0, descending=True) word_input_seqs = word_input_seqs[perm_idx] predicate_input_seqs = predicate_input_seqs[perm_idx] answer = [None] * len(x) # resort the answer according to the input count = 0 list_y = list(y) for (i, ans) in zip(perm_idx, list_y): answer[count] = list_y[i] count += 1 answer = numpy.concatenate(answer) answer = torch.autograd.Variable(torch.from_numpy(answer).type(torch.LongTensor)) answer = answer.view(-1) # print answer, answer.size() # Then pack the sequences # packed_input = torch.nn.utils.rnn.pack_padded_sequence(input_seqs, input_lengths.numpy(), batch_first=True) # packed_input = packed_input.cuda() if args.gpu else packed_input if args.gpu: word_input_seqs, predicate_input_seqs, input_lengths, perm_idx =\ word_input_seqs.cuda(), predicate_input_seqs.cuda(), input_lengths.cuda(), perm_idx.cuda() answer = answer.cuda() model.zero_grad() output = model.forward(word_input_seqs, predicate_input_seqs, input_lengths, perm_idx, len(x)) # (batch input, batch size) loss = model.loss(output, answer) loss.backward() # gradient clipping # torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) model.optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95) model.optimizer.step() train_loss += loss i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format(i, float(train_loss / i))) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, float(train_loss))) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, batched_dev_data, evaluator, writer, global_step) # Done. :) writer.close()