def do_training(args): torch.manual_seed(133) # Set up configuration and output config = Config(args) if not os.path.exists(config.output_path): os.makedirs(config.output_path) # Set up logging handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) # Load data helper, data = load_data(args) train_examples = data['train_examples'] dev_examples = data['dev_examples'] helper.save(config.output_path) # Load embeddings embeddings = load_embeddings(args, helper, config.device) # Initialize model logger.info("Initializing model...", ) model = NerBiLstmModel(helper, config, embeddings) model.to(config.device) # Preprocess data data_preprocessor = DataPreprocessor(model, config, helper) train_examples = data_preprocessor.preprocess_sequence_data(train_examples) dev_examples = data_preprocessor.preprocess_sequence_data(dev_examples) # Start training trainer = Trainer(model, config, helper, logger) logger.info("Starting training...", ) trainer.train(train_examples, dev_examples) # Save predictions of the best model logger.info( "Training completed, saving predictions of the best model...", ) with torch.no_grad(): model.load_state_dict(torch.load(config.model_output)) model.eval() predictor = Predictor(model, config) output = predictor.predict(dev_examples, use_str_labels=True) sentences, labels, predictions = zip(*output) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = list(zip(sentences, labels, predictions)) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)
def infer(self, conll): """ Uses the JAVANLP sentence object to create an appropriate CoNLL formatted input for the CRF CONLL is a list of arrays. @param: conll is a set of strings. """ with open(self.test_path, "w") as f: for conll_ in conll: write_conll(f, conll_) f.write("\n") output = check_output([CRF_TEST, "-m", self.model_path, self.test_path], universal_newlines=True) conll_out = read_conll_doc(output) assert len(conll_out) == len(conll) tags = [[tok[-1] for tok in c] for c in conll_out] return tags
def do_train(args): # Set up some parameters. config = Config(args) helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) #print("**********************Dumping embeddings to a file.***************") #with open("F:/Jupyter/NLP/assignment3/embeddings.pkl","wb") as f: #pickle.dump(embeddings, f) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None #Report(Config.eval_output) with tf.Graph().as_default(): logger.info("Building model...", ) start = time.time() model = RNNModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) model.fit(session, saver, train, dev) if report: report.log_output(model.output(session, dev_raw)) report.save() else: # Save predictions in a text file. output = model.output(session, dev_raw) sentences, labels, predictions = zip(*output) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = zip(sentences, labels, predictions) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)
def do_train(args): # Set up some parameters. config = Config() helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None #Report(Config.eval_output) with tf.Graph().as_default(): logger.info("Building model...", ) start = time.time() model = WindowModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: #session = tf_debug.LocalCLIDebugWrapperSession(session) #session.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) session.run(init) model.fit(session, saver, train, dev) if report: report.log_output(model.output(session, dev_raw)) report.save() else: # Save predictions in a text file. output = model.output(session, dev_raw) sentences, labels, predictions = zip(*output) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = zip(sentences, labels, predictions) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)
def update(self, conll, tags): """ Updates labels for the current example. """ # Create labelled data conll_labelled = [feats[:self.TAG_LABEL] + [t] for feats, t in zip(conll, tags)] # If we've move previous, rewrite the whole labelled set. if self.cur_index <= len(self.labelled_data): self.labelled_data[self.cur_index-1] = conll_labelled self.labelled_data_file.close() self.labelled_data_file = open(self.train_path,'w') for conll in self.labelled_data: write_conll(self.labelled_data_file, conll) self.labelled_data_file.close() self.labelled_data_file = open(self.train_path,'a') else: self.labelled_data.append(conll_labelled) write_conll(self.labelled_data_file, conll_labelled)
def do_train(args): # Set up some parameters. config = Config() helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None # Report(Config.eval_output) with tf.Graph().as_default(): logger.info("Building model...", ) start = time.time() model = WindowModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) model.fit(session, saver, train, dev) if report: report.log_output(model.output(session, dev_raw)) report.save() else: # Save predictions in a text file. output = model.output(session, dev_raw) sentences, labels, predictions = zip(*output) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = zip(sentences, labels, predictions) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)
def do_train(args): # Set up some parameters. config = Config() helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None #Report(Config.eval_output) logger.info("Building model...", ) start = time.time() model = WindowModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) model.apply(init_weights) model.fit(train, dev) if report: report.log_output(model.output(dev_raw)) report.save() else: # Save predictions in a text file. output = model.output(dev_raw) sentences, labels, predictions = zip(*output) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = zip(sentences, labels, predictions) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)