def main(): from model import E2EETModel, MentionLevelModel from bert_serving.client import BertClient import jsonlines bc = BertClient() logger.info("Loading files...") data_loaders = dutils.load_obj_from_pkl_file('data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') word_vocab = dutils.load_obj_from_pkl_file('word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') wordpiece_vocab = dutils.load_obj_from_pkl_file('wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') hierarchy = dutils.load_obj_from_pkl_file('hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') total_wordpieces = dutils.load_obj_from_pkl_file('total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') logger.info("Building model.") model = create_model(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces) model.cuda() model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME)) modelEvaluator = ModelEvaluator(model, data_loaders['test'], word_vocab, wordpiece_vocab, hierarchy, bc, mode="test") with jsonlines.open(cf.BEST_MODEL_JSON_FILENAME, "r") as reader: for line in reader: f1_score, epoch = line['f1_score'], line['epoch'] modelEvaluator.evaluate_model(epoch)
def main(opts): if len(opts) == 0: raise ValueError("Usage: evaluate.py <dataset>") dataset = opts[0] if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']: raise ValueError( "Dataset must be either cateringServices, automotiveEngineering or bbn." ) cf.load_config(dataset) logger.info("Loading data loaders...") data_loader_train = dutils.load_obj_from_pkl_file( 'data loader (train)', cf.ASSET_FOLDER + '/data_loader_train.pkl') data_loader_dev = dutils.load_obj_from_pkl_file( 'data loader (dev)', cf.ASSET_FOLDER + '/data_loader_dev.pkl') dataset_dev = pd.read_csv(cf.DEV_FILENAME) ground_truth_triples_df = pd.read_csv(cf.GROUND_TRUTH_TRIPLES_FILE) ground_truth_triples = parse_ground_truth_triples(ground_truth_triples_df) logger.info("Building model.") model = CandidateFilteringModel( embedding_dim=cf.EMBEDDING_DIM, hidden_dim=cf.HIDDEN_DIM, ) model.cuda() train(model, data_loader_train, data_loader_dev, dataset_dev, ground_truth_triples)
def __init__(self, dataset, cf): from model import E2EETModel from bert_serving.client import BertClient import jsonlines logger.info("Loading files...") data_loaders = dutils.load_obj_from_pkl_file( 'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') # Note: the word and wordpiece vocab are stored as attributes so that they may be expanded # if necessary during evaluation (if a new word appears) self.word_vocab = dutils.load_obj_from_pkl_file( 'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') self.wordpiece_vocab = dutils.load_obj_from_pkl_file( 'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') hierarchy = dutils.load_obj_from_pkl_file( 'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') total_wordpieces = dutils.load_obj_from_pkl_file( 'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') # Initialise the coref pipeline for use in end-user evaluation self.nlp = spacy.load('en') self.coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(self.coref, name='neuralcoref') logger.info("Building model.") model = E2EETModel( embedding_dim=cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM, hidden_dim=cf.HIDDEN_DIM, vocab_size=len(self.wordpiece_vocab), label_size=len(hierarchy), total_wordpieces=total_wordpieces, category_counts=hierarchy.get_train_category_counts(), hierarchy_matrix=hierarchy.hierarchy_matrix, max_seq_len=cf.MAX_SENT_LEN, batch_size=1) model.cuda() model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME)) self.modelEvaluator = ModelEvaluator(model, None, self.word_vocab, self.wordpiece_vocab, hierarchy, None, cf) self.cf = cf
def main(): logger.info("Loading files...") data_loaders = dutils.load_obj_from_pkl_file('data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') word_vocab = dutils.load_obj_from_pkl_file('word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') wordpiece_vocab = dutils.load_obj_from_pkl_file('wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') hierarchy = dutils.load_obj_from_pkl_file('hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') total_wordpieces = dutils.load_obj_from_pkl_file('total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') print(len(data_loaders['train'])) logger.info("Building model.") model = create_model(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces) model.cuda() train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy)
def main(opts): if len(opts) == 0: raise ValueError("Usage: train.py <dataset>") dataset = opts[0] if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']: raise ValueError( "Dataset must be either cateringServices, automotiveEngineering, or bbn." ) cf.load_config(dataset) logger.info("Loading files...") data_loaders = dutils.load_obj_from_pkl_file( 'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') word_vocab = dutils.load_obj_from_pkl_file( 'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') wordpiece_vocab = dutils.load_obj_from_pkl_file( 'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') hierarchy = dutils.load_obj_from_pkl_file( 'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') total_wordpieces = dutils.load_obj_from_pkl_file( 'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') ground_truth_triples_df = pd.read_csv(cf.GROUND_TRUTH_TRIPLES_FILE) ground_truth_triples = parse_ground_truth_triples(ground_truth_triples_df) logger.info("Building model.") model = E2EETModel(embedding_dim=cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM, hidden_dim=cf.HIDDEN_DIM, vocab_size=len(wordpiece_vocab), label_size=len(hierarchy), total_wordpieces=total_wordpieces, category_counts=hierarchy.get_train_category_counts(), hierarchy_matrix=hierarchy.hierarchy_matrix, max_seq_len=cf.MAX_SENT_LEN, batch_size=cf.BATCH_SIZE) model.cuda() train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy, ground_truth_triples)