Ejemplo n.º 1
0
def main():

	from model import E2EETModel, MentionLevelModel
	from bert_serving.client import BertClient
	import jsonlines
	
	bc = BertClient()

	logger.info("Loading files...")

	data_loaders = dutils.load_obj_from_pkl_file('data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
	word_vocab = dutils.load_obj_from_pkl_file('word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
	wordpiece_vocab = dutils.load_obj_from_pkl_file('wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
	hierarchy = dutils.load_obj_from_pkl_file('hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl')
	total_wordpieces = dutils.load_obj_from_pkl_file('total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')
	
	logger.info("Building model.")
	model = create_model(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces)	
	model.cuda()

	model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME))

	modelEvaluator = ModelEvaluator(model, data_loaders['test'], word_vocab, wordpiece_vocab, hierarchy, bc, mode="test")
	
	with jsonlines.open(cf.BEST_MODEL_JSON_FILENAME, "r") as reader:
		for line in reader:
			f1_score, epoch = line['f1_score'], line['epoch']

	modelEvaluator.evaluate_model(epoch)
Ejemplo n.º 2
0
def main(opts):

    if len(opts) == 0:
        raise ValueError("Usage: evaluate.py <dataset>")
    dataset = opts[0]
    if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']:
        raise ValueError(
            "Dataset must be either cateringServices, automotiveEngineering or bbn."
        )

    cf.load_config(dataset)

    logger.info("Loading data loaders...")

    data_loader_train = dutils.load_obj_from_pkl_file(
        'data loader (train)', cf.ASSET_FOLDER + '/data_loader_train.pkl')
    data_loader_dev = dutils.load_obj_from_pkl_file(
        'data loader (dev)', cf.ASSET_FOLDER + '/data_loader_dev.pkl')

    dataset_dev = pd.read_csv(cf.DEV_FILENAME)
    ground_truth_triples_df = pd.read_csv(cf.GROUND_TRUTH_TRIPLES_FILE)

    ground_truth_triples = parse_ground_truth_triples(ground_truth_triples_df)

    logger.info("Building model.")
    model = CandidateFilteringModel(
        embedding_dim=cf.EMBEDDING_DIM,
        hidden_dim=cf.HIDDEN_DIM,
    )
    model.cuda()

    train(model, data_loader_train, data_loader_dev, dataset_dev,
          ground_truth_triples)
Ejemplo n.º 3
0
    def __init__(self, dataset, cf):

        from model import E2EETModel
        from bert_serving.client import BertClient
        import jsonlines

        logger.info("Loading files...")

        data_loaders = dutils.load_obj_from_pkl_file(
            'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
        # Note: the word and wordpiece vocab are stored as attributes so that they may be expanded
        # if necessary during evaluation (if a new word appears)
        self.word_vocab = dutils.load_obj_from_pkl_file(
            'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
        self.wordpiece_vocab = dutils.load_obj_from_pkl_file(
            'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
        hierarchy = dutils.load_obj_from_pkl_file(
            'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl')
        total_wordpieces = dutils.load_obj_from_pkl_file(
            'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')

        # Initialise the coref pipeline for use in end-user evaluation
        self.nlp = spacy.load('en')
        self.coref = neuralcoref.NeuralCoref(self.nlp.vocab)
        self.nlp.add_pipe(self.coref, name='neuralcoref')

        logger.info("Building model.")
        model = E2EETModel(
            embedding_dim=cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM,
            hidden_dim=cf.HIDDEN_DIM,
            vocab_size=len(self.wordpiece_vocab),
            label_size=len(hierarchy),
            total_wordpieces=total_wordpieces,
            category_counts=hierarchy.get_train_category_counts(),
            hierarchy_matrix=hierarchy.hierarchy_matrix,
            max_seq_len=cf.MAX_SENT_LEN,
            batch_size=1)
        model.cuda()

        model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME))

        self.modelEvaluator = ModelEvaluator(model, None, self.word_vocab,
                                             self.wordpiece_vocab, hierarchy,
                                             None, cf)
        self.cf = cf
Ejemplo n.º 4
0
def main():

	logger.info("Loading files...")

	data_loaders = dutils.load_obj_from_pkl_file('data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
	word_vocab = dutils.load_obj_from_pkl_file('word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
	wordpiece_vocab = dutils.load_obj_from_pkl_file('wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
	hierarchy = dutils.load_obj_from_pkl_file('hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl')
	total_wordpieces = dutils.load_obj_from_pkl_file('total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')

	print(len(data_loaders['train']))
	
	logger.info("Building model.")

	model = create_model(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces)		
	model.cuda()

	train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy)
Ejemplo n.º 5
0
def main(opts):

    if len(opts) == 0:
        raise ValueError("Usage: train.py <dataset>")
    dataset = opts[0]
    if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']:
        raise ValueError(
            "Dataset must be either cateringServices, automotiveEngineering, or bbn."
        )

    cf.load_config(dataset)

    logger.info("Loading files...")

    data_loaders = dutils.load_obj_from_pkl_file(
        'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
    word_vocab = dutils.load_obj_from_pkl_file(
        'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
    wordpiece_vocab = dutils.load_obj_from_pkl_file(
        'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
    hierarchy = dutils.load_obj_from_pkl_file(
        'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl')
    total_wordpieces = dutils.load_obj_from_pkl_file(
        'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')

    ground_truth_triples_df = pd.read_csv(cf.GROUND_TRUTH_TRIPLES_FILE)

    ground_truth_triples = parse_ground_truth_triples(ground_truth_triples_df)

    logger.info("Building model.")
    model = E2EETModel(embedding_dim=cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM,
                       hidden_dim=cf.HIDDEN_DIM,
                       vocab_size=len(wordpiece_vocab),
                       label_size=len(hierarchy),
                       total_wordpieces=total_wordpieces,
                       category_counts=hierarchy.get_train_category_counts(),
                       hierarchy_matrix=hierarchy.hierarchy_matrix,
                       max_seq_len=cf.MAX_SENT_LEN,
                       batch_size=cf.BATCH_SIZE)
    model.cuda()

    train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy,
          ground_truth_triples)