data.y = [item for sublist in data.y for item in sublist] assert len(data.x[0]) == len(data.y) # save the data for cnn since it takes forever to generate # also save the concept dict order for faster prediction concept_order = uniq(concept.ids) data = [tr_data,val_data,concept_order] with open('gitig_new_data.pickle','wb') as f: pickle.dump(data,f,protocol=4) logger.info('Mentions and concepts saved.') # cnn if not int(config['model']['use_saved_model']): # train new model import cnn, model_tools cnn.print_input(tr_data) model = cnn.build_model(config,tr_data,vocabulary,pretrained) # select hardest training samples from preliminary training if config.getint('training','sample_hard'): import sp_training from datetime import datetime # from callback import EarlyStoppingRankingAccuracy # evaluation_function_1 = EarlyStoppingRankingAccuracy(config,val_data) from callback import EarlyStoppingRankingAccuracySpedUp evaluation_function = EarlyStoppingRankingAccuracySpedUp(config,val_data,concept.padded,corpus_dev.padded,pretrained) try: new_tr_data = pickle.load(open('gitig_new_tr_data_ratio.pickle','rb')) logger.info('Using saved subsampled data')
import pickle data = [corpus_train.elmo, corpus_dev.elmo, can_list.elmo] with open(config['embedding']['cache_elmo_emb'], 'wb') as f: pickle.dump(data, f, protocol=4) logger.info('Elmo embedding for mentions and candidates saved.') for corpus, data in zip([corpus_train, corpus_dev], [training_data, val_data]): data.x.extend( np.array(vectorizer_elmo.elmo_format_x(corpus.elmo, can_list.elmo))) if not int(config['model']['use_saved_model']): # train new model import cnn, model_tools from callback import EarlyStoppingRankingAccuracy evaluation_function = EarlyStoppingRankingAccuracy(config, val_data) cnn.print_input(training_data) model = cnn.build_model(config, training_data, vocabulary, pretrained) if int(config['settings']['imp_tr']): #importance sampling from importance_sampling.training import ImportanceTraining logger.warning('Using truncated data!') fake_data_x = [a[:1000000] for a in training_data.x] hist = ImportanceTraining(model).fit(fake_data_x, training_data.y[:1000000], epochs=int( config['training']['epoch']), batch_size=100, callbacks=[evaluation_function]) else: logger.warning('Using truncated data!')