def __init__(self, init_checkpoint, vocab_file, stop_words_file, config_file, embedding_table_file, index_vecs_file, index_data_file, listen_port=5022, logger=logging.StreamHandler()): super(WordVecTransformer, self).__init__() self.logger = logger self.port = listen_port self.vec_size = 2400 if not os.path.exists("./tmp"): os.mkdir("./tmp") model_dir = "./tmp" self.index_vecs = np.asarray(self.load_index_bin(index_vecs_file), dtype=np.float32) self.index_data = self.load_index_data(index_data_file) self.tokenizer = tokenization.Tokenizer( vocab_file=vocab_file, stop_words_file=stop_words_file, use_pos=False) self.model_config = ModelConfig.from_json_file(config_file) self.estimator = create_estimator(self.model_config, init_checkpoint, model_dir, embedding_table_file) self.build_index(self.index_vecs) self.logger.info("Finish WordVecTransFormer init.")
def main(): mean = FLAGS.mean std = FLAGS.std config_path = os.path.join(FLAGS.model_dir, "config.json") if not os.path.isfile(config_path): # create the model config config = ModelConfig( model_name=FLAGS.model_name, arch=FLAGS.arch, model_n_out=FLAGS.model_n_out, sz=FLAGS.sz, N=FLAGS.N, mean=np.array(mean), std=np.array(std), meta={"model_file_prefix": FLAGS.model_file_prefix}) config.toDir(FLAGS.model_dir) evaluate_model_dir(FLAGS.model_dir, sampler=None, TRAIN=FLAGS.train_image_dir, LABELS=FLAGS.train_csv)
def load_weights(model_dir, fold): # load config config = ModelConfig.fromDir(model_dir) n_folds = config.getMetaField('n_folds') if n_folds is None: n_folds = 4 model_file_prefix = config.getMetaField('model_file_prefix') if model_file_prefix is None: model_file_prefix = "" # load models model_path = os.path.join(model_dir, f'{model_file_prefix}{fold}.pth') assert os.path.isfile(model_path), f'Model not found {model_path}' state_dict = torch.load(model_path) return state_dict
def load_models_from_dir(model_dir, tile_list_input=True): # load config config = ModelConfig.fromDir(model_dir) n_folds = config.getMetaField('n_folds') if n_folds is None: n_folds = 4 model_file_prefix = config.getMetaField('model_file_prefix') if model_file_prefix is None: model_file_prefix = "" model_name = config.getField('model_name') arch = config.getField('arch') model_n_out = config.getField('model_n_out') N = config.getField('N') # load models model_paths = [ os.path.join(model_dir, f'{model_file_prefix}{i}.pth') for i in range(n_folds) ] model_func = get_panda_model(model_name, arch, n=model_n_out, num_tiles=N, pretrained=False, is_train=False, tile_list_input=tile_list_input) models = [] for model_path in model_paths: #assert os.path.isfile(model_path), f'Model not found {model_path}' if os.path.isfile(model_path): state_dict = torch.load( model_path) #,map_location=torch.device('cpu')) model = model_func() model.load_state_dict(state_dict) model.float() model.eval() model.cuda() models.append(model) return models
shuffle_nonempty_imgs=shuffle_nonempty_imgs, is_ordinal=is_ordinal, num_workers=n_workers) # Model model_func = get_panda_model(model_name, arch=arch, n=model_n_out, final_dropout=final_dropout, num_tiles=N) # create the model config which is saved along the models config = ModelConfig(model_name=model_name, arch=arch, model_n_out=model_n_out, sz=sz, N=N, mean=mean.numpy(), std=std.numpy(), meta=meta) loss_func = get_default_loss(model_name, data, is_ordinal=is_ordinal) default_metrics, monitor_metric = get_default_metrics(model_name, data=data, is_ordinal=is_ordinal) def default_callback_fns(): cb_fns = [ShowGraph] if gradient_accumulation > 1: accumulator = partial(GradientAccumulator,
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") model_config = ModelConfig.from_json_file(FLAGS.config_file) if FLAGS.max_seq_length > model_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, model_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = TextProcessor(labels=["1","2"]) label_list = processor.get_labels() tokenizer = tokenization.Tokenizer( vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=FLAGS.use_pos) tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d"%(model_config.vocab_size, tokenizer.vocab_size)) assert(model_config.vocab_size == tokenizer.vocab_size) if FLAGS.embedding_table is not None: embedding_table = load_embedding_table(FLAGS.embedding_table) else: embedding_table = None assert(len(tokenizer.vocab) == embedding_table.shape[0]) #train_examples = processor.get_train_examples(FLAGS.train_data) train_examples = None num_train_steps = FLAGS.num_train_steps num_warmup_steps = FLAGS.num_warmup_steps #if FLAGS.do_train: # train_examples = processor.get_train_examples(FLAGS.train_data) # num_train_steps = int( # len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=100, save_checkpoints_steps=1000, keep_checkpoint_max=6, log_step_count_steps=100) model_fn = model_fn_builder( model_config=model_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, embedding_table_value=embedding_table, embedding_table_trainable=FLAGS.embedding_table_trainable, use_one_hot_embeddings=False) params = { "batch_size":FLAGS.batch_size, } estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params=params) if FLAGS.do_train: #train_file = os.path.join(FLAGS.output_dir, "train.tf_record") #file_based_convert_examples_to_features( # train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) #tf.logging.info("***** Running training *****") #tf.logging.info(" Num examples = %d", len(train_examples)) #tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) #tf.logging.info(" Num steps = %d", num_train_steps) train_file = FLAGS.train_data train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: pass #eval_examples = processor.get_dev_examples(FLAGS.eval_data) #num_actual_eval_examples = len(eval_examples) #eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") #file_based_convert_examples_to_features( # eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) #tf.logging.info("***** Running evaluation *****") #tf.logging.info(" Num examples = %d", num_actual_eval_examples) #tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) #eval_input_fn = file_based_input_fn_builder( # input_file=eval_file, # seq_length=FLAGS.max_seq_length, # is_training=False, # drop_remainder=False) #eval_steps = None #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") #with tf.gfile.GFile(output_eval_file, "w") as writer: # tf.logging.info("***** Eval results *****") # for key in sorted(result.keys()): # tf.logging.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.pred_data) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d ", num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn, hooks=None) output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): text_representation = prediction["text_representation"] keyword_probs = prediction["keyword_probs"] input_ids = prediction["input_ids"] if i >= num_actual_predict_examples: break sorted_keyword_probs = np.argsort(keyword_probs, axis=-1) top_keyword_ids = [] top_keyword_probs = [] for i in range(-1,-6,-1): idx = sorted_keyword_probs[i] top_keyword_ids.append(input_ids[idx]) top_keyword_probs.append(keyword_probs[idx]) #for i, idx in enumerate(sorted_keyword_probs): # top_keyword_ids.append(input_ids[idx]) # top_keyword_probs.append(keyword_probs[idx]) # if i >= 5: # break top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids) output_line = "\t".join(kw + ":" + str(prob) for kw,prob in zip(top_keywords, top_keyword_probs)) + "\n" writer.write(output_line) words = tokenizer.convert_ids_to_tokens(input_ids) check_line = "\t".join(w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n" writer.write(check_line) num_written_lines += 1 print("num_writen_lines:%d,num_actual_predict_examples:%d"%(num_written_lines, num_actual_predict_examples)) assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_encode: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' or `do_encode` must be True." ) model_config = ModelConfig.from_json_file(FLAGS.config_file) if FLAGS.max_seq_length > model_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, model_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = PairTextProcessor() tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=False) tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d" % (model_config.vocab_size, tokenizer.vocab_size)) assert (model_config.vocab_size == tokenizer.vocab_size) if FLAGS.embedding_table is not None: embedding_table = load_embedding_table(FLAGS.embedding_table) else: embedding_table = None assert (len(tokenizer.vocab) == embedding_table.shape[0]) train_examples = None num_train_steps = FLAGS.num_train_steps num_warmup_steps = FLAGS.num_warmup_steps run_config = tf.estimator.RunConfig(model_dir=FLAGS.output_dir, save_summary_steps=100, save_checkpoints_steps=1000, keep_checkpoint_max=6, log_step_count_steps=100) model_fn = model_fn_builder( model_config=model_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, do_encode=FLAGS.do_encode, embedding_table_value=embedding_table, embedding_table_trainable=FLAGS.embedding_table_trainable, use_one_hot_embeddings=False) params = { "batch_size": FLAGS.batch_size, } estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params=params) if FLAGS.do_train: train_file = FLAGS.train_data train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) elif FLAGS.do_eval: pass elif FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.pred_data) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_pairexamples_to_features(predict_examples, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d ", num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn, hooks=None) output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): text_representation = prediction["text_representation"] keyword_probs = prediction["keyword_probs"] input_ids = prediction["input_ids"] if i >= num_actual_predict_examples: break sorted_keyword_probs = np.argsort(keyword_probs, axis=-1) top_keyword_ids = [] top_keyword_probs = [] for i in range(-1, -6, -1): idx = sorted_keyword_probs[i] top_keyword_ids.append(input_ids[idx]) top_keyword_probs.append(keyword_probs[idx]) top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids) output_line = "\t".join( kw + ":" + str(prob) for kw, prob in zip( top_keywords, top_keyword_probs)) + "\n" writer.write(output_line) words = tokenizer.convert_ids_to_tokens(input_ids) check_line = "\t".join( w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n" writer.write(check_line) num_written_lines += 1 print("num_writen_lines:%d,num_actual_predict_examples:%d" % (num_written_lines, num_actual_predict_examples)) assert num_written_lines == num_actual_predict_examples elif FLAGS.do_encode: encode_input_file = FLAGS.encode_data encode_input_fn = file_based_encode_input_fn_builder( input_file=encode_input_file, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) output_file = FLAGS.encode_output wfp = open(output_file, "wb") result = estimator.predict(input_fn=encode_input_fn, hooks=None) text_embeddings = [] for idx, item in enumerate(result): text_embeddings.append(item["text_representation"]) if idx < 10: tf.logging.info("%s" % (item["text_representation"])) pickle.dump(text_embeddings, wfp) wfp.close()
from train_config import FLAGS from train_aux_fn import get_loss_heatmap from train_aux_fn import learning_rate_schedule from train_aux_fn import learning_rate_exp_decay from train_aux_fn import get_heatmap_activation from train_aux_fn import metric_fn from train_aux_fn import summary_fn from tensorflow.contrib.training.python.training import evaluation from tensorflow.python.estimator import estimator # config instance generation train_config = TrainConfig() model_config = ModelConfig() preproc_config = PreprocessingConfig() train_config_dict = train_config.__dict__ model_config_dict = model_config.__dict__ preproc_config_dict = preproc_config.__dict__ def model_fn(features, labels, mode, params): """ The model_fn for dontbeturtle model to be used with Estimator. Returns: A `EstimatorSpec` for the model """ del params # unused
def evaluate_model_dir(model_dir, sampler=None, TRAIN=None, LABELS=None, **kwargs): """ Evaluates CV models in out-of-fold fashion and saves some stats to the model dir Provide either sampler or TRAIN and LABELS. model_dir: directory containing models sampler (FoldSampler): optional data sampler instance TRAIN: optional training images folder LABELS: optional train.csv path """ # load config config = ModelConfig.fromDir(model_dir) # load models models = load_models_from_dir(model_dir) model_name = config.getField('model_name') regr = "regr" in model_name n_folds = len(models) sz = config.getField('sz') mean = torch.tensor(np.array(config.getField('mean')).astype(np.float32)) std = torch.tensor(np.array(config.getField('std')).astype(np.float32)) N = config.getField('N') is_ordinal = config.getMetaField('is_ordinal') if sampler is None: assert (TRAIN is not None and LABELS is not None ), "Either sampler or TRAIN + LABELS must be provided" sampler = FoldSampler(TRAIN, LABELS, mean, std, N, tfms=[], sz=sz, bs=1, n_folds=n_folds, is_ordinal=is_ordinal, model_name=model_name) # evaluate out of fold val_qwks = [] karolinska_preds = [] karolinska_targets = [] radboud_preds = [] radboud_targets = [] all_preds = [] all_targets = [] score_dict = {} for fold, model in zip(range(n_folds), models): data = sampler.get_data(fold) default_metrics, monitor_metric = get_default_metrics( model_name, data=data, is_ordinal=is_ordinal) learn = Learner(data, model, metrics=default_metrics, opt_func=Over9000).to_fp16() learn.create_opt(1e-3, 0.9) # calculate data provider specific scores preds, targets, losses = learn.get_preds(with_loss=True) targets = targets.numpy() if is_ordinal: targets = ordinalRegs2cat(targets) losses = torch.sum(losses.view(preds.shape[0], preds.shape[1]), axis=1) if not regr: if is_ordinal: preds = ordinalRegs2cat((preds > 0.5).numpy()) else: preds = np.argmax(preds.numpy(), axis=1) else: # convert to categories preds = regrPreds2cat(preds) all_preds += list(preds) all_targets += list(targets) # fold qwk val_qwk = cohen_kappa_score(preds, targets, weights="quadratic") val_qwks.append(val_qwk) score_dict[f'{fold}_qwk'] = str(val_qwk) # get 'karolinska' 'radboud' labels data_providers = [ sampler.df[sampler.df.image_id == os.path.basename( _id)].data_provider.values[0] for _id in data.valid_ds.items ] for pred, target, provider in zip(preds, targets, data_providers): if provider == "karolinska": karolinska_preds.append(pred) karolinska_targets.append(target) else: radboud_preds.append(pred) radboud_targets.append(target) # plot top and min losses plot_samples(data, losses, preds, sampler.df[sampler.df.split == fold].image_id.values) plt.savefig(os.path.join(model_dir, "losses_fold-{0}.png".format(fold)), transparent=False) # confusion matrices if not regr: _ = plot_confusion_matrix_scipy( preds, targets, normalize=False, title='fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk)) plt.savefig(os.path.join(model_dir, "cm_fold-{0}.png".format(fold)), transparent=False) cm = plot_confusion_matrix_scipy( preds, targets, normalize=True, title='Norm. fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk)) plt.savefig(os.path.join(model_dir, "cm_fold-{0}-norm.png".format(fold)), transparent=False) else: _ = plot_confusion_matrix_scipy( preds, targets, normalize=False, title='fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk)) plt.savefig(os.path.join(model_dir, "cm_fold-{0}.png".format(fold)), transparent=False) cm = plot_confusion_matrix_scipy( preds, targets, normalize=True, title='Norm. fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk)) plt.savefig(os.path.join(model_dir, "cm_fold-{0}-norm.png".format(fold)), transparent=False) # save confusion matrix values np.save(os.path.join(model_dir, "cm_fold-{0}.npy".format(fold)), cm) cv_qwk = cohen_kappa_score(np.array(all_preds), np.array(all_targets), weights="quadratic") score_dict['cv_qwk'] = str(cv_qwk) score_dict['karolinska_qwk'] = str( cohen_kappa_score(karolinska_preds, karolinska_targets, weights="quadratic")) score_dict['radboud_qwk'] = str( cohen_kappa_score(radboud_preds, radboud_targets, weights="quadratic")) # save out-of-fold predictions np.save(os.path.join(model_dir, 'oof_preds.npy'), np.array(all_preds)) np.save(os.path.join(model_dir, 'oof_trues.npy'), np.array(all_targets)) with open(os.path.join(model_dir, 'eval.json'), 'w') as outfile: json.dump(score_dict, outfile, indent=4) # record for the notebook print(score_dict) plt.close('all')