Example #1
0
 def __init__(self,
              init_checkpoint,
              vocab_file,
              stop_words_file,
              config_file,
              embedding_table_file,
              index_vecs_file,
              index_data_file,
              listen_port=5022,
              logger=logging.StreamHandler()):
     super(WordVecTransformer, self).__init__()
     self.logger = logger
     self.port = listen_port
     self.vec_size = 2400
     if not os.path.exists("./tmp"):
         os.mkdir("./tmp")
     model_dir = "./tmp"
     self.index_vecs = np.asarray(self.load_index_bin(index_vecs_file),
                                  dtype=np.float32)
     self.index_data = self.load_index_data(index_data_file)
     self.tokenizer = tokenization.Tokenizer(
         vocab_file=vocab_file,
         stop_words_file=stop_words_file,
         use_pos=False)
     self.model_config = ModelConfig.from_json_file(config_file)
     self.estimator = create_estimator(self.model_config, init_checkpoint,
                                       model_dir, embedding_table_file)
     self.build_index(self.index_vecs)
     self.logger.info("Finish WordVecTransFormer init.")
Example #2
0
def main():
    mean = FLAGS.mean
    std = FLAGS.std

    config_path = os.path.join(FLAGS.model_dir, "config.json")
    if not os.path.isfile(config_path):
        # create the model config
        config = ModelConfig(
            model_name=FLAGS.model_name,
            arch=FLAGS.arch,
            model_n_out=FLAGS.model_n_out,
            sz=FLAGS.sz,
            N=FLAGS.N,
            mean=np.array(mean),
            std=np.array(std),
            meta={"model_file_prefix": FLAGS.model_file_prefix})
        config.toDir(FLAGS.model_dir)

    evaluate_model_dir(FLAGS.model_dir,
                       sampler=None,
                       TRAIN=FLAGS.train_image_dir,
                       LABELS=FLAGS.train_csv)
Example #3
0
def load_weights(model_dir, fold):
    # load config
    config = ModelConfig.fromDir(model_dir)

    n_folds = config.getMetaField('n_folds')
    if n_folds is None: n_folds = 4

    model_file_prefix = config.getMetaField('model_file_prefix')
    if model_file_prefix is None: model_file_prefix = ""

    # load models
    model_path = os.path.join(model_dir, f'{model_file_prefix}{fold}.pth')
    assert os.path.isfile(model_path), f'Model not found {model_path}'

    state_dict = torch.load(model_path)
    return state_dict
Example #4
0
def load_models_from_dir(model_dir, tile_list_input=True):
    # load config
    config = ModelConfig.fromDir(model_dir)

    n_folds = config.getMetaField('n_folds')
    if n_folds is None: n_folds = 4

    model_file_prefix = config.getMetaField('model_file_prefix')
    if model_file_prefix is None: model_file_prefix = ""

    model_name = config.getField('model_name')
    arch = config.getField('arch')
    model_n_out = config.getField('model_n_out')
    N = config.getField('N')

    # load models
    model_paths = [
        os.path.join(model_dir, f'{model_file_prefix}{i}.pth')
        for i in range(n_folds)
    ]
    model_func = get_panda_model(model_name,
                                 arch,
                                 n=model_n_out,
                                 num_tiles=N,
                                 pretrained=False,
                                 is_train=False,
                                 tile_list_input=tile_list_input)

    models = []
    for model_path in model_paths:
        #assert os.path.isfile(model_path), f'Model not found {model_path}'
        if os.path.isfile(model_path):
            state_dict = torch.load(
                model_path)  #,map_location=torch.device('cpu'))
            model = model_func()
            model.load_state_dict(state_dict)
            model.float()
            model.eval()
            model.cuda()
            models.append(model)

    return models
Example #5
0
                      shuffle_nonempty_imgs=shuffle_nonempty_imgs,
                      is_ordinal=is_ordinal,
                      num_workers=n_workers)

# Model
model_func = get_panda_model(model_name,
                             arch=arch,
                             n=model_n_out,
                             final_dropout=final_dropout,
                             num_tiles=N)

# create the model config which is saved along the models
config = ModelConfig(model_name=model_name,
                     arch=arch,
                     model_n_out=model_n_out,
                     sz=sz,
                     N=N,
                     mean=mean.numpy(),
                     std=std.numpy(),
                     meta=meta)

loss_func = get_default_loss(model_name, data, is_ordinal=is_ordinal)

default_metrics, monitor_metric = get_default_metrics(model_name,
                                                      data=data,
                                                      is_ordinal=is_ordinal)


def default_callback_fns():
    cb_fns = [ShowGraph]
    if gradient_accumulation > 1:
        accumulator = partial(GradientAccumulator,
Example #6
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

    model_config = ModelConfig.from_json_file(FLAGS.config_file)
    if FLAGS.max_seq_length > model_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, model_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    processor = TextProcessor(labels=["1","2"])
    label_list = processor.get_labels()

    tokenizer = tokenization.Tokenizer(
        vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=FLAGS.use_pos)
    tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d"%(model_config.vocab_size, tokenizer.vocab_size))
    assert(model_config.vocab_size == tokenizer.vocab_size)

    if FLAGS.embedding_table is not None:
        embedding_table = load_embedding_table(FLAGS.embedding_table)
    else:
        embedding_table = None

    assert(len(tokenizer.vocab) == embedding_table.shape[0])

    #train_examples = processor.get_train_examples(FLAGS.train_data)
    train_examples = None
    num_train_steps = FLAGS.num_train_steps
    num_warmup_steps = FLAGS.num_warmup_steps
    #if FLAGS.do_train:
    #    train_examples = processor.get_train_examples(FLAGS.train_data)
    #    num_train_steps = int(
    #        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    #    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=100,
        save_checkpoints_steps=1000,
        keep_checkpoint_max=6,
        log_step_count_steps=100)

    model_fn = model_fn_builder(
        model_config=model_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        embedding_table_value=embedding_table,
        embedding_table_trainable=FLAGS.embedding_table_trainable,
        use_one_hot_embeddings=False)

    params = {
        "batch_size":FLAGS.batch_size,
    }
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=params)

    if FLAGS.do_train:
        #train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        #file_based_convert_examples_to_features(
        #    train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        #tf.logging.info("***** Running training *****")
        #tf.logging.info("  Num examples = %d", len(train_examples))
        #tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        #tf.logging.info("  Num steps = %d", num_train_steps)
        train_file = FLAGS.train_data
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        pass
        #eval_examples = processor.get_dev_examples(FLAGS.eval_data)
        #num_actual_eval_examples = len(eval_examples)
        #eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        #file_based_convert_examples_to_features(
        #    eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        #tf.logging.info("***** Running evaluation *****")
        #tf.logging.info(" Num examples = %d", num_actual_eval_examples)
        #tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)

        #eval_input_fn = file_based_input_fn_builder(
        #    input_file=eval_file,
        #    seq_length=FLAGS.max_seq_length,
        #    is_training=False,
        #    drop_remainder=False)

        #eval_steps = None
        #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        #with tf.gfile.GFile(output_eval_file, "w") as writer:
        #    tf.logging.info("***** Eval results *****")
        #    for key in sorted(result.keys()):
        #        tf.logging.info("  %s = %s", key, str(result[key]))
        #        writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.pred_data)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d ", num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn, hooks=None)
        output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                text_representation = prediction["text_representation"]
                keyword_probs = prediction["keyword_probs"]
                input_ids = prediction["input_ids"]
                if i >= num_actual_predict_examples:
                    break

                sorted_keyword_probs = np.argsort(keyword_probs, axis=-1)
                top_keyword_ids = []
                top_keyword_probs = []
                for i in range(-1,-6,-1):
                    idx = sorted_keyword_probs[i]
                    top_keyword_ids.append(input_ids[idx])
                    top_keyword_probs.append(keyword_probs[idx])
                    
                #for i, idx in enumerate(sorted_keyword_probs):
                #    top_keyword_ids.append(input_ids[idx])
                #    top_keyword_probs.append(keyword_probs[idx])
                #    if i >= 5:
                #        break
                top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids)
                output_line = "\t".join(kw + ":" + str(prob) for kw,prob in zip(top_keywords, top_keyword_probs)) + "\n"
                writer.write(output_line)
                words = tokenizer.convert_ids_to_tokens(input_ids)
                check_line = "\t".join(w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n"
                writer.write(check_line)
                num_written_lines += 1
        print("num_writen_lines:%d,num_actual_predict_examples:%d"%(num_written_lines, num_actual_predict_examples))
        assert num_written_lines == num_actual_predict_examples
Example #7
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_encode:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' or `do_encode` must be True."
        )

    model_config = ModelConfig.from_json_file(FLAGS.config_file)
    if FLAGS.max_seq_length > model_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, model_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)
    processor = PairTextProcessor()
    tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file,
                                       stop_words_file=FLAGS.stop_words_file,
                                       use_pos=False)
    tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d" %
                    (model_config.vocab_size, tokenizer.vocab_size))
    assert (model_config.vocab_size == tokenizer.vocab_size)

    if FLAGS.embedding_table is not None:
        embedding_table = load_embedding_table(FLAGS.embedding_table)
    else:
        embedding_table = None

    assert (len(tokenizer.vocab) == embedding_table.shape[0])

    train_examples = None
    num_train_steps = FLAGS.num_train_steps
    num_warmup_steps = FLAGS.num_warmup_steps

    run_config = tf.estimator.RunConfig(model_dir=FLAGS.output_dir,
                                        save_summary_steps=100,
                                        save_checkpoints_steps=1000,
                                        keep_checkpoint_max=6,
                                        log_step_count_steps=100)

    model_fn = model_fn_builder(
        model_config=model_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        do_encode=FLAGS.do_encode,
        embedding_table_value=embedding_table,
        embedding_table_trainable=FLAGS.embedding_table_trainable,
        use_one_hot_embeddings=False)

    params = {
        "batch_size": FLAGS.batch_size,
    }
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params=params)

    if FLAGS.do_train:
        train_file = FLAGS.train_data
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    elif FLAGS.do_eval:
        pass

    elif FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.pred_data)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_pairexamples_to_features(predict_examples,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, predict_file)
        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d ", num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn, hooks=None)
        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "pred_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                text_representation = prediction["text_representation"]
                keyword_probs = prediction["keyword_probs"]
                input_ids = prediction["input_ids"]
                if i >= num_actual_predict_examples:
                    break

                sorted_keyword_probs = np.argsort(keyword_probs, axis=-1)
                top_keyword_ids = []
                top_keyword_probs = []
                for i in range(-1, -6, -1):
                    idx = sorted_keyword_probs[i]
                    top_keyword_ids.append(input_ids[idx])
                    top_keyword_probs.append(keyword_probs[idx])

                top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids)
                output_line = "\t".join(
                    kw + ":" + str(prob) for kw, prob in zip(
                        top_keywords, top_keyword_probs)) + "\n"
                writer.write(output_line)
                words = tokenizer.convert_ids_to_tokens(input_ids)
                check_line = "\t".join(
                    w + ":" + str(prob)
                    for w, prob in zip(words, keyword_probs)) + "\n"
                writer.write(check_line)
                num_written_lines += 1
        print("num_writen_lines:%d,num_actual_predict_examples:%d" %
              (num_written_lines, num_actual_predict_examples))
        assert num_written_lines == num_actual_predict_examples
    elif FLAGS.do_encode:
        encode_input_file = FLAGS.encode_data
        encode_input_fn = file_based_encode_input_fn_builder(
            input_file=encode_input_file,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        output_file = FLAGS.encode_output
        wfp = open(output_file, "wb")
        result = estimator.predict(input_fn=encode_input_fn, hooks=None)
        text_embeddings = []
        for idx, item in enumerate(result):
            text_embeddings.append(item["text_representation"])
            if idx < 10:
                tf.logging.info("%s" % (item["text_representation"]))
        pickle.dump(text_embeddings, wfp)
        wfp.close()
Example #8
0
from train_config import FLAGS

from train_aux_fn import get_loss_heatmap
from train_aux_fn import learning_rate_schedule
from train_aux_fn import learning_rate_exp_decay
from train_aux_fn import get_heatmap_activation
from train_aux_fn import metric_fn
from train_aux_fn import summary_fn

from tensorflow.contrib.training.python.training import evaluation
from tensorflow.python.estimator import estimator

# config instance generation
train_config = TrainConfig()
model_config = ModelConfig()
preproc_config = PreprocessingConfig()

train_config_dict = train_config.__dict__
model_config_dict = model_config.__dict__
preproc_config_dict = preproc_config.__dict__


def model_fn(features, labels, mode, params):
    """
    The model_fn for dontbeturtle model to be used with Estimator.
        Returns:
        A `EstimatorSpec` for the model
    """
    del params  # unused
Example #9
0
def evaluate_model_dir(model_dir,
                       sampler=None,
                       TRAIN=None,
                       LABELS=None,
                       **kwargs):
    """
    Evaluates CV models in out-of-fold fashion and saves some stats to the model dir

    Provide either sampler or TRAIN and LABELS.
    model_dir: directory containing models
    sampler (FoldSampler): optional data sampler instance
    TRAIN: optional training images folder
    LABELS: optional train.csv path
    """
    # load config
    config = ModelConfig.fromDir(model_dir)
    # load models
    models = load_models_from_dir(model_dir)
    model_name = config.getField('model_name')
    regr = "regr" in model_name

    n_folds = len(models)
    sz = config.getField('sz')
    mean = torch.tensor(np.array(config.getField('mean')).astype(np.float32))
    std = torch.tensor(np.array(config.getField('std')).astype(np.float32))
    N = config.getField('N')
    is_ordinal = config.getMetaField('is_ordinal')

    if sampler is None:
        assert (TRAIN is not None and LABELS is not None
                ), "Either sampler or TRAIN + LABELS must be provided"

        sampler = FoldSampler(TRAIN,
                              LABELS,
                              mean,
                              std,
                              N,
                              tfms=[],
                              sz=sz,
                              bs=1,
                              n_folds=n_folds,
                              is_ordinal=is_ordinal,
                              model_name=model_name)

    # evaluate out of fold
    val_qwks = []
    karolinska_preds = []
    karolinska_targets = []
    radboud_preds = []
    radboud_targets = []
    all_preds = []
    all_targets = []
    score_dict = {}
    for fold, model in zip(range(n_folds), models):
        data = sampler.get_data(fold)
        default_metrics, monitor_metric = get_default_metrics(
            model_name, data=data, is_ordinal=is_ordinal)
        learn = Learner(data,
                        model,
                        metrics=default_metrics,
                        opt_func=Over9000).to_fp16()
        learn.create_opt(1e-3, 0.9)

        # calculate data provider specific scores
        preds, targets, losses = learn.get_preds(with_loss=True)
        targets = targets.numpy()
        if is_ordinal:
            targets = ordinalRegs2cat(targets)
            losses = torch.sum(losses.view(preds.shape[0], preds.shape[1]),
                               axis=1)

        if not regr:
            if is_ordinal:
                preds = ordinalRegs2cat((preds > 0.5).numpy())
            else:
                preds = np.argmax(preds.numpy(), axis=1)
        else:
            # convert to categories
            preds = regrPreds2cat(preds)

        all_preds += list(preds)
        all_targets += list(targets)

        # fold qwk
        val_qwk = cohen_kappa_score(preds, targets, weights="quadratic")
        val_qwks.append(val_qwk)
        score_dict[f'{fold}_qwk'] = str(val_qwk)

        # get 'karolinska' 'radboud' labels
        data_providers = [
            sampler.df[sampler.df.image_id == os.path.basename(
                _id)].data_provider.values[0] for _id in data.valid_ds.items
        ]
        for pred, target, provider in zip(preds, targets, data_providers):
            if provider == "karolinska":
                karolinska_preds.append(pred)
                karolinska_targets.append(target)
            else:
                radboud_preds.append(pred)
                radboud_targets.append(target)

        # plot top and min losses
        plot_samples(data, losses, preds,
                     sampler.df[sampler.df.split == fold].image_id.values)
        plt.savefig(os.path.join(model_dir,
                                 "losses_fold-{0}.png".format(fold)),
                    transparent=False)

        # confusion matrices
        if not regr:
            _ = plot_confusion_matrix_scipy(
                preds,
                targets,
                normalize=False,
                title='fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk))
            plt.savefig(os.path.join(model_dir,
                                     "cm_fold-{0}.png".format(fold)),
                        transparent=False)

            cm = plot_confusion_matrix_scipy(
                preds,
                targets,
                normalize=True,
                title='Norm. fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk))
            plt.savefig(os.path.join(model_dir,
                                     "cm_fold-{0}-norm.png".format(fold)),
                        transparent=False)
        else:
            _ = plot_confusion_matrix_scipy(
                preds,
                targets,
                normalize=False,
                title='fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk))
            plt.savefig(os.path.join(model_dir,
                                     "cm_fold-{0}.png".format(fold)),
                        transparent=False)

            cm = plot_confusion_matrix_scipy(
                preds,
                targets,
                normalize=True,
                title='Norm. fold:{0} - qwk:{1:.3f}'.format(fold, val_qwk))
            plt.savefig(os.path.join(model_dir,
                                     "cm_fold-{0}-norm.png".format(fold)),
                        transparent=False)

        # save confusion matrix values
        np.save(os.path.join(model_dir, "cm_fold-{0}.npy".format(fold)), cm)

    cv_qwk = cohen_kappa_score(np.array(all_preds),
                               np.array(all_targets),
                               weights="quadratic")
    score_dict['cv_qwk'] = str(cv_qwk)
    score_dict['karolinska_qwk'] = str(
        cohen_kappa_score(karolinska_preds,
                          karolinska_targets,
                          weights="quadratic"))
    score_dict['radboud_qwk'] = str(
        cohen_kappa_score(radboud_preds, radboud_targets, weights="quadratic"))

    # save out-of-fold predictions
    np.save(os.path.join(model_dir, 'oof_preds.npy'), np.array(all_preds))
    np.save(os.path.join(model_dir, 'oof_trues.npy'), np.array(all_targets))

    with open(os.path.join(model_dir, 'eval.json'), 'w') as outfile:
        json.dump(score_dict, outfile, indent=4)

    # record for the notebook
    print(score_dict)
    plt.close('all')