コード例 #1
0
def main():
    dataset = 'cifar100'
    num_samples = 1000

    datafile = DATAFILE_LIST[dataset]
    num_classes = NUM_CLASSES_DICT[dataset]

    categories, observations, confidences, idx2category, category2idx, labels = prepare_data(datafile, False)

    # accuracy models
    accuracy_model = BetaBernoulli(k=num_classes, prior=None)
    accuracy_model.update_batch(categories, observations)

    # ece models for each class
    ece_model = ClasswiseEce(num_classes, num_bins=10, pseudocount=2)
    ece_model.update_batch(categories, observations, confidences)

    # draw samples from posterior of classwise accuracy
    accuracy_samples = accuracy_model.sample(num_samples)  # (num_categories, num_samples)
    ece_samples = ece_model.sample(num_samples)  # (num_categories, num_samples)

    accuracy = np.array([np.quantile(accuracy_samples, 0.025, axis=1),
                         np.quantile(accuracy_samples, 0.5, axis=1),
                         np.quantile(accuracy_samples, 0.975, axis=1)]).T
    ece = np.array([np.quantile(ece_samples, 0.025, axis=1),
                    np.quantile(ece_samples, 0.5, axis=1),
                    np.quantile(ece_samples, 0.975, axis=1)]).T
    fig, axes = plot_figure_1(accuracy, ece, labels=CIFAR100_CLASSES, limit=10, reverse=False)

    fig.tight_layout()
    fig.subplots_adjust(bottom=-0.2, wspace=0.35)
    fig.set_size_inches(COLUMN_WIDTH * 1.3, 2.0)
    fig.savefig(FIGURE_DIR + 'figure1.pdf', bbox_inches="tight", pad_inches=0.05)
コード例 #2
0
def test_shaps_to_probs_with_data() -> None:
    """
    test whether the shaps_to_probs tensorflow function actually calculates the correct
    class probabilities given actual shap values
    """
    from data_utils import load_costa_rica_dataset, prepare_data
    from xgboost_utils import fit_xgboost_classifier, calculate_shap_values

    # load data, train xgboost model, calculate shap values
    X, y = load_costa_rica_dataset()
    (n_samples, n_features, n_classes, X_train, X_valid, y_train, y_valid,
     y_train_onehot, y_valid_onehot, y_onehot,
     class_weights) = prepare_data(X, y)

    xgb_model = fit_xgboost_classifier(X_train, y_train)
    shap_values, expected_logits = calculate_shap_values(xgb_model, X)
    xgb_probs = xgb_model.predict_proba(X)

    # test shaps_to_probs
    sess = tf.Session()

    t_shaps = tf.placeholder(tf.float32)
    t_expected_logits = tf.placeholder(tf.float32)
    t_res = shaps_to_probs(t_shaps, t_expected_logits)

    shap_probs = sess.run(t_res,
                          feed_dict={
                              t_shaps: shap_values,
                              t_expected_logits: expected_logits
                          })
    print(np.allclose(shap_probs, xgb_probs))
    print()
コード例 #3
0
ファイル: train.py プロジェクト: asrulsibaoel/SimpleBayes
def main():
	# Prepare dataset.
	raw_file = './data/y_n_all' # raw data file.
	prepared_dir = './data/' # dir of prepared data(sentences cut down and labels are number)
	# cut_mode = 'character'
	cut_mode = 'jieba'
	# cut_mode = '2-gram'
	vocab_dir = './data/Bayes_vocabulary'
	prepared_data,prepared_label = data_utils.prepare_data(raw_file,prepared_dir,cut_method = cut_mode, vocab_dir = vocab_dir)
	print('Get prepared dataset.')
	# pdb.set_trace()
	# print(prepared_data)

	# Get training and test dataset.
	traning_dir = './data/train/'
	test_dir = './data/test/'
	ratio = 0
	train,test = data_utils.get_data(list(zip(prepared_data,prepared_label)),traning_dir,test_dir,ratio = ratio, cut_method = cut_mode)
	print('Get training and test dataset.')
	# pdb.set_trace()
	# print(train.data)
	# train
	model_name = 'Bayes'
	config_dir = './model/'
	train_model = init_model(model_name,config_dir)
	print('Initialize the model.')
	training(train_model,train)
	print('Training finished.')
	# store the variable.
	v = {'model':train_model}
	model_file = train_model.model_path + 'model.pickle'
	with open(model_file,'wb') as f:
		pickle.dump(v, f)
コード例 #4
0
def evaluate_model(model_path, dataset_path='emnist/emnist-balanced-test.csv'):
    raw_test_x, raw_test_y, class_map = data_utils.load_dataset(dataset_path)
    test_x, test_y, _ = data_utils.prepare_data(raw_test_x, raw_test_y,
                                                class_map)
    best_model = load_model(model_path)
    print(best_model.evaluate(test_x, test_y))
    data_utils.print_confusion_matrix(test_x, test_y, model_path, class_map)
コード例 #5
0
ファイル: translate_broken.py プロジェクト: zxsted/nn_chatbot
def get_prepared_data():
  """ Figures out from the passed in flags, what the training data should be, and
      prepared it by setting up the tokenizer. """
  from_train = None
  to_train = None
  from_dev = None
  to_dev = None
  if FLAGS.from_train_data and FLAGS.to_train_data:
    from_train_data = FLAGS.from_train_data
    to_train_data = FLAGS.to_train_data
    from_dev_data = from_train_data
    to_dev_data = to_train_data
    if FLAGS.from_dev_data and FLAGS.to_dev_data:
      from_dev_data = FLAGS.from_dev_data
      to_dev_data = FLAGS.to_dev_data
    from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_dir,
        from_train_data,
        to_train_data,
        from_dev_data,
        to_dev_data,
        FLAGS.from_vocab_size,
        FLAGS.to_vocab_size)
  else:
    pdb.set_trace()
    print("Must specify drom_train_data and to_train_data directories")
    exit(1)

  return (from_train, to_train, from_dev, to_dev)
コード例 #6
0
ファイル: models.py プロジェクト: DanaCohen95/copycat
def example_evaluate_random_classifier() -> None:
    """ compare a random classifier  """
    from data_utils import load_costa_rica_dataset, prepare_data
    from xgboost_utils import fit_xgboost_classifier, calculate_shap_values, evaluate_xgboost_classifier

    # load data, train xgboost model, calculate shap values
    X, y = load_costa_rica_dataset()
    (n_samples, n_features, n_classes, X_train, X_valid, y_train, y_valid,
     y_train_onehot, y_valid_onehot, y_onehot,
     class_weights) = prepare_data(X, y)

    xgb_model = fit_xgboost_classifier(X_train, y_train)
    shap_values, expected_logits = calculate_shap_values(xgb_model, X)

    # evaluate xgboost classifier
    print("\n", "evaluate xgboost classifier:")
    evaluate_xgboost_classifier(xgb_model, X_valid, y_valid)

    # evaluate random classifier
    print(
        "\n",
        "evaluate random classifier (based of xgboost expected probabilities):"
    )
    evaluate_random_classifier(expected_logits=expected_logits,
                               y_true=y_valid,
                               n_clones=1000)
コード例 #7
0
ファイル: test2.py プロジェクト: sodawater/simplify
def main(_):
    from_train_data = FLAGS.from_train_data
    to_train_data = FLAGS.to_train_data
    from_valid_data = FLAGS.from_valid_data
    to_valid_data = FLAGS.to_valid_data
    from_train, to_train, from_valid, to_valid, _, _ = data_utils.prepare_data(
        FLAGS.data_dir,
        from_train_data,
        to_train_data,
        from_valid_data,
        to_valid_data,
        FLAGS.from_vocab_size,
        FLAGS.to_vocab_size,
        same_vocab=False)
    hparams = create_hparams(FLAGS)
    hparams.add_hparam(name="from_train", value=from_train)
    hparams.add_hparam(name="to_train", value=to_train)
    hparams.add_hparam(name="from_valid", value=from_valid)
    hparams.add_hparam(name="to_valid", value=to_valid)
    from_vocab_path = os.path.join(hparams.data_dir,
                                   "vocab%d.from" % hparams.from_vocab_size)
    to_vocab_path = os.path.join(hparams.data_dir,
                                 "vocab%d.to" % hparams.to_vocab_size)
    #train_ae(hparams, train=True, interact=True)
    train_nmt(hparams, train=False, interact=True)
コード例 #8
0
    def load_data(self, debug=False):
        """Loads train/valid/test data and sentence encoding"""

        en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data(
            'tmp', 40000, 40000)

        self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary(
            en_vocab_path)
        self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary(
            fr_vocab_path)

        source_path = './tmp/train.ids40000.questions'
        target_path = './tmp/train.ids40000.answers'

        if self.config.train_mode:
            source_path = './tmp/train.ids40000.questions'
            target_path = './tmp/train.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)
        else:
            source_path = './tmp/test.ids40000.questions'
            target_path = './tmp/test.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)

        self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket(
            sources, targets, self.config)

        source_vocab_path = './tmp/vocab40000.questions'
        target_vocab_path = './tmp/vocab40000.answers'
        self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path)
        self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path)

        self.word_embedding = np.random.uniform(
            -self.config.embedding_init, self.config.embedding_init,
            (self.source_vocab_size, self.config.embed_size))
コード例 #9
0
ファイル: DAMSM.py プロジェクト: khakhulin/Text2Img
    def train_epoch(self,
                    epoch,
                    dataloader,
                    optimizer,
                    image_dir,
                    args,
                    device='cpu'):
        self.train()

        s_total_loss0 = 0
        s_total_loss1 = 0
        w_total_loss0 = 0
        w_total_loss1 = 0

        for data in tqdm(dataloader, total=len(dataloader)):

            imgs, caps, caps_len, masks, class_ids = \
                prepare_data(data, device, is_damsm=True)

            if self.is_bert:
                w_loss0, w_loss1, s_loss0, s_loss1 = \
                    self.forward(
                        imgs, caps, caps_len, args,
                        class_ids=class_ids, bert_mask=masks
                    )
            else:
                w_loss0, w_loss1, s_loss0, s_loss1 = \
                    self.forward(
                        imgs, caps, caps_len, args, class_ids=class_ids
                    )

            loss = s_loss0 + s_loss1 + w_loss0 + w_loss1
            w_total_loss0 += w_loss0.item()
            w_total_loss1 += w_loss1.item()
            s_total_loss0 += s_loss0.item()
            s_total_loss1 += s_loss1.item()

            self.text_encoder.zero_grad()
            self.image_encoder.zero_grad()

            loss.backward()
            # `clip_grad_norm` helps prevent
            # the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(self.text_encoder.parameters(),
                                           args.damsm_rnn_grad_clip)
            optimizer.step()

        s_total_loss0 /= len(dataloader)
        s_total_loss1 /= len(dataloader)
        w_total_loss0 /= len(dataloader)
        w_total_loss1 /= len(dataloader)
        sumloss = s_total_loss0 + s_total_loss1 + w_total_loss0 + w_total_loss1

        print('[TRAIN] Epoch {:3d} | '
              's_loss {:5.2f} {:5.2f} | '
              'w_loss {:5.2f} {:5.2f} | Sum {:5.2f}'.format(
                  epoch, s_total_loss0, s_total_loss1, w_total_loss0,
                  w_total_loss1, sumloss))

        return
コード例 #10
0
def train():
    print("Loading data...")
    vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \
        = data_utils.prepare_data(args.data_path) #读入数据
    max_text_len = max([len(words) for words in train_words])
    vocab_len = len(vocab_word_list)

    dev_sample_size = int(args.dev_sample * float(len(train_words)))
    x_train, x_dev = train_words[:-1 * dev_sample_size], train_words[
        -1 * dev_sample_size:]  #切分训练集和验证集
    y_train, y_dev = train_labels[:-1 * dev_sample_size], train_labels[
        -1 * dev_sample_size:]
    print("Vocabulary Size: {:d}".format(len(vocab_word_list)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

    #模型构造
    model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size,
                           list(map(int, args.filter_sizes.split(","))),
                           args.num_filters, args.max_gradient_norm,
                           args.learning_rate, args.l2_reg_lambda)

    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)
    #使用数据训练模型
    model.fit(x_train, y_train, x_dev, y_dev, args.epoch_size, args.batch_size,
              args.checkpoint_step, args.model_path)
コード例 #11
0
def get_data_params(train_df, val_df, raw_df, is_have_death=False, state="Texas", country='US'):
    train_params = prepare_data(train_df['Confirmed'].values,
                                train_df['Deaths'].values,
                                train_df['Recovered'].values,
                                population[country][state], is_have_death)

    val_parms = prepare_data(val_df['Confirmed'].values,
                            val_df['Deaths'].values,
                            val_df['Recovered'].values,
                            population[country][state], is_have_death)

    raw_params = prepare_data(raw_df['Confirmed'].values,
                            raw_df['Deaths'].values,
                            raw_df['Recovered'].values,
                            population[country][state], is_have_death)
    return train_params, val_parms, raw_params
コード例 #12
0
def _get_score(classifier):
    score_sum = 0
    for _ in tqdm(range(_score_iter)):
        samples, labels = prepare_data()
        score_sum += cross_val_score(classifier, samples, labels,
                                     cv=4).mean() * 100
    return score_sum / _score_iter
コード例 #13
0
def test(test_dataset, source_vocab, target_vocab, source_vocab_list,
         target_vocab_list):
    model = create_model(len(source_vocab), len(target_vocab),
                         source_vocab_list, target_vocab_list, 0.0,
                         args.max_source_len, args.max_target_len)
    test_set = data_utils.prepare_data(test_dataset, source_vocab,
                                       target_vocab)
    evaluate(model, test_set, source_vocab, target_vocab, source_vocab_list,
             target_vocab_list)
コード例 #14
0
def _get_score_with_optimal_features(classifier):
    score_sum = 0
    samples, labels = prepare_data()
    num_of_features = len(samples[0])
    histogram = [_score_iter for i in range(num_of_features)]
    for _ in tqdm(range(_score_iter)):
        samples, labels = prepare_data()
        search_problem = FeatureSearchProblem(classifier=classifier,
                                              initial_state=(samples, labels,
                                                             []))
        samples, labels, path = hill_climbing_stochastic(
            search_problem, iterations_limit=_search_iter).state
        dropped_features = _restore_path(path, num_of_features)
        _update_histogram(histogram, dropped_features)
        score_sum += cross_val_score(classifier, samples, labels,
                                     cv=4).mean() * 100
    print histogram
    return score_sum / _score_iter
コード例 #15
0
def train():
  """Run SpeakEasy/server/python_model/scripts/run.sh to train model"""
  # prepare movie subtitle data.
  print("Preparing data in %s" % FLAGS.data_dir)
  sys.stdout.flush()
  data_train, data_dev, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.vocab_size)

  with tf.Session() as sess:
    # create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    sys.stdout.flush()
    model = create_model(sess, False)

    # set up event logging. NOTE: added this
    merged_summaries = tf.merge_all_summaries()
    # writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph_def)

    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    sys.stdout.flush()
    dev_set = read_data(data_dev)
    train_set = read_data(data_train, FLAGS.max_train_data_size)

    # this is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:

      # get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set)
      summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:

        # print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        sys.stdout.flush()
        # decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          result = sess.run([model.learning_rate_decay_op])
        previous_losses.append(loss)
        # save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
コード例 #16
0
def test(test_data, source_vocab, target_vocab, source_vocab_list,
         target_vocab_list, source_serialize, target_serialize):
    model = create_model(len(source_vocab), len(target_vocab),
                         source_vocab_list, target_vocab_list, 0.0,
                         args.max_source_len, args.max_target_len)
    test_set = data_utils.prepare_data(test_data, source_vocab, target_vocab,
                                       args.input_format, args.output_format,
                                       source_serialize, target_serialize)
    evaluate(model, test_set, source_vocab, target_vocab, source_vocab_list,
             target_vocab_list)
コード例 #17
0
def plot_input_data(country, state):
    train_df, val_df, raw_df = load_data(country=country, state = state)
    raw_params = prepare_data(
        raw_df['Confirmed'].values,
        raw_df['Deaths'].values,
        raw_df['Recovered'].values,
        population[country][state],
        is_have_death=True
    )
    save_dir = os.path.join(cfg.data.save_path+'/input', f'{country}_{state}.jpg')
    plot_single_set(raw_params, x_axis=raw_df['Day'].values, save_dir=save_dir)
コード例 #18
0
def hyperparam_search(FLAGS, override=False):
    t1 = time.time()
    ofile = open('/home/qv/wikiqa-data/out.train_ldc.txt', 'a')
    i = 0
    while True:
        FLAGS.initial_learning_rate = np.random.choice([0.05, 0.005])
        FLAGS.l2_reg_strength = np.random.choice([0.005, 0.0005])
        FLAGS.keep_prob = np.random.choice([1, 0.75, 0.5])
        FLAGS.embedding_type = np.random.choice(
            ['enwiki-skipgram', 'GoogleNews'])
        FLAGS.remove_stopwords_from_s = np.random.choice([True, False])
        FLAGS.num_filters = np.random.choice([10, 100, 500])
        FLAGS.train_dir = train_dir_name(FLAGS)
        FLAGS.data_pkl_file = data_utils.processed_data_file_name(FLAGS)

        if os.path.exists(FLAGS.train_dir) and not override:
            continue

        data_dict = data_utils.prepare_data(FLAGS)
        FLAGS.max_q_sents = data_dict['max_q_sents']
        FLAGS.max_q_len = data_dict['max_q_len']

        epochs = 100
        if FLAGS.initial_learning_rate <= 0.01:
            epochs = 200
        print "Learning rate: ", FLAGS.initial_learning_rate
        tf.reset_default_graph()
        best_results = train(data_dict, epochs)

        t = (FLAGS.initial_learning_rate, FLAGS.l2_reg_strength,
             FLAGS.keep_prob, FLAGS.embedding_type, FLAGS.embedding_size,
             FLAGS.remove_stopwords_from_s, FLAGS.num_filters)
        opt_names = [
            'initial_learning_rate', 'l2_reg_strength', 'keep_prob',
            'embedding_type', 'embedding_size', 'stopwords removed',
            'num_filters'
        ]
        ofile.write(FLAGS.train_dir + '\n')
        ofile.write(FLAGS.data_pkl_file + '\n')
        ofile.write('Parameters: \n')
        for name, opt in zip(opt_names, t):
            ofile.write('  {}: {}\n'.format(name, opt))
        ofile.write('Best Results: ')
        ofile.write("  reg-loss %.4f loss %.4f tps/fps %d/%d tops %d\n"
                    "  mrr %.2f map %.2f corr preds %d/%d\n" % best_results)
        ofile.write('\n')
        ofile.flush()

        i = i + 1
        if i == 100:
            break

    ofile.close()
    print time.time() - t1
コード例 #19
0
def eval_test():
    tf.reset_default_graph()
    test_out = os.path.join(FLAGS.data_dir, 'test_errors.out')
    deleteFiles([test_out])
    stats = {'R2W': 0, 'W2R': 0, 'W2W_C': 0, 'W2W_NC': 0}
    # change the reuse parameter if you want to build the data again
    _, _, _, _, en_test, fr_test, _, _ = data_utils.prepare_data(
        FLAGS.data_dir, reuse=FLAGS.reuse)
    with tf.Session(config=config_all) as sess:
        model = create_model(sess, True)
        test_set = read_data(en_test, fr_test)
        test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))]
        print('Bucket Sizes : {}'.format(test_bucket_sizes))
        total_loss, num_batches = 0, 0

        for bucket_id in range(len(_buckets)):
            all_batches = ([u for u in k if u is not None]
                           for k in itertools.izip_longest(*[
                               test_set[bucket_id][i::FLAGS.batch_size]
                               for i in range(FLAGS.batch_size)
                           ]))
            for batch in all_batches:
                encoder_inputs, decoder_inputs, target_weights = model.prepare_batch(
                    batch, bucket_id)
                # setting the model batch size in case it is smaller (would be for the
                # last batch in the bucket)
                model.batch_size = len(batch)
                _, eval_loss, logits = model.step(sess, encoder_inputs,
                                                  decoder_inputs,
                                                  target_weights, bucket_id,
                                                  True)
                outputs = np.argmax(logits, axis=2).transpose()
                outseq = [
                    out[:list(out).index(data_utils.EOS_ID)] for out in outputs
                    if data_utils.EOS_ID in out
                ]
                stat_updates = update_error_counts(batch, outseq)
                stats = {k: stats[k] + v for k, v in stat_updates.items()}
                total_loss += math.exp(eval_loss)
                num_batches += 1
                # resetting the madel batch size
                model.batch_size = FLAGS.batch_size
        print("Loss over the test set : {}".format(total_loss / num_batches))
        print(stats)
        precision = stats['W2R'] / sum(
            [stats['W2R'], stats['R2W'], stats['W2W_C']])
        recall = stats['W2R'] / sum(
            [stats['W2R'], stats['W2W_NC'], stats['W2W_C']])
        f_m = (2 * precision * recall) / (precision + recall)
        print('P: {}\nR: {}\nF: {}'.format(precision, recall, f_m))
コード例 #20
0
ファイル: training.py プロジェクト: jianshaow/deep-learning
def re_run(model_params=MODEL_PARAMS,
           data=utils.prepare_data(),
           learning_rate=LEARNING_RATE,
           epochs=RERUN_EPOCHS):
    train_data, test_data = data

    model = cc_model.Model(model_params)
    model.load()
    model.compile(learning_rate, sparse=SPARSE)
    model.train(train_data, epochs=epochs, test_data=test_data)
    model.verify(utils.gen_sample_data(size=100))
    model.save(ask=True)

    return model
コード例 #21
0
    def load_data(self, debug=False):
        """Loads train/valid/test data and sentence encoding"""
        '''
        en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
        '''

        en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data(
            'tmp', 40000, 40000)

        self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary(
            en_vocab_path)
        self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary(
            fr_vocab_path)
        #print self.source_vocab_to_id
        #print self.source_id_to_vocab
        '''
        print self.target_vocab_to_id
        print self.target_id_to_vocab
        '''
        '''
        for i in range(0, 10):
            print i
            print self.target_id_to_vocab[int(float(i))]
        #adsfas
        '''

        source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions'
        target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers'

        if self.config.train_mode:
            source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions'
            target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)
        else:
            source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.questions'
            target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)

        self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket(
            sources, targets, self.config)

        source_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.questions'
        target_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.answers'
        self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path)
        self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path)

        self.word_embedding = np.random.uniform(
            -self.config.embedding_init, self.config.embedding_init,
            (self.source_vocab_size, self.config.embed_size))
コード例 #22
0
def main():

	config = Config()
	data = prepare_data(config , debug=config.debug)
	data.data_loader = {phase: torch.utils.data.DataLoader(data.data[phase], shuffle=False, 
									batch_size=config.batch_size, num_workers=2) for phase in ['train' , 'val']}


	encoder_model = encoder_RNN(config.embedding_size, data.vocab_size[config.source], config.hidden_size , 
		n_layers=config.n_layers_encoding , bidirectional=config.bidirectional, dropout=config.dropout).to(config.device)

	decoder_model = decoderAttn(config.attn_model , config.embedding_size , data.vocab_size[config.target], config.hidden_size, 
	                             n_layers=config.n_layers_decoding, dropout=config.dropout).to(config.device)

	loss_criterion = nn.CrossEntropyLoss(reduce=False)
	encoder_optimizer = torch.optim.Adam(encoder_model.parameters() , lr=config.lr)
	decoder_optimizer = torch.optim.Adam(decoder_model.parameters() , lr=config.lr)

	start = time.time()
	encoder_model , decoder_model , loss_curve = train(data, config, encoder_model, decoder_model, 
											encoder_optimizer, decoder_optimizer,loss_criterion,n_epochs=config.n_epochs)

	print('Training Completed. Took {} seconds'.format(time.time()-start))


	## Evaluate
	print("######## VALIDATION #########")
	for i in np.random.randint(0, len(data.data['val']), 5):
	    inp_seq = data.data['val'][i][0] + config.end_tok
	    print(data.data['val'][i][0])
	    print(data.data['val'][i][1])
	    gen_sen = generate_translation(inp_seq ,config, encoder_model , decoder_model, data.vocab)
	    print(gen_sen)
	    print(BLEU_score(config , data.data['val'][i][1], gen_sen))
	    print()

	print("######## TRAIN #########")
	for i in np.random.randint(0, len(data.data['train']), 5):
	    inp_seq = data.data['train'][i][0] + config.end_tok
	    print(data.data['train'][i][0])
	    print(data.data['train'][i][1])
	    gen_sen = generate_translation(inp_seq ,config, encoder_model , decoder_model, data.vocab)
	    print(gen_sen)
	    print(BLEU_score(config , data.data['train'][i][1], gen_sen))
	    print()


	return config , data
コード例 #23
0
ファイル: training.py プロジェクト: jianshaow/deep-learning
def first_run(model_params=MODEL_PARAMS,
              data=utils.prepare_data(),
              learning_rate=LEARNING_RATE,
              dry_run=False):
    train_data, test_data = data

    model = cc_model.Model(model_params)
    model.build()
    model.compile(learning_rate, sparse=SPARSE)
    model.train(train_data, test_data=test_data)
    model.verify(utils.gen_sample_data(size=100))

    if not dry_run:
        model.save()

    return model
コード例 #24
0
ファイル: DAMSM.py プロジェクト: khakhulin/Text2Img
    def evaluate(self, epoch, loader, image_dir, args, device='cpu'):
        self.eval()

        s_total_loss0 = 0
        s_total_loss1 = 0
        w_total_loss0 = 0
        w_total_loss1 = 0

        with torch.no_grad():
            for data in loader:

                imgs, caps, caps_len, masks, class_ids = \
                    prepare_data(data, device, is_damsm=True)

                if self.is_bert:
                    w_loss0, w_loss1, s_loss0, s_loss1 = \
                        self.forward(
                            imgs, caps, caps_len, args,
                            class_ids=class_ids, bert_mask=masks
                        )
                else:
                    w_loss0, w_loss1, s_loss0, s_loss1 = \
                        self.forward(
                            imgs, caps, caps_len, args,
                            class_ids=class_ids
                        )
                # loss = w_loss0 + w_loss1 + s_loss0 + s_loss1

                w_total_loss0 += w_loss0.item()
                w_total_loss1 += w_loss1.item()
                s_total_loss0 += s_loss0.item()
                s_total_loss1 += s_loss1.item()

        s_cur_loss0 = s_total_loss0 / len(loader)
        s_cur_loss1 = s_total_loss1 / len(loader)
        w_cur_loss0 = w_total_loss0 / len(loader)
        w_cur_loss1 = w_total_loss1 / len(loader)

        sum_loss = s_cur_loss0 + s_cur_loss1 + w_cur_loss0 + w_cur_loss1

        print('[VALID] Epoch {:3d} | s_loss {:5.2f} {:5.2f} | '
              'w_loss {:5.2f} {:5.2f} | Sum {:5.2f}'.format(
                  epoch, s_cur_loss0, s_cur_loss1, w_cur_loss0, w_cur_loss1,
                  sum_loss))

        return sum_loss
コード例 #25
0
    def get_scores(self):
        cur_time = datetime.datetime.now().strftime('%d:%m:%Y:%H-%M-%S')
        run_name = os.path.join('gen_exp', cur_time)
        save_dir = os.path.join('generated_images', run_name)

        self.model.generator.eval()

        gen_iter = 0

        for data in tqdm.tqdm(self.data_loader, total=len(self.data_loader)):
            images, captions, cap_lens, masks, class_ids = prepare_data(
                data, self.device)
            noise = torch.FloatTensor(captions.size(0), self.model.z_dim).to(
                self.device).normal_(0, 1)

            gen_iter += 1
            gen_images, _, _, _, _ = self.model(captions, cap_lens, noise,
                                                masks)
            filenames = [
                str(gen_iter) + str(i) for i in range(gen_images[-1].size(0))
            ]
            img_tensor = save_images(gen_images[-1], filenames, save_dir, '',
                                     gen_images[-1].size(3))

        # inception score calculation
        gen_save_folder = os.path.join(save_dir, 'images', 'iter',
                                       str(gen_images[-1].size(3)))
        gen_img_iterator = GenImgData(gen_save_folder)
        mean_val, std_val = inception_score(gen_img_iterator,
                                            cuda=False,
                                            batch_size=32,
                                            resize=False,
                                            splits=4)
        print('Inception Score, mean: {0:.3f}, std: {1:.3f}'.format(
            mean_val, std_val))

        #fid calculation
        paths_to_fid = []
        paths_to_fid.append(gen_save_folder)
        paths_to_fid.append(self.test_imgs_paths)
        fid_val = calculate_fid_given_paths(paths_to_fid,
                                            batch_size=1,
                                            cuda=False,
                                            dims=2048)
        print("FID value: ", fid_val)
コード例 #26
0
def initialize_params(experiment_dir_name,
                        experiment_dir_suffix,
                        stage_of_development,
                        params_initialization_for_training=None,
                        params_initialization_for_resume_training=None,
                        params_initialization_for_evaluation=None,
                        training_with_dev=False, 
                        use_ranges=False,):
    params = {}
    params['experiment_dir'] = None
    params['resume_training'] = False
    params['evaluate_model'] = False
    params['resume_training'] = False
    params['num_epochs'] = None
    params['max_steps'] = None
    params['num_steps_before_checkpoint'] = 1
    params['data_dir'] = None
    params['logs_dir'] = None
    params['summary_dir'] = None
    params['results_dir'] = None
    params['training_path'] = None
    params['dev_path'] = None
    params['testing_path'] = None
    params['forward_only'] = False
    params['use_preprocessing'] = False
    params['model_path'] = None
    params['dict_of_filePath_to_num_of_examples_in_tfrecord'] = None
    params['type_of_optimizer'] = 'adam'
    params['total_num_of_training_examples'] = None

    params, filenames_of_images, labels_of_images, = initialize_params_helper(params,
                                                                                stage_of_development,
                                                                                experiment_dir_name,
                                                                                experiment_dir_suffix,
                                                                                params_initialization_for_training=params_initialization_for_training,
                                                                                params_initialization_for_resume_training=params_initialization_for_resume_training,
                                                                                params_initialization_for_evaluation=params_initialization_for_evaluation)
    filenames_of_images_dev = None
    pmValues_dev = None
    if stage_of_development == "training" or stage_of_development == "resume_training":
        if training_with_dev:
            filenames_of_images_dev, _, _, _, _, _, _, _, pmValues_dev = data_utils.prepare_data(params['dev_path'])

    return params, filenames_of_images, labels_of_images,  filenames_of_images_dev, pmValues_dev
コード例 #27
0
ファイル: test.py プロジェクト: safooray/RareEventAugment
def main(argv):
    X, y = prepare_data(data_path=FLAGS.data_path,
                        sheet_name=FLAGS.sheet_name,
                        label_name=FLAGS.label_name)

    n_features = X.shape[1]
    X, y = gradient_augment(X, y)
    X, y = temporalize(X, y, LOOKBACK)
    _, X_test, _, y_test = train_test_split(np.array(X),
                                            np.array(y),
                                            test_size=DATA_SPLIT_PCT,
                                            random_state=0)
    X_test = np.array(X_test)
    X_test = X_test.reshape(X_test.shape[0], LOOKBACK, n_features)

    ## Loads scaler fit on training data.
    with open(FLAGS.data_scaler_path, 'rb') as scaler_file:
        scaler = pickle.load(scaler_file)

    X_test_scaled = scale(X_test, scaler)

    model = keras.models.load_model(FLAGS.final_model_path)

    get_layer_output = tf.keras.backend.function([model.layers[0].input],
                                                 [model.layers[1].output])
    layer_output = get_layer_output([X_test_scaled])[0]
    print(layer_output.shape, layer_output)

    classifier = LogisticRegression(class_weight='balanced',
                                    max_iter=200,
                                    penalty='l1',
                                    solver='saga',
                                    C=0.01,
                                    verbose=1)

    classifier.fit(layer_output[:, -384:], y_test)
    y_hat_test = classifier.predict(layer_output[:, -384:])
    print("Precision        Recall       F_score      Support")
    test_res = precision_recall_fscore_support(y_test,
                                               y_hat_test,
                                               average='binary')
    print(test_res)
コード例 #28
0
def test_basic_SIR(raw_df, attribute2fix: str, state='Texas', country='US'):
    '''
        attribute2fix: ['I', 'R']
    '''
    print(f"--- Basic SIR ---")
    raw_params = prepare_data(
        raw_df['Confirmed'].values,
        raw_df['Deaths'].values,
        raw_df['Recovered'].values,
        population[country][state]
    )

    filename = f"{country}/{state}_basicSIR_{cfg.model.curvefit_sigma_rate}"
    save_dir = os.path.join(cfg.data.save_path, filename) 

    basic_sir = BasicSIR(cfg, raw_params)
    res = basic_sir.fit_single_attribute(attribute=attribute2fix, visualize=False)
    print("finish curve fitting")
    val_params = {'I': basic_sir.I, 'R': basic_sir.R}
    x_axis=raw_df['Day'].values

    visualize_basic_result(val_params, res, x_axis, save_dir, attribute2fix)
コード例 #29
0
def test():
    print("Loading data...")
    vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \
        = data_utils.prepare_data(args.data_path)
    max_text_len = max([len(words) for words in train_words])
    vocab_len = len(vocab_word_list)
    # 模型构造
    model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size,
                           list(map(int, args.filter_sizes.split(","))),
                           args.num_filters, args.max_gradient_norm,
                           args.learning_rate, args.l2_reg_lambda)
    # 给定数据,对模型进行测试
    text = '''说实话没吃成, 但是对这家太不满意了, 所有都给差评!到那之后满屋子都是座, 服务员非得给安排在一个犄角旮旯, 黑咕隆咚的冷气还吹不到.
    点了餐喊半天服务员都不来拿单子, 而且是眼看着服务员从身边经过, 喊着服务员服务员, 她们就只当没听见.
    然后我自己换了个显眼的位置, 举着手喊服务员, 她们还是无视的从我旁边走过.我k, 你又不服务, 没事走来走去干什么?所以后来干脆不吃了.
    请问, 您家是要做生意么?'''
    words_ids = data_utils.sentence_to_token_ids(text, vocab_word)
    predicts = model.predict(words_ids, args.model_path)
    print text
    print "预测结果为:", predicts
    if predicts == [0]:
        print "正面评论"
    else:
        print "负面评论"

    text = '''瘦了点,可能和季节有关吧吃完加点青菜做泡饭满嗲的~孔雀开屏 45.00很大一条鱼,摆盘很漂亮,肉质挺嫩,如果加点醋更好,
    去腥更美味~~香菇菜心这个 我喜欢的呀~上面酱很嗲~ 香菇很入味,菜心很爽口~ 解油腻 总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,
    摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~雨天滴滴答答,不是很舒服,但却并没影响到FB的心情~~~店开在 比较老式的弄堂里,
    周围都是居民区,门面并不大,不过据说这里生意很好。性价比高么做的是绍兴菜,装修比较朴素,菜单也是很简单的A4纸
    塑封一下总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~'''
    words_ids = data_utils.sentence_to_token_ids(text, vocab_word)
    predicts = model.predict(words_ids, args.model_path)
    print text
    print "预测结果为:", predicts
    if predicts == [0]:
        print "正面评论"
    else:
        print "负面评论"
コード例 #30
0
def prepare_data():
    from_train = None
    to_train = None
    from_dev = None
    to_dev = None
    if FLAGS.from_train_data and FLAGS.to_train_data:
        from_train_data = FLAGS.from_train_data
        to_train_data = FLAGS.to_train_data
        from_dev_data = from_train_data
        to_dev_data = to_train_data
        if FLAGS.from_dev_data and FLAGS.to_dev_data:
            from_dev_data = FLAGS.from_dev_data
            to_dev_data = FLAGS.to_dev_data
        from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
            FLAGS.data_dir, from_train_data, to_train_data, from_dev_data,
            to_dev_data, FLAGS.from_vocab_size, FLAGS.to_vocab_size,
            data_utils.char_tokenizer)
    else:
        # Prepare WMT data.
        print("Preparing WMT data in %s" % FLAGS.data_dir)
        from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_wmt_data(
            FLAGS.data_dir, FLAGS.from_vocab_size, FLAGS.to_vocab_size)
    return from_train, to_train, from_dev, to_dev
コード例 #31
0
def test_data(country, state):
    if state in SEIR_STATE:
        return
    train_df, val_df, raw_df = load_data(country=country, state = state)

    raw_params = prepare_data(
        raw_df['Confirmed'].values,
        raw_df['Deaths'].values,
        raw_df['Recovered'].values,
        population[country][state]
    )
    print()
    print(f'test on {state}')

    # TEST TIME-DEPENDENT MODELS
    # test_time_SIR(train_df, val_df, raw_df, state, country)

    # test_time_SIRD(train_df, val_df, raw_df, state, country)

    # TEST BASIC MODELS
    for att in ['I', 'R']:
        test_basic_SIR(raw_df, att, state, country)
        test_basic_SIRD(raw_df, att, state, country)
コード例 #32
0
def process_data():
    print("Preparing data in %s" % FLAGS.data_dir)

    data_utils.prepare_data(FLAGS)
コード例 #33
0
def train():
    print "Preparing data in %s" % settings.data_dir
    sr_train_ids_path, tg_train_ids_path,sr_dev_ids_path, tg_dev_ids_path,sr_vocab_path, tg_vocab_path = data_utils.prepare_data(settings.data_dir)
    print "Reading training data from %s" % settings.data_dir
    train_set = data_utils.read_data(sr_train_ids_path,tg_train_ids_path,settings.max_train_num)
    train_batches,train_bucket_ids = data_utils.batchize(train_set)
    print "Reading development data from %s" % settings.data_dir
    dev_set = data_utils.read_data(sr_dev_ids_path,tg_dev_ids_path)
    dev_batches,dev_bucket_ids = data_utils.batchize(dev_set,False)

    log_file = open(settings.train_dir+'log.txt','w')
    log_file.write('epoch\tstep\ttime\ttrain-ppx\tdev-ppx\n')
    log_file.flush()

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (settings.num_layers, settings.size))
        model = create_model(sess, False)
        current_epoch,current_step,train_loss = 0,0,0.0
        start_time = time.time()
        while True:
            current_epoch+=1
            for batch_id in xrange(len(train_batches)):
                current_step+=1
                step_start_time = time.time()
                encoder_inputs, decoder_inputs, target_weights = model.preprocess_batch(train_batches[batch_id], train_bucket_ids[batch_id])
                _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, train_bucket_ids[batch_id], False)
                print "global-step %d\tstep-time %.2f\tstep-loss %.2f" % (model.global_step.eval(),time.time()-step_start_time,step_loss)
                train_loss+=step_loss/settings.steps_per_checkpoint
                if current_step % settings.steps_per_checkpoint == 0:
                    # evaluate in training set
                    train_ppx = math.exp(train_loss)/model.batch_size if train_loss < 300 else float('inf')
                    # evaluate in development set
                    dev_loss=0.0
                    for dev_batch_id in xrange(len(dev_batches)):
                        encoder_inputs, decoder_inputs, target_weights = model.preprocess_batch(dev_batches[dev_batch_id], dev_bucket_ids[dev_batch_id])
                        _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                 target_weights, dev_bucket_ids[dev_batch_id], True)
                        dev_loss+=step_loss/len(dev_batches)
                    dev_ppx = math.exp(dev_loss)/model.batch_size if dev_loss < 300 else float('inf')
                    log_file.write("%d\t%d\t%.2f\t%.2f\t%.2f\n" % (current_epoch,model.global_step.eval(),time.time()-start_time,train_ppx,dev_ppx))
                    log_file.flush()
                    sys.stdout.flush()
                    train_loss,dev_loss = 0.0,0.0
                    checkpoint_path = os.path.join(settings.train_dir, "summary.ckpt")
                    model.saver.save(sess, checkpoint_path,global_step=model.global_step)
            train_batches,train_bucket_ids = data_utils.batchize(train_set)
コード例 #34
0
def train():
  """Train a translation model using NMT data."""
  source = sys.argv[1]
  target = sys.argv[2]
  # Prepare NMT data.
  print("Preparing NMT data in %s" % FLAGS.data_dir)
  print("    source langauge: %s" % source)
  print("    target language: %s" % target)
  # Generates the preprocessed train and test files and gives their paths. These have tokenized ids.
  s_train, t_train, s_dev, t_dev, _, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.s_vocab_size, FLAGS.t_vocab_size, source, target)
  print("Tokenized Inputs: ", s_train, t_train, s_dev, t_dev)
  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, True, False)

    # Read data into buckets and compute their sizes.
    print("Reading development and training data (limit: %d)."
          % FLAGS.max_train_data_size)
    dev_set = read_data(s_dev, t_dev)
    train_set = read_data(s_train, t_train, FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    perplexity = 1e10
    train_steps, train_ppx, bucket_ppx = [], [], {0:[], 1:[], 2:[], 3:[]}
    
    # Put a limit on the number of iterations it takes to train (instead of the perplexity)
    while current_step <= 12000:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        train_steps.append(current_step)
        # Print statistics for the previous epoch.
        perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
        train_ppx.append(perplexity)
        print("global step %d learning rate %.4f step-time %.2f perplexity "
              "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                        step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss, eval_loss_tot = 0.0, 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          if len(dev_set[bucket_id]) == 0:
            print("  eval: empty bucket %d" % (bucket_id))
            continue
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              dev_set, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float("inf")
          bucket_ppx[bucket_id].append(eval_ppx)
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
          eval_loss_tot += eval_loss
        eval_loss_avg = eval_loss_tot/len(_buckets)
        eval_ppx = math.exp(float(eval_loss_avg)) if eval_loss < 300 else float("inf")
        print("  eval: mean perplexity %.2f" % eval_ppx)
        sys.stdout.flush()
    print(train_steps)
    print(train_ppx)
    print(bucket_ppx)
コード例 #35
0
def train():
  """Train a nl -> machine-code translation model."""
  # Prepare training & dev data.
  print("Preparing data in %s" % FLAGS.data_dir)
  srce_train, trgt_train, trgt_train_pos, trgt_train_map, srce_dev, trgt_dev, trgt_dev_pos, trgt_dev_map, _, _, srce_vocab_size, trgt_vocab_size = data_utils.prepare_data(
      FLAGS.data_dir, FLAGS.srce_vocab_min, FLAGS.trgt_vocab_min)

  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, srce_vocab_size, trgt_vocab_size, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(srce_dev, trgt_dev, trgt_dev_pos, trgt_dev_map)
    train_set = read_data(srce_train, trgt_train, trgt_train_pos, trgt_train_map, max_size=FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    print("training set bucket: ", train_bucket_sizes)

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]
    # size of dev set
    dev_bucket_sizes = [len(dev_set[b]) for b in xrange(len(_buckets))]
    dev_size = float(sum(dev_bucket_sizes)) 
    dev_bucket_proportion = [b/dev_size for b in dev_bucket_sizes]# proportion
    print("dev set bucket: ", dev_bucket_sizes)

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    dev_losses = [] 
    steps_per_checkpoint = int(train_total_size / FLAGS.batch_size)
    print ("steps per checkpoint: ", steps_per_checkpoint)

    while current_step < (FLAGS.epoch * steps_per_checkpoint):
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # check for empty bucket
      if len(train_set[bucket_id]) == 0:
        continue

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights, pos, maps = model.get_batch(
          train_set, bucket_id)

      # step
      _, step_loss, _, _, _, _= model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False, 
                                   decoder_inputs_positions=pos, decoder_inputs_maps=maps)
      step_time += (time.time() - start_time) / steps_per_checkpoint
      loss += step_loss / steps_per_checkpoint
      current_step += 1


      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))

        # Decrease learning rate if no improvement was seen over last 3 times.
        # if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
        if current_step / steps_per_checkpoint > 5:
          sess.run(model.learning_rate_decay_op)
          print ("learning rate update to %.4f" % model.learning_rate.eval())
          if model.learning_rate == float(0):
            break
        previous_losses.append(loss)

        # Run evals on development set, print their perplexity and perform early stopping.
        eval_loss_per_bucket = [] # eval_loss for the whole dev set
        for bucket_id in xrange(len(_buckets)):
          if len(dev_set[bucket_id])==0:
            # print ("Bucket %s is empty." % bucket_id)
            eval_loss_per_bucket.append(float(0))
            continue
          
          encoder_inputs, decoder_inputs, target_weights, pos, maps = model.get_batch(
              dev_set, bucket_id)
          
          _, eval_loss, _, _, _, _= model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps)
          
          eval_loss_per_bucket.append(float(eval_loss)) 
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_loss))
        
        dev_loss = np.dot(np.asarray(eval_loss_per_bucket), np.asarray(dev_bucket_proportion))
        dev_losses.append(dev_loss)
        print(" eval: dev set weighted perplexity %.2f"% dev_loss)
        
        if dev_loss <= min(dev_losses):
          # Save checkpoint and zero timer and loss.
          checkpoint_path = os.path.join(FLAGS.data_dir, "checkpoint/ckpt")
          model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        
        step_time, loss = 0.0, 0.0
        
        sys.stdout.flush()
コード例 #36
0
ファイル: speak_easy.py プロジェクト: Vunb/SpeakEasy
def train():
  """Run SpeakEasy/server/python_model/scripts/run.sh to train model"""
  
  slack.connection.notify(
    text='Training SpeakEasy!',
  )
  # Prepare movie subtitle data.
  print("Preparing data in %s" % FLAGS.data_dir)
  sys.stdout.flush()
  data_train, data_dev, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.vocab_size)

  with tf.Session() as sess:
    # Create model.
    print("Creating %s model with %d layers of %d units." % (FLAGS.model_type, FLAGS.num_layers, FLAGS.size))
    sys.stdout.flush()
    if FLAGS.buckets: print("Using bucketed model.")
    sys.stdout.flush()
    model = create_model(sess, False)

    # Set up event logging. NOTE: added this, this is not finished
    merged_summaries = tf.merge_all_summaries()
    # writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph_def)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    sys.stdout.flush()
    dev_set = read_data(data_dev)
    train_set = read_data(data_train, FLAGS.max_train_data_size)

    if FLAGS.buckets:
      train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
      train_total_size = float(sum(train_bucket_sizes))
      # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
      # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
      # the size if i-th training bucket, as used later.
      train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Get a batch and make a step.
      start_time = time.time()

      if FLAGS.buckets:
        # Choose a bucket according to data distribution. We pick a random number
        # in [0, 1] and use the corresponding interval in train_buckets_scale.
        random_number_01 = np.random.random_sample()
        bucket_id = min([i for i in xrange(len(train_buckets_scale))
                        if train_buckets_scale[i] > random_number_01])
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            train_set, bucket_id=bucket_id)
        summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, False, bucket_id=bucket_id)
      else:
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id=None)
        summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, False, bucket_id=None)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:

        # Save summaries. NOTE: added this
        # result = sess.run(merged_summaries)
        # summary_str = result[0]
        # writer.add_summary(summary_str, current_step)

        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        log_line = ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        print(log_line)
        slack.connection.notify(
          text=log_line,
        )

        sys.stdout.flush()
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          result = sess.run([model.learning_rate_decay_op])
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "speakEasy" + str(FLAGS.vocab_size) + ".ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0

        if FLAGS.buckets:
          # Run evals on development set and print their perplexity.
          for bucket_id in xrange(len(_buckets)-1):
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                dev_set, bucket_id)
            _, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, True, bucket_id=bucket_id)
            eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
            log_line = "eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)
            print("  %s" % log_line)
            slack.connection.notify(
              text=log_line,
            )

          sys.stdout.flush()
コード例 #37
0
ファイル: translate.py プロジェクト: galaxyh/ann-mt
def train():
    """Train a en->fr translation model using WMT data."""
    # Prepare parallel corpus data.
    print("Preparing parallel corpus data in %s" % FLAGS.data_dir)
    source_train, target_train, source_dev, target_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_dir, FLAGS.source_vocab_size, FLAGS.target_vocab_size,
        FLAGS.train_name, FLAGS.dev_name, FLAGS.source_ext, FLAGS.target_ext,
        tokenizer=data_utils.whitespace_tokenizer)

    with tf.Session() as sess:
        # Create model.
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, False)

        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)."
              % FLAGS.max_train_data_size)
        dev_set = read_data(source_dev, target_dev)
        train_set = read_data(source_train, target_train, FLAGS.max_train_data_size)
        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                               for i in xrange(len(train_bucket_sizes))]

        # This is the training loop.
        print("Start training...")
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([i for i in xrange(len(train_buckets_scale))
                             if train_buckets_scale[i] > random_number_01])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print("global step %d learning rate %.4f step-time %.2f perplexity "
                      "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                                step_time, perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
                model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in xrange(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        dev_set, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                 target_weights, bucket_id, True)
                    eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
                    print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
                sys.stdout.flush()
コード例 #38
0
    def __init__(self,
                 alpha,
             batch_size,
             n_epochs,
             wordVecLen,
             flag_dropout,
             datapath,
             random_seed,
             dropoutRates,
             optimizer,
             dispFreq,
             beam_size,
             flag_random_lookup_table,
             flag_toy_data,
             size_hidden_layer,
             dataset,
             result_path,
             sentence_modeling,
             CNN_filter_length,
             LSTM_go_backwards
             ):
        model_options = locals().copy()
        model_options['rng'] = np.random.RandomState(random_seed)
        print 'Loading data'
        src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath)
        if flag_toy_data == True:
            src_valid = src_valid[:10]
            src_test = src_test[:10] 
            #src_train = copy.copy(src_valid)
            src_train = src_train[:10]
        elif flag_toy_data != False:
            valid_l = len(src_valid) * flag_toy_data
            test_l = len(src_test) * flag_toy_data
            train_l = len(src_train) * flag_toy_data
            src_valid = src_valid[:int(valid_l)]
            src_test = src_test[:int(test_l)] 
            src_train = src_train[:int(train_l)]
            
        train,pairdict_train = prepare_data(src_train)
        valid,pairdict_valid = prepare_data(src_valid)
        test,pairdict_test = prepare_data(src_test)
        model_options['embedding'] = embedding
        
        (sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug) = build_model(model_options)
        #f_cost = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, name='f_cost')
    
        #grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams.values())
        grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams)
        # grads = tensor.grad(cost, wrt=tparams.values())
        #f_grad = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], grads, name='f_grad')
    
        lr = tensor.scalar(name='lr')
        if model_options['optimizer'] == 'sgd': optimizer = sgd
        elif model_options['optimizer'] == 'rmsprop': optimizer = rmsprop
        else: optimizer = adadelta
        f_grad_shared, f_update = optimizer(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost)
        
        
        print 'Optimization'

        kf_valid = get_minibatches_idx(len(valid), model_options['batch_size'])
        kf_test = get_minibatches_idx(len(test), model_options['batch_size'])
    
        print "%d train examples" % len(train)
        print "%d valid examples" % len(valid)
        print "%d test examples" % len(test)
        sys.stdout.flush()
        
        
        best_validation_score = -np.inf
        best_iter = 0
        uidx = 0  # the number of update done
        for epoch in xrange(model_options['n_epochs']):
            print ('Training on %d epoch' % epoch)
            sys.stdout.flush()
            kf = get_minibatches_idx(len(train), batch_size, shuffle=True)
            start_time = time.time()
            samples_seen = 0
            for _, train_index in kf:
                uidx += 1
                batch_samples = [train[t] for t in train_index]
                samples_seen += len(batch_samples)
                #print batch_samples
                sentence1,sentence1_mask,sentence2,sentence2_mask,y = data_padding(batch_samples)
                #print sentence1,sentence1_mask,sentence2,sentence2_mask,y
                #print sentence1.shape,sentence1_mask.shape,sentence2.shape,sentence2_mask.shape,y.shape
                #o = f_debug(sentence1,sentence1_mask,sentence2,sentence2_mask,y)
                #print o
                #print o[0].shape,o[1].shape,o[2].shape,o[3].shape
                cost = f_grad_shared(sentence1,sentence1_mask,sentence2,sentence2_mask,y)
                f_update(model_options['alpha'])
                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen
                    sys.stdout.flush()
            print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen
            sys.stdout.flush()
            '''
            if epoch % 5 == 0:
                kf_train = get_minibatches_idx(len(train), batch_size)
                print ('Train_score:')
                self.eva(f_pred, src_train, train, pairdict_train, kf_train, model_options)
                sys.stdout.flush()
            '''
            print ('Valid_score:')
            top1_res = self.eva(f_pred, src_valid, valid, pairdict_valid, kf_valid, model_options)
            self.save_result(model_options['result_path'] + 'dev.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res)
            sys.stdout.flush()
            print ('Test_score:')
            top1_res = self.eva(f_pred, src_test, test, pairdict_test, kf_test, model_options)
            self.save_result(model_options['result_path'] + 'test.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res)
            sys.stdout.flush()
            
            print ('%d epoch completed.' % epoch)
            sys.stdout.flush()
            '''
            if(best_validation_score < valid_score):
                best_iter = epoch
                best_validation_score = valid_score
            print ('Current best_dev_F is %.2f, at %d epoch'%(best_validation_score,best_iter))
            '''
        
            end_time = time.time()
            minu = int((end_time - start_time)/60)
            sec = (end_time - start_time) - 60 * minu
            print ('Time: %d min %.2f sec' % (minu, sec))
            sys.stdout.flush()
        print('Training completed!')
        sys.stdout.flush()