Ejemplo n.º 1
0
def run_iters_experiment(dir_path, file_name):
    res = {}
    res['x'] = []
    res['y'] = []
    for num_of_iters in ITERS:
        X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name,
                                             file_name)
        acc = []
        for i in range(10):
            print("CV: " + str(i))
            X, y = shuffle(X, y)
            X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data(
                X, y, train_test_split=0.8, labeled_unlabeled_split=0.2)
            base_models = [
                BASE_MODELS[ITERS_BASE_MODEL], BASE_MODELS[ITERS_BASE_MODEL]
            ]
            model = SP_coTrain.SP_coTrain(base_models,
                                          num_of_iters,
                                          add_rate=0.1,
                                          gamma=0.5)
            model.fit(X_labeled, X_unlabeled, y_labeled, view1, view2)
            y_pred = model.predict(X_test)
            acc.append(accuracy_score(y_test, y_pred))

        res['x'].append(num_of_iters)
        res['y'].append(sum(acc) / len(acc))
    plot_graph(res)
Ejemplo n.º 2
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(
                generate_epoch(train_encoder_inputs, train_decoder_inputs,
                               train_targets, train_en_seq_lens,
                               train_sp_seq_len, FLAGS.num_epochs,
                               FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                            batch_targets, batch_en_seq_lens,
                            batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS, batch_encoder_inputs,
                                     batch_decoder_inputs, batch_targets,
                                     batch_en_seq_lens, batch_sp_seq_lens,
                                     FLAGS.dropout)

                batch_loss.append(loss)

            losses.append(np.mean(batch_loss))

        plt.plot(losses, label='loss')
        plt.legend()
        plt.show()
Ejemplo n.º 3
0
def predict_on_stocks(array: numpy.array, model_path: str, interval: str,
                      stock_path: str):
    scaler = StandardScaler()
    open_data, close_data = init_data(array)

    open_data, close_data = normalize_data(open_data, close_data, scaler)

    (x_train, y_train, x_test, y_test) = split_data(open_data, close_data)
    (x_train, y_train) = shuffle_data(x_train, y_train)

    (model, checkpoint_callback) = create_model(model_path)
    model.fit(x_train,
              y_train,
              validation_data=(x_test, y_test),
              batch_size=64,
              epochs=EPOCHS,
              callbacks=[checkpoint_callback])

    #test_model(model, x_test, y_test, scaler, interval) // uncomment this if you want to test the ai efficiency

    dump(scaler, f'{model_path}/std_scaler.bin', compress=True)
Ejemplo n.º 4
0
def cross_validation(dir_path,
                     file_name,
                     cv=10,
                     base_model="RandomForest",
                     labeled_rate=0.2):
    X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name, file_name)

    res_spaco = []
    res_co = []
    res_base = []
    res_reg_base = []

    for i in range(cv):
        print("CV: " + str(i))
        X, y = shuffle(X, y)
        X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data(
            X, y, train_test_split=0.8, labeled_unlabeled_split=labeled_rate)

        res_spaco.append(
            evaluate_model("spaco", base_model, X_labeled, X_unlabeled,
                           y_labeled, X_test, y_test, view1, view2,
                           labeled_rate))

        res_co.append(
            evaluate_model("co", base_model, X_labeled, X_unlabeled, y_labeled,
                           X_test, y_test, view1, view2, labeled_rate))

        res_base.append(
            evaluate_model("base", base_model, X_labeled, X_unlabeled,
                           y_labeled, X_test, y_test, view1, view2,
                           labeled_rate))

        res_reg_base.append(
            evaluate_model("reg_base", base_model, X_labeled, X_unlabeled,
                           y_labeled, X_test, y_test, view1, view2,
                           labeled_rate))

    return res_spaco, res_co, res_base, res_reg_base
Ejemplo n.º 5
0
def main(args):

    # set random seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    print(args.dataset_root)

    # load data
    data, meta = data_utils.load_data(args.dataset_root,
                                      args.dataset,
                                      is_training=True)
    train_data, val_data = data_utils.split_data(data,
                                                 args.validate_rate,
                                                 shuffle=True)

    # build train dataloader
    train_dataset = data_utils.ImageDataset(*train_data,
                                            is_training=True,
                                            is_flip=args.dataset
                                            not in ['mnist', 'svhn'])
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   args.batch_size,
                                                   shuffle=True,
                                                   num_workers=2,
                                                   pin_memory=True)

    # build val dataloader
    val_dataset = data_utils.ImageDataset(*val_data, is_training=False)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 args.batch_size,
                                                 shuffle=False,
                                                 num_workers=2,
                                                 pin_memory=True)

    # remove temp dataset variables to reduce memory usage
    del data, train_data, val_data

    device = torch.device(args.device)

    # build model
    if args.model == 'resnet_20':
        model = models.Resnet20
    else:
        model = models.SimpleCNN
    net = model(train_dataset.shape, meta['n_class']).to(device=device)
    net.apply(init_param)

    criterion = torch.nn.CrossEntropyLoss()

    # build optim
    optim = torch.optim.SGD(make_param_groups(net, args.weight_decay),
                            0.1,
                            momentum=0.9)

    # make log directory
    logdir = Path(args.logdir)
    if not logdir.exists():
        os.makedirs(str(logdir))

    global_step = 0
    start_epoch = 0
    if args.restore:
        # restore checkpoint
        state = torch.load(args.restore)
        start_epoch = state['epoch'] + 1
        global_step = state['global_step']
        net.load_state_dict(state['net'])
        optim.load_state_dict(state['optim'])

    # lr strategy
    lr_boundaries = list(map(int, args.boundaries.split(',')))
    lr_values = list(map(float, args.values.split(',')))
    lr_manager = LRManager(lr_boundaries, lr_values)

    for e in range(start_epoch, args.n_epoch):
        print('-------epoch: {:d}-------'.format(e))

        # training phrase
        net.train()
        mean_loss, acc = MeanValue(), Accuracy()
        lr_manager.set_lr_for_optim(e, optim)
        tm = TimeMeter()
        tm.start()
        train_log = {}
        for i, (x, y) in enumerate(train_dataloader):
            tm.add_counter()

            if device.type == 'cuda':
                x = x.cuda(device, non_blocking=True)
                y = y.cuda(device, non_blocking=True)

            optim.zero_grad()
            logits = net(x)
            loss = criterion(logits, y)

            loss.backward()
            optim.step()
            global_step += 1

            loss = loss.detach().cpu().numpy()
            predicts = torch.argmax(logits, dim=1).detach().cpu().numpy()
            y = y.detach().cpu().numpy()

            mean_loss.add(loss)
            acc.add(predicts, y)

            if i % args.log_every == 0:
                torch.cuda.synchronize()
                tm.stop()

                print(
                    'step: {:d}, lr: {:g}, loss: {:.4f}, acc: {:.2%}, speed: {:.2f} i/s.'
                    .format(i, lr_manager.get(e), mean_loss.get(), acc.get(),
                            args.batch_size / tm.get()))
                train_log[global_step] = {
                    'loss': mean_loss.get(),
                    'acc': acc.get()
                }
                tm.reset()
                tm.start()
                mean_loss.reset()
                acc.reset()

        # val phrase
        net.eval()
        mean_loss, acc = MeanValue(), Accuracy()
        for x, y in val_dataloader:

            if device.type == 'cuda':
                x = x.cuda(device, non_blocking=True)
                y = y.cuda(device, non_blocking=True)

            logits = net(x)
            loss = criterion(logits, y)

            loss = loss.detach().cpu().numpy()
            predicts = torch.argmax(logits, dim=1).detach().cpu().numpy()
            y = y.detach().cpu().numpy()

            mean_loss.add(loss)
            acc.add(predicts, y)

        print('val_loss: {:.4f}, val_acc: {:.2%}'.format(
            mean_loss.get(), acc.get()))
        val_log = {global_step: {'loss': mean_loss.get(), 'acc': acc.get()}}

        # save checkpoint
        vars_to_saver = {
            'net': net.state_dict(),
            'optim': optim.state_dict(),
            'epoch': e,
            'global_step': global_step
        }
        cpt_file = logdir / 'checkpoint_{:d}.pk'.format(e)
        torch.save(vars_to_saver, str(cpt_file))

        log_file = logdir / 'log_{:d}.pk'.format(e)
        torch.save({'train': train_log, 'val': val_log}, str(log_file))
Ejemplo n.º 6
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)
    FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS, forward_only=False)

        # Training begins
        train_losses = []
        valid_losses = []
        for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,
            train_decoder_inputs, train_targets,
            train_en_seq_lens, train_sp_seq_len,
            FLAGS.num_epochs, FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                batch_targets, batch_en_seq_lens,
                batch_sp_seq_lens) in enumerate(epoch):

                y_pred, loss, _ = model.step(sess, FLAGS,
                    batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                    batch_en_seq_lens, batch_sp_seq_lens,
                    FLAGS.dropout, forward_only=False)

                batch_loss.append(loss)
            train_losses.append(np.mean(batch_loss))

            for valid_epoch_num, valid_epoch in enumerate(generate_epoch(valid_encoder_inputs,
                valid_decoder_inputs, valid_targets,
                valid_en_seq_lens, valid_sp_seq_len,
                num_epochs=1, batch_size=FLAGS.batch_size)):

                batch_loss = []

                for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                    batch_targets, batch_en_seq_lens,
                    batch_sp_seq_lens) in enumerate(valid_epoch):

                    loss = model.step(sess, FLAGS,
                        batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                        batch_en_seq_lens, batch_sp_seq_lens,
                        dropout=0.0, forward_only=True, sampling=False)

                    batch_loss.append(loss)
                valid_losses.append(np.mean(batch_loss))

        # Save checkpoint.
        if not os.path.isdir(FLAGS.ckpt_dir):
            os.makedirs(FLAGS.ckpt_dir)
        checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
        print "Saving the model."
        model.saver.save(sess, checkpoint_path,
                         global_step=model.global_step)

        plt.plot(train_losses, label='train_loss')
        plt.plot(valid_losses, label='valid_loss')
        plt.legend()
        plt.show()
import tensorflow as tf
import numpy as np
import data_processing
import config
import data_utils
import seq2seq_wrapper
from os import path

#load data and split into train and test sets
idx_headings, idx_descriptions = data_processing.process_data()
article_metadata = data_processing.unpickle_articles()
(x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings)

#define parameters
xseq_length = x_train.shape[-1]
yseq_length = y_train.shape[-1]
batch_size = config.batch_size
xvocab_size = len(article_metadata['idx2word'])
yvocab_size = xvocab_size
checkpoint_path = path.join(config.path_outputs, 'checkpoint')

print (checkpoint_path)

#define model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length,
                                yseq_len=yseq_length,
                                xvocab_size=xvocab_size,
                                yvocab_size=yvocab_size,
                                emb_dim=config.embedding_dim,
                                num_layers=3,
                                ckpt_path=checkpoint_path)
Ejemplo n.º 8
0
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform

from data_utils import get_data, split_data
from utils import DataFrameSelector, CombinedAttributesAdder, CustomLabelBinarizer

# laoding the dataset
housing = get_data()

# split into train_set and test_set
train_set, test_set = split_data(housing)

housing_train = train_set.drop(
    "median_house_value",
    axis=1)  # median_house_value column contains the target values
housing_test = test_set.drop("median_house_value", axis=1)
housing_train_labels = train_set["median_house_value"].copy()
housing_test_labels = test_set["median_house_value"].copy()

# data preparation and prediction going to be done in a pipeline

# pipeline to preprocess numerical features
numerical_attributes = train_set.drop(
    ['ocean_proximity', 'median_house_value'], axis=1).columns

numerical_pipeline = Pipeline(
Ejemplo n.º 9
0
#seq2seq train
import tensorflow as tf
import numpy as np
import data_processing
import config
import data_utils
import seq2seq_wrapper
from os import path

#load data and split into train and test sets
idx_headings, idx_descriptions = data_processing.process_data()
article_metadata = data_processing.unpickle_articles()
(x_train, x_test), (y_train, y_test), (x_valid,
                                       y_valid) = data_utils.split_data(
                                           idx_descriptions, idx_headings)

#define parameters
xseq_length = x_train.shape[-1]
yseq_length = y_train.shape[-1]
batch_size = config.batch_size
xvocab_size = len(article_metadata['idx2word'])
yvocab_size = xvocab_size
checkpoint_path = path.join(config.path_outputs, 'checkpoint')

print(checkpoint_path)

#define model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length,
                                yseq_len=yseq_length,
                                xvocab_size=xvocab_size,
                                yvocab_size=yvocab_size,
Ejemplo n.º 10
0
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "USAGE: " + sys.argv[0] + " input_file output_model_file"
        sys.exit(1)

    input_file = sys.argv[1]
    output_model_file = sys.argv[2]

    data = data_utils.read_from_csv(input_file)

    filtered_data = [x for x in data if x.diag_tag != "" and x.diag_tag != "u"]
    labels = [np.float32(x.diag_tag == "p") for x in filtered_data]
    data = [x.processed_sentence for x in filtered_data]
    report_ids = [x.report_id for x in filtered_data]

    train_data, train_labels, test_data, test_labels = data_utils.split_data(data, labels, report_ids, split=0.7)

    # change these parameters for the grid search
    # parameters = {'lsi__n_components': [100],
    #               'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10],
    #               'classifier__kernel': ["rbf"]
    #               }

    parameters = {'lsi__n_components': [100],
                  'classifier__n_estimators': [1000],
                  'classifier__max_depth': [5, 10],
                  'classifier__min_samples_split': [5, 10],
                  'classifier__min_samples_leaf': [5, 10],
                  }

    # clf = GridSearchCV(pipelines.get_count_lsi_SVM(), parameters)
Ejemplo n.º 11
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/my_en.txt', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/my_sp.txt', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    output = open('data/vocab_en.pkl', 'wb')
    pickle.dump(en_vocab_dict, output)
    output.close()
    output = open('data/vocab_sp.pkl', 'wb')
    pickle.dump(sp_vocab_dict, output)
    output.close()

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    print 'len(en_vocab_dict)', len(en_vocab_dict)
    print 'len(sp_vocab_dict)', len(sp_vocab_dict)

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(
                generate_epoch(train_encoder_inputs, train_decoder_inputs,
                               train_targets, train_en_seq_lens,
                               train_sp_seq_len, FLAGS.num_epochs,
                               FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                            batch_targets, batch_en_seq_lens,
                            batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS, batch_encoder_inputs,
                                     batch_decoder_inputs, batch_targets,
                                     batch_en_seq_lens, batch_sp_seq_lens,
                                     FLAGS.dropout)

                batch_loss.append(loss)

            losses.append(np.mean(batch_loss))

        checkpoint_path = "/tmp/model.ckpt"
        print "Saving the model."
        model.saver.save(sess, checkpoint_path)
        plt.plot(losses, label='loss')
        plt.legend()
        plt.savefig('seq_01.png')
Ejemplo n.º 12
0
    acc = accuracy_score(y_test, y_pred)

    return fit_time, predict_time, auc, acc


X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name, file_name)

res_spaco = []
res_spaco_ours = []
res_spaco_ours2 = []
res_reg=[]

for i in range(cv):
    print("CV: " + str(i))
    X, y = shuffle(X, y)
    X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data(X, y, train_test_split=0.8,
                                                                         labeled_unlabeled_split=labeled_split)

    res_spaco.append(evaluate_model("spaco",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2))

    res_spaco_ours.append(evaluate_model("spaco_ours",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2))

    res_spaco_ours2.append(evaluate_model("spaco_ours2",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2))

    res_reg.append(evaluate_model("reg",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2))

f=open('./test_res.txt',mode='w')
f.write("spaco\n")
f.write(str(res_spaco[0]))
f.write("\n\n")
f.write("spaco_ours\n")
f.write(str(res_spaco_ours[0]))
Ejemplo n.º 13
0
def train():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)
    train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \
        split_data(X, y, seq_lens)
    FLAGS.max_sequence_length = len(train_X[0])

    with tf.Session() as sess:

        # Load old model or create new one
        model = create_model(sess, FLAGS)

        # Train results
        for epoch_num, epoch in enumerate(
                generate_epoch(train_X, train_y, train_seq_lens,
                               FLAGS.num_epochs, FLAGS.batch_size)):
            print "EPOCH:", epoch_num

            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            train_loss = []
            train_accuracy = []
            for batch_num, (batch_X, batch_y,
                            batch_seq_lens) in enumerate(epoch):

                _, loss, accuracy = model.step(
                    sess,
                    batch_X,
                    batch_seq_lens,
                    batch_y,
                    dropout_keep_prob=FLAGS.dropout_keep_prob,
                    forward_only=False,
                    sampling=False)

                train_loss.append(loss)
                train_accuracy.append(accuracy)

            print
            print "EPOCH %i SUMMARY" % epoch_num
            print "Training loss %.3f" % np.mean(train_loss)
            print "Training accuracy %.3f" % np.mean(train_accuracy)
            print "----------------------"

            # Validation results
            for valid_epoch_num, valid_epoch in enumerate(
                    generate_epoch(valid_X,
                                   valid_y,
                                   valid_seq_lens,
                                   num_epochs=1,
                                   batch_size=FLAGS.batch_size)):
                valid_loss = []
                valid_accuracy = []

                for valid_batch_num, \
                    (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \
                        enumerate(valid_epoch):

                    loss, accuracy = model.step(sess,
                                                valid_batch_X,
                                                valid_batch_seq_lens,
                                                valid_batch_y,
                                                dropout_keep_prob=1.0,
                                                forward_only=True,
                                                sampling=False)

                    valid_loss.append(loss)
                    valid_accuracy.append(accuracy)

            print "Validation loss %.3f" % np.mean(valid_loss)
            print "Validation accuracy %.3f" % np.mean(valid_accuracy)
            print "----------------------"

            # Save checkpoint every epoch.
            if not os.path.isdir(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)
            checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
            print "Saving the model."
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)
Ejemplo n.º 14
0
def main():
    """
    Wrapper to run the classification task
    """
    # Parse command-line arguments
    parser = build_parser()
    options = parser.parse_args()

    if options.mode == "gen_data":
        # Split the data into train/dev/test sets
        split_data()

        # Load the data and reshape for training and evaluation
        X, y_media, y_emotion = load_data(update=options.update,
                                          remove_broken=options.remove_broken)

        for set_type in ["train", "dev", "test"]:
            total_media = np.sum(y_media[set_type], axis=0)
            total_emotion = np.sum(y_emotion[set_type], axis=0)

            print(f"Total images for each media category in {set_type} set:")
            for v, k in enumerate(MEDIA_LABELS):
                print(f"\t{k}: {total_media[v]}")
            print(f"Total images for each emotion category in {set_type} set:")
            for v, k in enumerate(EMOTION_LABELS):
                print(f"\t{k}: {total_emotion[v]}")

    elif options.mode == "train":
        # Create directory to save the results
        results_dir = "results"
        if not os.path.exists("./" + results_dir):
            os.makedirs("./" + results_dir)
        # Check if the given log folder already exists
        results_subdirs = os.listdir("./" + results_dir)
        if not options.log_folder:
            raise Exception(
                'Please specify log_folder argument to store results.')
        elif options.log_folder in results_subdirs:
            raise Exception('The given log folder already exists.')
        else:
            # Create a folder for each training run
            log_folder = os.path.join(results_dir, options.log_folder)
            os.makedirs(log_folder)

        # Load the data and organize into three tuples (train, val/dev, test)
        # Each tuple consists of input arrays, media labels, and emotion labels
        train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE,
                                                    MEDIA_LABEL_FILE,
                                                    EMOTION_LABEL_FILE)

        # Preprocess the data
        train_dset, val_dset, test_dset = preprocess(
            train_data,
            val_data,
            test_data,
            augment=options.augment,
            train_stats_dir=TRAIN_STATS_DIR)

        # Specify the device:
        if options.device == "cpu":
            device = "/cpu:0"
        elif options.device == "gpu":
            device = "/device:GPU:0"

        # Train the model
        train(train_dset,
              val_dset,
              log_folder=log_folder,
              device=device,
              batch_size=64,
              num_epochs=100,
              model_type=options.model_type)

    elif options.mode == "test":
        # Load the data and organize into three tuples (train, val/dev, test)
        # Each tuple consists of input arrays, media labels, and emotion labels
        train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE,
                                                    MEDIA_LABEL_FILE,
                                                    EMOTION_LABEL_FILE)
        # Preprocess the data
        if os.path.isfile(os.path.join(TRAIN_STATS_DIR, "train_stats.npz")):
            print(
                "Preprocess test data using saved statistics from train data..."
            )
            train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz")
            test_dset = preprocess_from_file(train_stats_file,
                                             test_data,
                                             augment=options.augment)
        else:
            print("Preprocess test data using train data...")
            train_dset, val_dset, test_dset = preprocess(
                train_data,
                val_data,
                test_data,
                augment=options.augment,
                train_stats_dir=TRAIN_STATS_DIR)

        # Specify the device:
        if options.device == "cpu":
            device = "/cpu:0"
        elif options.device == "gpu":
            device = "/device:GPU:0"

        # Load the model
        model_path = os.path.join("test_models", options.model_name)
        evaluate_test(model_path,
                      options.model_type,
                      test_dset,
                      batch_size=64,
                      confusion_mat=options.confusion_mat)

    elif options.mode == "ensemble":
        # Load the data and organize into three tuples (train, val/dev, test)
        # Each tuple consists of input arrays, media labels, and emotion labels
        train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE,
                                                    MEDIA_LABEL_FILE,
                                                    EMOTION_LABEL_FILE)
        # Preprocess the data
        if os.path.isfile(os.path.join(TRAIN_STATS_DIR, "train_stats.npz")):
            print(
                "Preprocess test data using saved statistics from train data..."
            )
            train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz")
            test_dset = preprocess_from_file(train_stats_file,
                                             test_data,
                                             augment=options.augment)
        else:
            print("Preprocess test data using train data...")
            train_dset, val_dset, test_dset = preprocess(
                train_data,
                val_data,
                test_data,
                augment=options.augment,
                train_stats_dir=TRAIN_STATS_DIR)
        # Specify the device:
        if options.device == "cpu":
            device = "/cpu:0"
        elif options.device == "gpu":
            device = "/device:GPU:0"

        if not options.ensemble_folder:
            raise Exception(
                'Please specify ensemble_folder argument to find ensemble folders.'
            )
        elif len(os.listdir(options.ensemble_folder)) == 0:
            raise Exception('Ensemble folder is empty.')

        # Evaluate the ensemble
        evaluate_ensemble(options.ensemble_folder,
                          test_dset,
                          batch_size=64,
                          confusion_mat=options.confusion_mat)

    elif options.mode == "test_single":
        x_test = load_image(
            os.path.join('stylized_images_configs', options.image))
        train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz")
        x_test = preprocess_image(train_stats_file,
                                  x_test,
                                  augment=options.augment)

        model_path = os.path.join("test_models", options.model_name)
        predict_image(x_test, model_path)
Ejemplo n.º 15
0
	logging.basicConfig(
		level=logging.INFO,
		format="%(asctime)s [%(levelname)s] %(message)s",
		handlers=[
			logging.FileHandler(logfile, mode='w'),
			logging.StreamHandler()]
	)

	logging.info('Loading dataset...')

	concepts, relation_types, relations = load_umls(umls_directory, data_folder)

	# logging.warning('Testing system with only 1000 examples!!!')
	# relations = relations[:1000]
	concept_list = list(concepts.values())
	train_data, val_data, _ = split_data(relations)
	train_relations_set = set(train_data)
	train_dataset = UmlsRelationDataset(train_data)
	val_dataset = UmlsRelationDataset(val_data)

	callbacks = []
	logging.info('Loading collator...')
	example_creator = NameRelationExampleCreator()
	# train_neg_sampler = BatchNegativeSampler(
	# 	negative_sample_size
	# )
	# val_neg_sampler = train_neg_sampler
	train_neg_sampler = UniformNegativeSampler(
		concept_list,
		train_relations_set,
		negative_sample_size,
Ejemplo n.º 16
0
def train(params):
	hindi_token_ids, hindi_seq_lens, hindi_vocab_dict, hindi_rev_vocab_dict = process_data('../data/hindi_dump.p', max_vocab_size=100000, target_lang=False)
	bengali_token_ids, bengali_seq_lens, bengali_vocab_dict, bengali_rev_vocab_dict = process_data('../data/bengali_dump.p', max_vocab_size=100000, target_lang=True)
	train_encoder_inputs, train_decoder_inputs, train_targets, train_hindi_seq_lens, train_bengali_seq_len, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens = split_data(hindi_token_ids, bengali_token_ids, hindi_seq_lens, bengali_seq_lens,train_ratio=0.8)

	params.hindi_vocab_size = len(hindi_vocab_dict)
	params.bengali_vocab_size = len(bengali_vocab_dict)

	print params.hindi_vocab_size, params.bengali_vocab_size

	with tf.Session() as sess:
		_model = model(params)
		sess.run(tf.global_variables_initializer())
		losses = []
		accs = []
		for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,train_decoder_inputs, train_targets,train_hindi_seq_lens, train_bengali_seq_len,params.num_epochs, params.batch_size)):
			print "EPOCH : ", epoch_num
			sess.run(tf.assign(_model.lr, 0.01 * (0.99 ** epoch_num)))
			batch_loss = []
			batch_acc = []
			for batch_num, (batch_encoder_inputs, batch_decoder_inputs,batch_targets, batch_hindi_seq_lens,batch_bengali_seq_lens) in enumerate(epoch):
				loss, _,acc = _model.step(sess, params,batch_encoder_inputs, batch_decoder_inputs, batch_targets,batch_hindi_seq_lens, batch_bengali_seq_lens,params.dropout)
				batch_loss.append(loss)
				batch_acc.append(acc)
			losses.append(np.mean(batch_loss))
			accs.append(np.mean(batch_acc))
			print "Training Loss: ",losses[-1]
			print "Training Accuracy",accs[-1]
		plt.plot(losses, label='loss')
		plt.legend()
		# plt.show()
		
		plt.title('Plot for Training Error versus Epochs', fontsize='20', style='oblique')
		plt.xlabel('Epochs', fontsize='16', color='green')
		plt.ylabel('Training Error', fontsize='16', color='green')
		plt.savefig('../output/plot.png')
		plt.show()

		acc = _model.test(sess, params, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens, params.dropout)
		print acc
Ejemplo n.º 17
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/tst2013.en', max_vocab_size=30000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/tst2013.tr', max_vocab_size=30000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)
    
    output = open('data/vocab_en.pkl', 'wb')
    pickle.dump(en_vocab_dict, output)
    output.close()
    output = open('data/vocab_sp.pkl', 'wb')
    pickle.dump(sp_vocab_dict, output)
    output.close()

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    print 'len(en_vocab_dict)', len(en_vocab_dict)
    print 'len(sp_vocab_dict)', len(sp_vocab_dict)
    
    # Start session
    with tf.Session() as sess:
        model = None
        # Create new model or load old one
        f = checkpoint_path + ".index"
        print f
        exit()
        if os.path.isfile(f):
            model = restore_model(sess)
        else:
            model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,
            train_decoder_inputs, train_targets,
            train_en_seq_lens, train_sp_seq_len,
            FLAGS.num_epochs, FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                batch_targets, batch_en_seq_lens,
                batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS,
                    batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                    batch_en_seq_lens, batch_sp_seq_lens,
                    FLAGS.dropout)
                print loss
                batch_loss.append(loss)
            print 'mean: ', np.mean(batch_loss)

            print "Saving the model."
            model.saver.save(sess, checkpoint_path)
Ejemplo n.º 18
0
from os import path
import data_processing
import config
import data_utils
import seq2seq_wrapper

#load data and split into train and test sets
idx_headings, idx_descriptions = data_processing.process_data()
article_metadata = data_processing.unpickle_articles()
(x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings)

#define parameters
xseq_length = x_train.shape[-1]
yseq_length = y_train.shape[-1]
batch_size = config.batch_size
xvocab_size = len(article_metadata['idx2word'])
yvocab_size = xvocab_size
checkpoint_path = path.join(config.path_outputs, 'checkpoint')

print (checkpoint_path)

#define model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length,
                                yseq_len=yseq_length,
                                xvocab_size=xvocab_size,
                                yvocab_size=yvocab_size,
                                emb_dim=config.embedding_dim,
                                num_layers=3,
                                ckpt_path=checkpoint_path)

val_batch_gen = data_utils.generate_random_batch(x_valid, y_valid, config.batch_size)
Ejemplo n.º 19
0
def main():
    args = parser.parse_args()
    pprint(args)

    # check and create directories
    if not os.path.exists(args.checkpoint):
        os.makedirs(args.checkpoint)

    if not os.path.exists(args.log):
        os.makedirs(args.log)

    print('==> Preparing data..')
    transformations_train = transforms.Compose([
        data.RandomTranslateWithReflect(32),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    transformations_test = transforms.Compose(
        [transforms.ToTensor(), normalize])

    mode = {'train': True, 'test': True}

    image_datasets = Cifar10(root='./data',
                             train=True,
                             transform=None,
                             download=True)

    trainData, trainLabel, testData, testLabel = du.split_data(image_datasets,
                                                               select_num=9000)

    unlabeled_idx, labeled_idx = du.split_idx(trainLabel, select_num=21000)

    idx = np.squeeze([0, 1, 2, 3, 4, 7, 8])

    labeled_trainData_75, labeled_trainLabel_75, labeled_trainData_25, labeled_trainLabel_25 = du.split_class(
        trainData[labeled_idx, :, :, :],
        trainLabel[labeled_idx],
        selected_class=idx)

    testData_75, testLabel_75, testData_25, testLabel_25 = du.split_class(
        testData, testLabel, selected_class=idx)

    anchor_idx = du.select_anchors_1(labeled_trainLabel_75,
                                     anchor_num=args.anchor_num)

    print("labeled_idx is:{}".format(labeled_idx.size))
    print("anchor_idx is:{}".format(anchor_idx.size))

    print("unique of labeled_trainLabel_25 is:{}".format(
        np.unique(labeled_trainLabel_25)))

    print("unique of testLabel_25 is:{}".format(np.unique(testLabel_25)))

    dict_data = DT(trainData=labeled_trainData_75[anchor_idx, :, :, :],
                   trainLabel=labeled_trainLabel_75[anchor_idx],
                   transform=transformations_train)
    dict_loader = torch.utils.data.DataLoader(dict_data,
                                              batch_size=args.anchor_num,
                                              shuffle=False,
                                              num_workers=args.workers)

    c_trainData = np.concatenate(
        (trainData[unlabeled_idx, :, :, :], labeled_trainData_75), axis=0)
    c_trainLabel = np.concatenate(
        (trainLabel[unlabeled_idx], labeled_trainLabel_75), axis=0)

    u_trn = unlabeled_idx.size
    l_trn = labeled_trainLabel_75.shape[0]
    n = u_trn + l_trn

    unlabeled_idx = np.squeeze(np.arange(u_trn).astype(np.int32))
    labeled_idx = np.squeeze(np.arange(u_trn, n).astype(np.int32))

    mask_labels = np.squeeze(np.zeros((n, 1)))
    mask_labels[u_trn:n] = 1

    train_data = DT(trainData=c_trainData,
                    trainLabel=c_trainLabel,
                    transform=transformations_train)
    test_data = DT(trainData=testData_25,
                   trainLabel=testLabel_25,
                   transform=transformations_test)

    labeled_trainData_25 = np.concatenate((labeled_trainData_25, testData_75),
                                          axis=0)
    labeled_trainLabel_25 = np.concatenate(
        (labeled_trainLabel_25, testLabel_75), axis=0)

    train_data_test = DT(trainData=labeled_trainData_25,
                         trainLabel=labeled_trainLabel_25,
                         transform=transformations_test)
    train_loader_test = torch.utils.data.DataLoader(train_data_test,
                                                    batch_size=100,
                                                    shuffle=True,
                                                    num_workers=args.workers)

    batch_sampler = data.TwoStreamBatchSampler(unlabeled_idx, labeled_idx,
                                               args.batch_size, 50)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_sampler=batch_sampler,
                                               num_workers=args.workers,
                                               pin_memory=True)

    # train_loader = torch.utils.data.DataLoader(train_data, batch_size=100,
    #                                           shuffle=True, num_workers=args.workers)

    test_loader = torch.utils.data.DataLoader(test_data,
                                              batch_size=100,
                                              shuffle=False,
                                              num_workers=args.workers)

    for iter in range(5):
        model = pre_train(train_loader,
                          test_loader,
                          dict_loader,
                          train_loader_test,
                          mask_labels,
                          total_epochs=100,
                          use_gpu=True,
                          seed=args.seed)
Ejemplo n.º 20
0
        TARGET_POSITIVE = "p"
    elif test_type == "class":
        tag_attr = "report_class"
        TARGET_POSITIVE = TARGET_CLASS
    else:
        raise ValueError("Unknown tag: " + test_type)

    data = data_utils.read_from_csv(data_file)
    filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"]
    filtered_data = filtered_data[:2500]  # put a limit on the size for performance

    labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data]
    report_ids = [x.report_id for x in filtered_data]
    sentences = [x.processed_sentence for x in filtered_data]

    train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value)

    # Create transformation pipeline
    if USE_RF:
        pipe = pipelines.get_count_lsi_randomforest()
    else:
        pipe = pipelines.get_count_lsi_SVM()

    # set pipe parameters and train model
    pipe.set_params(**model_params)
    pipe.fit(train_data, train_labels)

    print "Total = " + str(len(filtered_data)) + " [" + str(labels.count(0)) + ", " + str(labels.count(1)) + "]"
    print "Train = " + str(len(train_data)) + " [" + str(train_labels.count(0)) + ", " + str(
        train_labels.count(1)
    ) + "]"
Ejemplo n.º 21
0
def main(FLAGS):

    # set seed
    np.random.seed(FLAGS.seed)
    tf.set_random_seed(FLAGS.seed)

    with tf.device('/cpu:0'), tf.name_scope('input'):

        # load dataset into main memory
        data, meta = load_data(FLAGS.dataset_root,
                               FLAGS.dataset,
                               is_training=True)
        train_data, val_data = split_data(data, FLAGS.validate_rate)

        # build tf_dataset for training
        train_dataset = (tf.data.Dataset.from_tensor_slices(train_data).map(
            preprocess_for_train(args.dataset not in ['mnist', 'svhn']),
            8).shuffle(10000,
                       seed=FLAGS.seed).batch(FLAGS.batch_size).prefetch(1))

        # build tf_dataset for val
        val_dataset = (tf.data.Dataset.from_tensor_slices(val_data).map(
            preprocess_for_eval, 8).batch(FLAGS.batch_size).prefetch(1))

        # clean up and release memory
        del data, train_data, val_data

        # construct data iterator
        data_iterator = tf.data.Iterator.from_structure(
            train_dataset.output_types, train_dataset.output_shapes)

        # construct iterator initializer for training and validation
        train_data_init = data_iterator.make_initializer(train_dataset)
        val_data_init = data_iterator.make_initializer(val_dataset)

    # define useful scalars
    learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate')
    tf.summary.scalar('lr', learning_rate)
    is_training = tf.placeholder(tf.bool, [], name='is_training')
    global_step = tf.train.create_global_step()

    # define optimizer
    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)

    # build the net
    model = importlib.import_module('models.{}'.format(FLAGS.model))
    net = model.Net(meta['n_class'], FLAGS.weight_decay)

    # get data from data iterator
    images, labels = data_iterator.get_next()
    tf.summary.image('images', tf.transpose(images, [0, 2, 3, 1]))

    # get logits
    logits = net(images, is_training)
    tf.summary.histogram('logits', logits)

    # summary variable defined in net
    for w in net.global_variables:
        tf.summary.histogram(w.name, w)

    with tf.name_scope('losses'):
        # compute loss
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)

        # compute l2 regularization
        l2_reg = tf.losses.get_regularization_loss()

    with tf.name_scope('metrics') as scope:

        mean_loss, mean_loss_update_op = tf.metrics.mean(loss,
                                                         name='mean_loss')

        prediction = tf.argmax(logits, axis=1)
        accuracy, accuracy_update_op = tf.metrics.accuracy(labels,
                                                           prediction,
                                                           name='accuracy')

        reset_metrics = tf.variables_initializer(
            tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope))
        metrics_update_op = tf.group(mean_loss_update_op, accuracy_update_op)

        # collect metric summary alone, because it need to
        # summary after metrics update
        metric_summary = [
            tf.summary.scalar('loss', mean_loss, collections=[]),
            tf.summary.scalar('accuracy', accuracy, collections=[])
        ]

    # compute grad
    grads_and_vars = optimizer.compute_gradients(loss + l2_reg)

    # summary grads
    for g, v in grads_and_vars:
        tf.summary.histogram(v.name + '/grad', g)

    # run train_op and update_op together
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group(train_op, *update_ops)

    # build summary
    train_summary_str = tf.summary.merge_all()
    metric_summary_str = tf.summary.merge(metric_summary)

    # init op
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    # prepare for the logdir
    if not tf.gfile.Exists(FLAGS.logdir):
        tf.gfile.MakeDirs(FLAGS.logdir)

    # saver
    saver = tf.train.Saver(max_to_keep=FLAGS.n_epoch)

    # summary writer
    train_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'train'),
                                         tf.get_default_graph())
    val_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'val'),
                                       tf.get_default_graph())

    # session
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False,
                            intra_op_parallelism_threads=4,
                            inter_op_parallelism_threads=4)
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    # do initialization
    sess.run(init_op)

    # restore
    if FLAGS.restore:
        saver.restore(sess, FLAGS.restore)

    lr_boundaries = list(map(int, FLAGS.boundaries.split(',')))
    lr_values = list(map(float, FLAGS.values.split(',')))
    lr_manager = LRManager(lr_boundaries, lr_values)
    time_meter = TimeMeter()

    # start to train
    for e in range(FLAGS.n_epoch):
        print('-' * 40)
        print('Epoch: {:d}'.format(e))

        # training loop
        try:
            i = 0
            sess.run([train_data_init, reset_metrics])
            while True:

                lr = lr_manager.get(e)
                fetch = [train_summary_str] if i % FLAGS.log_every == 0 else []

                time_meter.start()
                result = sess.run([train_op, metrics_update_op] + fetch, {
                    learning_rate: lr,
                    is_training: True
                })
                time_meter.stop()

                if i % FLAGS.log_every == 0:
                    # fetch summary str
                    t_summary = result[-1]
                    t_metric_summary = sess.run(metric_summary_str)

                    t_loss, t_acc = sess.run([mean_loss, accuracy])
                    sess.run(reset_metrics)

                    spd = FLAGS.batch_size / time_meter.get()
                    time_meter.reset()

                    print(
                        'Iter: {:d}, LR: {:g}, Loss: {:.4f}, Acc: {:.2f}, Spd: {:.2f} i/s'
                        .format(i, lr, t_loss, t_acc, spd))

                    train_writer.add_summary(t_summary,
                                             global_step=sess.run(global_step))
                    train_writer.add_summary(t_metric_summary,
                                             global_step=sess.run(global_step))

                i += 1
        except tf.errors.OutOfRangeError:
            pass

        # save checkpoint
        saver.save(sess,
                   '{}/{}'.format(FLAGS.logdir, FLAGS.model),
                   global_step=sess.run(global_step),
                   write_meta_graph=False)

        # val loop
        try:
            sess.run([val_data_init, reset_metrics])
            while True:
                sess.run([metrics_update_op], {is_training: False})
        except tf.errors.OutOfRangeError:
            pass

        v_loss, v_acc = sess.run([mean_loss, accuracy])
        print('[VAL]Loss: {:.4f}, Acc: {:.2f}'.format(v_loss, v_acc))

        val_writer.add_summary(sess.run(metric_summary_str),
                               global_step=sess.run(global_step))

    print('-' * 40)
Ejemplo n.º 22
0
def main(*kargs, **kwargs):

    # ============ Parse global parameters ============
    get_kwargs(kwargs)
    test_fname = kwargs['test']
    embeds_type = kwargs['embeds_type']
    logger_fname = kwargs['logger']
    # warm_start = kwargs['warm_start']
    # model_warm_start = [model.lower() for model in kwargs['model_warm_start']]
    config = kwargs['config']
    train_clean = kwargs['train_clean']
    train_labels = kwargs['train_labels']
    test_clean = kwargs['test_clean']
    embeds_clean = kwargs['embeds_clean']
    result_path = './outputs/'
    oof_path = './oof_predictions'

    if not os.path.exists(result_path):
        os.mkdir(result_path)
    if not os.path.exists(oof_path):
        os.mkdir(oof_path)

    # ==== Create logger ====
    logger = Logger(logging.getLogger(), logger_fname)

    # ==== Load data ====
    logger.info('Loading data...')
    test_df = load_data(test_fname)
    train_x = np.load(train_clean)
    test_x = np.load(test_clean)
    embedding_matrix = np.load(embeds_clean)
    train_y = np.load(train_labels)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    # ==== Splitting training data ====
    x_train_nn, x_eval_nn, y_train_nn, y_eval_nn, train_idxs, eval_idxs = split_data(
        train_x, train_y, eval_size=0.1, shuffle=True, random_state=42)
    logger.debug('X shape = {}'.format(np.shape(x_train_nn)))

    # ============= Load params of models =============
    params = Params(config)
    models = params.get('models')

    # ============ Train models =============
    for model_name in models:
        model_func = get_model(model_name, embedding_matrix, params)
        if params.get(model_name).get('folding'):
            # =========== Training on folds ============
            batch_size = params.get(model_name).get('batch_size')

            logger.debug(
                'Starting {0} training on folds...'.format(model_name))
            models, val_predictions = train_folds(
                train_x,
                train_y,
                params.get(model_name).get('num_folds'),
                batch_size,
                model_func,
                params.get(model_name).get('optimizer'),
                logger=logger)
            val_predictions_array = np.concatenate(
                [minmax_scale(fold) for fold in val_predictions], axis=0)
            np.save(
                os.path.join(oof_path,
                             "{0}_{1}_oof.npy".format(model_name,
                                                      embeds_type)),
                val_predictions_array)
            logger.debug('Predicting results...')
            test_predicts_list = []
            for fold_id, model in enumerate(models):
                model_path = os.path.join(
                    result_path,
                    "{1}_{0}_{2}_weights.npy".format(fold_id, model_name,
                                                     embeds_type))
                np.save(model_path, model.get_weights())

                test_predicts_path = os.path.join(
                    result_path, "{1}_{2}_test_predicts{0}.npy".format(
                        fold_id, model_name, embeds_type))
                test_predictions = model.predict(test_x, batch_size=batch_size)
                test_predicts_list.append(test_predictions)
                np.save(test_predicts_path, test_predictions)

            test_predictions = np.ones(test_predicts_list[0].shape)
            for fold_predict in test_predicts_list:
                test_predictions *= minmax_scale(fold_predict)
            if params.get(model_name).get('norm_folds'):
                test_predictions **= (1. / len(test_predicts_list))

            logger.info('Saving prediction...')
            test_ids = test_df["id"].values
            test_ids = test_ids.reshape((len(test_ids), 1))

            test_predictions = pd.DataFrame(data=test_predictions,
                                            columns=target_labels)
            test_predictions["id"] = test_ids
            test_predictions = test_predictions[["id"] + target_labels]
            submit_path = os.path.join(
                result_path,
                "{0}_{1}_folds.submit".format(model_name, embeds_type))
            test_predictions.to_csv(submit_path, index=False)

        else:
            # ============ Single model training =============
            logger.info('Training single {0} training...'.format(model_name))
            model = model_func()
            model_tr = _train_model(
                model,
                batch_size=params.get(model_name).get('batch_size'),
                train_x=x_train_nn,
                train_y=y_train_nn,
                val_x=x_eval_nn,
                val_y=y_eval_nn,
                opt=params.get(model_name).get('optimizer'),
                logger=logger)
            test_predictions = model_tr.predict(
                test_x, batch_size=params.get(model_name).get('batch_size'))

            # ============== Saving trained parameters ================
            logger.info('Saving model parameters...')
            model_path = os.path.join(
                result_path,
                "{0}_{1}_weights.npy".format(model_name, embeds_type))
            np.save(model_path, model.get_weights())

            # ============== Postprocessing ===============

            # test_predictions **= PROBABILITIES_NORMALIZE_COEFFICIENT

            # ============== Saving predictions ==============

            logger.info('Saving predictions...')
            test_ids = test_df["id"].values
            test_ids = test_ids.reshape((len(test_ids), 1))

            test_predicts = pd.DataFrame(data=test_predictions,
                                         columns=target_labels)
            test_predicts["id"] = test_ids
            test_predicts = test_predicts[["id"] + target_labels]
            submit_path = os.path.join(
                result_path, "{0}_{1}.csv".format(model_name, embeds_type))
            test_predicts.to_csv(submit_path, index=False)
Ejemplo n.º 23
0
def main():
    args = parser.parse_args()
    pprint(args)

    # check and create directories
    if not os.path.exists(args.checkpoint):
        os.makedirs(args.checkpoint)

    if not os.path.exists(args.log):
        os.makedirs(args.log)

    print('==> Preparing data..')
    transformations_train = transforms.Compose([
        data.RandomTranslateWithReflect(32),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    transformations_test = transforms.Compose(
        [transforms.ToTensor(), normalize])

    mode = {'train': True, 'test': True}

    image_datasets = Cifar10(root='./data',
                             train=True,
                             transform=None,
                             download=True)
    trainData, trainLabel, testData, testLabel = du.split_data(image_datasets,
                                                               select_num=1000)
    unlabeled_idx, labeled_idx = du.split_idx(trainLabel, select_num=5000)

    anchor_idx = du.select_anchors(trainLabel,
                                   labeled_idx,
                                   anchor_num=args.anchor_num)

    print("labeled_idx is:{}".format(labeled_idx.size))
    print("anchor_idx is:{}".format(anchor_idx.size))

    dict_data = DT(trainData=trainData[anchor_idx, :, :, :],
                   trainLabel=trainLabel[anchor_idx],
                   transform=transformations_train)
    dict_loader = torch.utils.data.DataLoader(dict_data,
                                              batch_size=args.anchor_num,
                                              shuffle=False,
                                              num_workers=args.workers)

    n = trainLabel.shape[0]

    mask_labels = np.squeeze(np.zeros((n, 1)))
    mask_labels[labeled_idx] = 1

    train_data = DT(trainData=trainData,
                    trainLabel=trainLabel,
                    transform=transformations_train)
    test_data = DT(trainData=testData,
                   trainLabel=testLabel,
                   transform=transformations_test)

    train_data_test = DT(trainData=trainData,
                         trainLabel=trainLabel,
                         transform=transformations_test)
    train_loader_test = torch.utils.data.DataLoader(train_data_test,
                                                    batch_size=args.batch_size,
                                                    shuffle=False,
                                                    num_workers=args.workers)

    batch_sampler = data.TwoStreamBatchSampler(unlabeled_idx, labeled_idx,
                                               args.batch_size, 40)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_sampler=batch_sampler,
                                               num_workers=args.workers,
                                               pin_memory=True)

    test_loader = torch.utils.data.DataLoader(test_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=args.workers)

    model = pre_train(train_loader,
                      test_loader,
                      dict_loader,
                      train_loader_test,
                      mask_labels,
                      total_epochs=100,
                      use_gpu=True,
                      seed=args.seed)