def run_iters_experiment(dir_path, file_name): res = {} res['x'] = [] res['y'] = [] for num_of_iters in ITERS: X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name, file_name) acc = [] for i in range(10): print("CV: " + str(i)) X, y = shuffle(X, y) X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data( X, y, train_test_split=0.8, labeled_unlabeled_split=0.2) base_models = [ BASE_MODELS[ITERS_BASE_MODEL], BASE_MODELS[ITERS_BASE_MODEL] ] model = SP_coTrain.SP_coTrain(base_models, num_of_iters, add_rate=0.1, gamma=0.5) model.fit(X_labeled, X_unlabeled, y_labeled, view1, view2) y_pred = model.predict(X_test) acc.append(accuracy_score(y_test, y_pred)) res['x'].append(num_of_iters) res['y'].append(sum(acc) / len(acc)) plot_graph(res)
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate( generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) batch_loss.append(loss) losses.append(np.mean(batch_loss)) plt.plot(losses, label='loss') plt.legend() plt.show()
def predict_on_stocks(array: numpy.array, model_path: str, interval: str, stock_path: str): scaler = StandardScaler() open_data, close_data = init_data(array) open_data, close_data = normalize_data(open_data, close_data, scaler) (x_train, y_train, x_test, y_test) = split_data(open_data, close_data) (x_train, y_train) = shuffle_data(x_train, y_train) (model, checkpoint_callback) = create_model(model_path) model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=64, epochs=EPOCHS, callbacks=[checkpoint_callback]) #test_model(model, x_test, y_test, scaler, interval) // uncomment this if you want to test the ai efficiency dump(scaler, f'{model_path}/std_scaler.bin', compress=True)
def cross_validation(dir_path, file_name, cv=10, base_model="RandomForest", labeled_rate=0.2): X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name, file_name) res_spaco = [] res_co = [] res_base = [] res_reg_base = [] for i in range(cv): print("CV: " + str(i)) X, y = shuffle(X, y) X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data( X, y, train_test_split=0.8, labeled_unlabeled_split=labeled_rate) res_spaco.append( evaluate_model("spaco", base_model, X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2, labeled_rate)) res_co.append( evaluate_model("co", base_model, X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2, labeled_rate)) res_base.append( evaluate_model("base", base_model, X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2, labeled_rate)) res_reg_base.append( evaluate_model("reg_base", base_model, X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2, labeled_rate)) return res_spaco, res_co, res_base, res_reg_base
def main(args): # set random seed np.random.seed(args.seed) torch.manual_seed(args.seed) print(args.dataset_root) # load data data, meta = data_utils.load_data(args.dataset_root, args.dataset, is_training=True) train_data, val_data = data_utils.split_data(data, args.validate_rate, shuffle=True) # build train dataloader train_dataset = data_utils.ImageDataset(*train_data, is_training=True, is_flip=args.dataset not in ['mnist', 'svhn']) train_dataloader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=2, pin_memory=True) # build val dataloader val_dataset = data_utils.ImageDataset(*val_data, is_training=False) val_dataloader = torch.utils.data.DataLoader(val_dataset, args.batch_size, shuffle=False, num_workers=2, pin_memory=True) # remove temp dataset variables to reduce memory usage del data, train_data, val_data device = torch.device(args.device) # build model if args.model == 'resnet_20': model = models.Resnet20 else: model = models.SimpleCNN net = model(train_dataset.shape, meta['n_class']).to(device=device) net.apply(init_param) criterion = torch.nn.CrossEntropyLoss() # build optim optim = torch.optim.SGD(make_param_groups(net, args.weight_decay), 0.1, momentum=0.9) # make log directory logdir = Path(args.logdir) if not logdir.exists(): os.makedirs(str(logdir)) global_step = 0 start_epoch = 0 if args.restore: # restore checkpoint state = torch.load(args.restore) start_epoch = state['epoch'] + 1 global_step = state['global_step'] net.load_state_dict(state['net']) optim.load_state_dict(state['optim']) # lr strategy lr_boundaries = list(map(int, args.boundaries.split(','))) lr_values = list(map(float, args.values.split(','))) lr_manager = LRManager(lr_boundaries, lr_values) for e in range(start_epoch, args.n_epoch): print('-------epoch: {:d}-------'.format(e)) # training phrase net.train() mean_loss, acc = MeanValue(), Accuracy() lr_manager.set_lr_for_optim(e, optim) tm = TimeMeter() tm.start() train_log = {} for i, (x, y) in enumerate(train_dataloader): tm.add_counter() if device.type == 'cuda': x = x.cuda(device, non_blocking=True) y = y.cuda(device, non_blocking=True) optim.zero_grad() logits = net(x) loss = criterion(logits, y) loss.backward() optim.step() global_step += 1 loss = loss.detach().cpu().numpy() predicts = torch.argmax(logits, dim=1).detach().cpu().numpy() y = y.detach().cpu().numpy() mean_loss.add(loss) acc.add(predicts, y) if i % args.log_every == 0: torch.cuda.synchronize() tm.stop() print( 'step: {:d}, lr: {:g}, loss: {:.4f}, acc: {:.2%}, speed: {:.2f} i/s.' .format(i, lr_manager.get(e), mean_loss.get(), acc.get(), args.batch_size / tm.get())) train_log[global_step] = { 'loss': mean_loss.get(), 'acc': acc.get() } tm.reset() tm.start() mean_loss.reset() acc.reset() # val phrase net.eval() mean_loss, acc = MeanValue(), Accuracy() for x, y in val_dataloader: if device.type == 'cuda': x = x.cuda(device, non_blocking=True) y = y.cuda(device, non_blocking=True) logits = net(x) loss = criterion(logits, y) loss = loss.detach().cpu().numpy() predicts = torch.argmax(logits, dim=1).detach().cpu().numpy() y = y.detach().cpu().numpy() mean_loss.add(loss) acc.add(predicts, y) print('val_loss: {:.4f}, val_acc: {:.2%}'.format( mean_loss.get(), acc.get())) val_log = {global_step: {'loss': mean_loss.get(), 'acc': acc.get()}} # save checkpoint vars_to_saver = { 'net': net.state_dict(), 'optim': optim.state_dict(), 'epoch': e, 'global_step': global_step } cpt_file = logdir / 'checkpoint_{:d}.pk'.format(e) torch.save(vars_to_saver, str(cpt_file)) log_file = logdir / 'log_{:d}.pk'.format(e) torch.save({'train': train_log, 'val': val_log}, str(log_file))
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS, forward_only=False) # Training begins train_losses = [] valid_losses = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): y_pred, loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout, forward_only=False) batch_loss.append(loss) train_losses.append(np.mean(batch_loss)) for valid_epoch_num, valid_epoch in enumerate(generate_epoch(valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_en_seq_lens, valid_sp_seq_len, num_epochs=1, batch_size=FLAGS.batch_size)): batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(valid_epoch): loss = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, dropout=0.0, forward_only=True, sampling=False) batch_loss.append(loss) valid_losses.append(np.mean(batch_loss)) # Save checkpoint. if not os.path.isdir(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt") print "Saving the model." model.saver.save(sess, checkpoint_path, global_step=model.global_step) plt.plot(train_losses, label='train_loss') plt.plot(valid_losses, label='valid_loss') plt.legend() plt.show()
import tensorflow as tf import numpy as np import data_processing import config import data_utils import seq2seq_wrapper from os import path #load data and split into train and test sets idx_headings, idx_descriptions = data_processing.process_data() article_metadata = data_processing.unpickle_articles() (x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings) #define parameters xseq_length = x_train.shape[-1] yseq_length = y_train.shape[-1] batch_size = config.batch_size xvocab_size = len(article_metadata['idx2word']) yvocab_size = xvocab_size checkpoint_path = path.join(config.path_outputs, 'checkpoint') print (checkpoint_path) #define model model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length, yseq_len=yseq_length, xvocab_size=xvocab_size, yvocab_size=yvocab_size, emb_dim=config.embedding_dim, num_layers=3, ckpt_path=checkpoint_path)
from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.model_selection import RandomizedSearchCV from sklearn.svm import SVR from sklearn.metrics import mean_squared_error from scipy.stats import uniform from data_utils import get_data, split_data from utils import DataFrameSelector, CombinedAttributesAdder, CustomLabelBinarizer # laoding the dataset housing = get_data() # split into train_set and test_set train_set, test_set = split_data(housing) housing_train = train_set.drop( "median_house_value", axis=1) # median_house_value column contains the target values housing_test = test_set.drop("median_house_value", axis=1) housing_train_labels = train_set["median_house_value"].copy() housing_test_labels = test_set["median_house_value"].copy() # data preparation and prediction going to be done in a pipeline # pipeline to preprocess numerical features numerical_attributes = train_set.drop( ['ocean_proximity', 'median_house_value'], axis=1).columns numerical_pipeline = Pipeline(
#seq2seq train import tensorflow as tf import numpy as np import data_processing import config import data_utils import seq2seq_wrapper from os import path #load data and split into train and test sets idx_headings, idx_descriptions = data_processing.process_data() article_metadata = data_processing.unpickle_articles() (x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data( idx_descriptions, idx_headings) #define parameters xseq_length = x_train.shape[-1] yseq_length = y_train.shape[-1] batch_size = config.batch_size xvocab_size = len(article_metadata['idx2word']) yvocab_size = xvocab_size checkpoint_path = path.join(config.path_outputs, 'checkpoint') print(checkpoint_path) #define model model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length, yseq_len=yseq_length, xvocab_size=xvocab_size, yvocab_size=yvocab_size,
if __name__ == "__main__": if len(sys.argv) != 2: print "USAGE: " + sys.argv[0] + " input_file output_model_file" sys.exit(1) input_file = sys.argv[1] output_model_file = sys.argv[2] data = data_utils.read_from_csv(input_file) filtered_data = [x for x in data if x.diag_tag != "" and x.diag_tag != "u"] labels = [np.float32(x.diag_tag == "p") for x in filtered_data] data = [x.processed_sentence for x in filtered_data] report_ids = [x.report_id for x in filtered_data] train_data, train_labels, test_data, test_labels = data_utils.split_data(data, labels, report_ids, split=0.7) # change these parameters for the grid search # parameters = {'lsi__n_components': [100], # 'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10], # 'classifier__kernel': ["rbf"] # } parameters = {'lsi__n_components': [100], 'classifier__n_estimators': [1000], 'classifier__max_depth': [5, 10], 'classifier__min_samples_split': [5, 10], 'classifier__min_samples_leaf': [5, 10], } # clf = GridSearchCV(pipelines.get_count_lsi_SVM(), parameters)
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/my_en.txt', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/my_sp.txt', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) output = open('data/vocab_en.pkl', 'wb') pickle.dump(en_vocab_dict, output) output.close() output = open('data/vocab_sp.pkl', 'wb') pickle.dump(sp_vocab_dict, output) output.close() # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) print 'len(en_vocab_dict)', len(en_vocab_dict) print 'len(sp_vocab_dict)', len(sp_vocab_dict) # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate( generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) batch_loss.append(loss) losses.append(np.mean(batch_loss)) checkpoint_path = "/tmp/model.ckpt" print "Saving the model." model.saver.save(sess, checkpoint_path) plt.plot(losses, label='loss') plt.legend() plt.savefig('seq_01.png')
acc = accuracy_score(y_test, y_pred) return fit_time, predict_time, auc, acc X, y, view1, view2 = du.extract_data(dir_path + '/' + file_name, file_name) res_spaco = [] res_spaco_ours = [] res_spaco_ours2 = [] res_reg=[] for i in range(cv): print("CV: " + str(i)) X, y = shuffle(X, y) X_labeled, X_unlabeled, y_labeled, X_test, y_test = du.split_data(X, y, train_test_split=0.8, labeled_unlabeled_split=labeled_split) res_spaco.append(evaluate_model("spaco",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2)) res_spaco_ours.append(evaluate_model("spaco_ours",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2)) res_spaco_ours2.append(evaluate_model("spaco_ours2",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2)) res_reg.append(evaluate_model("reg",base_model,X_labeled, X_unlabeled, y_labeled, X_test, y_test, view1, view2)) f=open('./test_res.txt',mode='w') f.write("spaco\n") f.write(str(res_spaco[0])) f.write("\n\n") f.write("spaco_ours\n") f.write(str(res_spaco_ours[0]))
def train(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \ split_data(X, y, seq_lens) FLAGS.max_sequence_length = len(train_X[0]) with tf.Session() as sess: # Load old model or create new one model = create_model(sess, FLAGS) # Train results for epoch_num, epoch in enumerate( generate_epoch(train_X, train_y, train_seq_lens, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH:", epoch_num sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) train_loss = [] train_accuracy = [] for batch_num, (batch_X, batch_y, batch_seq_lens) in enumerate(epoch): _, loss, accuracy = model.step( sess, batch_X, batch_seq_lens, batch_y, dropout_keep_prob=FLAGS.dropout_keep_prob, forward_only=False, sampling=False) train_loss.append(loss) train_accuracy.append(accuracy) print print "EPOCH %i SUMMARY" % epoch_num print "Training loss %.3f" % np.mean(train_loss) print "Training accuracy %.3f" % np.mean(train_accuracy) print "----------------------" # Validation results for valid_epoch_num, valid_epoch in enumerate( generate_epoch(valid_X, valid_y, valid_seq_lens, num_epochs=1, batch_size=FLAGS.batch_size)): valid_loss = [] valid_accuracy = [] for valid_batch_num, \ (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \ enumerate(valid_epoch): loss, accuracy = model.step(sess, valid_batch_X, valid_batch_seq_lens, valid_batch_y, dropout_keep_prob=1.0, forward_only=True, sampling=False) valid_loss.append(loss) valid_accuracy.append(accuracy) print "Validation loss %.3f" % np.mean(valid_loss) print "Validation accuracy %.3f" % np.mean(valid_accuracy) print "----------------------" # Save checkpoint every epoch. if not os.path.isdir(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt") print "Saving the model." model.saver.save(sess, checkpoint_path, global_step=model.global_step)
def main(): """ Wrapper to run the classification task """ # Parse command-line arguments parser = build_parser() options = parser.parse_args() if options.mode == "gen_data": # Split the data into train/dev/test sets split_data() # Load the data and reshape for training and evaluation X, y_media, y_emotion = load_data(update=options.update, remove_broken=options.remove_broken) for set_type in ["train", "dev", "test"]: total_media = np.sum(y_media[set_type], axis=0) total_emotion = np.sum(y_emotion[set_type], axis=0) print(f"Total images for each media category in {set_type} set:") for v, k in enumerate(MEDIA_LABELS): print(f"\t{k}: {total_media[v]}") print(f"Total images for each emotion category in {set_type} set:") for v, k in enumerate(EMOTION_LABELS): print(f"\t{k}: {total_emotion[v]}") elif options.mode == "train": # Create directory to save the results results_dir = "results" if not os.path.exists("./" + results_dir): os.makedirs("./" + results_dir) # Check if the given log folder already exists results_subdirs = os.listdir("./" + results_dir) if not options.log_folder: raise Exception( 'Please specify log_folder argument to store results.') elif options.log_folder in results_subdirs: raise Exception('The given log folder already exists.') else: # Create a folder for each training run log_folder = os.path.join(results_dir, options.log_folder) os.makedirs(log_folder) # Load the data and organize into three tuples (train, val/dev, test) # Each tuple consists of input arrays, media labels, and emotion labels train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE, MEDIA_LABEL_FILE, EMOTION_LABEL_FILE) # Preprocess the data train_dset, val_dset, test_dset = preprocess( train_data, val_data, test_data, augment=options.augment, train_stats_dir=TRAIN_STATS_DIR) # Specify the device: if options.device == "cpu": device = "/cpu:0" elif options.device == "gpu": device = "/device:GPU:0" # Train the model train(train_dset, val_dset, log_folder=log_folder, device=device, batch_size=64, num_epochs=100, model_type=options.model_type) elif options.mode == "test": # Load the data and organize into three tuples (train, val/dev, test) # Each tuple consists of input arrays, media labels, and emotion labels train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE, MEDIA_LABEL_FILE, EMOTION_LABEL_FILE) # Preprocess the data if os.path.isfile(os.path.join(TRAIN_STATS_DIR, "train_stats.npz")): print( "Preprocess test data using saved statistics from train data..." ) train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz") test_dset = preprocess_from_file(train_stats_file, test_data, augment=options.augment) else: print("Preprocess test data using train data...") train_dset, val_dset, test_dset = preprocess( train_data, val_data, test_data, augment=options.augment, train_stats_dir=TRAIN_STATS_DIR) # Specify the device: if options.device == "cpu": device = "/cpu:0" elif options.device == "gpu": device = "/device:GPU:0" # Load the model model_path = os.path.join("test_models", options.model_name) evaluate_test(model_path, options.model_type, test_dset, batch_size=64, confusion_mat=options.confusion_mat) elif options.mode == "ensemble": # Load the data and organize into three tuples (train, val/dev, test) # Each tuple consists of input arrays, media labels, and emotion labels train_data, val_data, test_data = load_data(DATA_DIR, INPUT_FILE, MEDIA_LABEL_FILE, EMOTION_LABEL_FILE) # Preprocess the data if os.path.isfile(os.path.join(TRAIN_STATS_DIR, "train_stats.npz")): print( "Preprocess test data using saved statistics from train data..." ) train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz") test_dset = preprocess_from_file(train_stats_file, test_data, augment=options.augment) else: print("Preprocess test data using train data...") train_dset, val_dset, test_dset = preprocess( train_data, val_data, test_data, augment=options.augment, train_stats_dir=TRAIN_STATS_DIR) # Specify the device: if options.device == "cpu": device = "/cpu:0" elif options.device == "gpu": device = "/device:GPU:0" if not options.ensemble_folder: raise Exception( 'Please specify ensemble_folder argument to find ensemble folders.' ) elif len(os.listdir(options.ensemble_folder)) == 0: raise Exception('Ensemble folder is empty.') # Evaluate the ensemble evaluate_ensemble(options.ensemble_folder, test_dset, batch_size=64, confusion_mat=options.confusion_mat) elif options.mode == "test_single": x_test = load_image( os.path.join('stylized_images_configs', options.image)) train_stats_file = os.path.join(TRAIN_STATS_DIR, "train_stats.npz") x_test = preprocess_image(train_stats_file, x_test, augment=options.augment) model_path = os.path.join("test_models", options.model_name) predict_image(x_test, model_path)
logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler(logfile, mode='w'), logging.StreamHandler()] ) logging.info('Loading dataset...') concepts, relation_types, relations = load_umls(umls_directory, data_folder) # logging.warning('Testing system with only 1000 examples!!!') # relations = relations[:1000] concept_list = list(concepts.values()) train_data, val_data, _ = split_data(relations) train_relations_set = set(train_data) train_dataset = UmlsRelationDataset(train_data) val_dataset = UmlsRelationDataset(val_data) callbacks = [] logging.info('Loading collator...') example_creator = NameRelationExampleCreator() # train_neg_sampler = BatchNegativeSampler( # negative_sample_size # ) # val_neg_sampler = train_neg_sampler train_neg_sampler = UniformNegativeSampler( concept_list, train_relations_set, negative_sample_size,
def train(params): hindi_token_ids, hindi_seq_lens, hindi_vocab_dict, hindi_rev_vocab_dict = process_data('../data/hindi_dump.p', max_vocab_size=100000, target_lang=False) bengali_token_ids, bengali_seq_lens, bengali_vocab_dict, bengali_rev_vocab_dict = process_data('../data/bengali_dump.p', max_vocab_size=100000, target_lang=True) train_encoder_inputs, train_decoder_inputs, train_targets, train_hindi_seq_lens, train_bengali_seq_len, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens = split_data(hindi_token_ids, bengali_token_ids, hindi_seq_lens, bengali_seq_lens,train_ratio=0.8) params.hindi_vocab_size = len(hindi_vocab_dict) params.bengali_vocab_size = len(bengali_vocab_dict) print params.hindi_vocab_size, params.bengali_vocab_size with tf.Session() as sess: _model = model(params) sess.run(tf.global_variables_initializer()) losses = [] accs = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,train_decoder_inputs, train_targets,train_hindi_seq_lens, train_bengali_seq_len,params.num_epochs, params.batch_size)): print "EPOCH : ", epoch_num sess.run(tf.assign(_model.lr, 0.01 * (0.99 ** epoch_num))) batch_loss = [] batch_acc = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs,batch_targets, batch_hindi_seq_lens,batch_bengali_seq_lens) in enumerate(epoch): loss, _,acc = _model.step(sess, params,batch_encoder_inputs, batch_decoder_inputs, batch_targets,batch_hindi_seq_lens, batch_bengali_seq_lens,params.dropout) batch_loss.append(loss) batch_acc.append(acc) losses.append(np.mean(batch_loss)) accs.append(np.mean(batch_acc)) print "Training Loss: ",losses[-1] print "Training Accuracy",accs[-1] plt.plot(losses, label='loss') plt.legend() # plt.show() plt.title('Plot for Training Error versus Epochs', fontsize='20', style='oblique') plt.xlabel('Epochs', fontsize='16', color='green') plt.ylabel('Training Error', fontsize='16', color='green') plt.savefig('../output/plot.png') plt.show() acc = _model.test(sess, params, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens, params.dropout) print acc
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/tst2013.en', max_vocab_size=30000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/tst2013.tr', max_vocab_size=30000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) output = open('data/vocab_en.pkl', 'wb') pickle.dump(en_vocab_dict, output) output.close() output = open('data/vocab_sp.pkl', 'wb') pickle.dump(sp_vocab_dict, output) output.close() # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) print 'len(en_vocab_dict)', len(en_vocab_dict) print 'len(sp_vocab_dict)', len(sp_vocab_dict) # Start session with tf.Session() as sess: model = None # Create new model or load old one f = checkpoint_path + ".index" print f exit() if os.path.isfile(f): model = restore_model(sess) else: model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) print loss batch_loss.append(loss) print 'mean: ', np.mean(batch_loss) print "Saving the model." model.saver.save(sess, checkpoint_path)
from os import path import data_processing import config import data_utils import seq2seq_wrapper #load data and split into train and test sets idx_headings, idx_descriptions = data_processing.process_data() article_metadata = data_processing.unpickle_articles() (x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings) #define parameters xseq_length = x_train.shape[-1] yseq_length = y_train.shape[-1] batch_size = config.batch_size xvocab_size = len(article_metadata['idx2word']) yvocab_size = xvocab_size checkpoint_path = path.join(config.path_outputs, 'checkpoint') print (checkpoint_path) #define model model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length, yseq_len=yseq_length, xvocab_size=xvocab_size, yvocab_size=yvocab_size, emb_dim=config.embedding_dim, num_layers=3, ckpt_path=checkpoint_path) val_batch_gen = data_utils.generate_random_batch(x_valid, y_valid, config.batch_size)
def main(): args = parser.parse_args() pprint(args) # check and create directories if not os.path.exists(args.checkpoint): os.makedirs(args.checkpoint) if not os.path.exists(args.log): os.makedirs(args.log) print('==> Preparing data..') transformations_train = transforms.Compose([ data.RandomTranslateWithReflect(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) transformations_test = transforms.Compose( [transforms.ToTensor(), normalize]) mode = {'train': True, 'test': True} image_datasets = Cifar10(root='./data', train=True, transform=None, download=True) trainData, trainLabel, testData, testLabel = du.split_data(image_datasets, select_num=9000) unlabeled_idx, labeled_idx = du.split_idx(trainLabel, select_num=21000) idx = np.squeeze([0, 1, 2, 3, 4, 7, 8]) labeled_trainData_75, labeled_trainLabel_75, labeled_trainData_25, labeled_trainLabel_25 = du.split_class( trainData[labeled_idx, :, :, :], trainLabel[labeled_idx], selected_class=idx) testData_75, testLabel_75, testData_25, testLabel_25 = du.split_class( testData, testLabel, selected_class=idx) anchor_idx = du.select_anchors_1(labeled_trainLabel_75, anchor_num=args.anchor_num) print("labeled_idx is:{}".format(labeled_idx.size)) print("anchor_idx is:{}".format(anchor_idx.size)) print("unique of labeled_trainLabel_25 is:{}".format( np.unique(labeled_trainLabel_25))) print("unique of testLabel_25 is:{}".format(np.unique(testLabel_25))) dict_data = DT(trainData=labeled_trainData_75[anchor_idx, :, :, :], trainLabel=labeled_trainLabel_75[anchor_idx], transform=transformations_train) dict_loader = torch.utils.data.DataLoader(dict_data, batch_size=args.anchor_num, shuffle=False, num_workers=args.workers) c_trainData = np.concatenate( (trainData[unlabeled_idx, :, :, :], labeled_trainData_75), axis=0) c_trainLabel = np.concatenate( (trainLabel[unlabeled_idx], labeled_trainLabel_75), axis=0) u_trn = unlabeled_idx.size l_trn = labeled_trainLabel_75.shape[0] n = u_trn + l_trn unlabeled_idx = np.squeeze(np.arange(u_trn).astype(np.int32)) labeled_idx = np.squeeze(np.arange(u_trn, n).astype(np.int32)) mask_labels = np.squeeze(np.zeros((n, 1))) mask_labels[u_trn:n] = 1 train_data = DT(trainData=c_trainData, trainLabel=c_trainLabel, transform=transformations_train) test_data = DT(trainData=testData_25, trainLabel=testLabel_25, transform=transformations_test) labeled_trainData_25 = np.concatenate((labeled_trainData_25, testData_75), axis=0) labeled_trainLabel_25 = np.concatenate( (labeled_trainLabel_25, testLabel_75), axis=0) train_data_test = DT(trainData=labeled_trainData_25, trainLabel=labeled_trainLabel_25, transform=transformations_test) train_loader_test = torch.utils.data.DataLoader(train_data_test, batch_size=100, shuffle=True, num_workers=args.workers) batch_sampler = data.TwoStreamBatchSampler(unlabeled_idx, labeled_idx, args.batch_size, 50) train_loader = torch.utils.data.DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True) # train_loader = torch.utils.data.DataLoader(train_data, batch_size=100, # shuffle=True, num_workers=args.workers) test_loader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=False, num_workers=args.workers) for iter in range(5): model = pre_train(train_loader, test_loader, dict_loader, train_loader_test, mask_labels, total_epochs=100, use_gpu=True, seed=args.seed)
TARGET_POSITIVE = "p" elif test_type == "class": tag_attr = "report_class" TARGET_POSITIVE = TARGET_CLASS else: raise ValueError("Unknown tag: " + test_type) data = data_utils.read_from_csv(data_file) filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"] filtered_data = filtered_data[:2500] # put a limit on the size for performance labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data] report_ids = [x.report_id for x in filtered_data] sentences = [x.processed_sentence for x in filtered_data] train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value) # Create transformation pipeline if USE_RF: pipe = pipelines.get_count_lsi_randomforest() else: pipe = pipelines.get_count_lsi_SVM() # set pipe parameters and train model pipe.set_params(**model_params) pipe.fit(train_data, train_labels) print "Total = " + str(len(filtered_data)) + " [" + str(labels.count(0)) + ", " + str(labels.count(1)) + "]" print "Train = " + str(len(train_data)) + " [" + str(train_labels.count(0)) + ", " + str( train_labels.count(1) ) + "]"
def main(FLAGS): # set seed np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) with tf.device('/cpu:0'), tf.name_scope('input'): # load dataset into main memory data, meta = load_data(FLAGS.dataset_root, FLAGS.dataset, is_training=True) train_data, val_data = split_data(data, FLAGS.validate_rate) # build tf_dataset for training train_dataset = (tf.data.Dataset.from_tensor_slices(train_data).map( preprocess_for_train(args.dataset not in ['mnist', 'svhn']), 8).shuffle(10000, seed=FLAGS.seed).batch(FLAGS.batch_size).prefetch(1)) # build tf_dataset for val val_dataset = (tf.data.Dataset.from_tensor_slices(val_data).map( preprocess_for_eval, 8).batch(FLAGS.batch_size).prefetch(1)) # clean up and release memory del data, train_data, val_data # construct data iterator data_iterator = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) # construct iterator initializer for training and validation train_data_init = data_iterator.make_initializer(train_dataset) val_data_init = data_iterator.make_initializer(val_dataset) # define useful scalars learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate') tf.summary.scalar('lr', learning_rate) is_training = tf.placeholder(tf.bool, [], name='is_training') global_step = tf.train.create_global_step() # define optimizer optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) # build the net model = importlib.import_module('models.{}'.format(FLAGS.model)) net = model.Net(meta['n_class'], FLAGS.weight_decay) # get data from data iterator images, labels = data_iterator.get_next() tf.summary.image('images', tf.transpose(images, [0, 2, 3, 1])) # get logits logits = net(images, is_training) tf.summary.histogram('logits', logits) # summary variable defined in net for w in net.global_variables: tf.summary.histogram(w.name, w) with tf.name_scope('losses'): # compute loss loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # compute l2 regularization l2_reg = tf.losses.get_regularization_loss() with tf.name_scope('metrics') as scope: mean_loss, mean_loss_update_op = tf.metrics.mean(loss, name='mean_loss') prediction = tf.argmax(logits, axis=1) accuracy, accuracy_update_op = tf.metrics.accuracy(labels, prediction, name='accuracy') reset_metrics = tf.variables_initializer( tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope)) metrics_update_op = tf.group(mean_loss_update_op, accuracy_update_op) # collect metric summary alone, because it need to # summary after metrics update metric_summary = [ tf.summary.scalar('loss', mean_loss, collections=[]), tf.summary.scalar('accuracy', accuracy, collections=[]) ] # compute grad grads_and_vars = optimizer.compute_gradients(loss + l2_reg) # summary grads for g, v in grads_and_vars: tf.summary.histogram(v.name + '/grad', g) # run train_op and update_op together train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(train_op, *update_ops) # build summary train_summary_str = tf.summary.merge_all() metric_summary_str = tf.summary.merge(metric_summary) # init op init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # prepare for the logdir if not tf.gfile.Exists(FLAGS.logdir): tf.gfile.MakeDirs(FLAGS.logdir) # saver saver = tf.train.Saver(max_to_keep=FLAGS.n_epoch) # summary writer train_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'train'), tf.get_default_graph()) val_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'val'), tf.get_default_graph()) # session config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=4, inter_op_parallelism_threads=4) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # do initialization sess.run(init_op) # restore if FLAGS.restore: saver.restore(sess, FLAGS.restore) lr_boundaries = list(map(int, FLAGS.boundaries.split(','))) lr_values = list(map(float, FLAGS.values.split(','))) lr_manager = LRManager(lr_boundaries, lr_values) time_meter = TimeMeter() # start to train for e in range(FLAGS.n_epoch): print('-' * 40) print('Epoch: {:d}'.format(e)) # training loop try: i = 0 sess.run([train_data_init, reset_metrics]) while True: lr = lr_manager.get(e) fetch = [train_summary_str] if i % FLAGS.log_every == 0 else [] time_meter.start() result = sess.run([train_op, metrics_update_op] + fetch, { learning_rate: lr, is_training: True }) time_meter.stop() if i % FLAGS.log_every == 0: # fetch summary str t_summary = result[-1] t_metric_summary = sess.run(metric_summary_str) t_loss, t_acc = sess.run([mean_loss, accuracy]) sess.run(reset_metrics) spd = FLAGS.batch_size / time_meter.get() time_meter.reset() print( 'Iter: {:d}, LR: {:g}, Loss: {:.4f}, Acc: {:.2f}, Spd: {:.2f} i/s' .format(i, lr, t_loss, t_acc, spd)) train_writer.add_summary(t_summary, global_step=sess.run(global_step)) train_writer.add_summary(t_metric_summary, global_step=sess.run(global_step)) i += 1 except tf.errors.OutOfRangeError: pass # save checkpoint saver.save(sess, '{}/{}'.format(FLAGS.logdir, FLAGS.model), global_step=sess.run(global_step), write_meta_graph=False) # val loop try: sess.run([val_data_init, reset_metrics]) while True: sess.run([metrics_update_op], {is_training: False}) except tf.errors.OutOfRangeError: pass v_loss, v_acc = sess.run([mean_loss, accuracy]) print('[VAL]Loss: {:.4f}, Acc: {:.2f}'.format(v_loss, v_acc)) val_writer.add_summary(sess.run(metric_summary_str), global_step=sess.run(global_step)) print('-' * 40)
def main(*kargs, **kwargs): # ============ Parse global parameters ============ get_kwargs(kwargs) test_fname = kwargs['test'] embeds_type = kwargs['embeds_type'] logger_fname = kwargs['logger'] # warm_start = kwargs['warm_start'] # model_warm_start = [model.lower() for model in kwargs['model_warm_start']] config = kwargs['config'] train_clean = kwargs['train_clean'] train_labels = kwargs['train_labels'] test_clean = kwargs['test_clean'] embeds_clean = kwargs['embeds_clean'] result_path = './outputs/' oof_path = './oof_predictions' if not os.path.exists(result_path): os.mkdir(result_path) if not os.path.exists(oof_path): os.mkdir(oof_path) # ==== Create logger ==== logger = Logger(logging.getLogger(), logger_fname) # ==== Load data ==== logger.info('Loading data...') test_df = load_data(test_fname) train_x = np.load(train_clean) test_x = np.load(test_clean) embedding_matrix = np.load(embeds_clean) train_y = np.load(train_labels) target_labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] # ==== Splitting training data ==== x_train_nn, x_eval_nn, y_train_nn, y_eval_nn, train_idxs, eval_idxs = split_data( train_x, train_y, eval_size=0.1, shuffle=True, random_state=42) logger.debug('X shape = {}'.format(np.shape(x_train_nn))) # ============= Load params of models ============= params = Params(config) models = params.get('models') # ============ Train models ============= for model_name in models: model_func = get_model(model_name, embedding_matrix, params) if params.get(model_name).get('folding'): # =========== Training on folds ============ batch_size = params.get(model_name).get('batch_size') logger.debug( 'Starting {0} training on folds...'.format(model_name)) models, val_predictions = train_folds( train_x, train_y, params.get(model_name).get('num_folds'), batch_size, model_func, params.get(model_name).get('optimizer'), logger=logger) val_predictions_array = np.concatenate( [minmax_scale(fold) for fold in val_predictions], axis=0) np.save( os.path.join(oof_path, "{0}_{1}_oof.npy".format(model_name, embeds_type)), val_predictions_array) logger.debug('Predicting results...') test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join( result_path, "{1}_{0}_{2}_weights.npy".format(fold_id, model_name, embeds_type)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( result_path, "{1}_{2}_test_predicts{0}.npy".format( fold_id, model_name, embeds_type)) test_predictions = model.predict(test_x, batch_size=batch_size) test_predicts_list.append(test_predictions) np.save(test_predicts_path, test_predictions) test_predictions = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predictions *= minmax_scale(fold_predict) if params.get(model_name).get('norm_folds'): test_predictions **= (1. / len(test_predicts_list)) logger.info('Saving prediction...') test_ids = test_df["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predictions = pd.DataFrame(data=test_predictions, columns=target_labels) test_predictions["id"] = test_ids test_predictions = test_predictions[["id"] + target_labels] submit_path = os.path.join( result_path, "{0}_{1}_folds.submit".format(model_name, embeds_type)) test_predictions.to_csv(submit_path, index=False) else: # ============ Single model training ============= logger.info('Training single {0} training...'.format(model_name)) model = model_func() model_tr = _train_model( model, batch_size=params.get(model_name).get('batch_size'), train_x=x_train_nn, train_y=y_train_nn, val_x=x_eval_nn, val_y=y_eval_nn, opt=params.get(model_name).get('optimizer'), logger=logger) test_predictions = model_tr.predict( test_x, batch_size=params.get(model_name).get('batch_size')) # ============== Saving trained parameters ================ logger.info('Saving model parameters...') model_path = os.path.join( result_path, "{0}_{1}_weights.npy".format(model_name, embeds_type)) np.save(model_path, model.get_weights()) # ============== Postprocessing =============== # test_predictions **= PROBABILITIES_NORMALIZE_COEFFICIENT # ============== Saving predictions ============== logger.info('Saving predictions...') test_ids = test_df["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predictions, columns=target_labels) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + target_labels] submit_path = os.path.join( result_path, "{0}_{1}.csv".format(model_name, embeds_type)) test_predicts.to_csv(submit_path, index=False)
def main(): args = parser.parse_args() pprint(args) # check and create directories if not os.path.exists(args.checkpoint): os.makedirs(args.checkpoint) if not os.path.exists(args.log): os.makedirs(args.log) print('==> Preparing data..') transformations_train = transforms.Compose([ data.RandomTranslateWithReflect(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) transformations_test = transforms.Compose( [transforms.ToTensor(), normalize]) mode = {'train': True, 'test': True} image_datasets = Cifar10(root='./data', train=True, transform=None, download=True) trainData, trainLabel, testData, testLabel = du.split_data(image_datasets, select_num=1000) unlabeled_idx, labeled_idx = du.split_idx(trainLabel, select_num=5000) anchor_idx = du.select_anchors(trainLabel, labeled_idx, anchor_num=args.anchor_num) print("labeled_idx is:{}".format(labeled_idx.size)) print("anchor_idx is:{}".format(anchor_idx.size)) dict_data = DT(trainData=trainData[anchor_idx, :, :, :], trainLabel=trainLabel[anchor_idx], transform=transformations_train) dict_loader = torch.utils.data.DataLoader(dict_data, batch_size=args.anchor_num, shuffle=False, num_workers=args.workers) n = trainLabel.shape[0] mask_labels = np.squeeze(np.zeros((n, 1))) mask_labels[labeled_idx] = 1 train_data = DT(trainData=trainData, trainLabel=trainLabel, transform=transformations_train) test_data = DT(trainData=testData, trainLabel=testLabel, transform=transformations_test) train_data_test = DT(trainData=trainData, trainLabel=trainLabel, transform=transformations_test) train_loader_test = torch.utils.data.DataLoader(train_data_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) batch_sampler = data.TwoStreamBatchSampler(unlabeled_idx, labeled_idx, args.batch_size, 40) train_loader = torch.utils.data.DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) model = pre_train(train_loader, test_loader, dict_loader, train_loader_test, mask_labels, total_epochs=100, use_gpu=True, seed=args.seed)