def update_time_zones(df_android): dic_geo = {} df = load_data(r'Data\US_States_Timezones_clean.csv') df_clean = df[['State_Code', 'TimeZone_Code']] df_clean.apply(lambda x: put_geo_location_to_dict(x, dic_geo), axis=1) print('finish build geo dict') df_user_state = df_android[['user_state']] df_android['geo_location'] = df_user_state.apply( lambda x: get_geo_location(dic_geo, x), axis=1)
def update_app_categories(df_android): dic_app_cat = {} df = load_data(r'Data\AppCat.csv') df_clean = df[['app_Name', 'app_Cat']] df_clean.apply(lambda x: put_app_cat_to_dict(x, dic_app_cat), axis=1) print('finish build app_cat dict') df_user_state = df_android[['app_id']] df_android['app_cat'] = df_user_state.apply( lambda x: get_app_cat(dic_app_cat, x), axis=1)
def write_created_new_features(): """ loads the original data and add to it 2 columns related to out-source data- google app category and USA timezones. write it to a new file :return: """ df_android = load_data(r'Data\android_bids_us.csv') update_app_categories(df_android) update_time_zones(df_android) df_android.to_csv(r'Data/df_with_new_features.csv')
def main(_): sess = tf.compat.v1.Session() model = MyModel(sess,model_configs) if args.mode == "train": x_train, y_train, _,_ = load_data(args.data_dir) model.train(x_train, y_train,200) elif args.mode == "test": # Testing on public testing dataset _, _, x_test, y_test = load_data(args.data_dir) model.evaluate(x_test, y_test) elif args.mode == "predict": # Predicting and storing results on private testing dataset x_test = load_testing_images(args.data_dir) predictions = model.predict_prob(x_test) np.save("../predictions.npy", predictions)
def get_app_type(): dic = {} df = load_data(r'Data\android_bids_us.csv') df_app_ids = df[['app_id']] app_ids_set = df_app_ids.unique() for app_id in app_ids_set: if app_id not in dic: app_cat = call_google_store(app_id) dic[app_id] = app_cat sleep(1) dt_res = pd.DataFrame(dic.items()) dt_res.columns = ['app_Name', 'app_Cat'] dt_res.to_csv(r'Data\AppCat.csv', index=False)
def main(args): '''HYPER PARAMETER''' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu MESH_SIZE = args.mesh_size BATCHSIZE = args.batchsize LEARNING_RATE = args.learning_rate INPUT_SIZE = (1, MESH_SIZE, MESH_SIZE, MESH_SIZE) EPOCH = args.epoch COMPUTE_TRAIN_METRICS = args.train_metric ROUNTING_ITER = args.n_routing_iter DATA_PATH = args.data_path NUM_CLASS = args.num_class DECAY_RATE = args.decay_rate if args.rotation is not None: ROTATION = (int(args.rotation[0:2]), int(args.rotation[3:5])) else: ROTATION = None '''CREATE DIR''' experiment_dir = Path('./experiment/') experiment_dir.mkdir(exist_ok=True) result_dir = Path(args.result_dir) result_dir.mkdir(exist_ok=True) checkpoints_dir = Path('./experiment/checkpoints/') checkpoints_dir.mkdir(exist_ok=True) log_dir = Path(args.log_dir) log_dir.mkdir(exist_ok=True) '''LOG''' args = parse_args() logger = logging.getLogger("PointCapsNet") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler = logging.FileHandler( args.log_dir + 'train-' + str(datetime.datetime.now().strftime('%Y-%m-%d %H-%M')) + '.txt') file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.info( '---------------------------------------------------TRANING---------------------------------------------------' ) logger.info('PARAMETER ...') logger.info(args) '''DATA LOADING''' logger.info('Load dataset ...') train_data, train_label, test_data, test_label = load_data(DATA_PATH) logger.info("The number of training data is: %d", train_data.shape[0]) logger.info("The number of test data is: %d", test_data.shape[0]) trainDataset = myDataset(train_data, train_label, meshmode="density", rotation=ROTATION) if ROTATION is not None: print('The range of training rotation is', ROTATION) testDataset = myDataset(test_data, test_label, meshmode="density", rotation=ROTATION) trainDataLoader = torch.utils.data.DataLoader(trainDataset, batch_size=BATCHSIZE, shuffle=True) testDataLoader = torch.utils.data.DataLoader(testDataset, batch_size=BATCHSIZE, shuffle=False) '''MODEL LOADING''' model = PointCapsNet(INPUT_SIZE, NUM_CLASS, ROUNTING_ITER).cuda() if args.pretrain is not None: print('Use pretrain model...') logger.info('Use pretrain model') checkpoint = torch.load(args.pretrain) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model_state_dict']) else: print('No existing model, starting training from scratch...') start_epoch = 0 print(model) print('Number of Parameters: %d' % model.n_parameters()) criterion = PointCapsNetLoss() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-08) global_epoch = 0 global_step = 0 best_tst_accuracy = 0.0 history = defaultdict(lambda: list()) '''TRANING''' logger.info('Start training...') total_train_acc = [] total_test_acc = [] for epoch in range(start_epoch, EPOCH): print('Epoch %d (%d/%s):' % (global_epoch + 1, epoch + 1, EPOCH)) logger.info('Epoch %d (%d/%s):', global_epoch + 1, epoch + 1, EPOCH) for batch_id, (x, y) in tqdm(enumerate(trainDataLoader), total=len(trainDataLoader), smoothing=0.9): optimizer = exponential_decay(optimizer, LEARNING_RATE, global_epoch, 1, DECAY_RATE) x = Variable(x).float().cuda() y = Variable(y.squeeze()).cuda() y_pred, x_reconstruction = model(x, y) loss, margin_loss, reconstruction_loss = criterion( x, y, x_reconstruction, y_pred.cuda(), NUM_CLASS) history['margin_loss'].append(margin_loss.cpu().data.numpy()) history['reconstruction_loss'].append( reconstruction_loss.cpu().data.numpy()) history['loss'].append(loss.cpu().data.numpy()) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 train_metrics, train_hist_acc = test( model, trainDataLoader) if COMPUTE_TRAIN_METRICS else (None, []) test_metrics, test_hist_acc = test(model, testDataLoader) total_train_acc += train_hist_acc total_test_acc += test_hist_acc print('Margin Loss: %f' % history['margin_loss'][-1]) logger.info('Margin Loss: %f', history['margin_loss'][-1]) print('Reconstruction Loss: %f' % history['reconstruction_loss'][-1]) logger.info('Reconstruction Loss: %f', history['reconstruction_loss'][-1]) print('Loss: %f' % history['loss'][-1]) logger.info('Loss: %f', history['loss'][-1]) if COMPUTE_TRAIN_METRICS: print('Train Accuracy: %f' % (train_metrics['accuracy'])) logger.info('Train Accuracy: %f', (train_metrics['accuracy'])) print('Test Accuracy: %f' % test_metrics['accuracy']) logger.info('Test Accuracy: %f', test_metrics['accuracy']) # TODO show reconstruction mesh # idx = np.random.randint(0, len(x)) # show_example(x[idx], y[idx], x_reconstruction[idx], y_pred[idx], args.result_dir, 'Epoch_{}'.format(epoch)) if (test_metrics['accuracy'] >= best_tst_accuracy) and epoch > 5: best_tst_accuracy = test_metrics['accuracy'] logger.info('Save model...') save_checkpoint( global_epoch + 1, train_metrics['accuracy'] if COMPUTE_TRAIN_METRICS else 0.0, test_metrics['accuracy'], model, optimizer, str(checkpoints_dir)) global_epoch += 1 logger.info('End of training...') n_points_avg = 10 n_points_plot = 1000 plt.figure(figsize=(20, 10)) plot_loss_curve(history, n_points_avg, n_points_plot, str(result_dir)) plot_acc_curve(total_train_acc, total_test_acc, str(result_dir))
''' Net Settings''' In_Nodes = 5567 ###number of genes Pathway_Nodes = 860 ###number of pathways Hidden_Nodes = 100 ###number of hidden nodes Out_Nodes = 30 ###number of hidden nodes in the last hidden layer ''' Initialize ''' Initial_Learning_Rate = [0.03, 0.01, 0.001, 0.00075] L2_Lambda = [0.1, 0.01, 0.005, 0.001] num_epochs = 3000 ###for grid search Num_EPOCHS = 20000 ###for training ###sub-network setup Dropout_Rate = [0.7, 0.5] ''' load data and pathway ''' pathway_mask = load_pathway("../data/pathway_mask.csv", dtype) x_train, ytime_train, yevent_train, age_train = load_data( "../data/train.csv", dtype) x_valid, ytime_valid, yevent_valid, age_valid = load_data( "../data/validation.csv", dtype) x_test, ytime_test, yevent_test, age_test = load_data("../data/test.csv", dtype) opt_l2_loss = 0 opt_lr_loss = 0 opt_loss = torch.Tensor([float("Inf")]) ###if gpu is being used if torch.cuda.is_available(): opt_loss = opt_loss.cuda() ### opt_c_index_va = 0 opt_c_index_tr = 0 ###grid search the optimal hyperparameters using train and validation data
from Model import MyModel from DataLoader import load_data, train_valid_split, load_testing_images from Configure import model_configs, training_configs parser = argparse.ArgumentParser() parser.add_argument("--mode", help="train, test or predict") parser.add_argument("--data_dir", help="path to the data") parser.add_argument("--test_file", help="path to the test file") parser.add_argument("--save_dir", help="path to save the results") args = parser.parse_args() if __name__ == '__main__': model = MyModel(model_configs, training_configs) if args.mode == 'train': x_train, y_train, x_test, y_test = load_data(args.data_dir) x_train, y_train, x_valid, y_valid = train_valid_split( x_train, y_train) model.train(x_train, y_train, x_valid, y_valid) model.save_weights( os.path.join(args.save_dir, model_configs["version"], "")) model.evaluate(x_test, y_test) elif args.mode == 'test': # Testing on public testing dataset model.load_weights( os.path.join(args.save_dir, model_configs["version"], "")) _, _, x_test, y_test = load_data(args.data_dir) model.evaluate(x_test, y_test)
import utils parser = argparse.ArgumentParser() parser.add_argument("mode", help="train, test or predict") parser.add_argument("data_dir", help="path to the data") parser.add_argument("--save_dir", help="path to save the results") parser.add_argument("--resume_checkpoint", help=".pth checkpoint file to resume") parser.add_argument("--checkpoint", help=".pth checkpoint file to use for evaluation") args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' if __name__ == '__main__': model = MyModel(model_configs) if args.mode == 'train': print('----- training mode ----') train,test,orig_trainset = load_data(args.data_dir,train_aug=training_configs['train_augmentation']) # augment the train data with config train,valid = train_valid_split(train,orig_trainset,train_ratio=1) if args.resume_checkpoint is not None: checkpoint = torch.load('../saved_models/' + args.resume_checkpoint) epoch,accuracy_type,prev_accuracy = (checkpoint[k] for k in ['epoch','accuracy_type','accuracy']) print('RESUME---> Loading model from Epoch %d with %s Accuracy %f' %(epoch,accuracy_type,prev_accuracy)) else: checkpoint = None model.train(train, training_configs,valid=None,test=test,checkpoint=checkpoint) # note test data is used only to evaluate model performance during training model.evaluate(test) elif args.mode == 'test': # Testing on public testing dataset _, test, _ = load_data(args.data_dir,None) if args.checkpoint is not None:
''' PASNet Settings''' In_Nodes = 4359 ###number of genes Pathway_Nodes = 574 ###number of pathways Hidden_Nodes = 100 ###number of hidden nodes Out_Nodes = 2 ###one is for LTS, and the other is for non-LTS ''' Initial Settings for Empirical Search ''' Learning_Rates = [0.05, 0.01, 0.007, 0.005, 0.001, 0.0007, 0.0005, 0.0001] L2_Lambdas = [3e-4, 5e-4, 7e-4, 1e-3, 3e-3, 5e-3] Dropout_Rates = [0.8, 0.7] ###sub-network setup nEpochs = 5000 ###for empirical search ''' load data and pathway ''' dtype = torch.FloatTensor pathway_mask = load_pathway("data/gbm_binary_pathway_mask_reactome_574.csv", dtype) ###loaded data were split for empirical search only x_train, y_train = load_data("data/std_train.csv", dtype) x_valid, y_valid = load_data("data/std_valid.csv", dtype) opt_l2 = 0 opt_lr = 0 opt_loss = torch.Tensor([float("Inf")]) ###if gpu is being used if torch.cuda.is_available(): opt_loss = opt_loss.cuda() ### ##grid search the optimal hyperparameters using train and validation data for lr in Learning_Rates: for l2 in L2_Lambdas: pred_tr, pred_val, loss_tr, loss_val = trainPASNet(x_train, y_train, x_valid, y_valid, pathway_mask, \ In_Nodes, Pathway_Nodes, Hidden_Nodes, Out_Nodes, \
checkpoint = torch.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['net']) #print(checkpoint['net']) print(net.load_state_dict(checkpoint['net'])) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] #print(start_epoch) #print(start_epoch) if args.mode == 'train': x_train, y_train = load_data(args.data_dir) x_train, y_train, x_valid, y_valid = train_valid_split(x_train, y_train) trainset = batch_data_process(x_train, x_valid, y_train, y_valid, training=True, validation=False, testing=False) trainloader = torch.utils.data.DataLoader(trainset, batch_size=100, shuffle=True, num_workers=0) validset = batch_data_process(x_train, x_valid, y_train, y_valid, training=False, validation=True, testing=False) validloader = torch.utils.data.DataLoader(validset, batch_size=100, shuffle=False, num_workers=0) for epoch in range(start_epoch, start_epoch+max_epoch): model.train(trainloader, epoch) model.evaluate(validloader, epoch)
Dropout_Rates = [0.8, 0.7] ###sub-network setup ''' load data and pathway ''' pathway_mask = load_pathway("data/gbm_binary_pathway_mask_reactome_574.csv", dtype) N = 10 # number of repeated times K = 5 # number of folds opt_lr = 1e-4 opt_l2 = 3e-4 test_auc = [] test_f1 = [] for replicate in range(N): for fold in range(K): print("replicate: ", replicate, "fold: ", fold) x_train, y_train = load_data( "data/std_train_" + str(replicate) + "_" + str(fold) + ".csv", dtype) x_test, y_test = load_data( "data/std_test_" + str(replicate) + "_" + str(fold) + ".csv", dtype) pred_train, pred_test, loss_train, loss_test = trainPASNet(x_train, y_train, x_test, y_test, pathway_mask, \ In_Nodes, Pathway_Nodes, Hidden_Nodes, Out_Nodes, \ opt_lr, opt_l2, nEpochs, Dropout_Rates, optimizer = "Adam") ###if gpu is being used, transferring back to cpu if torch.cuda.is_available(): pred_test = pred_test.cpu().detach() ### np.savetxt("PASNet_pred_" + str(replicate) + "_" + str(fold) + ".txt", pred_test.numpy(), delimiter=",") auc_te = auc(y_test, pred_test)
from Network3 import * from DataLoader import load_data from LayersWrapper import * import numpy as np import pickle import matplotlib.pyplot as plt #### Virat ### Visor ### http://clickdamage.com/sourcecode/cv_datasets.php #test training_data, mnist_validation_data, test_data = load_data() #expanded_data, _, _ = load_data("data/mnist_expanded.pkl.gz") with open("data/dogs_and_snakes.pkl", 'rb') as f: training_data, validation_data = pickle.load(f, encoding='latin1') t1 = training_data[0] t2 = training_data[1] t1 = [i[0:150 * 200] for i in t1] print(len(t1)) training_data = [t1[:2000], t2[:2000]] t1 = validation_data[0] t2 = validation_data[1] t1 = [i[0:150 * 200] for i in t1] validation_data = [t1[:300], t2[:300]] ar = t1[10] ar = np.reshape(ar, newshape=(150, 200))
##### Net In_Nodes = 24803 ### number of omics Gene_Nodes = 5481 ### number of genes Pathway_Nodes = 507 ### number of pathways Hidden_Nodes = [22, 5] ### number of hidden nodes ##### Initials max_epochs = 10000 Drop_Rate = [0.7, 0.5] ### dropout rates ''' load data ''' folder_path = "/home/NewUsersDir/jhao2/data/proposed/" pathway_indices = load_sparse_indices(folder_path + "gbm_binary_pathway_mask.npz") gene_indices = load_sparse_indices(folder_path + "gbm_binary_gene_mask.npz") x_train, ytime_train, yevent_train, age_train = load_data( folder_path + "gbm_std_imputed_train_" + str(REPID) + ".csv") x_valid, ytime_valid, yevent_valid, age_valid = load_data( folder_path + "gbm_std_imputed_valid_" + str(REPID) + ".csv") ###grid search the optimal hyperparameters using train and validation data L2_Lambda = [0.01, 0.02, 0.04, 0.08, 0.10, 0.12] Initial_Learning_Rate = [1e-2, 5e-3, 1e-3] opt_cidx = 0.0 for lr in Initial_Learning_Rate: for l2 in L2_Lambda: print("L2: ", l2, "LR: ", lr) c_index_tr, c_index_va = train_omics_net(x_train, age_train, ytime_train, yevent_train, \ x_valid, age_valid, ytime_valid, yevent_valid, \ gene_indices, pathway_indices, \ In_Nodes, Gene_Nodes, Pathway_Nodes, Hidden_Nodes, \
Pathway_Nodes = 659 ### number of pathways Image_Nodes = 50 ### number of aggregated feature maps Hidden_Nodes = [100, 30, 30] ### number of hidden nodes in hidden, hidden 2, and hidden 3 Max_Epochs = 1500 ### maximum number of epochs in training ### sub-network setup Drop_Rate = [0.7, 0.5, 0.7] ### dropout rates of pathway, hidden, image layers ''' load data and pathway''' folder_path = "/home/NewUsersDir/jhao2/Integrative/src/data/" pathway_indices = load_sparse_indices(folder_path + "binary_pathway_mask.npz") C_index = [] for REPID in range(20): print("-----Split ", REPID) x_tr, y_tr, pt_tr, delta_tr, age_tr = load_data(folder_path + "std_train_genomic_" + str(REPID) + ".csv", folder_path + "norm_image_train_" + str(REPID) + ".csv") x_va, y_va, pt_va, delta_va, age_va = load_data(folder_path + "std_valid_genomic_" + str(REPID) + ".csv", folder_path + "norm_image_valid_" + str(REPID) + ".csv") x_te, y_te, pt_te, delta_te, age_te = load_data(folder_path + "std_test_genomic_" + str(REPID) + ".csv", folder_path + "norm_image_test_" + str(REPID) + ".csv") ###grid search the optimal hyperparameters using train and validation data L2_Lambda = [0.1,0.2,0.3,0.4,0.5] Initial_Learning_Rate = [0.001,0.0015,0.0001,0.00015] opt_cidx = 0.0 for lr in Initial_Learning_Rate: for l2 in L2_Lambda: print("Sparse coding is on") print("L2: ", l2, "LR: ", lr) torch.cuda.empty_cache() tr_cindex, va_cindex = train_model(x_tr, age_tr, pt_tr, y_tr, delta_tr, \ x_va, age_va, pt_va, y_va, delta_va, x_te, age_te, pt_te, y_te, delta_te, \ pathway_indices, \ Gene_Nodes, Pathway_Nodes, Image_Nodes, Hidden_Nodes, \
xover_rows = np.random.choice([False, True], size=parents.shape[0], p=[1 - crossover_rate, crossover_rate]) if xover_rows.sum() % 2 != 0: xover_rows[np.argwhere(xover_rows==False)[0]] = True xover_parents = parents[xover_rows] xover_bits = np.random.random_integers(0, 1, size=(xover_parents.shape[0] / 2, xover_parents.shape[1])).repeat(2, axis=0) xover_bits[1::2] = 1 - xover_bits[1::2] xover_children = np.empty(shape=xover_parents.shape, dtype=xover_parents.dtype) xover_parents1, xover_parents2 = xover_parents * xover_bits, xover_parents * (1 - xover_bits) xover_children[0::2] = xover_parents1[0::2] + xover_parents1[1::2] xover_children[1::2] = xover_parents2[0::2] + xover_parents2[1::2] return np.vstack((xover_parents, parents[~xover_rows])) def mutate(children: np.ndarray, mutation_rate): for row in range(0, children.shape[0], 2): if np.random.binomial(1, mutation_rate): mut_pos = np.random.random_integers(1, children.shape[1] - 1) children[row, mut_pos] = 1 - children[row, mut_pos] if __name__ == '__main__': (X_train, y_train), (X_test, y_test) = load_data(red_size=0.1) best_model = solve(X_train, y_train, X_test, y_test) model = build_network(best_model) print(model.summary()) model.compile(optimizer='adadelta', loss='categorical_crossentropy') model.fit(X_train, y_train, batch_size=2048, nb_epoch=20) print(model.evaluate(X_test, y_test))
def train(data_path, epoch, batch_size, hidden_size, embedding_dim, testing=False): print('Loading data...') train, valid, test = load_data(data_path, valid_portion=0.1) mrr_list, recall_list, loss_list = [], [], [] train_data = RecSysDataset(train) valid_data = RecSysDataset(valid) test_data = RecSysDataset(test) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) n_items = 37484 model = NARM(n_items, hidden_size=hidden_size, embedding_dim=embedding_dim, batch_size=batch_size).to(device) if testing: ckpt = torch.load('latest_checkpoint.pth.tar') model.load_state_dict(ckpt['state_dict']) model.eval() recall, mrr = validate(test_loader, model) print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format( topk, recall, topk, mrr)) return optimizer = optim.Adam(model.parameters(), 1e-3) criterion = nn.CrossEntropyLoss() scheduler = StepLR(optimizer, step_size=80, gamma=0.1) for e in tqdm(range(epoch)): # train for one epoch scheduler.step(epoch=e) sum_loss = trainForEpoch(train_loader, model, optimizer, e, epoch, criterion) print('[TRAIN] epoch %d/%d avg loss %.4f' % (epoch + 1, epoch, sum_loss / len(train_loader.dataset))) recall, mrr = validate(valid_loader, model) recall_list.append(recall) mrr_list.append(mrr) print( 'Epoch {} validation: Recall@{}: {:.4f}, MRR@{}: {:.4f} \n'.format( e, topk, recall, topk, mrr)) # store best loss and save a model checkpoint ckpt_dict = { 'epoch': e + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(ckpt_dict, here + f'/checkpoint_{e}.pth.tar') return mrr_list, recall_list, loss_list