def main(): config = Config() rawdata = loadRawData(config.train_data_path) data = processRawData(rawdata) vocabList = createVocabList(data) train_x, test_x = data_split(data, 0.1, 42) torch.manual_seed(64) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = PoetryModel(config.embed_size, config.hidden_size, vocabList, device).to(device) hp = float('inf') if config.flag_load_model: checkpoint = torch.load(config.params_path) model.load_state_dict(checkpoint['model_dict']) hp = checkpoint['Hp'] optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=config.lr) for key, value in model.named_parameters(): print(key, value.shape) train(train_x, test_x, model, optimizer, device, config, Hp=hp) model.generate_poetry(24)
def main(): print("rnn algorithm") train_data, labels = loadDataSet("./data/train.tsv") test_data, _ = loadDataSet('./data/test.tsv', 1) train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42) # 所有文件中最长的评论长度 # max_sent_len = 56 # 只使用训练样本中出现的词 vocabListTrainData = createVocabList(train_data) # 使用测试样本出现的词 vocabListTestData = createVocabList(test_data) # 使用词表中的所有词 # 这里犯了一个很大的错误, 只使用了一个 或运算 来获取vocabList # set是使用散列表实现的,是无序的,所以每次重新运行代码,最终得到的embedding都是不一样的。 vocabList = vocabListTrainData | vocabListTestData vocabList = sorted(vocabList) use_cuda = torch.cuda.is_available() torch.manual_seed(64) device = torch.device("cuda" if use_cuda else "cpu") batch = 64 epoch = 8 embed_size = 100 hidden_size = 50 model = RNN(embed_size, hidden_size, vocabList, device).to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) flag = 0 if flag == 0: s = time.time() train(model, device, train_x, train_y, optimizer, epoch, batch, 0.2) e = time.time() print("train time is : ", (e-s)/60.) else: model.load_state_dict(torch.load('./data/rnn_params.pth')) test(model, device, train_x, train_y) test(model, device, test_x, test_y) kaggleTest(model, './data/kaggleData.csv')
def build_tree(self, data_array): ''' Recursively builds a decision tree for categorical data_array input: current tree node object data_array array return" decision tree: linked list of nodes. ''' current_entropy = entropy(data_array) feature_count = len(data_array[0])-1 ig_global = 0.0 ig_feature_valpair = None ig_setpair = None # choosing best feature and its value to split data_arrayset for index in range(0, feature_count): # creating a set of unique values for a given feature in the data_arrayset vals = set() for row in data_array: vals.add(row[index]) # iterating through the unique set of features and calculating information gain for values in vals: (subset_1, subset_2) = data_split( data_array, index, values) pos = float(len(subset_1))/float(len(data_array)) neg = float(len(subset_2))/float(len(data_array)) gain = current_entropy - pos * \ entropy(subset_1) - (neg)*entropy(subset_2) # updating feature index, values and data_array splits for the best information gain if gain > ig_global and len(subset_1) > 0 and len(subset_2) > 0: ig_global = gain ig_feature_valpair = (index, values) ig_setpair = (subset_1, subset_2) # ig >0 for impure sets: hence move on to subsets of data_array if ig_global > 0.0: self.feature_index = ig_feature_valpair[0] self.feature_value = ig_feature_valpair[1] if not self.true: self.true = DecisionTree() self.true.build_tree(ig_setpair[0]) if not self.false: self.false = DecisionTree() self.false.build_tree(ig_setpair[1]) # decision leaf reached and decision label assigned to this leaf else: self.class_label = label_counts(data_array)
def main(): name2id = {'START':0, 'I-MISC':1, 'B-MISC':2, 'I-LOC':3, 'B-LOC':4, 'I-ORG':5, 'B-ORG':6, 'I-PER':7, 'O':8, 'END':9} train_data_path = './data/conll2003/eng.train' params_path = './data/rnn_params.pth' data, labels = loadData(train_data_path) labels = [[name2id[name] for name in sents] for sents in labels] vocabList = createVocabList(data) train_x, test_x, train_y, test_y = data_split(data, labels, 0.1, 42) torch.manual_seed(64) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") batch = 32 epoch = 4 embed_size = 100 hidden_size = 50 flag_load_model = 1 n_label = len(name2id) corate = -1 model = RNN(embed_size, hidden_size, n_label, vocabList, device).to(device) if flag_load_model: checkpoint = torch.load(params_path) model.load_state_dict(checkpoint['model_dict']) corate = checkpoint['corate'] optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) for key, value in model.named_parameters(): print(key, value.shape) train(train_x, train_y, test_x, test_y, model, optimizer, device, epoch, batch, params_path, corate=corate) print(model.transition) # test_x = test_x[:5] # test_y = test_y[:5] test(test_x, test_y, model, device)
Data_dir=fcn_setting['Data_dir'], patch_size=fcn_setting['patch_size'], exp_idx=exp_idx, seed=seed, model_name='fcn', metric='accuracy') fcn.train(lr=fcn_setting['learning_rate'], epochs=fcn_setting['train_epochs']) fcn.test_and_generate_DPMs() if __name__ == "__main__": config = read_json('./config.json') seed, repe_time = 1000, config[ 'repeat_time'] # if you only want to use 1 data split, set repe_time = 1 # data_split function splits ADNI dataset into training, validation and testing for several times (repe_time) data_split(repe_time=repe_time) # to perform FCN training ##################################### with torch.cuda.device(2): # specify which gpu to use fcn_main( seed ) # each FCN model will be independently trained on the corresponding data split # to perform CNN training ##################################### with torch.cuda.device(2): # specify which gpu to use cnn_main( seed ) # each CNN model will be independently trained on the corresponding data split
# newline='', 就不会产生空行 with open(filePath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['PhraseId', 'Sentiment']) writer.writerows(kaggle_data) if __name__ == "__main__": print("bayes algrithm") train_data, labels = loadDataSet("./data/train.tsv") maxLen = 0 for it in train_data: maxLen = max(maxLen, len(it)) print('the max len is : ', maxLen) train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42) vocabList = createVocabList(train_x) train_x_vec = [] print('change train data to vector.') for i, it in tqdm(enumerate(train_x)): train_x_vec.append(bagOfWord2Vec(vocabList, it)) pw, pc = train(np.array(train_x_vec), np.array(train_y)) test_x_vec = [] print('change test data to vector') for i, it in tqdm(enumerate(test_x)): test_x_vec.append(bagOfWord2Vec(vocabList, it)) # test(np.array(test_x_vec), np.array(test_y), pw, pc) # kaggleTest(np.array(test_x_vec), pw, pc, './data/kaggleData.csv') test_data, labels = loadDataSet("./data/test.tsv", 1)
mode = namespace.name d = namespace.d path = os.path.join("data", mode, "matrices") input_shape = (d, d, d, 4) samples = namespace.samples epochs = namespace.epochs weights_dir = os.path.join("saved_models", "unet", mode) os.makedirs(weights_dir, exist_ok=True) os.makedirs(os.path.join("output/unet", mode), exist_ok=True) weights = os.path.join(weights_dir, "unet_weights_" + mode + ".best.hdf5") lr = namespace.lr batch_size = namespace.batch_size # Split the data training_ids, validation_ids = data_split( path, samples, frac=namespace.split, n_rot=namespace.nrot ) training_generator = UnetDataGenerator( training_ids, data_path=path, batch_size=batch_size, n_channels=input_shape[-1], shuffle=True, ) validation_generator = UnetDataGenerator( validation_ids, data_path=path, batch_size=batch_size, n_channels=input_shape[-1], shuffle=True, )
def train_supervised(): patience = 50 best_result = 0 best_std = 0 best_dropout = None best_weight_decay = None best_lr = None best_time = 0 best_epoch = 0 lr = [0.05, 0.01, 0.002] #,0.01, weight_decay = [1e-4, 5e-4, 5e-5, 5e-3] #5e-5,1e-4,5e-4,1e-3,5e-3 dropout = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] for args.lr, args.weight_decay in itertools.product(lr, weight_decay): result = np.zeros(10) t_total = time.time() num_epoch = 0 for idx in range(10): #idx_train, idx_val, idx_test = rand_train_test_idx(labels) #idx_train, idx_val, idx_test = random_disassortative_splits(labels, num_class) idx_train, idx_val, idx_test = data_split(idx, args.dataset_name) #rank = OneVsRestClassifier(LinearRegression()).fit(features[idx_train], labels[idx_train]).predict(features) #print(rank) #adj = reconstruct(old_adj, rank, num_class) model = GAT(num_layers=args.layers, in_dim=features.shape[1], num_hidden=args.hidden, num_classes=labels.max().item() + 1, heads=heads, dropout=args.dropout) #model = TwoCPPooling(in_fea=features.shape[1], out_class=labels.max().item() + 1, hidden1=2*args.hidden, hidden2=args.hidden, dropout=args.dropout) if args.cuda: #adj = adj.cuda() idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda() model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) vlss_mn = np.inf vacc_mx = 0.0 vacc_early_model = None vlss_early_model = None curr_step = 0 best_test = 0 best_training_loss = None for epoch in range(args.epochs): num_epoch = num_epoch + 1 t = time.time() model.train() optimizer.zero_grad() output = model(g, features) #print(F.softmax(output,dim=1)) output = F.log_softmax(output, dim=1) #print(output) loss_train = F.nll_loss(output[idx_train], labels[idx_train]) acc_train = accuracy(output[idx_train], labels[idx_train]) loss_train.backward() optimizer.step() if not args.fastmode: # Evaluate validation set performance separately, # deactivates dropout during validation run. model.eval() output = model(g, features) output = F.log_softmax(output, dim=1) val_loss = F.nll_loss(output[idx_val], labels[idx_val]) val_acc = accuracy(output[idx_val], labels[idx_val]) if val_acc >= vacc_mx or val_loss <= vlss_mn: if val_acc >= vacc_mx and val_loss <= vlss_mn: vacc_early_model = val_acc vlss_early_model = val_loss best_test = test(model, idx_train, idx_val, idx_test) best_training_loss = loss_train vacc_mx = val_acc vlss_mn = val_loss curr_step = 0 else: curr_step += 1 if curr_step >= patience: break print( "Optimization Finished! Best Test Result: %.4f, Training Loss: %.4f" % (best_test, best_training_loss)) #model.load_state_dict(state_dict_early_model) # Testing result[idx] = best_test del model, optimizer if args.cuda: torch.cuda.empty_cache() five_epochtime = time.time() - t_total print("Total time elapsed: {:.4f}s, Total Epoch: {:.4f}".format( five_epochtime, num_epoch)) print( "learning rate %.4f, weight decay %.6f, dropout %.4f, Test Result: %.4f" % (args.lr, args.weight_decay, args.dropout, np.mean(result))) if np.mean(result) > best_result: best_result = np.mean(result) best_std = np.std(result) #best_dropout = args.dropout best_weight_decay = args.weight_decay best_lr = args.lr best_time = five_epochtime best_epoch = num_epoch print( "Best learning rate %.4f, Best weight decay %.6f, dropout %.4f, Test Mean: %.4f, Test Std: %.4f, Time/Run: %.4f, Time/Epoch: %.4f" % (best_lr, best_weight_decay, 0, best_result, best_std, best_time / 5, best_time / best_epoch))
n = namespace.samples batch_size = namespace.batch_size eps = namespace.eps_frac vae_weights = os.path.join("saved_models", "vae", mode, "vae_weights_" + mode + ".best.hdf5") unet_weights = os.path.join("saved_models", "unet", mode, "unet_weights_" + mode + ".best.hdf5") perceptual_model = os.path.join("saved_models", "unet", mode, "unet_weights_" + mode + ".best.h5") clustering_max_iters = namespace.clus_iters os.makedirs(os.path.join("output", "eval", mode), exist_ok=True) # Split the data training_ids, validation_ids = data_split(data_path, n, frac=namespace.split, n_rot=0) validation_generator = VAEDataGenerator( validation_ids, data_path=data_path, property_csv=csv_path, batch_size=batch_size, n_channels=input_shape[-1], shuffle=False, n_bins=ncond, ) # Create the VAE vae = LatticeDFCVAE(perceptual_model=perceptual_model, cond_shape=ncond) vae._set_model(weights=vae_weights, batch_size=batch_size) # Create the Unet
def gyro_from_data(_, data): return data_split(data)[1]
def acc_from_data(_, data): return data_split(data)[0]
def data_processing(DATA_PATH, ratio_list, debug, label_correct=True): """configuration""" if label_correct: config_path = './label_correct_config.yml' # config loadpath else: config_path = './label_no_correct_config.yml' # config loadpath create_config(config_path) with open(config_path, 'r') as f_obj: config = yaml.load(f_obj, Loader=yaml.FullLoader) split = config['SPLIT'] split_num = config['SPLIT_NUM'] # final split image number is split_num^2 if split: DATA_SAVE_PATH = os.path.join( DATA_PATH, 'datasets_split') # flist savepath else: DATA_SAVE_PATH = os.path.join(DATA_PATH + 'datasets') IMG_SPLIT_SAVE_PATH = os.path.join( DATA_PATH, 'png_split') # img split savepath EDGE_SPLIT_SAVE_PATH = os.path.join( DATA_PATH, 'edge_split') # edge split savepath # save path create_dir(DATA_SAVE_PATH) if split: create_dir(IMG_SPLIT_SAVE_PATH) create_dir(EDGE_SPLIT_SAVE_PATH) # generate edge from points # time_start=time.time() # print(time_start) # if label_correct: # gen_edge_from_point_base_gradient(DATA_PATH, debug) # else: # gen_edge_from_point(DATA_PATH, debug) # time_end=time.time() # print(time_end) # print('generate edge from points time cost',time_end-time_start,'s') if debug==0: subject_word = config['SUBJECT_WORD'] # generate a list of original edge edge_flist_src = os.path.join(DATA_SAVE_PATH, subject_word + '_edge.flist') gen_flist(os.path.join(DATA_PATH, 'edge'), edge_flist_src) edge_num = len(np.genfromtxt( edge_flist_src, dtype=np.str, encoding='utf-8')) # generate a list of original images png_flist_src = os.path.join(DATA_SAVE_PATH, subject_word + '_png.flist') gen_flist(os.path.join(DATA_PATH, 'png'), png_flist_src) # img (training set, verification set, test set)(not split) key_name = 'png' png_flist = os.path.join(DATA_SAVE_PATH, subject_word + '_' + key_name) png_val_test_PATH = [png_flist+'_train.flist', png_flist+'_val.flist', png_flist+'_test.flist'] id_list = gen_flist_train_val_test( png_flist_src, edge_num, png_val_test_PATH, ratio_list, config['SEED'], []) # edge (training set, verification set, test set)(not split) key_name = 'edge' edge_flist = os.path.join(DATA_SAVE_PATH, subject_word + '_' + key_name) edge_val_test_PATH = [edge_flist+'_train.flist', edge_flist+'_val.flist', edge_flist+'_test.flist'] gen_flist_train_val_test( edge_flist_src, edge_num, edge_val_test_PATH, ratio_list, config['SEED'], id_list) # split data if split: key_name = 'png_split' png_flist = os.path.join(DATA_SAVE_PATH, subject_word + '_' + key_name) png_val_test_PATH_save = [png_flist+'_train.flist', png_flist+'_val.flist', png_flist+'_test.flist'] i = 0 id_img = 0 for path in png_val_test_PATH: if ratio_list[i] != 0: id_img = data_split(split_num, path, IMG_SPLIT_SAVE_PATH, 'png', id_img, png_val_test_PATH_save[i], RGB=True) i += 1 key_name = 'edge_split' png_flist = os.path.join(DATA_SAVE_PATH, subject_word + '_' + key_name) edge_val_test_PATH_save = [ png_flist+'_train.flist', png_flist+'_val.flist', png_flist+'_test.flist'] i = 0 id_img = 0 for path in edge_val_test_PATH: if ratio_list[i] != 0: id_img = data_split(split_num, path, EDGE_SPLIT_SAVE_PATH, 'edge', id_img, edge_val_test_PATH_save[i], RGB=False) i += 1 png_val_test_PATH = png_val_test_PATH_save edge_val_test_PATH = edge_val_test_PATH_save """setting path of data list""" set_flist_config(config_path, png_val_test_PATH, flag='data') set_flist_config(config_path, edge_val_test_PATH, flag='edge')
def main(model_dir, result_path, test_dir, save_path=None, device='cuda:0', debug=False): """ model_dir <--- model_filepath result_path <--- result_filepath test_dir <---- examples_dirpath """ # some parameters REGU = "l1" # l1 or l2, regularization of mask loss rate = 0.15 # final rate for threshold scale = 1.1 # 1.1 * bestarea data_thres = 5 batch = 64 # batch size for input data_shuffle, labels_shuffle, batch_size_all, num_class = \ load_data(test_dir, data_thres, batch) if device is not None: device = torch.device(device) bestarea_list_p = [] # universal: _p outputs_list_p = [] # for all classes: _list outputs_list = [] similarities_best = [] jdict = {} # main part for targets in range(0, num_class): # find images belonging to the selected label and images that are not imgs, imgs2, labs, labs2, size1, size2 = \ data_split(data_shuffle, labels_shuffle, targets, num_class) size1_all = sum(size1) # print current information: print("-------------------------------------------------------------") print("- current target label: ", targets) print("- regularization form: ", REGU) # create path for saving images if save_path is not None: if not os.path.exists(save_path): os.makedirs(save_path) save_path_i = os.path.join(save_path, 'target_{}.jpg'.format(targets)) else: save_path_i = None # find the Universal Perturbation modifier_p, det_p, ind1, ind2, output_p =\ UniversalPert(model_dir, size1, size2, device)\ .attack(imgs, imgs2, labs, labs2, save_path_i) # find per-image target perturbation modifier, det, indic, output = \ PerImgPert(model_dir, size1, device).attack(imgs, labs) print( "===========main-information================================================" ) print("Universal Perturbation found in target\n", targets) print("Type 1 wrong labels: ", ind1) print("Type 2 wrong labels: ", ind2) print("Per-image Perturbation found in target\n", targets) print("Wrong indices: ", indic) print( "==============================================================================" ) # jdict[targets] = {"modifier_p": modifier_p.cpu().detach().numpy().tolist(), # "det_p": det_p.cpu().detach().numpy().tolist(), # "output_p": output_p.tolist(), # # "modifier": modifier.cpu().detach().numpy().tolist(), # # "det": det.cpu().detach().numpy().tolist(), # "output": output.tolist(), # "size_all": batch_size_all} # with open(result_path, 'w') as f: # json.dump(jdict, f) # bestarea_list_p.append(torch.sum(torch.abs(modifier_p))) outputs_list_p.append(output_p) outputs_list.append(output) # write result if not debug: # write result when proposed res = cal_result(outputs_list_p, outputs_list, num_class) with open(result_path, 'w') as f: f.write("{}".format(res)) else: # write for statistics with open(result_path, 'w') as f: f.write('scale\tlabel\tqt_0.15\tqt_0.25\tqt_0.5\tqt_0.75\n') for i in range(num_class): qt0 = np.quantile(similarities_best[i], 0.15) qt1 = np.quantile(similarities_best[i], 0.25) qt2 = np.quantile(similarities_best[i], 0.5) qt3 = np.quantile(similarities_best[i], 0.75) print("scale: ", scale, "label ", i, qt0, qt1, qt2, qt3) f.write("\t{0}\t{1}\t{2}\t{3}\t{4}\n".format( scale, i, qt0, qt1, qt2, qt3)) print("Model Done") # for i in range(1042, 1100): # model_path = './round2/id-%08d/model.pt' % i # result = './result-data/id-%08d.json' % i # data_path = './round2/id-%08d/example_data' % i # # trigger_path = './reverse_trigger-2/id-%08d' % i # main(model_path, result, data_path, save_path=None, device='cuda:0', debug=True) # main('./round2/id-00001003/model.pt', 'test.txt', './round2/id-00001003/example_data', debug=False)
) ML_df = ML_df.append( pd.DataFrame([all_stats], columns=list(ML_columns)), ignore_index=True ) # Save statistics to file ML_df.to_csv('figures/ML_increase_negs_combined.csv') # ---------------------- # Run classifiers on in # silico generated data # ---------------------- # Create collection with training and test split mHER_H3_all = data_split(mHER_H3_AgPos, mHER_H3_AgNeg) # Create model directory model_dir = 'classification' os.makedirs(model_dir, exist_ok=True) # Use tuned model parameters for CNN (performed in separate script) params = [['CONV', 400, 5, 1], ['DROP', 0.2], ['POOL', 2, 1], ['FLAT'], ['DENSE', 300]] # Train and test CNN with unadjusted (class split) data set CNN_all = CNN_classification( mHER_H3_all, 'All_data', save_model=model_dir, params=params
import networkx as nx import numpy as np from keras.models import Input, Model from layers import GraphConv GRAPH_DECOMPOSITION = 20 GRAPH_PAR_ITER = 10000 GC_LAYERS = 2 GC_UNITS = 100 GC_LAYERS_ACT = 'relu' CLUSTERS_PER_BATCH = 2 SPARSE_A = False #load and spilt data X, A, y = load_data(dataset="cora") #A is scipy spase matrix X_train, A_train, y_train, train_samples = data_split(X, A, y, test_size=0.4) def ClusterGCN(ft_length, gcn_layers, gcn_units, classes, activation=None): in_feature = Input(shape=(None, ft_length), name='X') in_adj = Input(shape=(None, None), name='A', sparse=SPARSE_A) #hidden gcn for _ in range(gcn_layers): if _ == 0: gcn = GraphConv(gcn_units, activation=activation, name='gcn_{}'.format(_))([in_feature, in_adj]) else: gcn = GraphConv(gcn_units, activation=activation,