def main(args): # build search space data = load_data(args.dataset, args.seed) ss, _ = pruning_search_space_by_eda(data) if data.setting == 'inductive': trainer = InductiveTrainer() else: trainer = TransductiveTrainer() sampler = Sampler(args.dataset, ss) archs = [] val_scores = [] test_scores = [] # init training data for GBDT sampled_archs = sampler.sample(3000) i = 0 while i < len(sampled_archs): arch = sampled_archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) test_score = trainer.test(data) except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str( e): # CUDA OOM, sample another arch print(e) sampled_archs += sampler.sample(1) i += 1 continue else: raise e archs.append(arch) val_scores.append(val_score) test_scores.append(test_score) print(arch, f'real val score: {val_score} | real test score: {test_score}') print(f'Number of evaluated archs: {len(archs)}') i += 1 if i % 500 == 0: print(f'Round {i // 500} | best test score: {max(test_scores)}') if i >= 2000: break
""" strategy = ['mean','median','most_frequent'] # 找出含有缺失值的列 missing_values_list = [] attribution_counts_dict = dict(self.data) for attribution,counts in attribution_counts_dict.items(): if counts < 2760: missing_values_list.append(attribution) # 如果是Nominal attribution,则可以使用‘most_frequent’ # 这里缺失值只涉及到Nominal attribution self.nominal_attribution_list = missing_values_list # 如果是Numerical attribution,则可以使用'mean' #self.numerical_attribution_list = [] class DealWithNoisyData(): """ 异常值处理 """ def __init__(self, data): self.data = data # test code if __name__ == '__main__': dataframe = data_prepare.load_data(data_path = '../datasets/tianchi_fresh_comp_train_item.csv') deal_with_missing_values_obj = DealWithMissingValues(dataframe)
lstmattn = nn.DataParallel(lstmattn, device_ids=device_ids) else: lstmattn = lstmattn print(lstmattn) ########################################################## all_train_art = [] all_train_lab = [] auto_valid_art = [] auto_valid_label = [] man_valid_art = [] man_valid_label = [] # to get training data and test data ############################################################ art_test = data_prepare.load_data(auto_test_set, auto_testdata_size) y_test = data_prepare.load_labels(auto_test_label, auto_testdata_size).tolist() all_train_art.extend(art_test) all_train_lab.extend(y_test) ################################################################# art_auto = data_prepare.load_data(auto_dataset, auto_data_size) y_auto = data_prepare.load_labels(auto_labelset, auto_data_size).tolist() art_auto_train = art_auto[0:len(art_auto) - auto_valid_size] y_auto_train = y_auto[0:len(art_auto) - auto_valid_size] art_auto_valid = art_auto[len(art_auto) - auto_valid_size:] y_auto_valid = y_auto[len(art_auto) - auto_valid_size:] all_train_art.extend(art_auto_train) all_train_lab.extend(y_auto_train)
#print("LOC",LOC) #print("ORG",ORG) instance['sen']=sen instance['PER']=PER instance['LOC']=LOC instance['ORG']=ORG result.append(instance) with open('./data/case/result.json','w',encoding='utf-8')as fw: fw.write(json.dumps(result,ensure_ascii=False)) print("********The result is saved in the ./data/case/result.json*********"+ '\r') if __name__=='__main__': word2id, embeddings, embedding_method = get_embedding() if args.model != 2: train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = load_data(train_path) test_data = load_data(test_path) test_size = len(test_data) paths = {} if args.decode_method==0: decode='CRF' else: decode='Softmax' output_path = os.path.join('.', args.model_path, decode+"_"+embedding_method) if not os.path.exists(output_path): os.makedirs(output_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) if args.model==0: ckpt_prefix = os.path.join(model_path, "model")
import os.path as osp import argparse import torch.nn.functional as F from torch_geometric.datasets import Planetoid import torch_geometric.transforms as T from torch_geometric.nn import GATConv from data_prepare import load_data from feature_engineering import get_embedding import pickle parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='cora') args = parser.parse_args() data = load_data(args.dataset, 0, transform=T.NormalizeFeatures()) if data.x is None: data.x = get_embedding(args.dataset, data, 'onehot') class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = GATConv(data.x.shape[1], 8, heads=8, dropout=0.6) # On the Pubmed dataset, use heads=8 in conv2. self.conv2 = GATConv(8 * 8, int(max(data.y)) + 1, heads=1, concat=False, dropout=0.6)
# ================== step0: 参数准备 ================= params = tools.Parameters() params.set("k", 19) # 不能小于conv1d_kernel_size[1]*2,否则没法做第2个1d卷积 params.set("conv1d_channels", [16, 32]) params.set("conv1d_kernel_size", [0, 5]) params.set("dense_dim", 128) params.set("gcnn_dims", [32, 32, 32, 1]) params.set("learning_rate", 0.01) params.set("keep_prob", 1) epoch_num = 10 batch_size = 50 # ================== step1: 数据准备 ================= train_set, test_set, param = dp.load_data('MUTAG', 0, 1) logger.info(f"训练数据量 {len(train_set)},测试数据量 {len(test_set)}") params.extend(param) print(params) model = dm.DGCNN(params) model.build() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter('./log', sess.graph) # train for i in range(epoch_num): loss, acc, auc = loop_dataset(model, params, train_set, sess,
torch.backends.cudnn.benchmark = True lstmattn = lstmattn.to(device) lstmattn = nn.DataParallel(lstmattn, device_ids=device_ids) else: lstmattn = lstmattn print(lstmattn) optimizer = torch.optim.Adam(lstmattn.parameters(), lr=lr) # define a optimizer for backpropagation loss_func = nn.CrossEntropyLoss() # define loss funtion # to get training data ########################################################## all_train_art = [] all_train_lab = [] art_test = data_prepare.load_data(auto_test_set,auto_testdata_size) y_test = data_prepare.load_labels(auto_test_label,auto_testdata_size).tolist() all_train_art.extend(art_test) all_train_lab.extend(y_test) ################################################################# art_auto = data_prepare.load_data(auto_dataset,auto_data_size) y_auto = data_prepare.load_labels(auto_labelset,auto_data_size).tolist() all_train_art.extend(art_auto) all_train_lab.extend(y_auto) ################################################################# with open('./output/bilstmattn_tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) load_model = True
def main(args): # build search space data = load_data(args.dataset, args.seed) ss, _ = pruning_search_space_by_eda(data) if data.setting == 'inductive': trainer = InductiveTrainer() else: trainer = TransductiveTrainer() sampler = Sampler(args.dataset, ss) archs = [] val_scores = [] top_archs = [] top_val_scores = [] top_test_scores = [] # init training data for GBDT sampled_archs = sampler.sample(args.n) i = 0 while i < len(sampled_archs): arch = sampled_archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) sampled_archs += sampler.sample(1) i += 1 continue else: raise e archs.append(arch) val_scores.append(val_score) print(arch, f'real val score: {val_score}') print(f'Number of evaluated archs: {len(archs)}') i += 1 # train GBDT predictor for iter_round in range(1, args.iterations + 1): print(f'Iteration round {iter_round}, ReTraining model and sampling archs...', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # train GBDT X = [[str(e) for e in row] for row in archs] y = np.array(val_scores) train_pool = Pool(X, y, cat_features=[i for i in range(len(X[0]))]) # X = lgb.Dataset(pd.DataFrame(X, columns=ss.keys()), label=np.array(val_scores)) # gbdt_model = lgb.train(gbdt_params, X, args.gbdt_num_boost_round, categorical_feature=ss.keys()) gbdt_model = CatBoostRegressor( learning_rate=args.gbdt_lr, verbose=False ) gbdt_model.fit(train_pool) # pruning search space ss = pruning_search_space_by_shap(archs, gbdt_model, ss, args.p) sampler.update_search_space(ss) # predict some archs sampled_archs = sampler.sample(args.m) X = [[str(e) for e in row] for row in sampled_archs] test_pool = Pool(X, cat_features=[i for i in range(len(X[0]))]) predicted_val_scores = gbdt_model.predict(test_pool) # sort the archs according to the predicted value zipped = zip(sampled_archs, predicted_val_scores) zipped = sorted(zipped, key=lambda e: e[1], reverse=True) # sort in decreaing order sampled_archs, predicted_val_scores = zip(*zipped) sampled_archs, predicted_val_scores = list(sampled_archs), list(predicted_val_scores) print(f'Iteration round {iter_round}, evaluating top k archs on valid set', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # evaluate top k archs i = 0 while i < len(sampled_archs): arch = sampled_archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) predicted_val_score = predicted_val_scores[i] except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) sampled_archs += sampler.sample(1) i += 1 continue else: raise e archs.append(arch) val_scores.append(val_score) print(arch, f'predicted val score: {predicted_val_score} | real val score: {val_score}') print(f'Number of evaluated archs: {len(archs)}') if i + 1 >= args.k: break i += 1 # sort all the evaluated archs zipped = zip(archs, val_scores) zipped = sorted(zipped, key=lambda e: e[1], reverse=True) archs, val_scores = zip(*zipped) archs, val_scores = list(archs), list(val_scores) print(f'Iteration round {iter_round}, evaluating top k_test archs on test set', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # evaluate top k_test archs on test set i = 0 while i < len(archs): arch = archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) test_score, z = trainer.test(data, return_logits=True) pickle.dump((z, data.y[data.test_mask]), open(f'embeddings/{args.dataset}_AutoGRL-round{iter_round}-top{i + 1}.pt', 'wb')) except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) i += 1 continue else: raise e top_archs.append(arch) top_val_scores.append(val_score) top_test_scores.append(test_score) print(arch) print(f'Testing... round {iter_round} | arch top {i + 1} | real val score {val_score} | real test score {test_score}', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if i + 1 >= args.k_test: # only test top k_test models for every round break i += 1 zipped = zip(top_val_scores, top_test_scores) zipped = sorted(zipped, key=lambda e: e[0], reverse=True) best_val_score, corr_test_score = zipped[0][0], zipped[0][1] # logging print(f'Iteration {iter_round} | best val score {best_val_score} | corresponding test score {corr_test_score} | best test score {max(top_test_scores)}', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) pickle.dump((ss, sampler, trainer, archs, val_scores, gbdt_model, sampled_archs, predicted_val_scores, top_val_scores, top_test_scores), open(f'cache/gbdt/{args.dataset}_seed{args.seed}_round{iter_round}.pt', 'wb'))
device = torch.device("cuda:" + str(device_ids[0]) + "" if torch.cuda.is_available() else "cpu") if cuda_gpu: if torch.cuda.device_count() == 1: lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout).to(device) else: torch.backends.cudnn.benchmark = True lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout).to(device) lstmattn = nn.DataParallel(lstmattn, device_ids=device_ids) else: lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout) articles = data_prepare.load_data(dataset, data_size) label_array = data_prepare.load_labels(labelset, data_size) X_train = articles[0:train_size] y_train = label_array[0:train_size] X_test = articles[train_size:] y_test = label_array[train_size:] optimizer = torch.optim.Adam(lstmattn.parameters(), lr=lr) # define a optimizer for backpropagation loss_func = nn.CrossEntropyLoss() # define loss funtion training_loss, training_acc, test_acc = train.train_elmo( num_epoch, train_size, batch_size, optimizer, X_train, y_train, sequence_length, embedding_dim, lstmattn, test_size, loss_func, X_test,
def test_with_w2v(test_set, test_labels, data_size, vocabulary_size, sequence_length, load_model, tokenizer, batch_size, embedding_dim, embedding_matrix, model, batch_first): # load data and labels articles = data_prepare.load_data(test_set, data_size) labels = data_prepare.load_labels(test_labels, data_size) # tokenize and transform to matrix X, tokenizer = data_prepare.tokenize(articles, vocabulary_size, sequence_length, load_model, tokenizer) pred_labels = [] correct = 0 total = 0 acc = 0 # predict label for test data by given model for i in range(0, data_size, batch_size): # mini batch process batch_start = i batch_end = i + batch_size if batch_end > data_size - 1: batch_end = data_size if batch_first: input_x = data_prepare.trans2input_batch_first( X[batch_start:batch_end, :], batch_size, sequence_length, embedding_dim, embedding_matrix) # word embedding using google news vectors else: input_x = data_prepare.trans2input_batch_second( X[batch_start:batch_end, :], batch_size, sequence_length, embedding_dim, embedding_matrix) # word embedding using google news vectors b_x = torch.from_numpy( input_x).float() # reshape x to (batch, time_step, input_size) test_output = model(b_x) # model output del b_x y_true = np.asarray(labels[batch_start:batch_end]) pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze( ) # to get the maximum probability of very article in the batch and get its value pred_labels.extend(pred_y.tolist()) total += y_true.shape[ 0] # the total number of article in the test data correct += (pred_y == y_true).sum().item( ) # the number of intances that pred_y matches with the y_true acc = 100.00 * float(correct) / float(total) # get accuracy ## calculate presicion, recall and F1 score t0 = 0.0 t1 = 0.0 f0 = 0.0 f1 = 0.0 for i in range(len(pred_labels)): true_label = labels[i] predict = pred_labels[i] if predict == 1 and true_label == 1: t1 += 1 elif predict == 0 and true_label == 0: t0 += 1 elif true_label == 1 and predict == 0: f1 += 1 elif true_label == 0 and predict == 1: f0 += 1 precision0 = t0 / (t0 + f0) recall0 = t0 / (t0 + f1) f1_score0 = 2 * ((precision0 * recall0) / (precision0 + recall0)) precision1 = t1 / (t1 + f1) recall1 = t1 / (t1 + f0) f1_score1 = 2 * ((precision1 * recall1) / (precision1 + recall1)) # store all results into a dictionary output_dic = {} output_dic["t0"] = t0 output_dic["t1"] = t1 output_dic["f0"] = f0 output_dic["f1"] = f1 output_dic["precision0"] = precision0 output_dic["recall0"] = recall0 output_dic["f1_score0"] = f1_score0 output_dic["precision1"] = precision1 output_dic["recall1"] = recall1 output_dic["f1_score1"] = f1_score1 output_dic["test accuracy"] = acc return acc, output_dic
# import user own package import data_prepare # Load the dataset and the labelset dataset = "../data/small_dataset.txt" labelset = "../data/small_labelset.txt" # set up training size and testing size train_size = 150000 test_size = 20000 # vocabulary size is set to be 100000 vocabulary_size = 100000 # get articles and labels from the dataset and labelset articles = data_prepare.load_data(dataset, train_size) labels = data_prepare.load_labels(labelset, train_size) # assign index to the words in the article tokenizer = Tokenizer(num_words=vocabulary_size) tokenizer.fit_on_texts(articles) # turn the words into one-hot vector X = tokenizer.texts_to_matrix(articles, mode='binary') clf = MultinomialNB(alpha=1) clf.fit(X, labels) # get the cross validation score scores = cross_val_score(clf, X, labels, cv=10) results = {}
seed = 0 import random random.seed(seed) import numpy as np np.random.seed(seed) import torch torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True from torch_geometric.transforms import NormalizeFeatures from search_space import pruning_search_space_by_eda, precompute_cache from data_prepare import load_data # for name in ['cora', 'usa-airports', 'photo', 'wikics']: for name in ['photo']: data = load_data(name, seed, transform=NormalizeFeatures()) ss, (data_aug, fe, hpo, nas) = pruning_search_space_by_eda(data) precompute_cache(name, data, data_aug, fe)