def main(argv): # config the CPU/GPU in TF, assume only one GPU is in use. # For multi-gpu setting, please refer to # https://www.tensorflow.org/guide/gpu#using_multiple_gpus gpus = tf.config.experimental.list_physical_devices('GPU') if len(gpus) == 0 or FLAGS.gpu_id == None: device_id = "/device:CPU:0" else: tf.config.experimental.set_visible_devices(gpus[FLAGS.gpu_id], 'GPU') device_id = '/device:GPU:0' A_mat, X_mat, z_vec, train_idx, val_idx, test_idx = load_data(FLAGS.dataset) An_mat = preprocess_graph(A_mat) N = A_mat.shape[0] K = z_vec.max() + 1 with tf.device(device_id): gcn = GCN(An_mat, X_mat, [FLAGS.hidden1, K]) gcn.train(train_idx, z_vec[train_idx], val_idx, z_vec[val_idx]) test_res = gcn.evaluate(test_idx, z_vec[test_idx], training=False) # gcn = GCN(An_mat_diag, X_mat_stack, [FLAGS.hidden1, K]) # gcn.train(train_idx_recal, z_vec[train_idx], val_idx_recal, z_vec[val_idx]) # test_res = gcn.evaluate(test_idx_recal, z_vec[test_idx], training=False) print("Dataset {}".format(FLAGS.dataset), "Test loss {:.4f}".format(test_res[0]), "test acc {:.4f}".format(test_res[1]))
from sklearn.decomposition import PCA from models.utils import load_data, plot_prediction, print_stats from models.positive_models import PositiveLinearRegression if __name__ == '__main__': n_components = 8 data_len, weeks, ts, xs = load_data() means, std_devs = [], [] pca = PCA(n_components=n_components) xs_pca = pca.fit(xs).transform(xs) linear_model = PositiveLinearRegression() linear_model.fit(xs_pca, ts) mean, std_dev = print_stats(xs_pca, ts, linear_model) plot_prediction(weeks, xs_pca, ts, linear_model) means.append(mean) std_devs.append(std_dev)
Usage: ./train.py -i <filename> -a <r> -n <n> -h <n> ./train.py -h | --help Options: -i <filename> Instance filename (data) -a <r> Alpha -n <n> Neurons -h <n> Hidden layers -h --help Show this screen. """ from docopt import docopt from models.utils import load_data, stats from sklearn.neural_network import MLPRegressor if __name__ == '__main__': opts = docopt(__doc__) # Read the penalty parameter alpha = float(opts['-a']) hidden = int(opts['-h']) neurons = int(opts['-n']) filename = opts['-i'] data_len, weeks, ts, xs = load_data(filename=filename) MLPR_model = MLPRegressor(solver='lbfgs', activation='logistic', alpha=alpha) MLPR_model.fit(xs, ts) mean, std_dev = stats(xs, ts, MLPR_model)
def explained_variance(): # explained variance of embeddings d = './results/pretrained_embeddings/' metric = r'Sensitivity' # Change to Specificty, YI, YI_max, etc. k = 0 # Must correspont to the metric. See load_results in analyse_results.py. mt = 'simple' vs = [] evs = [] for j, s in enumerate(['none', 'chemical', 'species', 'both']): tmp1, tmp2 = [], [] for model1, model2 in product(models, models): X, y = load_data('./data/%s_data_test.csv' % s) y = np.asarray(y) f = d + model1 + '_chemical_entity_embeddings.npy' X1 = np.load(f) f = d + model1 + '_chemical_ids.npy' ids1 = dict(np.load(f)) f = d + model2 + '_taxonomy_entity_embeddings.npy' X2 = np.load(f) f = d + model2 + '_taxonomy_ids.npy' ids2 = dict(np.load(f)) X = np.asarray([ np.concatenate([X1[int(ids1[c])], X2[int(ids2[s])], [conc]], axis=0) for c, s, conc in X if c in ids1 and s in ids2 ]) X = normalize(X, norm='l2', axis=0) # normalize over each feature pca = PCA(n_components=10) pca.fit(X) ev = sum(pca.explained_variance_ratio_) f = 'results/%s_%s_pretrained_%s_%s.csv' % (s, mt, model1, model2) p = load_predictions( f.replace('/', '/predictions_').replace('csv', 'npy')) v = p['value'][k] tmp1.append(ev) tmp2.append(v) evs.append(tmp1) vs.append(tmp2) colours = ['red', 'blue', 'green', 'black'] labels = [r'$\it{(i)}$', r'$\it{(ii)}$', r'$\it{(iii)}$', r'$\it{(iv)}$'] plt.figure(figsize=(10, 10)) for i in range(4): x = evs[i] y = vs[i] my_fitting, stats = poly.polyfit(x, y, 1, full=True) R2 = stats[0][0] plt.scatter(x, y, color=colours[i]) plt.plot(np.unique(x), np.poly1d(my_fitting[::-1])(np.unique(x)), color=colours[i], linewidth=4, label=labels[i]) plt.xlabel('Explained variance', fontsize=18) plt.ylabel(metric, fontsize=18) plt.legend(fontsize=18) plt.savefig('./plots/%s_ev_vs_%s.png' % (mt, metric))
# -*- coding: utf-8 -*- # @Time : 2018/8/6 23:49 # @Author : quincyqiang # @File : 01_memorization_baseline.py # @Software: PyCharm from models.utils import load_data # 1 加载数据 ner_dataset_dir = '../data/ner_dataset.csv' data = load_data(ner_dataset_dir) # 2 构建数据 class SentenceGetter(object): def __init__(self, data): self.n_sent = 1 self.data = data self.empty = False def get_next(self): try: s = self.data[self.data['Sentence #'] == "Sentence: {}".format( self.n_sent)] self.n_sent += 1 return s['Word'].tolist(), s['POS'].tolist(), s['Tag'].tolist() except: self.empty = True return None, None, None # getter=SentenceGetter(data)
def main(args, params): #To approx 0.15/0.15/0.70 split in total data when splitting chemicals/species. sizes = {'none':(0.225,0.225),'species':(0.22,0.24),'chemical':(0.23,0.23),'both':(0.45,0.47)} SAMPLING = args.sampling if args.CREATE_DATA: valid_size,test_size = sizes[SAMPLING] X,y = load_data(DATA_FILE) train, valid, test = train_test_split_custom(X, y, valid_size=valid_size, test_size=test_size, sampling=SAMPLING, random_state=RANDOM_SEED) print(len(valid[1])/sum(map(len,[train[1],test[1],valid[1]])),len(test[1])/sum(map(len,[train[1],test[1],valid[1]]))) save_data('data/%s_data_train.csv' % SAMPLING, train) save_data('data/%s_data_valid.csv' % SAMPLING, valid) save_data('data/%s_data_test.csv' % SAMPLING, test) try: train = load_data('data/%s_data_train.csv' % SAMPLING) valid = load_data('data/%s_data_valid.csv' % SAMPLING) test = load_data('data/%s_data_test.csv' % SAMPLING) print('Train Split',len(train[1])/sum(map(len,[train[1],test[1],valid[1]]))) print('Valid Split',len(valid[1])/sum(map(len,[train[1],test[1],valid[1]]))) print('Test Split',len(test[1])/sum(map(len,[train[1],test[1],valid[1]]))) oversample = RandomOverSampler(sampling_strategy='minority') train = oversample.fit_resample(*train) train = shuffle(*train) test = shuffle(*test) valid = shuffle(*valid) except: args.CREATE_DATA = True return main(args,params) params['cw'] = None if args.SIMPLE: SAMPLING+='_simple' else: SAMPLING+='_complex' if args.model == "onehot": fit_onehot(train, valid, test, results_file='results/%s_one_hot.csv' % SAMPLING, hp_file = 'pred_hp/%s_one_hot.csv' % SAMPLING, params=params) if args.model == "hier": fit_hier_embeddings(train, valid, test, chemical_hier_embeddings_files, taxonomy_hier_embeddings_files, results_file='results/%s_hierarchy_embedding.csv' % SAMPLING, hp_file='pred_hp/%s_hierarchy_embedding.csv' % SAMPLING, params=params) if args.model == "pretrained": for model1 in models: for model2 in models: fit_pretrained(train, valid, test, KGE_EMBEDDINGS_DIR+model1, KGE_EMBEDDINGS_DIR+model2, results_file='results/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv', hp_file='pred_hp/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv', params=params) if args.model == "allpretrained": fit_pretrained(train, valid, test, [KGE_EMBEDDINGS_DIR+m for m in models], [KGE_EMBEDDINGS_DIR+m for m in models], results_file='results/%s_all_pretrained_' % SAMPLING+'.csv', hp_file='pred_hp/%s_all_pretrained_' % SAMPLING +'.csv', params=params) #Select best models from pretrained and run them using sim embedding. if args.model in ['pretrainedensemble','sim']: best_models_auc = {} for model1 in models: for model2 in models: df = pd.read_csv('results/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv',index_col='metric') best_models_auc[(model1,model2)] = df.loc['ba','value'] best_models_auc = sorted(best_models_auc.items(),key=lambda x: x[1], reverse=True) if args.model == "sim": m,_ = best_models_auc[args.num_models-1] model1, model2 = m if args.MAX_TRIALS < 1: hp_file = "sim_hp/%s_joint_finetune_" % SAMPLING + model1+"_"+model2+".csv.json" else: hp_file = None hps = {} try: with open('pretrained_hp/%s_chemical_kg.json' % model1,'r') as f: tmp = json.load(f) for k in tmp: hps[k+'1'] = tmp[k] except: pass try: with open('pretrained_hp/%s_taxonomy_kg.json' % model2,'r') as f: tmp = json.load(f) for k in tmp: hps[k+'2'] = tmp[k] except: pass try: with open('pred_hp/%s_pretrained_%s_%s.csv' % (SAMPLING,model1,model2),'r') as f: tmp = json.load(f) hps = {**hps,**tmp} except: pass if hp_file: try: with open(hp_file, 'r') as f: tmp = json.load(f) hps = {**hps,**tmp} except: print(model1,model2,'Missing HP file. Using default') params['use_pretrained'] = args.USE_PRETRAINED if not args.USE_PRETRAINED: SAMPLING+='_non_init' fit_sim_model(train, valid, test, model1, model2, results_file='results/%s_joint_finetune_' % SAMPLING +model1+'_'+model2+'.csv', embedding_file='sim_embeddings/%s_joint_finetune_' % SAMPLING +model1+'_'+model2, hps = hps, params=params)