def test_get_cross_validation_datasets(self): original = [1, 2, 3, 4, 5, 6] cv1, cv2, cv3 = datasets.get_cross_validation_datasets(original, 3) tr1, te1 = cv1 self.assertEqual(len(tr1), 4) self.assertEqual(tr1[0], 1) self.assertEqual(tr1[1], 2) self.assertEqual(tr1[2], 3) self.assertEqual(tr1[3], 4) self.assertEqual(len(te1), 2) self.assertEqual(te1[0], 5) self.assertEqual(te1[1], 6) tr2, te2 = cv2 self.assertEqual(len(tr2), 4) self.assertEqual(tr2[0], 5) self.assertEqual(tr2[1], 6) self.assertEqual(tr2[2], 1) self.assertEqual(tr2[3], 2) self.assertEqual(len(te2), 2) self.assertEqual(te2[0], 3) self.assertEqual(te2[1], 4) tr3, te3 = cv3 self.assertEqual(len(tr3), 4) self.assertEqual(tr3[0], 3) self.assertEqual(tr3[1], 4) self.assertEqual(tr3[2], 5) self.assertEqual(tr3[3], 6) self.assertEqual(len(te3), 2) self.assertEqual(te3[0], 1) self.assertEqual(te3[1], 2)
F_list.append(mol_to_feature(mol,-1,args.atomsize)) T_list.append(mol.GetProp('_Name') ) #------------------------------- # Setting Dataset to model f.write("Reshape the Dataset...\n") Mf.random_list(F_list) Mf.random_list(T_list) data_t = cp.asarray(T_list, dtype=cp.int32).reshape(-1,1) data_f = cp.asarray(F_list, dtype=cp.float32).reshape(-1,1,args.atomsize,lensize) f.write('{0}\t{1}\n'.format(data_t.shape, data_f.shape)) f.write('Validate the Dataset...k ={0}\n'.format(args.validation)) dataset = datasets.TupleDataset(data_f, data_t) if args.validation > 1: dataset = datasets.get_cross_validation_datasets(dataset, args.validation) #dataset = datasets.get_cross_validation_datasets_random(dataset, args.validation) #------------------------------- # reset memory del mol, mols, data_f, data_t, F_list, T_list gc.collect() #------------------------------- # 5-fold print('Training...') f.write('Convolutional neural network is running...\n') v = 1 while v <= args.validation: print('...{0}'.format(v)) f.write('Cross-Validation : {0}\n'.format(v))
min_layer = 3 min_node = n_in * 1.5 max_epoch = 10000 min_epoch = 5000 #nnode = (min_node+np.array(hp[:,0])*(max_node-min_node)+0.5).astype(np.int32) #nlayer = (min_layer+np.array(hp[:,1])*(max_layer-min_layer)+0.5).astype(np.int32) #nepoch = (min_epoch+np.array(hp[:,2])*(max_epoch-min_epoch)+0.5).astype(np.int32) nnode0 = [172] nnode1 = [100] nnode2 = [100] nlayer = [4] nepoch = [2000] nreport = 100 #divide data about train and validation data_list = get_cross_validation_datasets(data, nfold) r2_train_1 = np.zeros(nexp) r2_valid_1 = np.zeros(nexp) loss_train = [] loss_valid = [] R2_train = [] R2_valid = [] it_train_loss = [] it_valid_loss = [] it_train_r2 = [] it_valid_r2 = [] r2_train_2 = np.zeros(nexp) r2_valid_2 = np.zeros(nexp)
def train(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--model', '-m', type=str, default=None) parser.add_argument('--opt', type=str, default=None) parser.add_argument('--validation', '-v', type=int, default=5) parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--lr', '-l', type=float, default=0.001) parser.add_argument('--inf', type=int, default=3) parser.add_argument('--outf', type=int, default=3) parser.add_argument('--batch', '-b', type=int, default=1) args = parser.parse_args() train = dataset.UCSDped1Dataset(0, 200, args.inf, args.outf, "./ucsd_ped1_train.npy") # cross validation dataset_ = datasets.get_cross_validation_datasets(train, args.validation, order=None) v = 1 while v <= args.validation: model = convlstm.Model(n_input=2, size=[128,64,64]) if args.model != None: print( "loading model from " + args.model ) serializers.load_npz(args.model, model) if args.gpu >= 0: cuda.get_device_from_id(0).use() model.to_gpu() optimizer = optimizers.RMSprop(lr=args.lr) optimizer.setup(model) if args.opt != None: print( "loading opt from " + args.opt ) serializers.load_npz(args.opt, opt) train_iter = chainer.iterators.SerialIterator(dataset_[v-1][0], batch_size=args.batch, shuffle=False) test_iter = chainer.iterators.SerialIterator(dataset_[v-1][1], batch_size=args.batch, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='results') trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'), log_name='log_'+str(v)+'_epoch')) trainer.extend(extensions.LogReport(trigger=(10, 'iteration'))) trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time'])) trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss_'+str(v)+'_epoch.png')) trainer.extend(extensions.ProgressBar(update_interval=1)) trainer.run() modelname = "./results/model" + str(v) print( "saving model to " + modelname ) serializers.save_npz(modelname, model) optname = "./results/opt" + str(v) print( "saving opt to " + optname ) serializers.save_npz(optname, optimizer) v = v + 1
def main(args): random.seed(0) np.random.seed(0) if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() cuda.cupy.random.seed(0) dataset, id2ene = load_dataset(args.dataset, args.features, args.redirects) print(f'# of examples in dataset: {len(dataset)}') def batch2tensors(batch, device): xp = cuda.cupy if device >= 0 else np xf = xp.zeros((len(batch), args.n_feature), dtype='f') xe = xp.zeros((len(batch), args.embed_size), dtype='f') t = xp.zeros((len(batch), len(id2ene)), dtype='i') for i, item in enumerate(batch): for feature_id in item['feature_ids']: if feature_id < args.n_feature: xf[i, feature_id] = 1.0 if item['embedding']: xe[i] = xp.array(item['embedding'], dtype='f') for ene_id in item['ene_ids']: t[i, ene_id] = 1 x = xp.concatenate((xf, xe), axis=1) return x, t cv_datasets = get_cross_validation_datasets(dataset, args.cv) ys = [] ts = [] for split_idx, cv_dataset in enumerate(cv_datasets): print(f'cross validation ({split_idx + 1}/{len(cv_datasets)})') train, test = cv_dataset train_iter = SerialIterator(train, batch_size=args.batch) test_iter = SerialIterator(test, batch_size=args.batch, repeat=False, shuffle=False) model = ENEClassifier(in_size=args.n_feature + args.embed_size, hidden_size=args.hidden_size, out_size=len(id2ene)) if args.gpu >= 0: model.to_gpu(args.gpu) optimizer = optimizers.Adam() optimizer.setup(model) updater = StandardUpdater(train_iter, optimizer, converter=batch2tensors, device=args.gpu) trainer = Trainer(updater, (args.epoch, 'epoch'), out=args.out_dir) trainer.extend(extensions.LogReport()) trainer.extend( extensions.snapshot_object( model, filename='epoch_{.updater.epoch}.model')) trainer.extend( extensions.Evaluator(test_iter, model, converter=batch2tensors, device=args.gpu)) trainer.extend( extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time'])) trainer.extend(extensions.ProgressBar(update_interval=1)) trainer.run() test_iter.reset() for batch in test_iter: x, t = batch2tensors(batch, device=args.gpu) with chainer.using_config('train', False): y = model.predict(x) ys.append(y) ts.append(t) y_all = F.concat(ys, axis=0) t_all = F.concat(ts, axis=0) prediction_matrix = (y_all.data >= 0.5).astype('f') reference_matrix = (t_all.data == 1).astype('f') accuracy_matrix = prediction_matrix * reference_matrix eb_pred = prediction_matrix.sum( axis=1) # entity-based num. of predicted classes eb_ref = reference_matrix.sum( axis=1) # entity-based num. of reference classes eb_acc = accuracy_matrix.sum( axis=1) # entity-based num. of accurate classes eb_nopred = (eb_pred == 0.).astype('f') # for avoiding zero-division eb_precision = (eb_acc / (eb_pred + eb_nopred)).mean() eb_recall = (eb_acc / eb_ref).mean() eb_f1 = (2 * eb_acc / (eb_pred + eb_ref)).mean() cb_pred = prediction_matrix.sum( axis=0) # class-based num. of predicted examples cb_ref = reference_matrix.sum( axis=0) # class-based num. of reference examples cb_acc = accuracy_matrix.sum( axis=0) # class-based num. of accurate examples cb_nopred = (cb_pred == 0.).astype('f') # for avoiding zero-division cb_macro_precision = (cb_acc / (cb_pred + cb_nopred)).mean() cb_macro_recall = (cb_acc / cb_ref).mean() cb_macro_f1 = (2 * cb_acc / (cb_pred + cb_ref)).mean() cb_micro_precision = cb_acc.sum() / cb_pred.sum() cb_micro_recall = cb_acc.sum() / cb_ref.sum() cb_micro_f1 = (2 * cb_acc.sum()) / (cb_pred.sum() + cb_ref.sum()) print(f'Entity-based Precision: {float(eb_precision):.2%}') print(f'Entity-based Recall: {float(eb_recall):.2%}') print(f'Entity-based F1 score: {float(eb_f1):.2%}') print(f'Class-based macro Precision: {float(cb_macro_precision):.2%}') print(f'Class-based macro Recall: {float(cb_macro_recall):.2%}') print(f'Class-based macro F1 score: {float(cb_macro_f1):.2%}') print(f'Class-based micro Precision: {float(cb_micro_precision):.2%}') print(f'Class-based micro Recall: {float(cb_micro_recall):.2%}') print(f'Class-based micro F1 score: {float(cb_micro_f1):.2%}') print(f'writing out classification results') with open(Path(args.out_dir) / 'classification_result.json', 'w') as fo: for i, item in tqdm(enumerate(dataset)): title = item['title'] predicted_classes = [ id2ene[j] for j, v in enumerate(prediction_matrix[i]) if v == 1.0 ] reference_classes = [ id2ene[j] for j, v in enumerate(reference_matrix[i]) if v == 1.0 ] out = { 'title': title, 'prediction': predicted_classes, 'reference': reference_classes } print(json.dumps(out, ensure_ascii=False), file=fo)