def pca_train(n, normed_trainx, trainy, normed_validx, validy, opt, logger, layers, opt_lr, opt_epochs, optimizer): trainx, validx, var_ex = pca_nd(normed_trainx, normed_validx, n, logger) validy_ = validy.copy() # for further convenience trainy_ = trainy.copy() if opt.gpu: # store everything to GPU all at once logger.info('Using GPU acceleration') device = torch.device("cuda:0") trainx = torch.Tensor(trainx).to(device) trainy = torch.Tensor(trainy).to(device) validx = torch.Tensor(validx).to(device) validy = torch.Tensor(validy).to(device) model = fitting.TorchMLPRegressor(len(trainx[0]), len(trainy[0]), layers, batch_size=opt.batch, is_gpu=opt.gpu != 0, args_opt={ 'optimizer': torch.optim.Adam, 'lr': opt.lr, 'weight_decay': opt.l2 }) model.init_session() print(model.regressor) model.load_data(trainx, trainy) header = 'Epoch Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc1% Acc2% Acc5% Acc10%'.split( ) logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header)) total_epoch = 0 for k, each_epoch in enumerate(opt_epochs): # implement seperated learning rate model.reset_optimizer({ 'optimizer': optimizer, 'lr': opt_lr[k], 'weight_decay': opt.l2 }) for i_epoch in range(each_epoch): total_epoch += 1 loss = model.fit_epoch(trainx, trainy) if (i_epoch + 1) % 20 == 0 or i_epoch + 1 == each_epoch: predy = model.predict_batch(validx) err_line = '%d/%d %8.3e %8.3e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % ( total_epoch, sum(opt_epochs), loss, metrics.mean_squared_error(validy_, predy), metrics.mean_signed_error(validy_, predy) * 100, metrics.mean_unsigned_error(validy_, predy) * 100, metrics.max_relative_error(validy_, predy) * 100, metrics.accuracy(validy_, predy, 0.01) * 100, metrics.accuracy(validy_, predy, 0.02) * 100, metrics.accuracy(validy_, predy, 0.05) * 100, metrics.accuracy(validy_, predy, 0.10) * 100) logger.info(err_line) return var_ex, metrics.accuracy( validy_, predy, 0.02) * 100, metrics.mean_squared_error(validy, predy)
def load_data(opt, logger): if opt.layer != "": layers = list(map(int, opt.layer.split(','))) else: layers = [] if not os.path.exists(opt.output): os.mkdir(opt.output) if opt.featrm == 'auto': featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) logger.info('loading data...') datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm) selector = preprocessing.Selector(datax, datay, data_names) if opt.part: selector.load(opt.part) else: selector.partition(0.8, 0.1) selector.save(opt.output + '/part.txt') trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() logger.info('loading model...') scaler = preprocessing.Scaler() scaler.load(opt.output + '/scale.txt') normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) model = fitting.TorchMLPRegressor( None, None, [], is_gpu=False, ) model.load(opt.output + '/model.pt') #if opt.pca != -1: # normed_trainx, normed_validx, _ = pca_nd(normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca) return normed_validx, validy, model
def main(): parser = argparse.ArgumentParser( description='Alkane property fitting demo') parser.add_argument('-i', '--input', type=str, help='Data') parser.add_argument('-f', '--fp', type=str, help='Fingerprints') parser.add_argument('-o', '--output', default='out', type=str, help='Output directory') parser.add_argument('-t', '--target', default='raw_density', type=str, help='Fitting target') parser.add_argument('-p', '--part', default='', type=str, help='Partition cache file') parser.add_argument('-l', '--layer', default='16,16', type=str, help='Size of hidden layers') parser.add_argument('--visual', default=1, type=int, help='Visualzation data') parser.add_argument('--gpu', default=1, type=int, help='Using gpu') parser.add_argument('--epoch', default="500,2000,2500", type=str, help='Number of epochs') parser.add_argument('--batch', default=1000, type=int, help='Batch size') parser.add_argument('--lr', default="0.01,0.001,0.0001", type=str, help='Initial learning rate') parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty') parser.add_argument('--check', default=50, type=int, help='Number of epoch that do convergence check') parser.add_argument('--minstop', default=0.2, type=float, help='Minimum fraction of step to stop') parser.add_argument('--maxconv', default=2, type=int, help='Times of true convergence that makes a stop') parser.add_argument('--featrm', default='', type=str, help='Remove features') parser.add_argument('--optim', default='rms', type=str, help='optimizer') parser.add_argument('--continuation', default=False, type=bool, help='continue training') parser.add_argument('--pca', default=-1, type=int, help='dimension to discard') parser.add_argument( '--sobol', default=-1, type=int, help='dimensions to reduce according to sensitivity analysis') opt = parser.parse_args() if opt.layer != "": layers = list(map(int, opt.layer.split(','))) else: layers = [] opt_lr = list(map(float, opt.lr.split(','))) opt_epochs = list(map(int, opt.epoch.split(','))) if not os.path.exists(opt.output): os.mkdir(opt.output) logger = logging.getLogger('train') logger.setLevel(logging.INFO) flog = logging.FileHandler(opt.output + '/log.txt', mode='w') flog.setLevel(logging.INFO) formatter = logging.Formatter( fmt='[%(asctime)s] (%(levelname)s) %(message)s', datefmt='%Y-%d-%m %H:%M:%S') flog.setFormatter(formatter) clog = logging.StreamHandler() clog.setFormatter(formatter) logger.addHandler(flog) logger.addHandler(clog) if sys.platform == 'linux': logger.info('Use non-interactive Agg backend for matplotlib on linux') matplotlib.use('Agg') if opt.featrm == 'auto': logger.info('Automatically remove features') featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) logger.info('Remove Feature: %s' % featrm) logger.info('Reading data...') datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm) # Store fingerprint identifier files for fp in opt.fp.split(','): if os.path.exists(fp + '.idx') and Path(fp).parent.absolute() != Path( opt.output).absolute(): shutil.copy(fp + '.idx', opt.output) logger.info('Selecting data...') selector = preprocessing.Selector(datax, datay, data_names) if opt.part: logger.info('Loading partition file %s' % opt.part) selector.load(opt.part) else: logger.warning( "Partition file not found. Using auto-partition instead.") selector.partition(0.8, 0.1) selector.save(opt.output + '/part.txt') trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() logger.info('Training size = %d, Validation size = %d' % (len(trainx), len(validx))) logger.info('X input example: (size=%d) %s' % (len(datax[0]), ','.join(map(str, datax[0])))) logger.info('Y input example: (size=%d) %s' % (len(datay[0]), ','.join(map(str, datay[0])))) logger.info('Normalizing...') scaler = preprocessing.Scaler() scaler.fit(trainx) scaler.save(opt.output + '/scale.txt') normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) if opt.sobol != -1: with open(opt.output + '/sobol_idx.pkl', 'rb') as file: sobol_idx = pickle.load(file) normed_trainx, normed_validx = sobol_reduce( normed_trainx, normed_validx, len(normed_trainx[0]) - 2 - opt.sobol, sobol_idx) logger.info('sobol SA reduced dimension:%d' % (opt.sobol)) if opt.pca != -1: normed_trainx, normed_validx, _ = pca_nd( normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca, logger) logger.info('pca reduced dimension:%d' % (opt.pca)) logger.info('final input length:%d' % (len(normed_trainx[0]))) logger.info('Building network...') logger.info('Hidden layers = %r' % layers) logger.info('optimizer = %s' % (opt.optim)) logger.info('Learning rate = %s' % opt_lr) logger.info('Epochs = %s' % opt_epochs) logger.info('L2 penalty = %f' % opt.l2) logger.info('Batch size = %d' % opt.batch) validy_ = validy.copy() # for further convenience trainy_ = trainy.copy() if opt.gpu: # store everything to GPU all at once logger.info('Using GPU acceleration') device = torch.device("cuda:0") normed_trainx = torch.Tensor(normed_trainx).to(device) trainy = torch.Tensor(trainy).to(device) normed_validx = torch.Tensor(normed_validx).to(device) validy = torch.Tensor(validy).to(device) if opt.optim == 'sgd': optimizer = torch.optim.SGD elif opt.optim == 'adam': optimizer = torch.optim.Adam elif opt.optim == 'rms': optimizer = torch.optim.RMSprop elif opt.optim == 'ada': optimizer = torch.optim.Adagrad model = fitting.TorchMLPRegressor(len(normed_trainx[0]), len(trainy[0]), layers, batch_size=opt.batch, is_gpu=opt.gpu != 0, args_opt={ 'optimizer': optimizer, 'lr': opt.lr, 'weight_decay': opt.l2 }) model.init_session() if opt.continuation: cpt = opt.output + '/model.pt' logger.info('Continue training from checkpoint %s' % (cpt)) model.load(cpt) logger.info('Optimizer = %s' % (optimizer)) header = 'Step Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split( ) logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header)) mse_history = [] converge_times = 0 mse_min = None model_saved = False converged = False all_epoch = sum(opt_epochs) total_epoch = 0 for k, each_epoch in enumerate(opt_epochs): # implement separated learning rate model.reset_optimizer({ 'optimizer': optimizer, 'lr': opt_lr[k], 'weight_decay': opt.l2 }) for i in range(each_epoch): total_epoch += 1 loss = model.fit_epoch(normed_trainx, trainy) if total_epoch % opt.check == 0: predy = model.predict_batch(normed_validx) mse = metrics.mean_squared_error(validy_, predy) mse_history.append(mse) err_line = '%-8i %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % ( total_epoch, loss.data.cpu().numpy() if model.is_gpu else loss.data.numpy(), mse, metrics.mean_signed_error(validy_, predy) * 100, metrics.mean_unsigned_error(validy_, predy) * 100, metrics.max_relative_error(validy_, predy) * 100, metrics.accuracy(validy_, predy, 0.02) * 100, metrics.accuracy(validy_, predy, 0.05) * 100, metrics.accuracy(validy_, predy, 0.10) * 100) logger.info(err_line) if mse_min is None: mse_min = mse elif mse < mse_min: model.save(opt.output + '/model.pt') model_saved = True mse_min = mse if total_epoch > all_epoch * opt.minstop: conv, cur_conv = validation.is_converge( np.array(mse_history), nskip=25) if conv: logger.info('Model converge detected at epoch %d' % total_epoch) converge_times += 1 if converge_times >= opt.maxconv and cur_conv: logger.info('Model converged at epoch: %d' % total_epoch) converged = True break if not converged: logger.warning('Model not converged') if not model_saved: model.save(opt.output + '/model.pt') visualizer = visualize.LinearVisualizer( trainy_.reshape(-1), model.predict_batch(normed_trainx).reshape(-1), trainname, 'training') visualizer.append(validy_.reshape(-1), model.predict_batch(normed_validx).reshape(-1), validname, 'validation') visualizer.dump(opt.output + '/fit.txt') visualizer.dump_bad_molecules(opt.output + '/error-0.05.txt', 'validation', threshold=0.05) visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt', 'validation', threshold=0.1) visualizer.dump_bad_molecules(opt.output + '/error-0.15.txt', 'validation', threshold=0.15) visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt', 'validation', threshold=0.2) logger.info('Fitting result saved') if opt.visual: visualizer.scatter_yy(savefig=opt.output + '/error-train.png', annotate_threshold=0, marker='x', lw=0.2, s=5) visualizer.hist_error(savefig=opt.output + '/error-hist.png', label='validation', histtype='step', bins=50) plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', default='out', help='Model directory') parser.add_argument('-i', '--input', help='Data') parser.add_argument('-f', '--fp', help='Fingerprints') parser.add_argument('-t', '--target', help='Target property') parser.add_argument('-p', '--part', help='Partition file') parser.add_argument('--gpu', default=1, type=int, help='Using GPU') parser.add_argument('--visual', default=1, type=int, help='Visualzation data') parser.add_argument('--visualx', default='', help='Extra visualisze on special x') parser.add_argument('--dump', default='', help='Output of fitting results') parser.add_argument('--featrm', default='', type=str, help='Remove features') opt = parser.parse_args() model = fitting.TorchMLPRegressor(None, None, []) model.is_gpu = opt.gpu == 1 model.load(opt.dir + '/model.pt') scaler = preprocessing.Scaler() scaler.load(opt.dir + '/scale.txt') if opt.featrm == 'auto': featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) datax, datay, data_names = mdlearn.dataloader.load(filename=opt.input, target=opt.obj, fps=opt.fp.split(','), featrm=featrm) selector = preprocessing.Selector(datax, datay, data_names) selector.load(opt.part) trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() testx, testy, testname = selector.test_set() normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) normed_testx = scaler.transform(testx) trainy = trainy.flatten() validy = validy.flatten() testy = testy.flatten() trainy_est = model.predict_batch(normed_trainx).flatten() validy_est = model.predict_batch(normed_validx).flatten() testy_est = model.predict_batch(normed_testx).flatten() def evaluate_model(y, y_est): mse = metrics.mean_squared_error(y, y_est) ae = np.average(metrics.abs_absolute_error(y, y_est)) ave_y = np.average(y) ave_y_est = np.average(y_est) bias = (ave_y_est - ave_y) eval_results = OrderedDict() eval_results['MSE'] = mse eval_results['RMSE'] = np.sqrt(mse) eval_results['AE'] = ae eval_results['Max AAE'] = metrics.max_absolute_error(y, y_est) eval_results['Bias'] = bias eval_results['RRMSE'] = np.sqrt(mse) / np.abs(ave_y) eval_results['MARE'] = ae / np.abs(ave_y) eval_results['Max ARE'] = metrics.max_relative_error(y, y_est) eval_results['RBias'] = bias / np.abs(ave_y) eval_results['Accuracy1%'] = metrics.accuracy(y, y_est, 0.01) eval_results['Accuracy2%'] = metrics.accuracy(y, y_est, 0.02) eval_results['Accuracy5%'] = metrics.accuracy(y, y_est, 0.05) eval_results['Accuracy10%'] = metrics.accuracy(y, y_est, 0.1) return eval_results results = [] results.append(evaluate_model(trainy, trainy_est)) results.append(evaluate_model(validy, validy_est)) results.append(evaluate_model(testy, testy_est)) results.append( evaluate_model(np.concatenate((trainy, validy, testy)), np.concatenate((trainy_est, validy_est, testy_est)))) print('Dataset\t%s' % ('\t'.join(results[0].keys()))) fmt = lambda x: '%.3g' % x for name, result in zip(['Training', 'Validation', 'Test', 'Overall'], results): print('%s\t%s' % (name, '\t'.join([fmt(v) for v in result.values()]))) visualizer = visualize.LinearVisualizer(trainy, trainy_est, trainname, 'training') visualizer.append(validy, validy_est, validname, 'validation') visualizer.append(testy, testy_est, testname, 'test') if opt.dump: visualizer.dump(opt.dump) if opt.visual: visualizer.scatter_yy(annotate_threshold=0.1, marker='x', lw=0.2, s=5, figure_name='Value') visualizer.scatter_error(annotate_threshold=0.1, marker='x', lw=0.2, s=5, figure_name='Error') visualizer.hist_error(label='test', histtype='step', bins=50, figure_name='Error Distribution') if opt.visualx: for i in map(int, opt.visualx.split(',')): visualizer2 = visualize.LinearVisualizer( trainx[:, i], trainy_est - trainy, trainname, 'training') visualizer2.append(validx[:, i], validy_est - validy, validname, 'validation') visualizer2.append(testx[:, i], testy_est - testy, testname, 'test') visualizer2.scatter_yy(ref=None, annotate_threshold=-1, marker='x', lw=0.2, s=5, figure_name=str(i)) plt.show()