def main(): parser = argparse.ArgumentParser( description='Alkane property fitting demo') parser.add_argument('-i', '--input', type=str, help='Data') parser.add_argument('-o', '--output', default='fp', type=str, help='Output directory') opt = parser.parse_args() if not os.path.exists(opt.output): os.mkdir(opt.output) df = pd.read_csv(opt.input, sep='\s+', header=0) smiles_array = df.SMILES.values selector = preprocessing.Selector(smiles_array) sel_mol = preprocessing.Selector(df.SMILES.unique()) fold = 5 sel_mol.kfold_partition(1.0, fold) for n in range(fold): sel_mol.kfold_use(n) mol_train = sel_mol.training_set() mol_valid = sel_mol.validation_set() mol_train_dict = dict([(s, 1) for s in mol_train]) mol_valid_dict = dict([(s, 1) for s in mol_valid]) selector.train_index = np.array( [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.valid_index = np.array( [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.test_index = np.logical_not( np.logical_or(selector.train_index, selector.valid_index)) selector.save(opt.output + '/part-%i.txt' % (n + 1))
def load_data(opt, logger): if opt.layer != "": layers = list(map(int, opt.layer.split(','))) else: layers = [] if not os.path.exists(opt.output): os.mkdir(opt.output) if opt.featrm == 'auto': featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) logger.info('loading data...') datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm) selector = preprocessing.Selector(datax, datay, data_names) if opt.part: selector.load(opt.part) else: selector.partition(0.8, 0.1) selector.save(opt.output + '/part.txt') trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() logger.info('loading model...') scaler = preprocessing.Scaler() scaler.load(opt.output + '/scale.txt') normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) model = fitting.TorchMLPRegressor( None, None, [], is_gpu=False, ) model.load(opt.output + '/model.pt') #if opt.pca != -1: # normed_trainx, normed_validx, _ = pca_nd(normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca) return normed_validx, validy, model
def main(): logger.info('Reading data and extra features...') fp_files = [] if opt.fp is None else opt.fp.split(',') fp_extra, y_array, name_array = dataloader.load(opt.input, opt.target, fp_files) smiles_list = [name.split()[0] for name in name_array] logger.info('Generating molecular graphs with %s...' % opt.graph) if opt.graph == 'rdk': graph_list, feats_list = smi2dgl(smiles_list) elif opt.graph == 'msd': msd_list = ['%s.msd' % base64.b64encode(smiles.encode()).decode() for smiles in smiles_list] graph_list, feats_list = msd2dgl(msd_list, '../data/msdfiles.zip') else: raise logger.info('Node feature example: (size=%d) %s' % (len(feats_list[0][0]), ','.join(map(str, feats_list[0][0])))) logger.info('Extra graph feature example: (size=%d) %s' % (len(fp_extra[0]), ','.join(map(str, fp_extra[0])))) logger.info('Output example: (size=%d) %s' % (len(y_array[0]), ','.join(map(str, y_array[0])))) if fp_extra.shape[-1] > 0: logger.info('Normalizing extra graph features...') scaler = preprocessing.Scaler() scaler.fit(fp_extra) scaler.save(opt.output + '/scale.txt') fp_extra = scaler.transform(fp_extra) logger.info('Selecting data...') selector = preprocessing.Selector(smiles_list) if opt.part is not None: logger.info('Loading partition file %s' % opt.part) selector.load(opt.part) else: logger.warning('Partition file not provided. Using auto-partition instead') selector.partition(0.8, 0.2) device = torch.device('cuda:0') # batched data for training set data_list = [[data[i] for i in np.where(selector.train_index)[0]] for data in (graph_list, y_array, feats_list, fp_extra, name_array, smiles_list)] n_batch, (graphs_batch, y_batch, feats_node_batch, feats_extra_batch, names_batch) = \ preprocessing.separate_batches(data_list[:-1], opt.batch, data_list[-1]) bg_batch_train = [dgl.batch(graphs).to(device) for graphs in graphs_batch] y_batch_train = [torch.tensor(y, dtype=torch.float32, device=device) for y in y_batch] feats_node_batch_train = [torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device) for feats_node in feats_node_batch] feats_extra_batch_train = [torch.tensor(feats_extra, dtype=torch.float32, device=device) for feats_extra in feats_extra_batch] # for plot y_train_array = np.concatenate(y_batch) names_train = np.concatenate(names_batch) # data for validation set graphs, y, feats_node, feats_extra, names_valid = \ [[data[i] for i in np.where(selector.valid_index)[0]] for data in (graph_list, y_array, feats_list, fp_extra, name_array)] bg_valid, y_valid, feats_node_valid, feats_extra_valid = ( dgl.batch(graphs).to(device), torch.tensor(y, dtype=torch.float32, device=device), torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device), torch.tensor(feats_extra, dtype=torch.float32, device=device), ) # for plot y_valid_array = y_array[selector.valid_index] logger.info('Training size = %d, Validation size = %d' % (len(y_train_array), len(y_valid_array))) logger.info('Batches = %d, Batch size ~= %d' % (n_batch, opt.batch)) in_feats_node = feats_list[0].shape[-1] in_feats_extra = fp_extra[0].shape[-1] n_heads = list(map(int, opt.head.split(','))) logger.info('Building network...') logger.info('Conv layers = %s' % n_heads) logger.info('Learning rate = %s' % opt.lr) logger.info('L2 penalty = %f' % opt.l2) model = GATModel(in_feats_node, opt.embed, n_head_list=n_heads, extra_feats=in_feats_extra) model.cuda() print(model) for name, param in model.named_parameters(): print(name, param.data.shape) header = 'Step MaxRE(t) Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split() logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header)) optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.l2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrsteps, gamma=opt.lrgamma) for epoch in range(opt.epoch): model.train() if (epoch + 1) % opt.check == 0: pred_train = [None] * n_batch for ib in np.random.permutation(n_batch): optimizer.zero_grad() pred = model(bg_batch_train[ib], feats_node_batch_train[ib], feats_extra_batch_train[ib]) loss = F.mse_loss(pred, y_batch_train[ib]) loss.backward() optimizer.step() if (epoch + 1) % opt.check == 0: pred_train[ib] = pred.detach().cpu().numpy() scheduler.step() if (epoch + 1) % opt.check == 0: model.eval() pred_train = np.concatenate(pred_train) pred_valid = model(bg_valid, feats_node_valid, feats_extra_valid).detach().cpu().numpy() err_line = '%-8i %8.1f %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % ( epoch + 1, metrics.max_relative_error(y_train_array, pred_train) * 100, metrics.mean_squared_error(y_train_array, pred_train), metrics.mean_squared_error(y_valid_array, pred_valid), metrics.mean_signed_error(y_valid_array, pred_valid) * 100, metrics.mean_unsigned_error(y_valid_array, pred_valid) * 100, metrics.max_relative_error(y_valid_array, pred_valid) * 100, metrics.accuracy(y_valid_array, pred_valid, 0.02) * 100, metrics.accuracy(y_valid_array, pred_valid, 0.05) * 100, metrics.accuracy(y_valid_array, pred_valid, 0.10) * 100) logger.info(err_line) torch.save(model, opt.output + '/model.pt') visualizer = visualize.LinearVisualizer(y_train_array.reshape(-1), pred_train.reshape(-1), names_train, 'train') visualizer.append(y_valid_array.reshape(-1), pred_valid.reshape(-1), names_valid, 'valid') visualizer.dump(opt.output + '/fit.txt') visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt', 'valid', threshold=0.1) visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt', 'valid', threshold=0.2) visualizer.scatter_yy(savefig=opt.output + '/error-train.png', annotate_threshold=0.1, marker='x', lw=0.2, s=5) visualizer.hist_error(savefig=opt.output + '/error-hist.png', label='valid', histtype='step', bins=50) plt.show()
def main(): parser = argparse.ArgumentParser( description='Alkane property fitting demo') parser.add_argument('-i', '--input', type=str, help='Data') parser.add_argument('-o', '--output', default='fp', type=str, help='Output directory') parser.add_argument('--fold', default=0, type=int, help='using n-fold partition as validation set') parser.add_argument('--similarity', default=-1.0, type=float, help='using similarity partition as validation set') opt = parser.parse_args() if not os.path.exists(opt.output): os.mkdir(opt.output) smiles_list = [] smiles_list_training = None input_list = opt.input.split(',') for file in input_list: df = pd.read_csv(file, sep='\s+', header=0) if 'train' in file: smiles_list_training = df.SMILES.unique().tolist() else: smiles_array = df.SMILES.values selector = preprocessing.Selector(smiles_array) sel_mol = preprocessing.Selector(smiles_array) if len(input_list) == 2 and smiles_list_training is not None: sel_mol.partition_smiles_list( smiles_list_training=smiles_list_training) mol_train = sel_mol.training_set() mol_valid = sel_mol.validation_set() mol_train_dict = dict([(s, 1) for s in mol_train]) mol_valid_dict = dict([(s, 1) for s in mol_valid]) selector.training_index = np.array( [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.validation_index = np.array( [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.test_index = np.logical_not( np.logical_or(selector.training_index, selector.validation_index)) selector.save(opt.output + '/part.txt') elif opt.similarity > 0.0: sel_mol.similarity_partition(cutoff=opt.similarity) mol_train = sel_mol.training_set() mol_valid = sel_mol.validation_set() mol_train_dict = dict([(s, 1) for s in mol_train]) mol_valid_dict = dict([(s, 1) for s in mol_valid]) selector.training_index = np.array( [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.validation_index = np.array( [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.test_index = np.logical_not( np.logical_or(selector.training_index, selector.validation_index)) selector.save(opt.output + '/part-similarity-%.2f.txt' % (opt.similarity)) elif opt.fold != 0: fold = opt.fold sel_mol.kfold_partition(1.0, fold) for n in range(fold): sel_mol.kfold_use(n) mol_train = sel_mol.training_set() mol_valid = sel_mol.validation_set() mol_train_dict = dict([(s, 1) for s in mol_train]) mol_valid_dict = dict([(s, 1) for s in mol_valid]) selector.training_index = np.array( [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.validation_index = np.array( [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool) selector.test_index = np.logical_not( np.logical_or(selector.training_index, selector.validation_index)) selector.save(opt.output + '/part-%i.txt' % (n + 1))
def main(): parser = argparse.ArgumentParser( description='Alkane property fitting demo') parser.add_argument('-i', '--input', type=str, help='Data') parser.add_argument('-f', '--fp', type=str, help='Fingerprints') parser.add_argument('-o', '--output', default='out', type=str, help='Output directory') parser.add_argument('-t', '--target', default='raw_density', type=str, help='Fitting target') parser.add_argument('-p', '--part', default='', type=str, help='Partition cache file') parser.add_argument('-l', '--layer', default='16,16', type=str, help='Size of hidden layers') parser.add_argument('--visual', default=1, type=int, help='Visualzation data') parser.add_argument('--gpu', default=1, type=int, help='Using gpu') parser.add_argument('--epoch', default="500,2000,2500", type=str, help='Number of epochs') parser.add_argument('--batch', default=1000, type=int, help='Batch size') parser.add_argument('--lr', default="0.01,0.001,0.0001", type=str, help='Initial learning rate') parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty') parser.add_argument('--check', default=50, type=int, help='Number of epoch that do convergence check') parser.add_argument('--minstop', default=0.2, type=float, help='Minimum fraction of step to stop') parser.add_argument('--maxconv', default=2, type=int, help='Times of true convergence that makes a stop') parser.add_argument('--featrm', default='', type=str, help='Remove features') parser.add_argument('--optim', default='rms', type=str, help='optimizer') parser.add_argument('--continuation', default=False, type=bool, help='continue training') parser.add_argument('--pca', default=-1, type=int, help='dimension to discard') parser.add_argument( '--sobol', default=-1, type=int, help='dimensions to reduce according to sensitivity analysis') opt = parser.parse_args() if opt.layer != "": layers = list(map(int, opt.layer.split(','))) else: layers = [] opt_lr = list(map(float, opt.lr.split(','))) opt_epochs = list(map(int, opt.epoch.split(','))) if not os.path.exists(opt.output): os.mkdir(opt.output) logger = logging.getLogger('train') logger.setLevel(logging.INFO) flog = logging.FileHandler(opt.output + '/log.txt', mode='w') flog.setLevel(logging.INFO) formatter = logging.Formatter( fmt='[%(asctime)s] (%(levelname)s) %(message)s', datefmt='%Y-%d-%m %H:%M:%S') flog.setFormatter(formatter) clog = logging.StreamHandler() clog.setFormatter(formatter) logger.addHandler(flog) logger.addHandler(clog) if sys.platform == 'linux': logger.info('Use non-interactive Agg backend for matplotlib on linux') matplotlib.use('Agg') if opt.featrm == 'auto': logger.info('Automatically remove features') featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) logger.info('Remove Feature: %s' % featrm) logger.info('Reading data...') datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm) # Store fingerprint identifier files for fp in opt.fp.split(','): if os.path.exists(fp + '.idx') and Path(fp).parent.absolute() != Path( opt.output).absolute(): shutil.copy(fp + '.idx', opt.output) logger.info('Selecting data...') selector = preprocessing.Selector(datax, datay, data_names) if opt.part: logger.info('Loading partition file %s' % opt.part) selector.load(opt.part) else: logger.warning( "Partition file not found. Using auto-partition instead.") selector.partition(0.8, 0.1) selector.save(opt.output + '/part.txt') trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() logger.info('Training size = %d, Validation size = %d' % (len(trainx), len(validx))) logger.info('X input example: (size=%d) %s' % (len(datax[0]), ','.join(map(str, datax[0])))) logger.info('Y input example: (size=%d) %s' % (len(datay[0]), ','.join(map(str, datay[0])))) logger.info('Normalizing...') scaler = preprocessing.Scaler() scaler.fit(trainx) scaler.save(opt.output + '/scale.txt') normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) if opt.sobol != -1: with open(opt.output + '/sobol_idx.pkl', 'rb') as file: sobol_idx = pickle.load(file) normed_trainx, normed_validx = sobol_reduce( normed_trainx, normed_validx, len(normed_trainx[0]) - 2 - opt.sobol, sobol_idx) logger.info('sobol SA reduced dimension:%d' % (opt.sobol)) if opt.pca != -1: normed_trainx, normed_validx, _ = pca_nd( normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca, logger) logger.info('pca reduced dimension:%d' % (opt.pca)) logger.info('final input length:%d' % (len(normed_trainx[0]))) logger.info('Building network...') logger.info('Hidden layers = %r' % layers) logger.info('optimizer = %s' % (opt.optim)) logger.info('Learning rate = %s' % opt_lr) logger.info('Epochs = %s' % opt_epochs) logger.info('L2 penalty = %f' % opt.l2) logger.info('Batch size = %d' % opt.batch) validy_ = validy.copy() # for further convenience trainy_ = trainy.copy() if opt.gpu: # store everything to GPU all at once logger.info('Using GPU acceleration') device = torch.device("cuda:0") normed_trainx = torch.Tensor(normed_trainx).to(device) trainy = torch.Tensor(trainy).to(device) normed_validx = torch.Tensor(normed_validx).to(device) validy = torch.Tensor(validy).to(device) if opt.optim == 'sgd': optimizer = torch.optim.SGD elif opt.optim == 'adam': optimizer = torch.optim.Adam elif opt.optim == 'rms': optimizer = torch.optim.RMSprop elif opt.optim == 'ada': optimizer = torch.optim.Adagrad model = fitting.TorchMLPRegressor(len(normed_trainx[0]), len(trainy[0]), layers, batch_size=opt.batch, is_gpu=opt.gpu != 0, args_opt={ 'optimizer': optimizer, 'lr': opt.lr, 'weight_decay': opt.l2 }) model.init_session() if opt.continuation: cpt = opt.output + '/model.pt' logger.info('Continue training from checkpoint %s' % (cpt)) model.load(cpt) logger.info('Optimizer = %s' % (optimizer)) header = 'Step Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split( ) logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header)) mse_history = [] converge_times = 0 mse_min = None model_saved = False converged = False all_epoch = sum(opt_epochs) total_epoch = 0 for k, each_epoch in enumerate(opt_epochs): # implement separated learning rate model.reset_optimizer({ 'optimizer': optimizer, 'lr': opt_lr[k], 'weight_decay': opt.l2 }) for i in range(each_epoch): total_epoch += 1 loss = model.fit_epoch(normed_trainx, trainy) if total_epoch % opt.check == 0: predy = model.predict_batch(normed_validx) mse = metrics.mean_squared_error(validy_, predy) mse_history.append(mse) err_line = '%-8i %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % ( total_epoch, loss.data.cpu().numpy() if model.is_gpu else loss.data.numpy(), mse, metrics.mean_signed_error(validy_, predy) * 100, metrics.mean_unsigned_error(validy_, predy) * 100, metrics.max_relative_error(validy_, predy) * 100, metrics.accuracy(validy_, predy, 0.02) * 100, metrics.accuracy(validy_, predy, 0.05) * 100, metrics.accuracy(validy_, predy, 0.10) * 100) logger.info(err_line) if mse_min is None: mse_min = mse elif mse < mse_min: model.save(opt.output + '/model.pt') model_saved = True mse_min = mse if total_epoch > all_epoch * opt.minstop: conv, cur_conv = validation.is_converge( np.array(mse_history), nskip=25) if conv: logger.info('Model converge detected at epoch %d' % total_epoch) converge_times += 1 if converge_times >= opt.maxconv and cur_conv: logger.info('Model converged at epoch: %d' % total_epoch) converged = True break if not converged: logger.warning('Model not converged') if not model_saved: model.save(opt.output + '/model.pt') visualizer = visualize.LinearVisualizer( trainy_.reshape(-1), model.predict_batch(normed_trainx).reshape(-1), trainname, 'training') visualizer.append(validy_.reshape(-1), model.predict_batch(normed_validx).reshape(-1), validname, 'validation') visualizer.dump(opt.output + '/fit.txt') visualizer.dump_bad_molecules(opt.output + '/error-0.05.txt', 'validation', threshold=0.05) visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt', 'validation', threshold=0.1) visualizer.dump_bad_molecules(opt.output + '/error-0.15.txt', 'validation', threshold=0.15) visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt', 'validation', threshold=0.2) logger.info('Fitting result saved') if opt.visual: visualizer.scatter_yy(savefig=opt.output + '/error-train.png', annotate_threshold=0, marker='x', lw=0.2, s=5) visualizer.hist_error(savefig=opt.output + '/error-hist.png', label='validation', histtype='step', bins=50) plt.show()
def main(): parser = argparse.ArgumentParser( description='Alkane property fitting demo') parser.add_argument('-i', '--input', type=str, help='Data') parser.add_argument('-f', '--fp', type=str, help='Fingerprints') parser.add_argument('-o', '--output', default='out', type=str, help='Output directory') parser.add_argument('-t', '--target', default='raw_density', type=str, help='Fitting target') parser.add_argument('-p', '--part', default='', type=str, help='Partition cache file') parser.add_argument('-l', '--layer', default='16,16', type=str, help='Size of hidden layers') parser.add_argument('--visual', default=1, type=int, help='Visualzation data') parser.add_argument('--gpu', default=1, type=int, help='Using gpu') parser.add_argument('--epoch', default="200", type=str, help='Number of epochs') parser.add_argument('--step', default=500, type=int, help='Number of steps trained for each batch') parser.add_argument('--batch', default=int(1e9), type=int, help='Batch size') parser.add_argument('--lr', default="0.005", type=str, help='Initial learning rate') parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty') parser.add_argument( '--check', default=10, type=int, help='Number of epoch that do convergence check. Set 0 to disable.') parser.add_argument('--minstop', default=0.2, type=float, help='Minimum fraction of step to stop') parser.add_argument('--maxconv', default=2, type=int, help='Times of true convergence that makes a stop') parser.add_argument('--featrm', default='', type=str, help='Remove features') parser.add_argument('--optim', default='rms', type=str, help='optimizer') parser.add_argument('--continuation', default=False, type=bool, help='continue training') parser.add_argument('--pca', default=0, type=int, help='dimension to discard') parser.add_argument( '--sobol', default=-1, type=int, help='dimensions to reduce according to sensitivity analysis') opt = parser.parse_args() if opt.layer != "": layers = list(map(int, opt.layer.split(','))) else: layers = [] opt_lr = list(map(float, opt.lr.split(','))) opt_epochs = list(map(int, opt.epoch.split(','))) if not os.path.exists(opt.output): os.mkdir(opt.output) logger = logging.getLogger('train') logger.setLevel(logging.INFO) flog = logging.FileHandler(opt.output + '/log.txt', mode='w') flog.setLevel(logging.INFO) formatter = logging.Formatter( fmt='[%(asctime)s] (%(levelname)s) %(message)s', datefmt='%Y-%d-%m %H:%M:%S') flog.setFormatter(formatter) clog = logging.StreamHandler() clog.setFormatter(formatter) logger.addHandler(flog) logger.addHandler(clog) if opt.featrm == 'auto': logger.info('Automatically remove features') featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) logger.info('Remove Feature: %s' % featrm) logger.info('Reading data...') datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm) logger.info('Selecting data...') selector = preprocessing.Selector(datax, datay, data_names) if opt.part: logger.info('Loading partition file %s' % opt.part) selector.load(opt.part) else: logger.warning( "Partition file not found. Using auto-partition instead.") selector.partition(0.8, 0.1) selector.save(opt.output + '/part.txt') trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() logger.info('Training size = %d, Validation size = %d' % (len(trainx), len(validx))) logger.info('X input example: (size=%d) %s' % (len(datax[0]), ','.join(map(str, datax[0])))) logger.info('Y input example: (size=%d) %s' % (len(datay[0]), ','.join(map(str, datay[0])))) logger.info('Normalizing...') scaler = preprocessing.Scaler() scaler.fit(trainx) scaler.save(opt.output + '/scale.txt') normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) logger.info('Building network...') logger.info('Hidden layers = %r' % layers) logger.info('optimizer = %s' % (opt.optim)) logger.info('Initial learning rate = %f' % opt_lr[0]) logger.info('L2 penalty = %f' % opt.l2) logger.info('Total %d epochs' % sum(opt_epochs)) logger.info('Batch = (%d values x %d steps)' % (opt.batch, opt.step)) if opt.optim == 'sgd': optimizer = torch.optim.SGD elif opt.optim == 'adam': optimizer = torch.optim.Adam elif opt.optim == 'rms': optimizer = torch.optim.RMSprop elif opt.optim == 'ada': optimizer = torch.optim.Adagrad result = [] for i in range(60, len(trainx[0]), 5): logger.info('Start PCA trainning of dimension ' + str(i)) pca_i_result = pca_train(i, normed_trainx, trainy, normed_validx, validy, opt, logger, layers, opt_lr, opt_epochs, optimizer) logger.info('PCA reduced result of dimension %d :' % (i)) logger.info('%.3f variance_explained,\t acc2: %.3f,\t MSE %.3f ' % (pca_i_result)) result.append(pca_i_result) logger.info(result)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', default='out', help='Model directory') parser.add_argument('-i', '--input', help='Data') parser.add_argument('-f', '--fp', help='Fingerprints') parser.add_argument('-t', '--target', help='Target property') parser.add_argument('-p', '--part', help='Partition file') parser.add_argument('--gpu', default=1, type=int, help='Using GPU') parser.add_argument('--visual', default=1, type=int, help='Visualzation data') parser.add_argument('--visualx', default='', help='Extra visualisze on special x') parser.add_argument('--dump', default='', help='Output of fitting results') parser.add_argument('--featrm', default='', type=str, help='Remove features') opt = parser.parse_args() model = fitting.TorchMLPRegressor(None, None, []) model.is_gpu = opt.gpu == 1 model.load(opt.dir + '/model.pt') scaler = preprocessing.Scaler() scaler.load(opt.dir + '/scale.txt') if opt.featrm == 'auto': featrm = [14, 15, 17, 18, 19, 20, 21, 22] elif opt.featrm == '': featrm = [] else: featrm = list(map(int, opt.featrm.split(','))) datax, datay, data_names = mdlearn.dataloader.load(filename=opt.input, target=opt.obj, fps=opt.fp.split(','), featrm=featrm) selector = preprocessing.Selector(datax, datay, data_names) selector.load(opt.part) trainx, trainy, trainname = selector.training_set() validx, validy, validname = selector.validation_set() testx, testy, testname = selector.test_set() normed_trainx = scaler.transform(trainx) normed_validx = scaler.transform(validx) normed_testx = scaler.transform(testx) trainy = trainy.flatten() validy = validy.flatten() testy = testy.flatten() trainy_est = model.predict_batch(normed_trainx).flatten() validy_est = model.predict_batch(normed_validx).flatten() testy_est = model.predict_batch(normed_testx).flatten() def evaluate_model(y, y_est): mse = metrics.mean_squared_error(y, y_est) ae = np.average(metrics.abs_absolute_error(y, y_est)) ave_y = np.average(y) ave_y_est = np.average(y_est) bias = (ave_y_est - ave_y) eval_results = OrderedDict() eval_results['MSE'] = mse eval_results['RMSE'] = np.sqrt(mse) eval_results['AE'] = ae eval_results['Max AAE'] = metrics.max_absolute_error(y, y_est) eval_results['Bias'] = bias eval_results['RRMSE'] = np.sqrt(mse) / np.abs(ave_y) eval_results['MARE'] = ae / np.abs(ave_y) eval_results['Max ARE'] = metrics.max_relative_error(y, y_est) eval_results['RBias'] = bias / np.abs(ave_y) eval_results['Accuracy1%'] = metrics.accuracy(y, y_est, 0.01) eval_results['Accuracy2%'] = metrics.accuracy(y, y_est, 0.02) eval_results['Accuracy5%'] = metrics.accuracy(y, y_est, 0.05) eval_results['Accuracy10%'] = metrics.accuracy(y, y_est, 0.1) return eval_results results = [] results.append(evaluate_model(trainy, trainy_est)) results.append(evaluate_model(validy, validy_est)) results.append(evaluate_model(testy, testy_est)) results.append( evaluate_model(np.concatenate((trainy, validy, testy)), np.concatenate((trainy_est, validy_est, testy_est)))) print('Dataset\t%s' % ('\t'.join(results[0].keys()))) fmt = lambda x: '%.3g' % x for name, result in zip(['Training', 'Validation', 'Test', 'Overall'], results): print('%s\t%s' % (name, '\t'.join([fmt(v) for v in result.values()]))) visualizer = visualize.LinearVisualizer(trainy, trainy_est, trainname, 'training') visualizer.append(validy, validy_est, validname, 'validation') visualizer.append(testy, testy_est, testname, 'test') if opt.dump: visualizer.dump(opt.dump) if opt.visual: visualizer.scatter_yy(annotate_threshold=0.1, marker='x', lw=0.2, s=5, figure_name='Value') visualizer.scatter_error(annotate_threshold=0.1, marker='x', lw=0.2, s=5, figure_name='Error') visualizer.hist_error(label='test', histtype='step', bins=50, figure_name='Error Distribution') if opt.visualx: for i in map(int, opt.visualx.split(',')): visualizer2 = visualize.LinearVisualizer( trainx[:, i], trainy_est - trainy, trainname, 'training') visualizer2.append(validx[:, i], validy_est - validy, validname, 'validation') visualizer2.append(testx[:, i], testy_est - testy, testname, 'test') visualizer2.scatter_yy(ref=None, annotate_threshold=-1, marker='x', lw=0.2, s=5, figure_name=str(i)) plt.show()