def train_val_executor(task_queue, result_queue): while True: task = task_queue.get() if task == None: break param_idx = task[0] model_name = task[1] dt = task[2] params = task[3] fold_num = task[4] val_fold_id = task[5] m = SOL(algo=model_name, class_num=dt.class_num, **params) for p in xrange(dt.pass_num): for i in xrange(fold_num): if i == val_fold_id: continue train_accu = m.fit(dt.split_path(i), dt.slice_type) val_accu = m.score(dt.split_path(val_fold_id), dt.slice_type) logging.info('Cross validation of %s on %s, Fold %d/%d: \n\t\ params: %s\n\t\ Training Accuracy: %f, Validation Accuracy: %f', model_name, dt.name, val_fold_id, fold_num, str(params), train_accu, val_accu) result_queue.put((param_idx, train_accu, val_accu)) task_queue.put(None)
def main(): args = getargs() dt_name = osp.basename(args.input) dt = DataSet(dt_name, args.input, args.data_type) m = SOL(batch_size=args.batch_size, buf_size=args.buf_size) m.load(args.model) algo = m.name logging.info("testing algorithm %s ..." % (algo)) start_time = time.time() if args.output == None: accu = m.score(dt.data_path, dt.dtype) else: scores, predicts, labels = m.decision_function(dt.data_path, dt.dtype,get_labels=True) accu = np.sum(predicts == labels, dtype=np.float64) / predicts.shape[0] test_time = time.time() - start_time logging.info("test accuracy of %s: %.4f" % (algo, accu)) logging.info("test time of %s: %.4f sec" % (algo, test_time)) if args.output != None: logging.info("write prediction results to %s" %(args.output)) with open(args.output, 'w') as fh: if m.n_classes == 2: for i in xrange(scores.shape[0]): fh.write('%d\t%d\t%f\n' %(int(labels[i]), int(predicts[i]), scores[i])) else: for i in xrange(scores.shape[0]): fh.write('%d\t%d\t%s\n' %(int(labels[i]), int(predicts[i]), '\t'.join([str(v) for v in scores[i,:]])))
def train_val_executor(task_queue, result_queue): while True: task = task_queue.get() if task == None: break param_idx = task[0] model_name = task[1] dt = task[2] params = task[3] fold_num = task[4] val_fold_id = task[5] m = SOL(algo=model_name, class_num=dt.class_num, **params) for p in xrange(dt.pass_num): for i in xrange(fold_num): if i == val_fold_id: continue train_accu = m.fit(dt.split_path(i), dt.slice_type) val_accu = m.score(dt.split_path(val_fold_id), dt.slice_type) logging.info( 'Cross validation of %s on %s, Fold %d/%d: \n\t\ params: %s\n\t\ Training Accuracy: %f, Validation Accuracy: %f', model_name, dt.name, val_fold_id, fold_num, str(params), train_accu, val_accu) result_queue.put((param_idx, train_accu, val_accu)) task_queue.put(None)
def run_sol(dtrain, dtest, opts): logging.info('run sol: %s' % (opts['algo'])) if opts['algo'] == 'liblinear': return liblinear.run(dt_train, dt_test, opts) elif opts['algo'] == 'vw': return vw.run(dt_train, dt_test, opts) model_params = [] if 'params' in opts: model_params = [item.split('=') for item in opts['params']] if 'cv' in opts: cv_output_path = osp.join(dtrain.work_dir, 'cv-%s.txt' % (opts['cv'])) if osp.exists(cv_output_path): best_params = CV.load_results(cv_output_path) else: raise Exception('%s does not exist!' % (cv_output_path)) logging.info('cross validation parameters: %s' % (str(best_params))) for k, v in best_params: model_params.append([k, v]) model_params = dict(model_params) sparsity_list = [] test_accu_list = [] for l1 in opts['lambda']: model_params['lambda'] = l1 m = SOL(algo=opts['algo'], class_num=dtrain.class_num, **model_params) logging.info("train %s on %s with l1=%f ..." % (opts['algo'], dtrain.name, l1)) start_time = time.time() train_accu = m.fit(dtrain.rand_path('bin'), 'bin') end_time = time.time() sparsity_list.append(m.sparsity) logging.info("training accuracy: %.4f" % (train_accu)) logging.info("training time: %.4f seconds" % (end_time - start_time)) logging.info("model sparsity: %.4f seconds" % (m.sparsity)) logging.info("test %s on %s with l1=%f ..." % (opts['algo'], dtrain.name, l1)) start_time = time.time() test_accu = m.score(dtest.rand_path('bin'), 'bin') end_time = time.time() logging.info("test accuracy: %.4f" % (test_accu)) logging.info("test time: %.4f seconds" % (end_time - start_time)) test_accu_list.append(test_accu) return np.array(sparsity_list), np.array(test_accu_list)
def finetune(dt, model_path, model_params = {}, output_path = None): """Finetune from an existing model Parameter -------- dt: DataSet the dataset used to train the model model_path: str path to exisitng model model_params: dict{param, val} model parameters output_path: str path to save the model Return ------ tuple(train accuracy, train time) """ logging.info("finetnue model from %s ..." % (model_path)) logging.info("parameter settings: %s" % (model_params)) init_params = {} if 'batch_size' in model_params: init_params['batch_size'] = model_params['batch_size'] del model_params['batch_size'] if 'buf_size' in model_params: init_params['buf_size'] = model_params['buf_size'] del model_params['buf_size'] if 'verbose' in model_params: init_params['verbose'] = model_params['verbose'] del model_params['verbose'] m = SOL(**init_params) m.load(model_path) algo = m.name m.set_params(**model_params) start_time = time.time() train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num) train_time = time.time() - start_time logging.info("training accuracy of %s: %.4f" % (algo, train_accu)) logging.info("training time of %s: %.4f sec" % (algo, train_time)) if output_path != None: logging.info("save model of %s to %s" % (algo, output_path)) m.save(output_path) return train_accu, train_time, m
def main(): args = getargs() try: dt_name = os.path.basename(args.input) dt = DataSet(dt_name, args.input, args.data_type) model_params = [('verbose', args.verbose)] if args.params != None: model_params = [item.split('=') for item in args.cv] if args.cv != None: cv_output_path = os.path.join(dt.work_dir, 'cv-%s.txt' % (args.algo)) if os.path.exists(cv_output_path) and args.retrain == False: best_params = CV.load_results(cv_output_path) else: #cross validation cv_params = [item.split('=') for item in args.cv] cv = CV(dt, args.fold_num, cv_params, model_params) cv.train_val(args.algo) best_params = cv.get_best_param()[0] cv.save_results(cv_output_path) logging.info('cross validation parameters: %s' % (str(best_params))) for k, v in best_params: model_params.append((k, v)) model_params = dict(model_params) start_time = time.time() m = SOL(args.algo, dt.class_num, batch_size=args.batch_size, buf_size=args.buf_size, **model_params) logging.info("learn model with %s algorithm..." % (args.algo)) accu = m.fit(dt.data_path, dt.dtype, args.passes) logging.info("training accuracy of %s: %.4f" % (args.algo, accu)) logging.info("training time of %s: %.4f seconds" % (args.algo, time.time() - start_time)) if args.output != None: logging.info("save model of %s to %s" % (args.algo, args.output)) m.save(args.output) except Exception as err: print 'train failed: %s' % (err.message)
def main(): args = getargs() try: dt_name = os.path.basename(args.input) dt = DataSet(dt_name, args.input, args.data_type) start_time = time.time() m = SOL(batch_size=args.batch_size, buf_size=args.buf_size) m.load(args.model) algo = m.name logging.info("testing algorithm %s ..." % (algo)) if args.output == None: accu = m.score(dt.data_path, dt.dtype) else: scores, predicts, labels = m.decision_function(dt.data_path, dt.dtype,get_labels=True) accu = np.sum(predicts == labels, dtype=np.float64) / predicts.shape[0] logging.info("test accuracy of %s: %.4f" % (algo, accu)) logging.info("test time of %s: %.4f seconds" % (algo, time.time() - start_time)) if args.output != None: logging.info("write prediction results to %s" %(args.output)) with open(args.output, 'w') as fh: if m.n_classes == 2: for i in xrange(scores.shape[0]): fh.write('%d\t%d\t%f\n' %(int(labels[i]), int(predicts[i]), scores[i])) else: for i in xrange(scores.shape[0]): fh.write('%d\t%d\t%s\n' %(int(labels[i]), int(predicts[i]), '\t'.join([str(v) for v in scores[i,:]]))) except Exception as err: print 'test failed %s' % (err.message)
def __train_val_one_fold(self, model_name, val_fold_id): """ cross validation on one fold of data Parameters: model_name: string name of the model to be tuned val_fold_id: int fold id that is used as val data Return: list of (train accuracy, validation accuracy) """ train_accu_list = [] val_accu_list = [] #parameters for k in range(0, self.search_space.size): params = self.search_space.get_param(k) for param in self.extra_param: params.append(param) params = dict(params) m = SOL(algo=model_name, class_num=self.dataset.class_num, **params) for train_path in [ self.dataset.split_path(i) for i in xrange(self.fold_num) if i != val_fold_id ]: train_accu = m.fit(train_path, self.dataset.slice_type, self.dataset.pass_num) val_accu = m.score(self.dataset.split_path(val_fold_id), self.dataset.slice_type) print 'Results of Cross Validation on Model %s with Data %s: Fold %d/%d' % ( model_name, self.dataset.name, val_fold_id, self.fold_num) print '\tParameter Setting: %s' % (str(params)) print '\tTraining Accuracy: %f' % (train_accu) print '\tValidation Accuracy: %f' % (val_accu) train_accu_list.append(train_accu) val_accu_list.append(val_accu) return train_accu_list, val_accu_list
def train(dt, model_name, model_params={}, output_path= None, fold_num= 5, cv_params=None, retrain=False, cv_process_num=1): """ train a SOL model Parameter --------- dt: DataSet the dataset used to train the model model_name: str name of the algorithm to use model_params: dict{param, val} model parameters output_path: str path to save the model fold_num: int number of folds to do cross validation cv_params: dict{param, range} cross validation parameters cv_process_num: int number of processes to do cross validation retrain: bool whether to re-do the cross validation Return ------ tuple(train accuracy, train time, model) """ if cv_params != None: cv_output_path = osp.join(dt.work_dir, 'cv-%s.txt' % (model_name)) if osp.exists(cv_output_path) and retrain == False: best_params = CV.load_results(cv_output_path) else: #cross validation param_B = None param_lambda = None if 'B' in model_params: param_B = model_params['B'] del model_params['B'] if 'lambda' in model_params: param_lambda = model_params['lambda'] del model_params['lambda'] cv = CV(dt, fold_num, cv_params, model_params, process_num=cv_process_num) cv.train_val(model_name) best_params = cv.get_best_param()[0] cv.save_results(cv_output_path) if param_B is not None: model_params['B'] = param_B if param_lambda is not None: model_params['lambda'] = param_lambda logging.info('cross validation results: %s' % (str(best_params))) model_params.update(best_params) logging.info("learn model with %s algorithm on %s ..." % (model_name, dt.name)) logging.info("parameter settings: %s" % (model_params)) start_time = time.time() m = SOL(model_name, dt.class_num, **model_params) train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num) train_time = time.time() - start_time logging.info("training accuracy of %s: %.4f" % (model_name, train_accu)) logging.info("training time of %s: %.4f sec" % (model_name, train_time)) if output_path != None: logging.info("save model of %s to %s" % (model_name, output_path)) m.save(output_path) return train_accu, train_time, m
def run_ol(dtrain, dtest, opts, retrain=False, fold_num=5): logging.info('run ol: %s' % (opts['algo'])) if opts['algo'] == 'liblinear': return liblinear.run(dt_train, dt_test, opts, retrain, fold_num) elif opts['algo'] == 'vw': return vw.run(dt_train, dt_test, opts, retrain, fold_num) model_params = [] if 'params' in opts: model_params = [item.split('=') for item in opts['params']] if 'cv' in opts: cv_output_path = osp.join(dtrain.work_dir, 'cv-%s.txt' % (opts['algo'])) if osp.exists(cv_output_path) and retrain == False: best_params = CV.load_results(cv_output_path) else: #cross validation logging.info("cross validation on dataset %s with parameters %s" % (dtrain.name, str(opts['cv']))) cv_params = [item.split('=') for item in opts['cv']] cv = CV(dtrain, fold_num, cv_params, model_params) cv.train_val(opts['algo']) best_params = cv.get_best_param()[0] cv.save_results(cv_output_path) logging.info('cross validation parameters: %s' % (str(best_params))) for k, v in best_params: model_params.append([k, v]) model_params = dict(model_params) m = SOL(algo=opts['algo'], class_num=dtrain.class_num, **model_params) train_log = [] def record_training_process(data_num, iter_num, update_num, err_rate, stat=train_log): train_log.append([data_num, iter_num, update_num, err_rate]) m.inspect_learning(record_training_process) output_path = osp.join(dtrain.work_dir, opts['algo'] + '.model') logging.info("train %s on %s..." % (opts['algo'], dtrain.name)) start_time = time.time() train_accu = m.fit(dtrain.rand_path(), dtrain.dtype) end_time = time.time() train_time = end_time - start_time logging.info("training accuracy: %.4f" % (train_accu)) logging.info("training time: %.4f seconds" % (train_time)) logging.info("test %s on %s..." % (opts['algo'], dtrain.name)) start_time = time.time() test_accu = m.score(dtest.data_path, dtest.dtype) end_time = time.time() test_time = end_time - start_time logging.info("test accuracy: %.4f" % (test_accu)) logging.info("test time: %.4f seconds" % (test_time)) return train_accu, train_time, test_accu, test_time, np.array(train_log)
def run_ol(dtrain, dtest, algo, opts, fold_num=5, cv_process_num=1): """ Run Online Learning Algorithm Parameter --------- dtrain: DataSet training dataset dtest: DataSet test dataset algo: str name of the algorithm to use opts: dict options to train the model fold_num: int number of folds to do cross validation cv_process_num: int number of processes to do cross validaton """ logging.info('run ol: %s', algo) model_params = opts['params'] if 'params' in opts else {} cv_params = opts['cv'] if 'cv' in opts else None if algo == 'liblinear': params = model_params.copy() params.update(cv_params) return liblinear.train_test_l2(dtrain, dtest, fold_num=fold_num, **params) elif algo == 'vw': return vw.train_test(dtrain, dtest, model_params=model_params, cv_params=cv_params, fold_num=fold_num, cv_process_num=cv_process_num) #cross validation if cv_params != None: cv_output_path = osp.join(dtrain.work_dir, 'cv-%s.txt' % (algo)) if osp.exists(cv_output_path): best_params = CV.load_results(cv_output_path) else: cv_ = CV(dtrain, fold_num, cv_params, model_params, process_num=cv_process_num) cv_.train_val(algo) best_params = cv_.get_best_param()[0] cv_.save_results(cv_output_path) logging.info('cross validation results: %s', str(best_params)) model_params.update(best_params) logging.info("learn model with %s algorithm on %s ...", algo, dtrain.name) logging.info("parameter settings: %s", model_params) model = SOL(algo, dtrain.class_num, **model_params) #record update number and learning rate train_log = [] def record_training_process(data_num, iter_num, update_num, err_rate): """closure logging function""" train_log.append([data_num, iter_num, update_num, err_rate]) model.inspect_learning(record_training_process) #training start_time = time.time() train_accu = model.fit(dtrain.rand_path(), dtrain.dtype) end_time = time.time() train_time = end_time - start_time logging.info("training accuracy: %.4f", train_accu) logging.info("training time: %.4f seconds", train_time) #test logging.info("test %s on %s...", algo, dtest.name) start_time = time.time() test_accu = model.score(dtest.data_path, dtest.dtype) end_time = time.time() test_time = end_time - start_time logging.info("test accuracy: %.4f", test_accu) logging.info("test time: %.4f seconds", test_time) return test_accu, test_time, train_accu, train_time, np.array(train_log)