Example #1
0
File: cv.py Project: LIBOL/LIBSOL
def train_val_executor(task_queue,
                      result_queue):
    while True:
        task = task_queue.get()
        if task == None:
            break
        param_idx = task[0]
        model_name = task[1]
        dt = task[2]
        params = task[3]
        fold_num = task[4]
        val_fold_id = task[5]
        m = SOL(algo=model_name, class_num=dt.class_num, **params)

        for p in xrange(dt.pass_num):
            for i in xrange(fold_num):
                if i == val_fold_id:
                    continue
                train_accu = m.fit(dt.split_path(i), dt.slice_type)
        val_accu = m.score(dt.split_path(val_fold_id), dt.slice_type)

        logging.info('Cross validation of %s on %s, Fold %d/%d: \n\t\
                     params: %s\n\t\
                     Training Accuracy: %f, Validation Accuracy: %f',
                     model_name, dt.name, val_fold_id, fold_num,
                     str(params), train_accu, val_accu)

        result_queue.put((param_idx, train_accu, val_accu))
    task_queue.put(None)
Example #2
0
def main():
    args = getargs()
    dt_name = osp.basename(args.input)
    dt = DataSet(dt_name, args.input, args.data_type)

    m = SOL(batch_size=args.batch_size, buf_size=args.buf_size)
    m.load(args.model)

    algo = m.name
    logging.info("testing algorithm %s ..." % (algo))
    start_time = time.time()
    if args.output == None:
        accu = m.score(dt.data_path, dt.dtype)
    else:
        scores, predicts, labels = m.decision_function(dt.data_path, dt.dtype,get_labels=True)
        accu = np.sum(predicts == labels, dtype=np.float64) / predicts.shape[0]
    test_time = time.time() - start_time

    logging.info("test accuracy of %s: %.4f" % (algo, accu))
    logging.info("test time of %s: %.4f sec" % (algo, test_time))

    if args.output != None:
        logging.info("write prediction results to %s" %(args.output))
        with open(args.output, 'w') as fh:
            if m.n_classes == 2:
                for i in xrange(scores.shape[0]):
                    fh.write('%d\t%d\t%f\n' %(int(labels[i]), int(predicts[i]), scores[i]))
            else:
                for i in xrange(scores.shape[0]):
                    fh.write('%d\t%d\t%s\n' %(int(labels[i]), int(predicts[i]), '\t'.join([str(v) for v in scores[i,:]])))
Example #3
0
def train_val_executor(task_queue, result_queue):
    while True:
        task = task_queue.get()
        if task == None:
            break
        param_idx = task[0]
        model_name = task[1]
        dt = task[2]
        params = task[3]
        fold_num = task[4]
        val_fold_id = task[5]
        m = SOL(algo=model_name, class_num=dt.class_num, **params)

        for p in xrange(dt.pass_num):
            for i in xrange(fold_num):
                if i == val_fold_id:
                    continue
                train_accu = m.fit(dt.split_path(i), dt.slice_type)
        val_accu = m.score(dt.split_path(val_fold_id), dt.slice_type)

        logging.info(
            'Cross validation of %s on %s, Fold %d/%d: \n\t\
                     params: %s\n\t\
                     Training Accuracy: %f, Validation Accuracy: %f',
            model_name, dt.name, val_fold_id, fold_num, str(params),
            train_accu, val_accu)

        result_queue.put((param_idx, train_accu, val_accu))
    task_queue.put(None)
Example #4
0
def run_sol(dtrain, dtest, opts):
    logging.info('run sol: %s' % (opts['algo']))
    if opts['algo'] == 'liblinear':
        return liblinear.run(dt_train, dt_test, opts)
    elif opts['algo'] == 'vw':
        return vw.run(dt_train, dt_test, opts)

    model_params = []
    if 'params' in opts:
        model_params = [item.split('=') for item in opts['params']]

    if 'cv' in opts:
        cv_output_path = osp.join(dtrain.work_dir, 'cv-%s.txt' % (opts['cv']))
        if osp.exists(cv_output_path):
            best_params = CV.load_results(cv_output_path)
        else:
            raise Exception('%s does not exist!' % (cv_output_path))

        logging.info('cross validation parameters: %s' % (str(best_params)))
        for k, v in best_params:
            model_params.append([k, v])

    model_params = dict(model_params)
    sparsity_list = []
    test_accu_list = []
    for l1 in opts['lambda']:
        model_params['lambda'] = l1
        m = SOL(algo=opts['algo'], class_num=dtrain.class_num, **model_params)

        logging.info("train %s on %s with l1=%f ..." %
                     (opts['algo'], dtrain.name, l1))

        start_time = time.time()
        train_accu = m.fit(dtrain.rand_path('bin'), 'bin')
        end_time = time.time()

        sparsity_list.append(m.sparsity)

        logging.info("training accuracy: %.4f" % (train_accu))
        logging.info("training time: %.4f seconds" % (end_time - start_time))
        logging.info("model sparsity: %.4f seconds" % (m.sparsity))

        logging.info("test %s on %s with l1=%f ..." %
                     (opts['algo'], dtrain.name, l1))
        start_time = time.time()
        test_accu = m.score(dtest.rand_path('bin'), 'bin')
        end_time = time.time()

        logging.info("test accuracy: %.4f" % (test_accu))
        logging.info("test time: %.4f seconds" % (end_time - start_time))

        test_accu_list.append(test_accu)

    return np.array(sparsity_list), np.array(test_accu_list)
Example #5
0
def finetune(dt, model_path,
             model_params = {},
             output_path = None):
    """Finetune from an existing model

    Parameter
    --------
    dt: DataSet
        the dataset used to train the model
    model_path: str
        path to exisitng model
    model_params: dict{param, val}
        model parameters
    output_path: str
        path to save the model

    Return
    ------
    tuple(train accuracy, train time)
    """

    logging.info("finetnue model from %s ..." % (model_path))
    logging.info("parameter settings: %s" % (model_params))

    init_params = {}
    if 'batch_size' in model_params:
        init_params['batch_size'] = model_params['batch_size']
        del model_params['batch_size']
    if 'buf_size' in model_params:
        init_params['buf_size'] = model_params['buf_size']
        del model_params['buf_size']
    if 'verbose' in model_params:
        init_params['verbose'] = model_params['verbose']
        del model_params['verbose']

    m = SOL(**init_params)
    m.load(model_path)
    algo = m.name
    m.set_params(**model_params)

    start_time = time.time()
    train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num)
    train_time = time.time() - start_time

    logging.info("training accuracy of %s: %.4f" % (algo, train_accu))
    logging.info("training time of %s: %.4f sec" % (algo, train_time))

    if output_path != None:
        logging.info("save model of %s to %s" % (algo, output_path))
        m.save(output_path)

    return train_accu, train_time, m
Example #6
0
def main():
    args = getargs()
    try:
        dt_name = os.path.basename(args.input)
        dt = DataSet(dt_name, args.input, args.data_type)
        model_params = [('verbose', args.verbose)]
        if args.params != None:
            model_params = [item.split('=') for item in args.cv]

        if args.cv != None:
            cv_output_path = os.path.join(dt.work_dir,
                                          'cv-%s.txt' % (args.algo))
            if os.path.exists(cv_output_path) and args.retrain == False:
                best_params = CV.load_results(cv_output_path)
            else:
                #cross validation
                cv_params = [item.split('=') for item in args.cv]
                cv = CV(dt, args.fold_num, cv_params, model_params)
                cv.train_val(args.algo)
                best_params = cv.get_best_param()[0]
                cv.save_results(cv_output_path)
            logging.info('cross validation parameters: %s' %
                         (str(best_params)))
            for k, v in best_params:
                model_params.append((k, v))

        model_params = dict(model_params)

        start_time = time.time()
        m = SOL(args.algo,
                dt.class_num,
                batch_size=args.batch_size,
                buf_size=args.buf_size,
                **model_params)
        logging.info("learn model with %s algorithm..." % (args.algo))
        accu = m.fit(dt.data_path, dt.dtype, args.passes)
        logging.info("training accuracy of %s: %.4f" % (args.algo, accu))
        logging.info("training time of %s: %.4f seconds" %
                     (args.algo, time.time() - start_time))

        if args.output != None:
            logging.info("save model of %s to %s" % (args.algo, args.output))
            m.save(args.output)

    except Exception as err:
        print 'train failed: %s' % (err.message)
Example #7
0
def finetune(dt, model_path,
             model_params = {},
             output_path = None):
    """Finetune from an existing model

    Parameter
    --------
    dt: DataSet
        the dataset used to train the model
    model_path: str
        path to exisitng model
    model_params: dict{param, val}
        model parameters
    output_path: str
        path to save the model

    Return
    ------
    tuple(train accuracy, train time)
    """

    logging.info("finetnue model from %s ..." % (model_path))
    logging.info("parameter settings: %s" % (model_params))

    init_params = {}
    if 'batch_size' in model_params:
        init_params['batch_size'] = model_params['batch_size']
        del model_params['batch_size']
    if 'buf_size' in model_params:
        init_params['buf_size'] = model_params['buf_size']
        del model_params['buf_size']
    if 'verbose' in model_params:
        init_params['verbose'] = model_params['verbose']
        del model_params['verbose']

    m = SOL(**init_params)
    m.load(model_path)
    algo = m.name
    m.set_params(**model_params)

    start_time = time.time()
    train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num)
    train_time = time.time() - start_time

    logging.info("training accuracy of %s: %.4f" % (algo, train_accu))
    logging.info("training time of %s: %.4f sec" % (algo, train_time))

    if output_path != None:
        logging.info("save model of %s to %s" % (algo, output_path))
        m.save(output_path)

    return train_accu, train_time, m
Example #8
0
def main():
    args = getargs()

    try:
        dt_name = os.path.basename(args.input)
        dt = DataSet(dt_name, args.input, args.data_type)

        start_time = time.time()
        m = SOL(batch_size=args.batch_size, buf_size=args.buf_size)
        m.load(args.model)

        algo = m.name
        logging.info("testing algorithm %s ..." % (algo))
        if args.output == None:
            accu = m.score(dt.data_path, dt.dtype)
        else:
            scores, predicts, labels = m.decision_function(dt.data_path, dt.dtype,get_labels=True)
            accu = np.sum(predicts == labels, dtype=np.float64) / predicts.shape[0]
        logging.info("test accuracy of %s: %.4f" % (algo, accu))
        logging.info("test time of %s: %.4f seconds" %
                     (algo, time.time() - start_time))

        if args.output != None:
            logging.info("write prediction results to %s" %(args.output))
            with open(args.output, 'w') as fh:
                if m.n_classes == 2:
                    for i in xrange(scores.shape[0]):
                        fh.write('%d\t%d\t%f\n' %(int(labels[i]), int(predicts[i]), scores[i]))
                else:
                    for i in xrange(scores.shape[0]):
                        fh.write('%d\t%d\t%s\n' %(int(labels[i]), int(predicts[i]), '\t'.join([str(v) for v in scores[i,:]])))
    except Exception as err:
        print 'test failed %s' % (err.message)
Example #9
0
    def __train_val_one_fold(self, model_name, val_fold_id):
        """ cross validation on one fold of data
        Parameters:
        model_name: string
            name of the model to be tuned
        val_fold_id: int
            fold id that is used as val data
        Return:
            list of (train accuracy, validation accuracy)
        """
        train_accu_list = []
        val_accu_list = []
        #parameters
        for k in range(0, self.search_space.size):
            params = self.search_space.get_param(k)
            for param in self.extra_param:
                params.append(param)
            params = dict(params)
            m = SOL(algo=model_name,
                    class_num=self.dataset.class_num,
                    **params)

            for train_path in [
                    self.dataset.split_path(i) for i in xrange(self.fold_num)
                    if i != val_fold_id
            ]:
                train_accu = m.fit(train_path, self.dataset.slice_type,
                                   self.dataset.pass_num)
            val_accu = m.score(self.dataset.split_path(val_fold_id),
                               self.dataset.slice_type)

            print 'Results of Cross Validation on Model %s with Data %s: Fold %d/%d' % (
                model_name, self.dataset.name, val_fold_id, self.fold_num)
            print '\tParameter Setting: %s' % (str(params))
            print '\tTraining Accuracy: %f' % (train_accu)
            print '\tValidation Accuracy: %f' % (val_accu)
            train_accu_list.append(train_accu)
            val_accu_list.append(val_accu)
        return train_accu_list, val_accu_list
Example #10
0
def train(dt, model_name,
          model_params={},
          output_path= None,
          fold_num= 5,
          cv_params=None,
          retrain=False,
          cv_process_num=1):
    """
    train a SOL model

    Parameter
    ---------
    dt: DataSet
        the dataset used to train the model
    model_name: str
        name of the algorithm to use
    model_params: dict{param, val}
        model parameters
    output_path: str
        path to save the model
    fold_num: int
        number of folds to do cross validation
    cv_params: dict{param, range}
        cross validation parameters
        cv_process_num: int
        number of processes to do cross validation
    retrain: bool
        whether to re-do the cross validation

    Return
    ------
    tuple(train accuracy, train time, model)
    """

    if cv_params != None:
        cv_output_path = osp.join(dt.work_dir, 'cv-%s.txt' % (model_name))
        if osp.exists(cv_output_path) and retrain == False:
            best_params = CV.load_results(cv_output_path)
        else:
            #cross validation
            param_B = None
            param_lambda = None
            if 'B' in model_params:
                param_B  = model_params['B']
                del model_params['B']
            if 'lambda' in model_params:
                param_lambda  = model_params['lambda']
                del model_params['lambda']

            cv = CV(dt, fold_num, cv_params, model_params, process_num=cv_process_num)
            cv.train_val(model_name)
            best_params = cv.get_best_param()[0]
            cv.save_results(cv_output_path)

            if param_B is not None:
                model_params['B'] = param_B
            if param_lambda is not None:
                model_params['lambda'] = param_lambda

        logging.info('cross validation results: %s' % (str(best_params)))

        model_params.update(best_params)

    logging.info("learn model with %s algorithm on %s ..." % (model_name,
                                                           dt.name))
    logging.info("parameter settings: %s" % (model_params))

    start_time = time.time()
    m = SOL(model_name, dt.class_num, **model_params)
    train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num)
    train_time = time.time() - start_time

    logging.info("training accuracy of %s: %.4f" % (model_name, train_accu))
    logging.info("training time of %s: %.4f sec" % (model_name, train_time))

    if output_path != None:
        logging.info("save model of %s to %s" % (model_name, output_path))
        m.save(output_path)

    return train_accu, train_time, m
Example #11
0
def train(dt, model_name,
          model_params={},
          output_path= None,
          fold_num= 5,
          cv_params=None,
          retrain=False,
          cv_process_num=1):
    """
    train a SOL model

    Parameter
    ---------
    dt: DataSet
        the dataset used to train the model
    model_name: str
        name of the algorithm to use
    model_params: dict{param, val}
        model parameters
    output_path: str
        path to save the model
    fold_num: int
        number of folds to do cross validation
    cv_params: dict{param, range}
        cross validation parameters
        cv_process_num: int
        number of processes to do cross validation
    retrain: bool
        whether to re-do the cross validation

    Return
    ------
    tuple(train accuracy, train time, model)
    """

    if cv_params != None:
        cv_output_path = osp.join(dt.work_dir, 'cv-%s.txt' % (model_name))
        if osp.exists(cv_output_path) and retrain == False:
            best_params = CV.load_results(cv_output_path)
        else:
            #cross validation
            param_B = None
            param_lambda = None
            if 'B' in model_params:
                param_B  = model_params['B']
                del model_params['B']
            if 'lambda' in model_params:
                param_lambda  = model_params['lambda']
                del model_params['lambda']

            cv = CV(dt, fold_num, cv_params, model_params, process_num=cv_process_num)
            cv.train_val(model_name)
            best_params = cv.get_best_param()[0]
            cv.save_results(cv_output_path)

            if param_B is not None:
                model_params['B'] = param_B
            if param_lambda is not None:
                model_params['lambda'] = param_lambda

        logging.info('cross validation results: %s' % (str(best_params)))

        model_params.update(best_params)

    logging.info("learn model with %s algorithm on %s ..." % (model_name,
                                                           dt.name))
    logging.info("parameter settings: %s" % (model_params))

    start_time = time.time()
    m = SOL(model_name, dt.class_num, **model_params)
    train_accu = m.fit(dt.data_path, dt.dtype, dt.pass_num)
    train_time = time.time() - start_time

    logging.info("training accuracy of %s: %.4f" % (model_name, train_accu))
    logging.info("training time of %s: %.4f sec" % (model_name, train_time))

    if output_path != None:
        logging.info("save model of %s to %s" % (model_name, output_path))
        m.save(output_path)

    return train_accu, train_time, m
Example #12
0
def run_ol(dtrain, dtest, opts, retrain=False, fold_num=5):
    logging.info('run ol: %s' % (opts['algo']))
    if opts['algo'] == 'liblinear':
        return liblinear.run(dt_train, dt_test, opts, retrain, fold_num)
    elif opts['algo'] == 'vw':
        return vw.run(dt_train, dt_test, opts, retrain, fold_num)

    model_params = []
    if 'params' in opts:
        model_params = [item.split('=') for item in opts['params']]

    if 'cv' in opts:
        cv_output_path = osp.join(dtrain.work_dir,
                                  'cv-%s.txt' % (opts['algo']))
        if osp.exists(cv_output_path) and retrain == False:
            best_params = CV.load_results(cv_output_path)
        else:
            #cross validation
            logging.info("cross validation on dataset %s with parameters %s" %
                         (dtrain.name, str(opts['cv'])))
            cv_params = [item.split('=') for item in opts['cv']]
            cv = CV(dtrain, fold_num, cv_params, model_params)
            cv.train_val(opts['algo'])
            best_params = cv.get_best_param()[0]
            cv.save_results(cv_output_path)

        logging.info('cross validation parameters: %s' % (str(best_params)))
        for k, v in best_params:
            model_params.append([k, v])

    model_params = dict(model_params)
    m = SOL(algo=opts['algo'], class_num=dtrain.class_num, **model_params)
    train_log = []

    def record_training_process(data_num,
                                iter_num,
                                update_num,
                                err_rate,
                                stat=train_log):
        train_log.append([data_num, iter_num, update_num, err_rate])

    m.inspect_learning(record_training_process)

    output_path = osp.join(dtrain.work_dir, opts['algo'] + '.model')

    logging.info("train %s on %s..." % (opts['algo'], dtrain.name))
    start_time = time.time()
    train_accu = m.fit(dtrain.rand_path(), dtrain.dtype)
    end_time = time.time()
    train_time = end_time - start_time

    logging.info("training accuracy: %.4f" % (train_accu))
    logging.info("training time: %.4f seconds" % (train_time))

    logging.info("test %s on %s..." % (opts['algo'], dtrain.name))
    start_time = time.time()
    test_accu = m.score(dtest.data_path, dtest.dtype)
    end_time = time.time()
    test_time = end_time - start_time

    logging.info("test accuracy: %.4f" % (test_accu))
    logging.info("test time: %.4f seconds" % (test_time))

    return train_accu, train_time, test_accu, test_time, np.array(train_log)
Example #13
0
def run_ol(dtrain,
           dtest,
           algo,
           opts,
           fold_num=5,
           cv_process_num=1):
    """
    Run Online Learning Algorithm

    Parameter
    ---------
    dtrain: DataSet
        training dataset
    dtest: DataSet
        test dataset
    algo: str
        name of the algorithm to use
    opts: dict
        options to train the model
    fold_num: int
        number of folds to do cross validation
    cv_process_num: int
        number of processes to do cross validaton
    """

    logging.info('run ol: %s', algo)

    model_params = opts['params'] if 'params' in opts else {}
    cv_params = opts['cv'] if 'cv' in opts else None

    if algo == 'liblinear':
        params = model_params.copy()
        params.update(cv_params)
        return liblinear.train_test_l2(dtrain, dtest,
                                       fold_num=fold_num,
                                       **params)
    elif algo == 'vw':
        return vw.train_test(dtrain, dtest,
                             model_params=model_params,
                             cv_params=cv_params,
                             fold_num=fold_num,
                             cv_process_num=cv_process_num)

    #cross validation
    if cv_params != None:
        cv_output_path = osp.join(dtrain.work_dir, 'cv-%s.txt' % (algo))
        if osp.exists(cv_output_path):
            best_params = CV.load_results(cv_output_path)
        else:
            cv_ = CV(dtrain, fold_num, cv_params, model_params, process_num=cv_process_num)
            cv_.train_val(algo)
            best_params = cv_.get_best_param()[0]
            cv_.save_results(cv_output_path)

        logging.info('cross validation results: %s', str(best_params))

        model_params.update(best_params)

    logging.info("learn model with %s algorithm on %s ...", algo, dtrain.name)
    logging.info("parameter settings: %s", model_params)

    model = SOL(algo, dtrain.class_num, **model_params)

    #record update number and learning rate
    train_log = []
    def record_training_process(data_num, iter_num, update_num, err_rate):
        """closure logging function"""
        train_log.append([data_num, iter_num, update_num, err_rate])

    model.inspect_learning(record_training_process)

    #training
    start_time = time.time()
    train_accu = model.fit(dtrain.rand_path(), dtrain.dtype)
    end_time = time.time()
    train_time = end_time - start_time

    logging.info("training accuracy: %.4f", train_accu)
    logging.info("training time: %.4f seconds", train_time)

    #test
    logging.info("test %s on %s...", algo, dtest.name)
    start_time = time.time()
    test_accu = model.score(dtest.data_path, dtest.dtype)
    end_time = time.time()
    test_time = end_time - start_time

    logging.info("test accuracy: %.4f", test_accu)
    logging.info("test time: %.4f seconds", test_time)

    return test_accu, test_time, train_accu, train_time, np.array(train_log)