Ejemplo n.º 1
0
def crossValid(x, y, cv=5, Nu=10, Nv=20):
    results = {"perf": [], "Nu": [], "Nv": []}

    np.random.seed(2017)
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        ## hyperparameter tuning ##
        if type(Nu) == list:
            # both parameters should be of the same type #
            Nu_sel, Nv_sel = hyperparameters(x_train,
                                             y_train,
                                             cv=5,
                                             Nu=Nu,
                                             Nv=Nv)
        else:
            Nu_sel, Nv_sel = Nu, Nv

        y_pred = CADrank(Nu=Nu_sel, Nv=Nv_sel).fit(x_train,
                                                   y_train).predict(x_test)
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))
        results["Nu"].append(Nu_sel)
        results["Nv"].append(Nv_sel)

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 2
0
def crossValidate(x, y, cv=5, Abstention=True, Inverse_laplace=4):
    results = {"perf": []}
    np.random.seed(2016)
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # score input for y to pairwise input #
        x_tr, y_tr = score2pair(x_train,
                                y_train,
                                k=Inverse_laplace,
                                Abstention=Abstention)
        # train and predict ranks for test data #
        ranks = rankPairPref(x_tr, y_tr, x_test)
        # transform test score data to rank
        y_te = map(LogR.rankOrder, y_test.tolist())

        results["perf"].append(
            LogR.perfMeasure(y_pred=ranks, y_test=y_te, rankopt=True))

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 3
0
def crossData(data_list,
              alpha=0.0,
              rank_weight=False,
              stop_criterion_mis_rate=None,
              stop_criterion_min_node=1,
              stop_criterion_gain=0.0,
              prune_criteria=0):
    results = {}
    for data_train in data_list:
        results[data_train] = {}
        for data_test in data_list:
            if data_test == data_train:
                continue
            x_train, y_tr = LogR.dataClean(data_train)
            y_train = label2Rank(y_tr.tolist())
            x_test, y_te = LogR.dataClean(data_test)
            y_test = label2Rank(y_te.tolist())
            tree = DecisionTree().buildtree(
                x_train,
                y_train,
                weights=None,
                stop_criterion_mis_rate=stop_criterion_mis_rate,
                stop_criterion_min_node=stop_criterion_min_node,
                stop_criterion_gain=stop_criterion_gain)
            y_pred = tree.predict(x_test, alpha)
            results[data_train][data_test] = LogR.perfMeasure(y_pred,
                                                              y_test,
                                                              rankopt=True)
    return results
Ejemplo n.º 4
0
def crossValidateSimple(x, y, method="logReg", cv=5, alpha=None):
    #  error measure
    results = {"perf": []}
    # cross validation #
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True,
               random_state=0)  ## for testing fixing random_state
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # performance measure
        if method == "logReg":
            y_pred = np.zeros(x_test.shape)
            for i in range(y_pred.shape[0]):
                for j in range(y_pred.shape[1]):
                    y_pred[i, j] = x_test[i, j] * weight[j]
            results["perf"].append(LogR.perfMeasure(y_pred, y_test))

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 5
0
def crossValidate(x, y, cv=5, K=None):
    """
    :param y: N*L ranking vectors
    :return:
    """
    results = {"perf": []}

    ## cross validation ##
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # y_pred = KNN(K=K).fit(x_train, y_train).predict(x_test)
        y_pred = multithreadPredict(x_test, KNN(K=K).fit(x_train, y_train))
        print y_pred

        # print y_pred ### test
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))
        # print results["perf"][-1]

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
def hyperParameter(x, y, x_valid=None, y_valid=None, cv = 5, criteria = 0):
    if x_valid is None:
        # no validation set, using cross validation #
        alpha_perform = []
        kf = KFold(n_splits=cv, shuffle=True, random_state=0)
        for train, valid in kf.split(x):
            x_train = x[train,:]
            y_train = y[train,:]
            x_valid = x[valid,:]
            y_valid = y[valid,:]

            tree = DecisionTree().buildtree(x_train, y_train)
            alpha_list = tree.alphalist()
            print "alpha_list in hyperparameter tuning: ", alpha_list
            alpha_best = [-1, None]
            for alpha in alpha_list:
                y_pred = tree.predict(x_valid, alpha=alpha)
                perf = LogR.perfMeasure(y_pred, y_valid, rankopt=True)
                perf_criteria = perf[criteria]
                if alpha_best[1] is not None and alpha_best[1]>perf_criteria:
                    pass
                else:
                    alpha_best[0] = alpha
                    alpha_best[1] = perf_criteria

            alpha_perform.append(alpha_best)

        alpha_perform = np.array(alpha_perform, dtype=np.float32)
        print "inside hyperparameter:", alpha_perform ### test
        return np.average(alpha_perform, axis=0)[0]

    else:
        tree = DecisionTree().buildtree(x, y)
        alpha_list = tree.alphalist()
        alpha_best = [-1, None]
        for alpha in alpha_list:
            y_pred = tree.predict(x_valid, alpha=alpha)
            perf = LogR.perfMeasure(y_pred, y_valid, rankopt=True)
            perf_criteria = perf[criteria]
            if alpha_best[1] is not None and alpha_best[1] > perf_criteria:
                pass
            else:
                alpha_best[0] = alpha
                alpha_best[1] = perf_criteria
        return alpha_best[0]
Ejemplo n.º 7
0
def crossValidate(x,
                  y,
                  cv=5,
                  alpha=0,
                  rank_weight=False,
                  stop_criterion_mis_rate=None,
                  stop_criterion_min_node=1,
                  stop_criterion_gain=0.0):

    results = {"alpha": [], "perf": [], "size": []}

    # cross validation #
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True,
               random_state=0)  ## for testing fixing random_state
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # training and predict

        # if alpha == None:
        #     ## nested select validate and test ##
        #     # print "start searching alpha:", datetime.now() ### test
        #     alpha_sel, perf = DTme.hyperParometer(x_train, y_train)
        #     # print "finish searching alpha:", datetime.now(), alpha ### test
        # else:
        #     alpha_sel = alpha
        if rank_weight:
            weights = rank2Weight(y_train)
        else:
            weights = None
        tree = DecisionTree().buildtree(
            x_train,
            y_train,
            weights,
            stop_criterion_mis_rate=stop_criterion_mis_rate,
            stop_criterion_min_node=stop_criterion_min_node,
            stop_criterion_gain=stop_criterion_gain)

        # performance measure
        alpha_sel, y_pred = alpha, tree.predict(x_test, alpha)
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))
        results["alpha"].append(alpha_sel)
        results["size"].append(tree.size)
        print alpha_sel, "alpha"

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 8
0
def crossValidate(x, y, method="logReg", cv=5, alpha=None):
    #  error measure
    results = []
    if method == "logReg":
        results = {"perf": [], "coef": [], "interc": []}
    elif method == "dT":
        results = {"alpha": [], "perf": []}

    # cross validation #
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True,
               random_state=0)  ## for testing fixing random_state
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # from multilabel to multiclass based on independencec assumption
        if method == "logReg":
            x_train, y_train = LogR.multiClass(x_train, y_train)
        elif method == "dT":
            pass  # already in rank representation

        # training and predict
        if method == "dT":
            if alpha == None:
                ## nested select validate and test ##
                # print "start searching alpha:", datetime.now() ### test
                alpha_sel, perf = hyperParometer(x_train, y_train)
                # print "finish searching alpha:", datetime.now(), alpha ### test
            else:
                alpha_sel = alpha
            result = decisionTree(x_train, y_train, x_test, alpha=alpha_sel)

        # performance measure

        alpha_sel, y_pred = result
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))
        results["alpha"].append(alpha_sel)
        print alpha_sel, "alpha"

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 9
0
def crossValidate(x, y, cv=5):
    results = {"perf": []}
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        y_pred = labelWiseRanking(x_train, y_train, x_test)
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))
    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]
    return results
Ejemplo n.º 10
0
def hyperparameters(x, y, Nu, Nv, cv=5, criterion=-1):
    best_result = None
    best_para = [None, None]
    for Nu_sel in Nu:
        for Nv_sel in Nv:
            perfs = []
            kf = KFold(n_splits=cv, shuffle=True, random_state=0)
            for train, test in kf.split(x):
                x_train = x[train, :]
                y_train = y[train, :]
                x_test = x[test, :]
                y_test = y[test, :]
                y_pred = CADrank(Nu=Nu_sel,
                                 Nv=Nv_sel).fit(x_train,
                                                y_train).predict(x_test)
                perf = LogR.perfMeasure(y_pred, y_test, rankopt=True)
                perfs.append(perf[criterion])
            result = sum(perfs) / cv
            if best_result is None or best_result < result:
                best_result = result
                best_para = [Nu_sel, Nv_sel]
    return best_para[0], best_para[1]
Ejemplo n.º 11
0
def crossValidate(x,y, cv=5, nocross = False, cost = None, iter_max = ITER_MAX):

    results = {"perf": []}

    # cross validation #
    np.random.seed(1100)
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)  ## for testing fixing random_state
    for train, test in kf.split(x):
        x_train = x[train, :]
        y_train = y[train, :]
        x_test = x[test, :]
        y_test = y[test, :]

        # training and predict

        # if alpha == None:
        #     ## nested select validate and test ##
        #     # print "start searching alpha:", datetime.now() ### test
        #     alpha_sel, perf = DTme.hyperParometer(x_train, y_train)
        #     # print "finish searching alpha:", datetime.now(), alpha ### test
        # else:
        #     alpha_sel = alpha
        classifiers = adaboost(x_train,y_train,x_test,y_test, iter_max= iter_max, cost = cost)

        # performance measure
        y_pred = predict(x_test, classifiers)
        results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True))

        if nocross:
            break

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis=0)
        std = np.nanstd(item, axis=0)
        results[key] = [mean, std]

    return results
Ejemplo n.º 12
0
def adaboost(x_train, y_train, x_test = None, y_test = None, cost_train = None,
             output = output, iter_max = ITER_MAX, cost = None):
    """

    :param cost_train: cost for each training sample, used in cost = "C2"
    :param output: How often output performance on test data by current classifier
    :param iter_max: maximum of adaboost iteration
    :param cost: cost sensitive version indicator: None, "init_weight", "C2"
    """

    Nsamp = y_train.shape[0]

    classifiers = []

    # initialize weights #
    if cost is None or cost == "C2":
        weight = 1.0/Nsamp
        weights_init = np.array([weight for i in range(Nsamp)], dtype = np.float32)
        weights = weights_init
        if cost == "C2" and cost_train is None:
            cost_train = rank2Weight_cost(y_train, cost_level = COST_LEVEL)
    elif cost == "init_weight":
        weights_init = rank2Weight(y_train)
        weights = weights_init
    else:
        raise(ValueError, "unsupported cost type")

    start = datetime.now() # timer
    for iter in range(iter_max):
        # base classifier, decisiontree for now #
        tree = DecisionTree().buildtree(x_train, y_train, weights, stop_criterion_mis_rate = stop_criterion_mis_rate)
        # tree.printtree()
        # training result #
        compare_results = [False for i in range(Nsamp)]# whether correctly predicted
        for i in range(Nsamp):
            y_pred = tree.predict(x_train[i])
            # print y_pred, y_train[i]
            cmp_result = not tree.diffLabel(y_pred, y_train[i])
            compare_results[i] = cmp_result
        compare_results = np.array(compare_results, dtype = np.bool)

        # updating weight for classifier, wc#
        if cost is None or cost == "init_weight":
            weight_sum_cor = np.sum(weights[compare_results == True])
            weight_sum_dis = np.sum(weights[compare_results == False])
        elif cost == "C2":
            weight_sum_cor = costWeightSum(weights, compare_results, cost_train, cordis = True)
            weight_sum_dis = costWeightSum(weights, compare_results, cost_train, cordis = False)

        if weight_sum_cor < weight_sum_dis:                                 # the classifier is too weak for boosting
            raise(ValueError,"too weak classifier")
        if weight_sum_dis == 0:                                             # already perfect classifier
            Warning("perfect classifier")
            break
        wc = 0.5 * (math.log(weight_sum_cor) - math.log(weight_sum_dis))

        # updating weights #
        weights = weightsUpdate(weights, compare_results, wc, cost_train)

        # add classifier to classifier list #
        classifiers.append([wc, tree])

        # realtime output #
        if output is not None and (iter+1) % output == 0:
            # status of current classifiers #
            print "wc", wc  ### test
            print "weights stats, mean, std, min, max "
            print np.mean(weights), np.std(weights), np.min(weights), np.max(weights)
            # performance on test set #
            y_pred = predict(x_test,classifiers)
            performance = LogR.perfMeasure(y_pred,y_test,rankopt = True)
            print "iter: ", iter+1
            print performance
            with open(LOGFILE,"a") as log:
                log.write(" ".join(map(str, performance))+"\n")

            duration = datetime.now()-start
            start = datetime.now()
            print "time for %d iters: %f" % (output, duration.total_seconds())
    return classifiers