コード例 #1
0
def WicoxonTest(series, center, side='both', alpha=0.05):
    assert is_seq(series), 'Sign test expects sequence object stored data'
    assert all(map(is_math, series)), 'Sign test expects numerical data'
    assert is_math(center) is True, 'the value to compare must be a number'
    assert side == 'both', "don't supprot single side test in thie version"
    symbol = [1 if _ > center else 0 for _ in series]
    data = SeriesSet({'X': series, 'SYMBOL': symbol})

    # distance to the compare center
    data['ABS'] = data.X.apply(lambda x: abs(x - center))

    # rank records by distance
    data['RANK'] = get_ranks(data.ABS)
    
    # calculate the sum of ranking
    W_pos = sum(data.select(lambda row: row['SYMBOL'] == 1).RANK)
    W_neg = sum(data.select(lambda row: row['SYMBOL'] == 0).RANK)
    W = min(W_pos, W_neg)

    # calculate the Statistic
    n, C = float(data.shape.Ln), 0.5
    if W < n * (n + 1) / 4:
        C = -0.5
    Z = (W - n * (n + 1) / 4 + C) / (sqrt(n * (n + 1) * (2 * n + 1) / 24))
    pvalue = Ncdf(Z, 0, 1)
    return WicoxonTestResult(round(Z, 4), n, center, round(pvalue, 4))
コード例 #2
0
def ANOVA(data, cluster):
    if not isinstance(data, SeriesSet):
        data = SeriesSet(data)
    assert data.shape.Col > 1, 'ANOVA() expects more than 1 comparing group.'
    assert data.shape.Ln > 2, 'at least 2 records in the data'
    assert is_str(
        cluster
    ), '`cluster` must be a string object to represent the categorical variable in the data'
    assert is_str(
        control
    ) or control == None, '`control` must be False or a string object'
    assert report in (True, False)
    assert cluster in data.columns
    cluster = [cluster]

    value_column = tuple(set(data.columns) - set(cluster))[0]
    SST = data[value_column].std()

    total_mean = data[value_column].mean()
    SSA, SSE, r, n = 0.0, 0.0, 0.0, data.shape.Ln
    for label, subset in data.iter_groupby(cluster):
        seq = subset[value_column]
        r += 1
        SSA += len(seq) * (seq.mean() - total_mean)**2
        SSE += len(seq) * seq.std()**2
    MSA = SSA / (r - 1.0)
    MSE = SSE / (n - r) if SSE != 0 else 0.00001
    F = MSA / MSE
    return ANOVA_result(F, 1 - Fcdf(F, r - 1, n - r))
コード例 #3
0
 def _check_target_labels(self, target):
     target = SeriesSet(target)
     if target.shape.Col == 1:
         target = get_dummies(target[target.columns[0]], dtype='SeriesSet')
     self._labels = target.columns
     self._final_func = 'softmax'
     return self._engine.mat(list(target.iter_rows()))
コード例 #4
0
 def _Perf(self, X):
     if self._confumat is None:
         self._confumat = self._calculate_confumat(X)
     table = SeriesSet(None, ['Method', 'Accuracy (%)', 'Kappa'], nan='-')
     table.append_row([
         self._solve.upper(),
         Accuracy(self._confumat),
         Kappa(self._confumat)
     ])
     return table
コード例 #5
0
 def _Summary(self):
     table = SeriesSet(
         None, ['Function', 'Eigenvalue', 'Rate (%)', 'Cumulative (%)'])
     acf = 0
     for i, (val, valrate) in enumerate(zip(self._value, self._valrate), 1):
         acf += valrate
         table.append_row([
             'Func%d' % i,
             round(val, 4),
             round(valrate * 100, 4),
             round(acf * 100, 4)
         ])
     return table
コード例 #6
0
def ANOVA(data, cluster, control=None, report=False):
    if not isinstance(data, SeriesSet):
        data = SeriesSet(data)
    assert data.shape.Col > 2, 'ANOVA() expects more than 1 comparing group.'
    assert data.shape.Ln > 2, 'at least 2 records in the data'
    assert is_str(
        cluster
    ), '`cluster` must be a string object to represent the categorical variable in the data'
    assert is_str(
        control
    ) or control == None, '`control` must be False or a string object'
    assert report in (True, False)
    assert cluster in data.columns
    cluster = [cluster]
    if is_str(control) is True:
        assert control in data.columns
        cluster.append(control)

    value_column = set(data.columns) - set(cluster)
    value_data = data[value_column]
    sum_x = sum(ser.sum() for ser in data[value_column].values())
    num_of_value = float(value_data.shape.Ln * value_data.shape.Col)

    report_ = SeriesSet(None, ['Source', 'DF', 'SS', 'MS', 'F', 'Sig'], '')
    SST = sum((val**2).sum()
              for val in value_data.values()) - sum_x**2 / num_of_value
    report_.append_row(['Total', sum_of_value - 1, SST])
    for subclass in cluster:
        subset = data[set(data.columns) - set([subclass])]
        SSR = sum(i for val in data[set(data.columns) - set(cluster)])

    new_classes = list()
    for sequence in classes:
        sequence = Series(filter(is_math, sequence))
        assert len(
            sequence) > 1, 'ANOVA() expects more than 1 samples in each class.'
        new_classes.append(sequence)

    ni = list(map(len, new_classes))
    n = sum(ni)
    r = len(new_classes)
    Ti = map(sum, new_classes)
    G = sum([sum(map(lambda x: x**2, sequence)) for sequence in new_classes])

    totals = sum([Ti[i]**2 / float(ni[i]) for i in range(r)])
    Sa = totals - sum(Ti)**2 / float(sum(ni))
    Se = G - totals
    MSa = Sa / float(r - 1)
    MSe = Se / float(sum(ni) - r)
    F = MSa / MSe
    return ANOVA_result([F, 1 - Fcdf(F, r - 1, n - r)])
コード例 #7
0
ファイル: evaluator.py プロジェクト: yxhust/DaPy
def ConfuMat(Y, y_, labels):
    '''calculate confution Matrix'''
    labels = sorted(set(Y) | set(y_))
    confu = zeros((len(labels) + 1, len(labels) + 1))
    temp = SeriesSet({'Y': Y, 'y': y_})
    for i, l1 in enumerate(labels):
        subtemp = temp.select(lambda row: row[0] == l1)
        for j, l2 in enumerate(labels):
            confu[i, j] = len(subtemp.select(lambda row: row[1] == l2))
        confu[i, -1] = sum(confu[i])

    for j in range(len(labels) + 1):
        confu[-1, j] = sum(confu[:, j].tolist()[0])
    return confu
コード例 #8
0
ファイル: evaluator.py プロジェクト: yxhust/DaPy
def Performance(predictor, data, target, mode='reg'):
    assert mode in ('clf', 'reg'), "`mode` must be `clf` or `reg` only."
    assert len(data) == len(
        target), "the number of target data is not equal to variable data"

    if mode == 'clf':
        result = predictor.predict(data)
        if hasattr(result, 'shape') is False:
            result = SeriesSet(result)
        if hasattr(target, 'shape') is False:
            target = SeriesSet(target)
            assert target.shape[1] == 1, 'testify target must be a sequence'
            target = target[target.columns[0]]
        if hasattr(predictor, 'labels'):
            labels = predictor.labels
        else:
            labels = sorted(set(result) | set(target))

        confuMat = ConfuMat(target, result, labels)
        LogInfo('Classification Accuracy: %.4f' % Accuracy(confuMat))
        LogInfo('Classification Kappa: %.4f' % Kappa(confuMat))
        if confuMat.shape[1] == 3:
            proba = predictor.predict_proba(data)
            if proba.shape[1] == 2:
                proba = proba[:, 0]
            target = Series(1 if _ == labels[0] else 0 for _ in target)
            LogInfo('Classification AUC: %.4f' % Auc(target, proba))
        return confuMat

    elif mode == 'reg':
        target = Series(target)
        predict = Series(predictor.predict(data).T.tolist()[0])
        mean_abs_err = Score.MAE(target, predict)
        mean_sqrt_err = Score.MSE(target, predict)
        R2 = Score.R2_score(target, predict)
        mean_abs_percent_erro = Score.MAPE(target, predict)
        LogInfo('Regression MAE: %.4f' % mean_abs_err)
        LogInfo('Regression MSE: %.4f' % mean_sqrt_err)
        LogInfo('Regression MAPE: %.4f' % mean_abs_percent_erro)
        LogInfo(u'Regression R²: %.4f' % R2)
コード例 #9
0
 def _Info(self, shape):
     table = SeriesSet()
     table.append_col(['X%d' % i for i in range(1, shape + 1)], 'Variables')
     for i, vec in enumerate(self._vector, 1):
         table.append_col(vec.tolist()[0], 'Func%d' % i)
     return table
コード例 #10
0
def proba2label(seq, labels):
    if hasattr(seq, 'shape') is False:
        seq = SeriesSet(seq)
    if seq.shape[1] > 1:
        return clf_multilabel(seq, labels)
    return clf_binlabel(seq, labels)