def WicoxonTest(series, center, side='both', alpha=0.05): assert is_seq(series), 'Sign test expects sequence object stored data' assert all(map(is_math, series)), 'Sign test expects numerical data' assert is_math(center) is True, 'the value to compare must be a number' assert side == 'both', "don't supprot single side test in thie version" symbol = [1 if _ > center else 0 for _ in series] data = SeriesSet({'X': series, 'SYMBOL': symbol}) # distance to the compare center data['ABS'] = data.X.apply(lambda x: abs(x - center)) # rank records by distance data['RANK'] = get_ranks(data.ABS) # calculate the sum of ranking W_pos = sum(data.select(lambda row: row['SYMBOL'] == 1).RANK) W_neg = sum(data.select(lambda row: row['SYMBOL'] == 0).RANK) W = min(W_pos, W_neg) # calculate the Statistic n, C = float(data.shape.Ln), 0.5 if W < n * (n + 1) / 4: C = -0.5 Z = (W - n * (n + 1) / 4 + C) / (sqrt(n * (n + 1) * (2 * n + 1) / 24)) pvalue = Ncdf(Z, 0, 1) return WicoxonTestResult(round(Z, 4), n, center, round(pvalue, 4))
def ANOVA(data, cluster): if not isinstance(data, SeriesSet): data = SeriesSet(data) assert data.shape.Col > 1, 'ANOVA() expects more than 1 comparing group.' assert data.shape.Ln > 2, 'at least 2 records in the data' assert is_str( cluster ), '`cluster` must be a string object to represent the categorical variable in the data' assert is_str( control ) or control == None, '`control` must be False or a string object' assert report in (True, False) assert cluster in data.columns cluster = [cluster] value_column = tuple(set(data.columns) - set(cluster))[0] SST = data[value_column].std() total_mean = data[value_column].mean() SSA, SSE, r, n = 0.0, 0.0, 0.0, data.shape.Ln for label, subset in data.iter_groupby(cluster): seq = subset[value_column] r += 1 SSA += len(seq) * (seq.mean() - total_mean)**2 SSE += len(seq) * seq.std()**2 MSA = SSA / (r - 1.0) MSE = SSE / (n - r) if SSE != 0 else 0.00001 F = MSA / MSE return ANOVA_result(F, 1 - Fcdf(F, r - 1, n - r))
def _check_target_labels(self, target): target = SeriesSet(target) if target.shape.Col == 1: target = get_dummies(target[target.columns[0]], dtype='SeriesSet') self._labels = target.columns self._final_func = 'softmax' return self._engine.mat(list(target.iter_rows()))
def _Perf(self, X): if self._confumat is None: self._confumat = self._calculate_confumat(X) table = SeriesSet(None, ['Method', 'Accuracy (%)', 'Kappa'], nan='-') table.append_row([ self._solve.upper(), Accuracy(self._confumat), Kappa(self._confumat) ]) return table
def _Summary(self): table = SeriesSet( None, ['Function', 'Eigenvalue', 'Rate (%)', 'Cumulative (%)']) acf = 0 for i, (val, valrate) in enumerate(zip(self._value, self._valrate), 1): acf += valrate table.append_row([ 'Func%d' % i, round(val, 4), round(valrate * 100, 4), round(acf * 100, 4) ]) return table
def ANOVA(data, cluster, control=None, report=False): if not isinstance(data, SeriesSet): data = SeriesSet(data) assert data.shape.Col > 2, 'ANOVA() expects more than 1 comparing group.' assert data.shape.Ln > 2, 'at least 2 records in the data' assert is_str( cluster ), '`cluster` must be a string object to represent the categorical variable in the data' assert is_str( control ) or control == None, '`control` must be False or a string object' assert report in (True, False) assert cluster in data.columns cluster = [cluster] if is_str(control) is True: assert control in data.columns cluster.append(control) value_column = set(data.columns) - set(cluster) value_data = data[value_column] sum_x = sum(ser.sum() for ser in data[value_column].values()) num_of_value = float(value_data.shape.Ln * value_data.shape.Col) report_ = SeriesSet(None, ['Source', 'DF', 'SS', 'MS', 'F', 'Sig'], '') SST = sum((val**2).sum() for val in value_data.values()) - sum_x**2 / num_of_value report_.append_row(['Total', sum_of_value - 1, SST]) for subclass in cluster: subset = data[set(data.columns) - set([subclass])] SSR = sum(i for val in data[set(data.columns) - set(cluster)]) new_classes = list() for sequence in classes: sequence = Series(filter(is_math, sequence)) assert len( sequence) > 1, 'ANOVA() expects more than 1 samples in each class.' new_classes.append(sequence) ni = list(map(len, new_classes)) n = sum(ni) r = len(new_classes) Ti = map(sum, new_classes) G = sum([sum(map(lambda x: x**2, sequence)) for sequence in new_classes]) totals = sum([Ti[i]**2 / float(ni[i]) for i in range(r)]) Sa = totals - sum(Ti)**2 / float(sum(ni)) Se = G - totals MSa = Sa / float(r - 1) MSe = Se / float(sum(ni) - r) F = MSa / MSe return ANOVA_result([F, 1 - Fcdf(F, r - 1, n - r)])
def ConfuMat(Y, y_, labels): '''calculate confution Matrix''' labels = sorted(set(Y) | set(y_)) confu = zeros((len(labels) + 1, len(labels) + 1)) temp = SeriesSet({'Y': Y, 'y': y_}) for i, l1 in enumerate(labels): subtemp = temp.select(lambda row: row[0] == l1) for j, l2 in enumerate(labels): confu[i, j] = len(subtemp.select(lambda row: row[1] == l2)) confu[i, -1] = sum(confu[i]) for j in range(len(labels) + 1): confu[-1, j] = sum(confu[:, j].tolist()[0]) return confu
def Performance(predictor, data, target, mode='reg'): assert mode in ('clf', 'reg'), "`mode` must be `clf` or `reg` only." assert len(data) == len( target), "the number of target data is not equal to variable data" if mode == 'clf': result = predictor.predict(data) if hasattr(result, 'shape') is False: result = SeriesSet(result) if hasattr(target, 'shape') is False: target = SeriesSet(target) assert target.shape[1] == 1, 'testify target must be a sequence' target = target[target.columns[0]] if hasattr(predictor, 'labels'): labels = predictor.labels else: labels = sorted(set(result) | set(target)) confuMat = ConfuMat(target, result, labels) LogInfo('Classification Accuracy: %.4f' % Accuracy(confuMat)) LogInfo('Classification Kappa: %.4f' % Kappa(confuMat)) if confuMat.shape[1] == 3: proba = predictor.predict_proba(data) if proba.shape[1] == 2: proba = proba[:, 0] target = Series(1 if _ == labels[0] else 0 for _ in target) LogInfo('Classification AUC: %.4f' % Auc(target, proba)) return confuMat elif mode == 'reg': target = Series(target) predict = Series(predictor.predict(data).T.tolist()[0]) mean_abs_err = Score.MAE(target, predict) mean_sqrt_err = Score.MSE(target, predict) R2 = Score.R2_score(target, predict) mean_abs_percent_erro = Score.MAPE(target, predict) LogInfo('Regression MAE: %.4f' % mean_abs_err) LogInfo('Regression MSE: %.4f' % mean_sqrt_err) LogInfo('Regression MAPE: %.4f' % mean_abs_percent_erro) LogInfo(u'Regression R²: %.4f' % R2)
def _Info(self, shape): table = SeriesSet() table.append_col(['X%d' % i for i in range(1, shape + 1)], 'Variables') for i, vec in enumerate(self._vector, 1): table.append_col(vec.tolist()[0], 'Func%d' % i) return table
def proba2label(seq, labels): if hasattr(seq, 'shape') is False: seq = SeriesSet(seq) if seq.shape[1] > 1: return clf_multilabel(seq, labels) return clf_binlabel(seq, labels)