Beispiel #1
0
def fea_weight_write(path, fn, \
                  feature_weight_list, exc_fun_label, fidx, \
                     sort_flag = False, reverse_flag = False):
    """
    write features ranking to specify file.(path + fn).
    also the path (path) will write to file in feature_weight_file_path.

    Input
    ----
    path: {str} the path where to write feature ranking.
    fn: {str} file name.
    feature_weight_list: {list}, len is m, each element is numpy array shape {n_features,}.
    exc_fun_label: {numpy array}, shape {m,}. the index for feature ranking
    fidx: {numpy array}, shape {n_features,}

    Output
    ------
    None
    """
    num_fea = 0

    if len(feature_weight_list) > 0:
        num_fea = len(feature_weight_list[0])

    # new_path = path + "/n{0}/".format(num_fea)
    new_path = path + "/"

    create_path(new_path)

    with open(feature_weight_file_path, "a+") as f:
        if new_path.strip('/') not in f.readlines():
            print(new_path, file=f)

    feature_weight_table_path = new_path + fn + '.csv'
    feature_weight_table = pd.DataFrame(data=np.array(feature_weight_list),
                                        index=exc_fun_label,
                                        columns=fidx)
    feature_weight_table.index.name = 'index name'
    print('write : ', feature_weight_table_path)
    if feature_weight_table_path != None:
        feature_weight_table.to_csv(feature_weight_table_path,
                                    header=True,
                                    index=True)

    plot_table = feature_weight_table
    if sort_flag == True:
        arr = plot_table.values
        new_arr = np.sort(arr, axis=1)
        if reverse_flag == True:
            for i in range(len(arr)):
                new_arr[i, :] = new_arr[i, ::-1]
        plot_table = pd.DataFrame(np.array(new_arr),
                                  index=feature_weight_table.index,
                                  columns=feature_weight_table.columns)
        plot_table.index.name = feature_weight_table.index.name

    plot_acc_arr(plot_table, picture_path=new_path + '/' + fn + '.png')
Beispiel #2
0
def save_time(fn, fun_name, save_value):
    time_file_name = 'exec_time.csv'
    columns = ['data name', 'fun name', 'which', 'time']

    time_table = pd.DataFrame(columns=columns)
    time_table.index.name = 'index name'
    if path_isExists(time_file_name):
        time_table = pd.read_csv(time_file_name)

    for which in save_value:
        t_table = pd.DataFrame(np.array(
            [fn, fun_name, which, save_value[which]]).reshape(1, -1),
                               columns=columns)

        value_dict = {columns[0]: fn, columns[1]: fun_name, columns[2]: which}
        flag = time_table.apply(lambda x: time_isExit(x, value_dict), axis=1)

        if flag.shape[0] == 0 or (flag.shape[0] != 0 and not flag.any()):
            time_table = time_table.append(t_table, ignore_index=True)
        elif flag.shape[0] != 0 and flag.any():
            # print("equal")
            if time_table.ix[flag, columns[3]] is np.NaN:
                time_table.ix[flag, columns[3]] = 0

            time_table.ix[flag,
                          columns[3]] = (time_table.ix[flag, columns[3]] +
                                         save_value[which]) / 2.0
            # print(time_table.ix[flag, :])

    time_table.to_csv(time_file_name, index=False)
def cal_many_acc_by_idx(x_train, y_train, x_test, y_test, \
                         feature_order, idx_array, run_num = 10):
    """
    calculate accuracy according feature number in idx_array

    Input
    -----
    x_train: {numpy array}, shape {n_samples, n_features}
    y_train: {numpy array}, shape {n_samples,}
    x_test: {numpy array}, shape {n2_samples, n_features}
    y_test: {numpy array}, shape {n2_samples,}
    feature_order: {numpy array}, shape {n_features,}
    idx_array: {numpy array}, shape {n,}

    Output
    ------
    acc_array: {numpy array}, shape {n,}

    """
    idx_array = np.array(idx_array)
    acc_array = np.zeros(idx_array.shape)
    for i, num_fea in enumerate(idx_array):
        idx = feature_order[:num_fea]
        new_x_train, new_x_test = x_train[:, idx], x_test[:, idx]
        new_y_train, new_y_test = y_train, y_test
        a = run_acc(new_x_train,
                    new_y_train,
                    new_x_test,
                    new_y_test,
                    run_num=run_num)
        acc_array[i] = a
    return acc_array
Beispiel #4
0
def save_objectv(arr, name, output_path, sort_flag=False, reverse_flag=False):
    create_path(output_path)
    table = pd.DataFrame(np.array(arr).reshape(1, -1), index=[name])
    table.index.name = 'index name'
    table.to_csv(output_path + '/' + name + '.csv', header=True, index=True)

    plot_table = table
    if sort_flag == True:
        new_arr = sorted(arr)
        if reverse_flag == True:
            new_arr = new_arr[::-1]
        plot_table = pd.DataFrame(np.array(new_arr).reshape(1, -1),
                                  index=[name])
        plot_table.index.name = 'index name'

    plot_acc_arr(plot_table,
                 xlabel="iter",
                 ylabel='value',
                 picture_path=output_path + '/' + name + '.png')
Beispiel #5
0
def fea_rank_write(path, fn, \
                feature_order_list, exc_fun_label, fidx):
    """
    write features ranking to specify file.(path + fn).
    also the path (path) will write to file in feature_ranking_file_path.

    Input
    ----
    path: {str} the path where to write feature ranking.
    fn: {str} file name.
    feature_order_list: {list}, len is m, each element is numpy array shape {n_features,}.
    exc_fun_label: {numpy array}, shape {m,}. the index for feature ranking
    fidx: {numpy array}, shape {n_features,}

    Output
    ------
    None
    """
    num_fea = 0

    if len(feature_order_list) > 0:
        num_fea = len(feature_order_list[0])

    new_path = path + "/n{0}/".format(num_fea)

    create_path(new_path)

    with open(feature_ranking_file_path, "a+") as f:
        if new_path not in f.readlines():
            print(new_path, file=f)

    feature_order_table_path = new_path + fn
    feature_order_table = pd.DataFrame(data=np.array(feature_order_list),
                                       index=exc_fun_label,
                                       columns=fidx)
    feature_order_table.index.name = 'index name'
    print('write : ', feature_order_table_path)
    if feature_order_table_path != None:
        feature_order_table.to_csv(feature_order_table_path,
                                   header=True,
                                   index=True)
def my_normalized_mutual_info_score(labels_true, labels_pred):
    """Normalized Mutual Information between two clusterings

    这个函数修改了熵的部分:np.sqrt(h_true * h_pred) =》 max(h_true, h_pred)

    Normalized Mutual Information (NMI) is an normalization of the Mutual
    Information (MI) score to scale the results between 0 (no mutual
    information) and 1 (perfect correlation). In this function, mutual
    information is normalized by ``sqrt(H(labels_true) * H(labels_pred))``

    This measure is not adjusted for chance. Therefore
    :func:`adjusted_mustual_info_score` might be preferred.

    This metric is independent of the absolute values of the labels:
    a permutation of the class or cluster label values won't change the
    score value in any way.

    This metric is furthermore symmetric: switching ``label_true`` with
    ``label_pred`` will return the same score value. This can be useful to
    measure the agreement of two independent label assignments strategies
    on the same dataset when the real ground truth is not known.

    Read more in the :ref:`User Guide <mutual_info_score>`.

    Parameters
    ----------
    labels_true : int array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    labels_pred : array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    Returns
    -------
    nmi: float
       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling

    See also
    --------
    adjusted_rand_score: Adjusted Rand Index
    adjusted_mutual_info_score: Adjusted Mutual Information (adjusted
        against chance)

    Examples
    --------

    Perfect labelings are both homogeneous and complete, hence have
    score 1.0::

      >>> from sklearn.metrics.cluster import normalized_mutual_info_score
      >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
      1.0
      >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
      1.0

    If classes members are completely split across different clusters,
    the assignment is totally in-complete, hence the NMI is null::

      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
      0.0

    """
    # 一维,且要维度相同
    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    classes = np.unique(labels_true)
    clusters = np.unique(labels_pred)
    # Special limit cases: no clustering since the data is not split.
    # This is a perfect match hence return 1.0.
    if (classes.shape[0] == clusters.shape[0] == 1
        or classes.shape[0] == clusters.shape[0] == 0):
        return 1.0
    contingency = contingency_matrix(labels_true, labels_pred)
    contingency = np.array(contingency, dtype='float')
    # Calculate the MI for the two clusterings
    mi = mutual_info_score(labels_true, labels_pred,
                           contingency=contingency)
    # Calculate the expected value for the mutual information
    # Calculate entropy for each labeling
    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
    nmi = mi / max(max(h_true, h_pred), 1e-10)
    return nmi
def mutual_info_score(labels_true, labels_pred, contingency=None):
    """Mutual Information between two clusterings

    The Mutual Information is a measure of the similarity between two labels of
    the same data. Where :math:`P(i)` is the probability of a random sample
    occurring in cluster :math:`U_i` and :math:`P'(j)` is the probability of a
    random sample occurring in cluster :math:`V_j`, the Mutual Information
    between clusterings :math:`U` and :math:`V` is given as:

    .. math::

        MI(U,V)=\sum_{i=1}^R \sum_{j=1}^C P(i,j)\log\\frac{P(i,j)}{P(i)P'(j)}

        markdown形式,跟熵的形式有点像
        $$MI(U,V)=\sum_{i=1}^R \sum_{j=1}^C P(i,j)\log\frac{P(i,j)}{P(i)P(j)}$$

    This is equal to the Kullback-Leibler divergence of the joint distribution
    with the product distribution of the marginals.

    This metric is independent of the absolute values of the labels:
    a permutation of the class or cluster label values won't change the
    score value in any way.

    This metric is furthermore symmetric: switching ``label_true`` with
    ``label_pred`` will return the same score value. This can be useful to
    measure the agreement of two independent label assignments strategies
    on the same dataset when the real ground truth is not known.

    Read more in the :ref:`User Guide <mutual_info_score>`.

    Parameters
    ----------
    labels_true : int array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    labels_pred : array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    contingency: None or array, shape = [n_classes_true, n_classes_pred]
        A contingency matrix given by the :func:`contingency_matrix` function.
        If value is ``None``, it will be computed, otherwise the given value is
        used, with ``labels_true`` and ``labels_pred`` ignored.

    Returns
    -------
    mi: float
       Mutual information, a non-negative value

    See also
    --------
    adjusted_mutual_info_score: Adjusted against chance Mutual Information
    normalized_mutual_info_score: Normalized Mutual Information
    """
    if contingency is None:
        labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
        contingency = contingency_matrix(labels_true, labels_pred)
    contingency = np.array(contingency, dtype='float')

    # 样本总数
    contingency_sum = np.sum(contingency)
    # 每个行标对应的样本出现次数
    pi = np.sum(contingency, axis=1)
    # 每个列标对应的样本出现次数
    pj = np.sum(contingency, axis=0)
    "out[i, j] = n[i] * n[j]"
    outer = np.outer(pi, pj)
    nnz = contingency != 0.0
    # normalized contingency
    "将nnz为true的值,筛选出来,组成一个一维数组"
    contingency_nm = contingency[nnz]
    "log(N(i,j))"
    log_contingency_nm = np.log(contingency_nm)
    "contingency_nm : p(i, j)"
    contingency_nm /= contingency_sum
    # log(a / b) should be calculated as log(a) - log(b) for
    # possible loss of precision
    "pi.sum()等于pj.sum,均是样本总数"
    "log_outer : -log(p(i)p(j))"
    log_outer = -np.log(outer[nnz]) + math.log(pi.sum()) + math.log(pj.sum())
    "log_contingency_nm - log(contingency_sum) : log(p(i,j))"
    mi = (contingency_nm * (log_contingency_nm - math.log(contingency_sum))
          + contingency_nm * log_outer)
    return mi.sum()