Ejemplo n.º 1
0
def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None,
                         linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list:
    """
    Machine Learning for Asset Managers
    Snippet 6.5.2.1 , page 85. Step 1: Features Clustering

    Gets clustered features subsets from the given set of features.

    :param X: (pd.DataFrame) Dataframe of features.
    :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or
                              'information_variation' or 'mutual_information' or 'distance_correlation'.
    :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that
                            can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the
                            feature are to be generated as it is by the ONC algorithm.
    :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward',
                           'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to
                           be generated as it is by the ONC algorithm.
    :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it
                       returns optimal number of clusters decided by the ONC Algorithm.
    :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number
                                in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be
                                indentified as having low silhouette and hence requied transformation will be appiled to for
                                for correction of the same.
    :return: (list) Feature subsets.
    """

    # Get the dependence matrix
    if dependence_metric != 'linear':
        dep_matrix = get_dependence_matrix(X, dependence_method=dependence_metric)
    else:
        dep_matrix = X.corr()

    # Checking if dataset contains features low silhouette
    X = _check_for_low_silhouette_scores(X, dep_matrix, critical_threshold)

    if n_clusters is None and (distance_metric is None or linkage_method is None):
        return list(get_onc_clusters(dep_matrix.fillna(0))[1].values())  # Get optimal number of clusters
    if distance_metric is not None and (linkage_method is not None and n_clusters is None):
        n_clusters = len(get_onc_clusters(dep_matrix.fillna(0))[1])
    if n_clusters >= len(X.columns):  # Check if number of clusters exceeds number of features
        raise ValueError('Number of clusters must be less than the number of features')

    # Apply distance operator on the dependence matrix
    dist_matrix = get_distance_matrix(dep_matrix, distance_metric=distance_metric)

    # Get the linkage
    link = linkage(squareform(dist_matrix), method=linkage_method)
    clusters = fcluster(link, t=n_clusters, criterion='maxclust')
    clustered_subsets = [[f for c, f in zip(clusters, X.columns) if c == ci] for ci in range(1, n_clusters + 1)]

    return clustered_subsets
Ejemplo n.º 2
0
def _check_for_low_silhouette_scores(X: pd.DataFrame, dep_matrix: pd.DataFrame,
                                     critical_threshold: float = 0.0) -> pd.DataFrame:
    """
    Machine Learning for Asset Managers
    Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph)

    Checks where the dataset contains features low silhouette due one feature being a combination of
    multiple features across clusters. This is a problem, because ONC cannot assign one feature to multiple
    clusters and it needs a transformation.

    :param X: (pd.DataFrame) Dataframe of features.
    :param dep_matrix: (pd.DataFrame) Dataframe with dependences between features.
    :param critical_threshold: (float) Threshold for determining low silhouette score.
    :return: (pd.DataFrame) Dataframe of features.
    """
    _, clstrs, silh = get_onc_clusters(dep_matrix)
    low_silh_feat = silh[silh < critical_threshold].index
    if len(low_silh_feat) > 0:
        print(f'{len(low_silh_feat)} feature/s found with low silhouette score {low_silh_feat}. Returning the transformed dataset')

        # Returning the transformed dataset
        return _cluster_transformation(X, clstrs, low_silh_feat)

    print('No feature/s found with low silhouette score. All features belongs to its respective clusters')

    return X