Beispiel #1
0
class HierarchicalClusteringAssetAllocation:
    """
    This class implements the Hierarchical Equal Risk Contribution (HERC) algorithm and it's extended components mentioned in the
    following papers: `Raffinot, Thomas, The Hierarchical Equal Risk Contribution Portfolio (August 23,
    2018). <https://ssrn.com/abstract=3237540>`_; and `Raffinot, Thomas, Hierarchical Clustering Based Asset Allocation (May 2017)
    <https://ssrn.com/abstract=2840729>`_;

    While the vanilla Hierarchical Risk Parity algorithm uses only the variance as a risk measure for assigning weights, the HERC
    algorithm proposed by Raffinot, allows investors to use other risk metrics like Expected Shortfall, Sharpe Ratio and
    Conditional Drawdown. Furthermore, it is flexible enough to be easily extended to include custom risk measures of our own.
    """

    def __init__(self, calculate_expected_returns='mean'):
        """
        Constructor.

        :param calculate_expected_returns: (str) the method to use for calculation of expected returns.
        Currently supports "mean" and "exponential"
        """

        self.weights = list()
        self.clusters = None
        self.ordered_indices = None
        self.returns_estimator = ReturnsEstimation()
        self.risk_metrics = RiskMetrics()
        self.calculate_expected_returns = calculate_expected_returns

    @staticmethod
    def _compute_cluster_inertia(labels, asset_returns):
        """
        Calculate the cluster inertia (within cluster sum-of-squares).

        :param labels: (list) cluster labels
        :param asset_returns: (pd.DataFrame) historical asset returns
        :return: (float) cluster inertia value
        """

        unique_labels = np.unique(labels)
        inertia = [np.mean(pairwise_distances(asset_returns[:, labels == label])) for label in unique_labels]
        inertia = np.log(np.sum(inertia))
        return inertia

    def _get_optimal_number_of_clusters(self,
                                        correlation,
                                        asset_returns,
                                        num_reference_datasets=5,
                                        max_number_of_clusters=10):
        """
        Find the optimal number of clusters for hierarchical clustering using the Gap statistic.

        :param correlation: (np.array) matrix of asset correlations
        :param asset_returns: (pd.DataFrame) historical asset returns
        :param num_reference_datasets: (int) the number of reference datasets to generate for calculating expected inertia
        :param max_number_of_clusters: (int) the maximum number of clusters to check for finding the optimal value
        :return: (int) the optimal number of clusters
        """

        cluster_func = AgglomerativeClustering(affinity='precomputed', linkage='single')
        original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        gap_values = []
        for num_clusters in range(1, max_number_of_clusters + 1):
            cluster_func.n_clusters = num_clusters

            # Calculate expected inertia from reference datasets
            reference_inertias = []
            for _ in range(num_reference_datasets):

                # Generate reference returns from uniform distribution and calculate the distance matrix.
                reference_asset_returns = pd.DataFrame(np.random.rand(*asset_returns.shape))
                reference_correlation = np.array(reference_asset_returns.corr())
                reference_distance_matrix = np.sqrt(2 * (1 - reference_correlation).round(5))

                reference_cluster_assignments = cluster_func.fit_predict(reference_distance_matrix)
                inertia = self._compute_cluster_inertia(reference_cluster_assignments, reference_asset_returns.values)
                reference_inertias.append(inertia)
            expected_inertia = np.mean(reference_inertias)

            # Calculate inertia from original data
            original_cluster_asignments = cluster_func.fit_predict(original_distance_matrix)
            inertia = self._compute_cluster_inertia(original_cluster_asignments, asset_returns.values)

            # Calculate the gap statistic
            gap = expected_inertia - inertia
            gap_values.append(gap)

        return np.argmax(gap_values)

    @staticmethod
    def _tree_clustering(correlation, num_clusters):
        """
        Perform agglomerative clustering on the current portfolio.

        :param correlation: (np.array) matrix of asset correlations
        :param num_clusters: (int) the number of clusters
        :return: (list) structure of hierarchical tree
        """

        cluster_func = AgglomerativeClustering(n_clusters=num_clusters,
                                               affinity='precomputed',
                                               linkage='single')
        distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        cluster_func.fit(distance_matrix)
        return cluster_func.children_

    def _quasi_diagnalization(self, num_assets, curr_index):
        """
        Rearrange the assets to reorder them according to hierarchical tree clustering order.

        :param num_assets: (int) the total number of assets
        :param curr_index: (int) current index
        :return: (list) the assets rearranged according to hierarchical clustering
        """

        if curr_index < num_assets:
            return [curr_index]

        left = int(self.clusters[curr_index - num_assets, 0])
        right = int(self.clusters[curr_index - num_assets, 1])

        return (self._quasi_diagnalization(num_assets, left) + self._quasi_diagnalization(num_assets, right))

    @staticmethod
    def _get_inverse_variance_weights(covariance):
        '''
        Calculate the inverse variance weight allocations.

        :param covariance: (pd.DataFrame) covariance matrix of assets
        :return: (list) inverse variance weight values
        '''

        inv_diag = 1 / np.diag(covariance.values)
        parity_w = inv_diag * (1 / np.sum(inv_diag))
        return parity_w

    def _get_cluster_variance(self, covariance, cluster_indices):
        """
        Calculate cluster variance.

        :param covariance: (pd.DataFrame) covariance matrix of assets
        :param cluster_indices: (list) list of asset indices for the cluster
        :return: (float) variance of the cluster
        """

        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        cluster_variance = self.risk_metrics.calculate_variance(covariance=cluster_covariance, weights=parity_w)
        return cluster_variance

    def _get_cluster_sharpe_ratio(self, expected_asset_returns, covariance, cluster_indices):
        """
        Calculate cluster Sharpe Ratio.

        :param expected_asset_returns: (list) a list of mean asset returns (mu)
        :param covariance: (pd.DataFrame) covariance matrix of assets
        :param cluster_indices: (list) list of asset indices for the cluster
        :return: (float) sharpe ratio of the cluster
        """

        cluster_expected_returns = expected_asset_returns[cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        cluster_variance = self.risk_metrics.calculate_variance(covariance=cluster_covariance, weights=parity_w)
        cluster_sharpe_ratio = (parity_w @ cluster_expected_returns) / np.sqrt(cluster_variance)
        return cluster_sharpe_ratio

    def _get_cluster_expected_shortfall(self, asset_returns, covariance, confidence_level, cluster_indices):
        """
        Calculate cluster expected shortfall.

        :param asset_returns: (pd.DataFrame) historical asset returns
        :param covariance: (pd.DataFrame) covariance matrix of assets
        :param confidence_level: (float) the confidence level (alpha)
        :param cluster_indices: (list) list of asset indices for the cluster
        :return: (float) expected shortfall of the cluster
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        portfolio_returns = cluster_asset_returns @ parity_w
        cluster_expected_shortfall = self.risk_metrics.calculate_expected_shortfall(returns=portfolio_returns,
                                                                                    confidence_level=confidence_level)
        return cluster_expected_shortfall

    def _get_cluster_conditional_drawdown_at_risk(self, asset_returns, covariance, confidence_level, cluster_indices):
        """
        Calculate cluster conditional drawdown at risk.

        :param asset_returns: (pd.DataFrame) historical asset returns
        :param covariance: (pd.DataFrame) covariance matrix of assets
        :param confidence_level: (float) the confidence level (alpha)
        :param cluster_indices: (list) list of asset indices for the cluster
        :return: (float) CDD of the cluster
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        portfolio_returns = cluster_asset_returns @ parity_w
        cluster_conditional_drawdown = self.risk_metrics.calculate_conditional_drawdown_risk(returns=portfolio_returns,
                                                                                             confidence_level=confidence_level)
        return cluster_conditional_drawdown

    def _recursive_bisection(self,
                             expected_asset_returns,
                             asset_returns,
                             covariance_matrix,
                             assets,
                             allocation_metric,
                             confidence_level):
        # pylint: disable=bad-continuation, too-many-locals
        """
        Recursively assign weights to the clusters - ultimately assigning weights to the individual assets.

        :param expected_asset_returns: (list) a list of mean asset returns (mu)
        :param asset_returns: (pd.DataFrame) historical asset returns
        :param covariance_matrix: (pd.DataFrame) the covariance matrix
        :param assets: (list) list of asset names in the portfolio
        :param allocation_metric: (str) the metric used for calculating weight allocations
        :param confidence_level: (float) the confidence level (alpha)
        """

        self.weights = pd.Series(1, index=self.ordered_indices)
        clustered_alphas = [self.ordered_indices]

        while clustered_alphas:
            clustered_alphas = [cluster[start:end]
                                for cluster in clustered_alphas
                                for start, end in ((0, len(cluster) // 2), (len(cluster) // 2, len(cluster)))
                                if len(cluster) > 1]

            for subcluster in range(0, len(clustered_alphas), 2):
                left_cluster = clustered_alphas[subcluster]
                right_cluster = clustered_alphas[subcluster + 1]

                # Calculate allocation factor based on the metric
                if allocation_metric == 'minimum_variance':
                    left_cluster_variance = self._get_cluster_variance(covariance_matrix, left_cluster)
                    right_cluster_variance = self._get_cluster_variance(covariance_matrix, right_cluster)
                    alloc_factor = 1 - left_cluster_variance / (left_cluster_variance + right_cluster_variance)
                elif allocation_metric == 'minimum_standard_deviation':
                    left_cluster_sd = np.sqrt(self._get_cluster_variance(covariance_matrix, left_cluster))
                    right_cluster_sd = np.sqrt(self._get_cluster_variance(covariance_matrix, right_cluster))
                    alloc_factor = 1 - left_cluster_sd / (left_cluster_sd + right_cluster_sd)
                elif allocation_metric == 'sharpe_ratio':
                    left_cluster_sharpe_ratio = self._get_cluster_sharpe_ratio(expected_asset_returns,
                                                                               covariance_matrix,
                                                                               left_cluster)
                    right_cluster_sharpe_ratio = self._get_cluster_sharpe_ratio(expected_asset_returns,
                                                                                covariance_matrix,
                                                                                right_cluster)
                    alloc_factor = left_cluster_sharpe_ratio / (left_cluster_sharpe_ratio + right_cluster_sharpe_ratio)

                    if alloc_factor < 0 or alloc_factor > 1:
                        left_cluster_variance = self._get_cluster_variance(covariance_matrix, left_cluster)
                        right_cluster_variance = self._get_cluster_variance(covariance_matrix, right_cluster)
                        alloc_factor = 1 - left_cluster_variance / (left_cluster_variance + right_cluster_variance)
                elif allocation_metric == 'expected_shortfall':
                    left_cluster_expected_shortfall = self._get_cluster_expected_shortfall(asset_returns=asset_returns,
                                                                                           covariance=covariance_matrix,
                                                                                           confidence_level=confidence_level,
                                                                                           cluster_indices=left_cluster)
                    right_cluster_expected_shortfall = self._get_cluster_expected_shortfall(asset_returns=asset_returns,
                                                                                           covariance=covariance_matrix,
                                                                                           confidence_level=confidence_level,
                                                                                           cluster_indices=right_cluster)
                    alloc_factor = \
                        1 - left_cluster_expected_shortfall / (left_cluster_expected_shortfall + right_cluster_expected_shortfall)
                elif allocation_metric == 'conditional_drawdown_risk':
                    left_cluster_conditional_drawdown = self._get_cluster_conditional_drawdown_at_risk(asset_returns=asset_returns,
                                                         covariance=covariance_matrix,
                                                         confidence_level=confidence_level,
                                                         cluster_indices=left_cluster)
                    right_cluster_conditional_drawdown = self._get_cluster_conditional_drawdown_at_risk(asset_returns=asset_returns,
                                                         covariance=covariance_matrix,
                                                         confidence_level=confidence_level,
                                                         cluster_indices=right_cluster)
                    alloc_factor = \
                        1 - left_cluster_conditional_drawdown / (left_cluster_conditional_drawdown + right_cluster_conditional_drawdown)
                else:
                    alloc_factor = 0.5 # equal weighting

                # Assign weights to each sub-cluster
                self.weights[left_cluster] *= alloc_factor
                self.weights[right_cluster] *= 1 - alloc_factor

        # Assign actual asset values to weight index
        self.weights.index = assets[self.ordered_indices]
        self.weights = pd.DataFrame(self.weights)
        self.weights = self.weights.T

    @staticmethod
    def _cov2corr(covariance):
        """
        Calculate the correlations from asset returns covariance matrix.

        :param covariance: (pd.DataFrame) asset returns covariances
        :return: (pd.DataFrame) correlations between asset returns
        """

        d_matrix = np.zeros_like(covariance)
        diagnoal_sqrt = np.sqrt(np.diag(covariance))
        np.fill_diagonal(d_matrix, diagnoal_sqrt)
        d_inv = np.linalg.inv(d_matrix)
        corr = np.dot(np.dot(d_inv, covariance), d_inv)
        corr = pd.DataFrame(corr, index=covariance.columns, columns=covariance.columns)
        return corr

    @staticmethod
    def _perform_checks(asset_prices, asset_returns, covariance_matrix, allocation_metric):
        # pylint: disable=bad-continuation
        """
        Perform initial warning checks.

        :param asset_prices: (pd.DataFrame) a dataframe of historical asset prices (daily close)
                                            indexed by date
        :param asset_returns: (pd.DataFrame/numpy matrix) user supplied matrix of asset returns
        :param covariance_matrix: (pd.DataFrame/numpy matrix) user supplied covariance matrix of asset returns
        :param allocation_metric: (str) the metric used for calculating weight allocations
        :return:
        """

        if asset_prices is None and asset_returns is None and covariance_matrix is None:
            raise ValueError("You need to supply either raw prices or returns or a covariance matrix of asset returns")

        if asset_prices is not None:
            if not isinstance(asset_prices, pd.DataFrame):
                raise ValueError("Asset prices matrix must be a dataframe")
            if not isinstance(asset_prices.index, pd.DatetimeIndex):
                raise ValueError("Asset prices dataframe must be indexed by date.")

        if allocation_metric not in \
                {'minimum_variance', 'minimum_standard_deviation', 'sharpe_ratio',
                 'equal_weighting', 'expected_shortfall', 'conditional_drawdown_risk'}:
            raise ValueError("Unknown allocation metric specified. Supported metrics are - minimum_variance, "
                             "minimum_standard_deviation, sharpe_ratio, equal_weighting, expected_shortfall, "
                             "conditional_drawdown_risk")

    def allocate(self,
                 asset_names,
                 asset_prices=None,
                 asset_returns=None,
                 covariance_matrix=None,
                 expected_asset_returns=None,
                 allocation_metric='equal_weighting',
                 confidence_level=0.05,
                 optimal_num_clusters=None,
                 resample_by=None):
        """
        Calculate asset allocations using the HCAA algorithm.

        :param asset_names: (list) a list of strings containing the asset names
        :param asset_prices: (pd.DataFrame) a dataframe of historical asset prices (daily close)
                                            indexed by date
        :param asset_returns: (pd.DataFrame/numpy matrix) user supplied matrix of asset returns
        :param covariance_matrix: (pd.DataFrame/numpy matrix) user supplied covariance matrix of asset returns
        :param expected_asset_returns: (list) a list of mean asset returns (mu)
        :param allocation_metric: (str) the metric used for calculating weight allocations
        :param confidence_level: (float) the confidence level (alpha) used for calculating expected shortfall and conditional
                                         drawdown at risk
        :param optimal_num_clusters: (int) optimal number of clusters for hierarchical clustering
        :param resample_by: (str) specifies how to resample the prices - weekly, daily, monthly etc.. Defaults to
                                  None for no resampling
        """

        # Perform initial checks
        self._perform_checks(asset_prices, asset_returns, covariance_matrix, allocation_metric)

        # Calculate the expected returns if the user does not supply any returns
        if allocation_metric == 'sharpe_ratio' and expected_asset_returns is None:
            if asset_prices is None:
                raise ValueError(
                    "Either provide pre-calculated expected returns or give raw asset prices for inbuilt returns calculation")

            if self.calculate_expected_returns == "mean":
                expected_asset_returns = self.returns_estimator.calculate_mean_historical_returns(
                    asset_prices=asset_prices,
                    resample_by=resample_by)
            elif self.calculate_expected_returns == "exponential":
                expected_asset_returns = self.returns_estimator.calculate_exponential_historical_returns(
                    asset_prices=asset_prices,
                    resample_by=resample_by)
            else:
                raise ValueError("Unknown returns specified. Supported returns - mean, exponential")

        # Calculate the returns if the user does not supply a returns dataframe
        if asset_returns is None:
            asset_returns = self.returns_estimator.calculate_returns(asset_prices=asset_prices, resample_by=resample_by)
        asset_returns = pd.DataFrame(asset_returns, columns=asset_names)

        # Calculate covariance of returns or use the user specified covariance matrix
        if covariance_matrix is None:
            covariance_matrix = asset_returns.cov()
        cov = pd.DataFrame(covariance_matrix, index=asset_names, columns=asset_names)

        # Calculate correlation from covariance matrix
        corr = self._cov2corr(covariance=cov)

        # Calculate the optimal number of clusters using the Gap statistic
        if not optimal_num_clusters:
            optimal_num_clusters = self._get_optimal_number_of_clusters(correlation=corr, asset_returns=asset_returns)

        # Tree Clustering
        self.clusters = self._tree_clustering(correlation=corr, num_clusters=optimal_num_clusters)

        # Quasi Diagnalization
        num_assets = len(asset_names)
        self.ordered_indices = self._quasi_diagnalization(num_assets, 2 * num_assets - 2)

        # Recursive Bisection
        self._recursive_bisection(expected_asset_returns=expected_asset_returns,
                                  asset_returns=asset_returns,
                                  covariance_matrix=cov,
                                  assets=asset_names,
                                  allocation_metric=allocation_metric,
                                  confidence_level=confidence_level)
Beispiel #2
0
class HierarchicalClusteringAssetAllocation:
    """
    This class implements the Hierarchical Equal Risk Contribution (HERC) algorithm and it's extended components mentioned in the
    following papers: `Raffinot, Thomas, The Hierarchical Equal Risk Contribution Portfolio (August 23,
    2018). <https://ssrn.com/abstract=3237540>`_; and `Raffinot, Thomas, Hierarchical Clustering Based Asset Allocation (May 2017)
    <https://ssrn.com/abstract=2840729>`_;

    While the vanilla Hierarchical Risk Parity algorithm uses only the variance as a risk measure for assigning weights, the HERC
    algorithm proposed by Raffinot, allows investors to use other risk metrics like Expected Shortfall, Sharpe Ratio and
    Conditional Drawdown. Furthermore, it is flexible enough to be easily extended to include custom risk measures of our own.
    """

    def __init__(self, calculate_expected_returns='mean', confidence_level=0.05):
        """
        Initialise.

        :param calculate_expected_returns: (str) The method to use for calculation of expected returns.
                                                 Currently supports: ``mean``, ``exponential``.
        :param confidence_level: (float) The confidence level (alpha) used for calculating expected shortfall and conditional
                                         drawdown at risk.
        """

        self.weights = list()
        self.clusters = None
        self.ordered_indices = None
        self.cluster_children = None
        self.returns_estimator = ReturnsEstimators()
        self.risk_metrics = RiskMetrics()
        self.calculate_expected_returns = calculate_expected_returns
        self.confidence_level = confidence_level

    def allocate(self, asset_names=None, asset_prices=None, asset_returns=None, covariance_matrix=None,
                 expected_asset_returns=None, allocation_metric='equal_weighting', linkage='ward',
                 optimal_num_clusters=None):
        # pylint: disable=too-many-branches
        """
        Calculate asset allocations using the HCAA algorithm.

        :param asset_names: (list) A list of strings containing the asset names.
        :param asset_prices: (pd.DataFrame) A dataframe of historical asset prices (daily close)
                                            indexed by date.
        :param asset_returns: (pd.DataFrame/numpy matrix) User supplied matrix of asset returns.
        :param covariance_matrix: (pd.DataFrame/numpy matrix) User supplied covariance matrix of asset returns.
        :param expected_asset_returns: (list) A list of mean asset returns (mu).
        :param allocation_metric: (str) The metric used for calculating weight allocations. Supported strings - ``equal_weighting``,
                                        ``minimum_variance``, ``minimum_standard_deviation``, ``sharpe_ratio``,
                                        ``expected_shortfall``, ``conditional_drawdown_risk``.
        :param linkage: (str) The type of linkage method to use for clustering. Supported strings - ``single``, ``average``,
                              ``complete``, ``ward``.
        :param optimal_num_clusters: (int) Optimal number of clusters for hierarchical clustering.
        """

        # Perform initial checks
        self._perform_checks(asset_prices, asset_returns, expected_asset_returns, allocation_metric)

        # Calculate the expected returns if the user does not supply any returns (only required for sharpe_ratio allocation metric)
        if allocation_metric == 'sharpe_ratio' and expected_asset_returns is None:
            if self.calculate_expected_returns == "mean":
                expected_asset_returns = self.returns_estimator.calculate_mean_historical_returns(
                    asset_prices=asset_prices)
            elif self.calculate_expected_returns == "exponential":
                expected_asset_returns = self.returns_estimator.calculate_exponential_historical_returns(
                    asset_prices=asset_prices)
            else:
                raise ValueError("Unknown returns specified. Supported returns - mean, exponential")

        if asset_names is None:
            if asset_prices is not None:
                asset_names = asset_prices.columns
            elif asset_returns is not None and isinstance(asset_returns, pd.DataFrame):
                asset_names = asset_returns.columns
            else:
                raise ValueError("Please provide a list of asset names")

        # Calculate the returns if the user does not supply a returns dataframe
        if asset_returns is None:
            if allocation_metric in {'expected_shortfall', 'conditional_drawdown_risk'} or \
                    covariance_matrix is None or not optimal_num_clusters:
                asset_returns = self.returns_estimator.calculate_returns(asset_prices=asset_prices)
        asset_returns = pd.DataFrame(asset_returns, columns=asset_names)

        # Calculate covariance of returns or use the user specified covariance matrix
        if covariance_matrix is None:
            covariance_matrix = asset_returns.cov()
        cov = pd.DataFrame(covariance_matrix, index=asset_names, columns=asset_names)

        # Calculate correlation from covariance matrix
        corr = self._cov2corr(covariance=cov)

        # Calculate the optimal number of clusters
        if not optimal_num_clusters:
            optimal_num_clusters = self._get_optimal_number_of_clusters(correlation=corr,
                                                                        linkage=linkage,
                                                                        asset_returns=asset_returns)
        else:
            optimal_num_clusters = self._check_max_number_of_clusters(num_clusters=optimal_num_clusters,
                                                                      linkage=linkage,
                                                                      correlation=corr)

        # Tree Clustering
        self.clusters, self.cluster_children = self._tree_clustering(correlation=corr,
                                                                     num_clusters=optimal_num_clusters,
                                                                     linkage=linkage)

        # Get the flattened order of assets in hierarchical clustering tree
        num_assets = len(asset_names)
        self.ordered_indices = self._quasi_diagnalization(num_assets, 2 * num_assets - 2)

        # Recursive Bisection
        self._recursive_bisection(expected_asset_returns=expected_asset_returns,
                                  asset_returns=asset_returns,
                                  covariance_matrix=cov,
                                  assets=asset_names,
                                  allocation_metric=allocation_metric,
                                  optimal_num_clusters=optimal_num_clusters)

    @staticmethod
    def _compute_cluster_inertia(labels, asset_returns):
        """
        Calculate the cluster inertia (within cluster sum-of-squares).

        :param labels: (list) Cluster labels.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :return: (float) Cluster inertia value.
        """

        unique_labels = np.unique(labels)
        inertia = [np.mean(pairwise_distances(asset_returns[:, labels == label])) for label in unique_labels]
        inertia = np.log(np.sum(inertia))
        return inertia

    @staticmethod
    def _check_max_number_of_clusters(num_clusters, linkage, correlation):
        """
        In some cases, the optimal number of clusters value given by the users is greater than the maximum number of clusters
        possible with the given data. This function checks this and assigns the proper value to the number of clusters when the
        given value exceeds maximum possible clusters.

        :param num_clusters: (int) The number of clusters.
        :param linkage (str): The type of linkage method to use for clustering.
        :param correlation: (np.array) Matrix of asset correlations.
        :return: (int) New value for number of clusters.
        """

        distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        clusters = scipy_linkage(squareform(distance_matrix.values), method=linkage)
        clustering_inds = fcluster(clusters, num_clusters, criterion='maxclust')
        max_number_of_clusters_possible = max(clustering_inds)
        num_clusters = min(max_number_of_clusters_possible, num_clusters)
        return num_clusters

    def _get_optimal_number_of_clusters(self, correlation, asset_returns, linkage, num_reference_datasets=5):
        """
        Find the optimal number of clusters for hierarchical clustering using the Gap statistic.

        :param correlation: (np.array) Matrix of asset correlations.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param linkage: (str) The type of linkage method to use for clustering.
        :param num_reference_datasets: (int) The number of reference datasets to generate for calculating expected inertia.
        :return: (int) The optimal number of clusters.
        """

        original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        gap_values = []
        num_clusters = 1
        max_number_of_clusters = float("-inf")
        while True:

            # Calculate inertia from original data
            original_clusters = scipy_linkage(squareform(original_distance_matrix), method=linkage)
            original_cluster_assignments = fcluster(original_clusters, num_clusters, criterion='maxclust')
            if max(original_cluster_assignments) == max_number_of_clusters or max(original_cluster_assignments) > 10:
                break
            max_number_of_clusters = max(original_cluster_assignments)
            inertia = self._compute_cluster_inertia(original_cluster_assignments, asset_returns.values)

            # Calculate expected inertia from reference datasets
            expected_inertia = self._calculate_expected_inertia(num_reference_datasets, asset_returns, num_clusters, linkage)

            # Calculate the gap statistic
            gap = expected_inertia - inertia
            gap_values.append(gap)
            num_clusters += 1
        return 1 + np.argmax(gap_values)

    def _calculate_expected_inertia(self, num_reference_datasets, asset_returns, num_clusters, linkage):
        """
        Calculate the expected inertia by generating clusters from a uniform distribution.

        :param num_reference_datasets: (int) The number of reference datasets to generate from the distribution.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param num_clusters: (int) The number of clusters to generate.
        :param linkage: (str) The type of linkage criterion to use for hierarchical clustering.
        :return: (float) The expected inertia from the reference datasets.
        """

        reference_inertias = []
        for _ in range(num_reference_datasets):
            # Generate reference returns from uniform distribution and calculate the distance matrix.
            reference_asset_returns = pd.DataFrame(np.random.rand(*asset_returns.shape))
            reference_correlation = np.array(reference_asset_returns.corr())
            reference_distance_matrix = np.sqrt(2 * (1 - reference_correlation).round(5))

            reference_clusters = scipy_linkage(squareform(reference_distance_matrix), method=linkage)
            reference_cluster_assignments = fcluster(reference_clusters, num_clusters, criterion='maxclust')
            inertia = self._compute_cluster_inertia(reference_cluster_assignments, reference_asset_returns.values)
            reference_inertias.append(inertia)
        return np.mean(reference_inertias)

    @staticmethod
    def _tree_clustering(correlation, num_clusters, linkage):
        """
        Perform agglomerative clustering on the current portfolio.

        :param correlation: (np.array) Matrix of asset correlations.
        :param num_clusters: (int) The number of clusters.
        :param linkage (str): The type of linkage method to use for clustering.
        :return: (list) Structure of hierarchical tree.
        """

        distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        clusters = scipy_linkage(squareform(distance_matrix.values), method=linkage)
        clustering_inds = fcluster(clusters, num_clusters, criterion='maxclust')
        cluster_children = {index - 1: [] for index in range(min(clustering_inds), max(clustering_inds) + 1)}
        for index, cluster_index in enumerate(clustering_inds):
            cluster_children[cluster_index - 1].append(index)
        return clusters, cluster_children

    def _quasi_diagnalization(self, num_assets, curr_index):
        """
        Rearrange the assets to reorder them according to hierarchical tree clustering order.

        :param num_assets: (int) The total number of assets.
        :param curr_index: (int) Current index.
        :return: (list) The assets rearranged according to hierarchical clustering.
        """

        if curr_index < num_assets:
            return [curr_index]

        left = int(self.clusters[curr_index - num_assets, 0])
        right = int(self.clusters[curr_index - num_assets, 1])

        return (self._quasi_diagnalization(num_assets, left) + self._quasi_diagnalization(num_assets, right))

    def _recursive_bisection(self, expected_asset_returns, asset_returns, covariance_matrix, assets, allocation_metric,
                             optimal_num_clusters):
        """
        Recursively assign weights to the clusters - ultimately assigning weights to the individual assets.

        :param expected_asset_returns: (list) A list of mean asset returns (mu).
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param assets: (list) List of asset names in the portfolio.
        :param allocation_metric: (str) The metric used for calculating weight allocations.
        :param optimal_num_clusters: (int) Optimal number of clusters for hierarchical tree clustering.
        """

        num_assets = len(assets)
        self.weights = np.ones(shape=num_assets)
        clusters_contribution = np.ones(shape=optimal_num_clusters)
        clusters_weights = np.ones(shape=optimal_num_clusters)
        clusters_variance = np.ones(shape=optimal_num_clusters)

        # Calculate the corresponding risk measure for the clusters
        self._calculate_risk_contribution_of_clusters(clusters_contribution,
                                                      clusters_variance,
                                                      allocation_metric,
                                                      optimal_num_clusters,
                                                      covariance_matrix,
                                                      expected_asset_returns,
                                                      asset_returns)

        # Recursive bisection taking into account the dendrogram structure
        for cluster_index in range(optimal_num_clusters - 1):

            # Get the left and right cluster ids
            left_cluster_ids, right_cluster_ids = self._get_children_cluster_ids(num_assets=num_assets,
                                                                                 parent_cluster_id=cluster_index)

            # Compute alpha
            left_cluster_contribution = np.sum(clusters_contribution[left_cluster_ids])
            right_cluster_contribution = np.sum(clusters_contribution[right_cluster_ids])
            if allocation_metric in {'minimum_variance', 'minimum_standard_deviation', 'expected_shortfall',
                                     'conditional_drawdown_risk'}:
                alloc_factor = 1 - left_cluster_contribution / (left_cluster_contribution + right_cluster_contribution)
            elif allocation_metric == 'sharpe_ratio':
                alloc_factor = left_cluster_contribution / (left_cluster_contribution + right_cluster_contribution)

                # If sharp ratio allocation factor is not within limits, then calculate normal cluster variance allocation
                # factor
                if alloc_factor < 0 or alloc_factor > 1:
                    left_cluster_variance = np.sum(clusters_variance[left_cluster_ids])
                    right_cluster_variance = np.sum(clusters_variance[right_cluster_ids])
                    alloc_factor = 1 - left_cluster_variance / (left_cluster_variance + right_cluster_variance)
            else:
                alloc_factor = 0.5  # equal weighting

            # Assign weights to each sub-cluster
            clusters_weights[left_cluster_ids] *= alloc_factor
            clusters_weights[right_cluster_ids] *= 1 - alloc_factor

        # Compute the final weights
        self._calculate_final_portfolio_weights(clusters_weights, covariance_matrix, optimal_num_clusters)

        # Assign actual asset names to weight index
        self.weights = pd.DataFrame(self.weights)
        self.weights.index = assets[self.ordered_indices]
        self.weights = self.weights.T

    def _calculate_final_portfolio_weights(self, clusters_weights, covariance_matrix, optimal_num_clusters):
        """
        Calculate the final asset weights.

        :param clusters_weights: (np.array) The cluster weights calculated using recursive bisection.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param optimal_num_clusters: (int) Optimal number of clusters for hierarchical tree clustering.
        """

        for cluster_index in range(optimal_num_clusters):
            cluster_asset_indices = self.cluster_children[cluster_index]
            cluster_covariance = covariance_matrix.iloc[cluster_asset_indices, cluster_asset_indices]
            ivp_weights = self._get_inverse_variance_weights(cluster_covariance)
            self.weights[cluster_asset_indices] = ivp_weights * clusters_weights[cluster_index]

    def _calculate_risk_contribution_of_clusters(self, clusters_contribution, clusters_variance, allocation_metric,
                                                 optimal_num_clusters, covariance_matrix, expected_asset_returns,
                                                 asset_returns):
        """
        Calculate the risk contribution of clusters based on the allocation metric.

        :param clusters_contribution: (np.array) The risk contribution value of the clusters.
        :param clusters_variance: (np.array) The variance of the clusters.
        :param allocation_metric: (str) The metric used for calculating weight allocations.
        :param optimal_num_clusters: (int) Optimal number of clusters for hierarchical tree clustering.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param expected_asset_returns: (list) A list of mean asset returns (mu).
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        """

        for cluster_index in range(optimal_num_clusters):
            cluster_asset_indices = self.cluster_children[cluster_index]

            if allocation_metric == 'minimum_variance':
                clusters_contribution[cluster_index] = self._get_cluster_variance(covariance_matrix,
                                                                                  cluster_asset_indices)
            elif allocation_metric == 'minimum_standard_deviation':
                clusters_contribution[cluster_index] = np.sqrt(
                    self._get_cluster_variance(covariance_matrix, cluster_asset_indices))
            elif allocation_metric == 'sharpe_ratio':
                clusters_contribution[cluster_index] = self._get_cluster_sharpe_ratio(expected_asset_returns,
                                                                                      covariance_matrix,
                                                                                      cluster_asset_indices)
                clusters_variance[cluster_index] = self._get_cluster_variance(covariance_matrix, cluster_asset_indices)
            elif allocation_metric == 'expected_shortfall':
                clusters_contribution[cluster_index] = self._get_cluster_expected_shortfall(asset_returns=asset_returns,
                                                                                            covariance=covariance_matrix,
                                                                                            cluster_indices=cluster_asset_indices)
            elif allocation_metric == 'conditional_drawdown_risk':
                clusters_contribution[cluster_index] = self._get_cluster_conditional_drawdown_at_risk(
                    asset_returns=asset_returns,
                    covariance=covariance_matrix,
                    cluster_indices=cluster_asset_indices)

    def _get_children_cluster_ids(self, num_assets, parent_cluster_id):
        """
        Find the left and right children cluster id of the given parent cluster id.

        :param num_assets: (int) The number of assets in the portfolio.
        :param parent_cluster_index: (int) The current parent cluster id.
        :return: (list, list) List of cluster ids to the left and right of the parent cluster in the hierarchical tree.
        """

        left = int(self.clusters[num_assets - 2 - parent_cluster_id, 0])
        right = int(self.clusters[num_assets - 2 - parent_cluster_id, 1])
        left_cluster = self._quasi_diagnalization(num_assets, left)
        right_cluster = self._quasi_diagnalization(num_assets, right)

        left_cluster_ids = []
        right_cluster_ids = []
        for id_cluster, cluster in self.cluster_children.items():
            if sorted(self._intersection(left_cluster, cluster)) == sorted(cluster):
                left_cluster_ids.append(id_cluster)
            if sorted(self._intersection(right_cluster, cluster)) == sorted(cluster):
                right_cluster_ids.append(id_cluster)

        return left_cluster_ids, right_cluster_ids

    @staticmethod
    def _get_inverse_variance_weights(covariance):
        """
        Calculate the inverse variance weight allocations.

        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :return: (list) Inverse variance weight values.
        """

        inv_diag = 1 / np.diag(covariance.values)
        parity_w = inv_diag * (1 / np.sum(inv_diag))
        return parity_w

    def _get_cluster_variance(self, covariance, cluster_indices):
        """
        Calculate cluster variance.

        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) Variance of the cluster.
        """

        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        cluster_variance = self.risk_metrics.calculate_variance(covariance=cluster_covariance, weights=parity_w)
        return cluster_variance

    def _get_cluster_sharpe_ratio(self, expected_asset_returns, covariance, cluster_indices):
        """
        Calculate cluster Sharpe Ratio.

        :param expected_asset_returns: (list) A list of mean asset returns (mu).
        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) Sharpe ratio of the cluster.
        """

        cluster_expected_returns = expected_asset_returns[cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        cluster_variance = self.risk_metrics.calculate_variance(covariance=cluster_covariance, weights=parity_w)
        cluster_sharpe_ratio = (parity_w @ cluster_expected_returns) / np.sqrt(cluster_variance)
        return cluster_sharpe_ratio

    def _get_cluster_expected_shortfall(self, asset_returns, covariance, cluster_indices):
        """
        Calculate cluster expected shortfall.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) Expected shortfall of the cluster.
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        portfolio_returns = cluster_asset_returns @ parity_w
        cluster_expected_shortfall = self.risk_metrics.calculate_expected_shortfall(returns=portfolio_returns,
                                                                                    confidence_level=self.confidence_level)
        return cluster_expected_shortfall

    def _get_cluster_conditional_drawdown_at_risk(self, asset_returns, covariance, cluster_indices):
        """
        Calculate cluster conditional drawdown at risk.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) CDD of the cluster.
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_w = self._get_inverse_variance_weights(cluster_covariance)
        portfolio_returns = cluster_asset_returns @ parity_w
        cluster_conditional_drawdown = self.risk_metrics.calculate_conditional_drawdown_risk(returns=portfolio_returns,
                                                                                             confidence_level=self.confidence_level)
        return cluster_conditional_drawdown

    @staticmethod
    def _intersection(list1, list2):
        """
        Calculate the intersection of two lists

        :param list1: (list) The first list of items.
        :param list2: (list) The second list of items.
        :return: (list) List containing the intersection of the input lists.
        """

        return list(set(list1) & set(list2))

    @staticmethod
    def _cov2corr(covariance):
        """
        Calculate the correlations from asset returns covariance matrix.

        :param covariance: (pd.DataFrame) Asset returns covariances.
        :return: (pd.DataFrame) Correlations between asset returns.
        """

        d_matrix = np.zeros_like(covariance)
        diagnoal_sqrt = np.sqrt(np.diag(covariance))
        np.fill_diagonal(d_matrix, diagnoal_sqrt)
        d_inv = np.linalg.inv(d_matrix)
        corr = np.dot(np.dot(d_inv, covariance), d_inv)
        corr = pd.DataFrame(corr, index=covariance.columns, columns=covariance.columns)
        return corr

    @staticmethod
    def _perform_checks(asset_prices, asset_returns, expected_asset_returns, allocation_metric):
        """
        Perform initial warning checks.

        :param asset_prices: (pd.DataFrame) A dataframe of historical asset prices (daily close)
                                            indexed by date.
        :param asset_returns: (pd.DataFrame/numpy matrix) User supplied matrix of asset returns.
        :param expected_asset_returns: (list) A list of mean asset returns (mu).
        :param allocation_metric: (str) The metric used for calculating weight allocations.
        """

        if asset_prices is None and asset_returns is None and expected_asset_returns is None:
            raise ValueError("You need to supply either raw prices or returns or expected asset returns.")

        if asset_prices is not None:
            if not isinstance(asset_prices, pd.DataFrame):
                raise ValueError("Asset prices matrix must be a dataframe")
            if not isinstance(asset_prices.index, pd.DatetimeIndex):
                raise ValueError("Asset prices dataframe must be indexed by date.")

        if allocation_metric not in \
                {'minimum_variance', 'minimum_standard_deviation', 'sharpe_ratio',
                 'equal_weighting', 'expected_shortfall', 'conditional_drawdown_risk'}:
            raise ValueError("Unknown allocation metric specified. Supported metrics are - minimum_variance, "
                             "minimum_standard_deviation, sharpe_ratio, equal_weighting, expected_shortfall, "
                             "conditional_drawdown_risk")

        if allocation_metric == 'sharpe_ratio' and expected_asset_returns is None and asset_prices is None:
            raise ValueError(
                "An expected asset returns list is required for sharpe ratio metric. Either provide pre-calculated"
                "expected asset returns or give raw asset prices for inbuilt returns calculation.")
Beispiel #3
0
class HierarchicalEqualRiskContribution:
    """
    This class implements the Hierarchical Equal Risk Contribution (HERC) algorithm and it's extended components mentioned in the
    following papers: `Raffinot, Thomas, The Hierarchical Equal Risk Contribution Portfolio (August 23,
    2018). <https://ssrn.com/abstract=3237540>`_; and `Raffinot, Thomas, Hierarchical Clustering Based Asset Allocation (May 2017)
    <https://ssrn.com/abstract=2840729>`_;

    While the vanilla Hierarchical Risk Parity algorithm uses only the variance as a risk measure for assigning weights, the HERC
    algorithm proposed by Raffinot, allows investors to use other risk metrics like Standard Deviation, Expected Shortfall and
    Conditional Drawdown at Risk.
    """

    UniqueColors = [
        'darkred', 'deepskyblue', 'springgreen', 'darkorange', 'deeppink',
        'slateblue', 'navy', 'blueviolet', 'pink', 'darkslategray'
    ]
    UnclusteredColor = "#808080"

    def __init__(self, confidence_level=0.05):
        """
        Initialise.

        :param confidence_level: (float) The confidence level (alpha) used for calculating expected shortfall and conditional
                                         drawdown at risk.
        """

        self.weights = list()
        self.clusters = None
        self.ordered_indices = None
        self.cluster_children = None
        self.optimal_num_clusters = None
        self.returns_estimator = ReturnsEstimators()
        self.risk_estimator = RiskEstimators()
        self.risk_metrics = RiskMetrics()
        self.confidence_level = confidence_level

    def allocate(self,
                 asset_names=None,
                 asset_prices=None,
                 asset_returns=None,
                 covariance_matrix=None,
                 risk_measure='equal_weighting',
                 linkage='ward',
                 optimal_num_clusters=None):
        # pylint: disable=too-many-branches
        """
        Calculate asset allocations using the Hierarchical Equal Risk Contribution algorithm.

        :param asset_names: (list) A list of strings containing the asset names.
        :param asset_prices: (pd.DataFrame) A dataframe of historical asset prices (daily close)
                                            indexed by date.
        :param asset_returns: (pd.DataFrame/numpy matrix) User supplied matrix of asset returns.
        :param covariance_matrix: (pd.DataFrame/numpy matrix) User supplied covariance matrix of asset returns.
        :param risk_measure: (str) The metric used for calculating weight allocations. Supported strings - ``equal_weighting``,
                                   ``variance``, ``standard_deviation``, ``expected_shortfall``, ``conditional_drawdown_risk``.
        :param linkage: (str) The type of linkage method to use for clustering. Supported strings - ``single``, ``average``,
                              ``complete``, ``ward``.
        :param optimal_num_clusters: (int) Optimal number of clusters for hierarchical clustering.
        """

        # Perform error checks
        self._error_checks(asset_prices, asset_returns, risk_measure,
                           covariance_matrix)

        if asset_names is None:
            if asset_prices is not None:
                asset_names = asset_prices.columns
            elif asset_returns is not None and isinstance(
                    asset_returns, pd.DataFrame):
                asset_names = asset_returns.columns
            else:
                raise ValueError("Please provide a list of asset names")

        # Calculate the returns if the user does not supply a returns dataframe
        if asset_returns is None and (risk_measure in {
                'expected_shortfall', 'conditional_drawdown_risk'
        } or covariance_matrix is None or not optimal_num_clusters):
            asset_returns = self.returns_estimator.calculate_returns(
                asset_prices=asset_prices)
        asset_returns = pd.DataFrame(asset_returns, columns=asset_names)

        # Calculate covariance of returns or use the user specified covariance matrix
        if covariance_matrix is None:
            covariance_matrix = asset_returns.cov()
        cov = pd.DataFrame(covariance_matrix,
                           index=asset_names,
                           columns=asset_names)

        # Calculate correlation from covariance matrix
        corr = self.risk_estimator.cov_to_corr(cov)

        # Calculate the optimal number of clusters
        if not optimal_num_clusters:
            self.optimal_num_clusters = self._get_optimal_number_of_clusters(
                correlation=corr, linkage=linkage, asset_returns=asset_returns)
        else:
            self.optimal_num_clusters = self._check_max_number_of_clusters(
                num_clusters=optimal_num_clusters,
                linkage=linkage,
                correlation=corr)

        # Tree Clustering
        self.clusters, self.cluster_children = self._tree_clustering(
            correlation=corr, linkage=linkage)

        # Get the flattened order of assets in hierarchical clustering tree
        num_assets = len(asset_names)
        self.ordered_indices = self._quasi_diagnalization(
            num_assets, 2 * num_assets - 2)

        # Recursive Bisection
        self._recursive_bisection(asset_returns=asset_returns,
                                  covariance_matrix=cov,
                                  assets=asset_names,
                                  risk_measure=risk_measure)

    def plot_clusters(self, assets):
        """
        Plot a dendrogram of the hierarchical clusters.

        :param assets: (list) Asset names in the portfolio
        :return: (dict) Dendrogram
        """

        colors = dict()
        for cluster_idx, children in self.cluster_children.items():
            color = self.UniqueColors[cluster_idx]

            for child in children:
                colors[assets[child]] = color
        dendrogram_plot = dendrogram(
            self.clusters,
            labels=assets,
            link_color_func=lambda k: self.UnclusteredColor)
        plot_axis = plt.gca()
        xlbls = plot_axis.get_xmajorticklabels()
        for lbl in xlbls:
            lbl.set_color(colors[lbl.get_text()])
        return dendrogram_plot

    @staticmethod
    def _compute_cluster_inertia(labels, asset_returns):
        """
        Calculate the cluster inertia (within cluster sum-of-squares).

        :param labels: (list) Cluster labels.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :return: (float) Cluster inertia value.
        """

        unique_labels = np.unique(labels)
        inertia = [
            np.mean(pairwise_distances(asset_returns[:, labels == label]))
            for label in unique_labels
        ]
        inertia = np.log(np.sum(inertia))
        return inertia

    @staticmethod
    def _check_max_number_of_clusters(num_clusters, linkage, correlation):
        """
        In some cases, the optimal number of clusters value given by the users is greater than the maximum number of clusters
        possible with the given data. This function checks this and assigns the proper value to the number of clusters when the
        given value exceeds maximum possible clusters.

        :param num_clusters: (int) The number of clusters.
        :param linkage (str): The type of linkage method to use for clustering.
        :param correlation: (np.array) Matrix of asset correlations.
        :return: (int) New value for number of clusters.
        """

        distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        clusters = scipy_linkage(squareform(distance_matrix.values),
                                 method=linkage)
        clustering_inds = fcluster(clusters,
                                   num_clusters,
                                   criterion='maxclust')
        max_number_of_clusters_possible = max(clustering_inds)
        num_clusters = min(max_number_of_clusters_possible, num_clusters)
        return num_clusters

    def _get_optimal_number_of_clusters(self,
                                        correlation,
                                        asset_returns,
                                        linkage,
                                        num_reference_datasets=5):
        """
        Find the optimal number of clusters for hierarchical clustering using the Gap statistic.

        :param correlation: (np.array) Matrix of asset correlations.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param linkage: (str) The type of linkage method to use for clustering.
        :param num_reference_datasets: (int) The number of reference datasets to generate for calculating expected inertia.
        :return: (int) The optimal number of clusters.
        """

        original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        gap_values = []
        num_clusters = 1
        max_number_of_clusters = float("-inf")
        while True:

            # Calculate inertia from original data
            original_clusters = scipy_linkage(
                squareform(original_distance_matrix), method=linkage)
            original_cluster_assignments = fcluster(original_clusters,
                                                    num_clusters,
                                                    criterion='maxclust')
            if max(original_cluster_assignments
                   ) == max_number_of_clusters or max(
                       original_cluster_assignments) > 10:
                break
            max_number_of_clusters = max(original_cluster_assignments)
            inertia = self._compute_cluster_inertia(
                original_cluster_assignments, asset_returns.values)

            # Calculate expected inertia from reference datasets
            expected_inertia = self._calculate_expected_inertia(
                num_reference_datasets, asset_returns, num_clusters, linkage)

            # Calculate the gap statistic
            gap = expected_inertia - inertia
            gap_values.append(gap)
            num_clusters += 1
        return 1 + np.argmax(gap_values)

    def _calculate_expected_inertia(self, num_reference_datasets,
                                    asset_returns, num_clusters, linkage):
        """
        Calculate the expected inertia by generating clusters from a uniform distribution.

        :param num_reference_datasets: (int) The number of reference datasets to generate from the distribution.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param num_clusters: (int) The number of clusters to generate.
        :param linkage: (str) The type of linkage criterion to use for hierarchical clustering.
        :return: (float) The expected inertia from the reference datasets.
        """

        reference_inertias = []
        for _ in range(num_reference_datasets):
            # Generate reference returns from uniform distribution and calculate the distance matrix.
            reference_asset_returns = pd.DataFrame(
                np.random.rand(*asset_returns.shape))
            reference_correlation = np.array(reference_asset_returns.corr())
            reference_distance_matrix = np.sqrt(
                2 * (1 - reference_correlation).round(5))

            reference_clusters = scipy_linkage(
                squareform(reference_distance_matrix), method=linkage)
            reference_cluster_assignments = fcluster(reference_clusters,
                                                     num_clusters,
                                                     criterion='maxclust')
            inertia = self._compute_cluster_inertia(
                reference_cluster_assignments, reference_asset_returns.values)
            reference_inertias.append(inertia)
        return np.mean(reference_inertias)

    def _tree_clustering(self, correlation, linkage):
        """
        Perform agglomerative clustering on the current portfolio.

        :param correlation: (np.array) Matrix of asset correlations.
        :param linkage (str): The type of linkage method to use for clustering.
        :return: (list) Structure of hierarchical tree.
        """

        distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        clusters = scipy_linkage(squareform(distance_matrix.values),
                                 method=linkage)
        clustering_inds = fcluster(clusters,
                                   self.optimal_num_clusters,
                                   criterion='maxclust')
        cluster_children = {
            index - 1: []
            for index in range(min(clustering_inds),
                               max(clustering_inds) + 1)
        }
        for index, cluster_index in enumerate(clustering_inds):
            cluster_children[cluster_index - 1].append(index)
        return clusters, cluster_children

    def _quasi_diagnalization(self, num_assets, curr_index):
        """
        Rearrange the assets to reorder them according to hierarchical tree clustering order.

        :param num_assets: (int) The total number of assets.
        :param curr_index: (int) Current index.
        :return: (list) The assets rearranged according to hierarchical clustering.
        """

        if curr_index < num_assets:
            return [curr_index]

        left = int(self.clusters[curr_index - num_assets, 0])
        right = int(self.clusters[curr_index - num_assets, 1])

        return (self._quasi_diagnalization(num_assets, left) +
                self._quasi_diagnalization(num_assets, right))

    def _recursive_bisection(self, asset_returns, covariance_matrix, assets,
                             risk_measure):
        """
        Recursively assign weights to the clusters - ultimately assigning weights to the individual assets.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param assets: (list) List of asset names in the portfolio.
        :param risk_measure: (str) The metric used for calculating weight allocations.
        """

        num_assets = len(assets)
        self.weights = np.ones(shape=num_assets)
        clusters_contribution = np.ones(shape=self.optimal_num_clusters)
        clusters_weights = np.ones(shape=self.optimal_num_clusters)

        # Calculate the corresponding risk measure for the clusters
        self._calculate_risk_contribution_of_clusters(clusters_contribution,
                                                      risk_measure,
                                                      covariance_matrix,
                                                      asset_returns)

        # Recursive bisection taking into account the dendrogram structure
        for cluster_index in range(self.optimal_num_clusters - 1):

            # Get the left and right cluster ids
            left_cluster_ids, right_cluster_ids = self._get_children_cluster_ids(
                num_assets=num_assets, parent_cluster_id=cluster_index)

            # Compute alpha
            left_cluster_contribution = np.sum(
                clusters_contribution[left_cluster_ids])
            right_cluster_contribution = np.sum(
                clusters_contribution[right_cluster_ids])
            if risk_measure == 'equal_weighting':
                alloc_factor = 0.5
            else:
                alloc_factor = 1 - left_cluster_contribution / (
                    left_cluster_contribution + right_cluster_contribution)

            # Assign weights to each sub-cluster
            clusters_weights[left_cluster_ids] *= alloc_factor
            clusters_weights[right_cluster_ids] *= 1 - alloc_factor

        # Compute the final weights
        self._calculate_final_portfolio_weights(risk_measure, clusters_weights,
                                                covariance_matrix,
                                                asset_returns)

        # Assign actual asset names to weight index
        self.weights = pd.DataFrame(self.weights)
        self.weights.index = assets
        self.weights = self.weights.T
        self.weights = self.weights.iloc[:, self.ordered_indices]

    def _calculate_final_portfolio_weights(self, risk_measure,
                                           clusters_weights, covariance_matrix,
                                           asset_returns):
        """
        Calculate the final asset weights.

        :param risk_measure: (str) The metric used for calculating weight allocations.
        :param clusters_weights: (np.array) The cluster weights calculated using recursive bisection.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        """

        for cluster_index in range(self.optimal_num_clusters):
            cluster_asset_indices = self.cluster_children[cluster_index]

            # Covariance of assets in this cluster
            cluster_covariance = covariance_matrix.iloc[cluster_asset_indices,
                                                        cluster_asset_indices]

            # Historical returns of assets in this cluster
            cluster_asset_returns = None
            if not asset_returns.empty:
                cluster_asset_returns = asset_returns.iloc[:,
                                                           cluster_asset_indices]

            parity_weights = self._calculate_naive_risk_parity(
                cluster_index=cluster_index,
                risk_measure=risk_measure,
                covariance=cluster_covariance,
                asset_returns=cluster_asset_returns)
            self.weights[
                cluster_asset_indices] = parity_weights * clusters_weights[
                    cluster_index]

    def _calculate_naive_risk_parity(self, cluster_index, risk_measure,
                                     covariance, asset_returns):
        # pylint: disable=no-else-return
        """
        Calculate the naive risk parity weights.

        :param cluster_index: (int) Index of the current cluster.
        :param risk_measure: (str) The metric used for calculating weight allocations.
        :param covariance: (pd.DataFrame) The covariance matrix of asset returns.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :return: (np.array) list of risk parity weights for assets in current cluster.
        """

        if risk_measure == 'equal_weighting':
            num_assets_in_cluster = len(self.cluster_children[cluster_index])
            return np.ones(num_assets_in_cluster) * 1 / num_assets_in_cluster
        elif risk_measure in {'variance', 'standard_deviation'}:
            return self._get_inverse_variance_weights(covariance)
        elif risk_measure == 'expected_shortfall':
            return self._get_inverse_CVaR_weights(asset_returns)
        return self._get_inverse_CDaR_weights(asset_returns)

    def _calculate_risk_contribution_of_clusters(self, clusters_contribution,
                                                 risk_measure,
                                                 covariance_matrix,
                                                 asset_returns):
        """
        Calculate the risk contribution of clusters based on the allocation metric.

        :param clusters_contribution: (np.array) The risk contribution value of the clusters.
        :param risk_measure: (str) The metric used for calculating weight allocations.
        :param covariance_matrix: (pd.DataFrame) The covariance matrix.
        :param asset_returns: (pd.DataFrame) Historical asset returns.
        """

        for cluster_index in range(self.optimal_num_clusters):
            cluster_asset_indices = self.cluster_children[cluster_index]

            if risk_measure == 'variance':
                clusters_contribution[
                    cluster_index] = self._get_cluster_variance(
                        covariance_matrix, cluster_asset_indices)
            elif risk_measure == 'standard_deviation':
                clusters_contribution[cluster_index] = np.sqrt(
                    self._get_cluster_variance(covariance_matrix,
                                               cluster_asset_indices))
            elif risk_measure == 'expected_shortfall':
                clusters_contribution[
                    cluster_index] = self._get_cluster_expected_shortfall(
                        asset_returns, cluster_asset_indices)
            elif risk_measure == 'conditional_drawdown_risk':
                clusters_contribution[
                    cluster_index] = self._get_cluster_conditional_drawdown_at_risk(
                        asset_returns=asset_returns,
                        cluster_indices=cluster_asset_indices)

    def _get_children_cluster_ids(self, num_assets, parent_cluster_id):
        """
        Find the left and right children cluster id of the given parent cluster id.

        :param num_assets: (int) The number of assets in the portfolio.
        :param parent_cluster_index: (int) The current parent cluster id.
        :return: (list, list) List of cluster ids to the left and right of the parent cluster in the hierarchical tree.
        """

        left = int(self.clusters[num_assets - 2 - parent_cluster_id, 0])
        right = int(self.clusters[num_assets - 2 - parent_cluster_id, 1])
        left_cluster = self._quasi_diagnalization(num_assets, left)
        right_cluster = self._quasi_diagnalization(num_assets, right)

        left_cluster_ids = []
        right_cluster_ids = []
        for id_cluster, cluster in self.cluster_children.items():
            if sorted(self._intersection(left_cluster,
                                         cluster)) == sorted(cluster):
                left_cluster_ids.append(id_cluster)
            if sorted(self._intersection(right_cluster,
                                         cluster)) == sorted(cluster):
                right_cluster_ids.append(id_cluster)

        return left_cluster_ids, right_cluster_ids

    @staticmethod
    def _get_inverse_variance_weights(covariance):
        """
        Calculate inverse variance weight allocations.

        :param covariance: (pd.DataFrame) Covariance matrix of assets.
        :return: (np.array) Inverse variance weight values.
        """

        inv_diag = 1 / np.diag(covariance.values)
        parity_weights = inv_diag * (1 / np.sum(inv_diag))
        return parity_weights

    def _get_inverse_CVaR_weights(self, asset_returns):
        # pylint: disable=invalid-name
        """
        Calculate inverse CVaR weight allocations.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :return: (np.array) Inverse CVaR weight values.
        """

        parity_weights = []
        for asset_index in range(asset_returns.shape[1]):
            returns = asset_returns.iloc[:, asset_index]
            cvar = self.risk_metrics.calculate_expected_shortfall(
                returns=returns, confidence_level=self.confidence_level)
            parity_weights.append(cvar)

        parity_weights = np.array(parity_weights)
        parity_weights = 1 / parity_weights
        parity_weights = parity_weights * (1 / np.sum(parity_weights))
        return parity_weights

    def _get_inverse_CDaR_weights(self, asset_returns):
        # pylint: disable=invalid-name
        """
        Calculate inverse CDaR weight allocations.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :return: (np.array) Inverse CDaR weight values.
        """

        parity_weights = []
        for asset_index in range(asset_returns.shape[1]):
            returns = asset_returns.iloc[:, asset_index]
            cdar = self.risk_metrics.calculate_conditional_drawdown_risk(
                returns=returns, confidence_level=self.confidence_level)
            parity_weights.append(cdar)

        parity_weights = np.array(parity_weights)
        parity_weights = 1 / parity_weights
        parity_weights = parity_weights * (1 / np.sum(parity_weights))
        return parity_weights

    def _get_cluster_variance(self, covariance, cluster_indices):
        """
        Calculate cluster variance.

        :param covariance: (pd.DataFrame) Covariance matrix of asset returns.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) Variance of the cluster.
        """

        cluster_covariance = covariance.iloc[cluster_indices, cluster_indices]
        parity_weights = self._get_inverse_variance_weights(cluster_covariance)
        cluster_variance = self.risk_metrics.calculate_variance(
            covariance=cluster_covariance, weights=parity_weights)
        return cluster_variance

    def _get_cluster_expected_shortfall(self, asset_returns, cluster_indices):
        """
        Calculate cluster expected shortfall.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) Expected shortfall of the cluster.
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        parity_weights = self._get_inverse_CVaR_weights(cluster_asset_returns)
        portfolio_returns = cluster_asset_returns @ parity_weights
        cluster_expected_shortfall = self.risk_metrics.calculate_expected_shortfall(
            returns=portfolio_returns, confidence_level=self.confidence_level)
        return cluster_expected_shortfall

    def _get_cluster_conditional_drawdown_at_risk(self, asset_returns,
                                                  cluster_indices):
        """
        Calculate cluster conditional drawdown at risk.

        :param asset_returns: (pd.DataFrame) Historical asset returns.
        :param cluster_indices: (list) List of asset indices for the cluster.
        :return: (float) CDD of the cluster.
        """

        cluster_asset_returns = asset_returns.iloc[:, cluster_indices]
        parity_weights = self._get_inverse_CDaR_weights(cluster_asset_returns)
        portfolio_returns = cluster_asset_returns @ parity_weights
        cluster_conditional_drawdown = self.risk_metrics.calculate_conditional_drawdown_risk(
            returns=portfolio_returns, confidence_level=self.confidence_level)
        return cluster_conditional_drawdown

    @staticmethod
    def _intersection(list1, list2):
        """
        Calculate the intersection of two lists

        :param list1: (list) The first list of items.
        :param list2: (list) The second list of items.
        :return: (list) List containing the intersection of the input lists.
        """

        return list(set(list1) & set(list2))

    @staticmethod
    def _error_checks(asset_prices, asset_returns, risk_measure,
                      covariance_matrix):
        """
        Perform initial warning checks.

        :param asset_prices: (pd.DataFrame) A dataframe of historical asset prices (daily close)
                                            indexed by date.
        :param asset_returns: (pd.DataFrame/numpy matrix) User supplied matrix of asset returns.
        :param risk_measure: (str) The metric used for calculating weight allocations.
        :param covariance_matrix: (pd.DataFrame/numpy matrix) User supplied covariance matrix of asset returns.
        """

        if asset_prices is None and asset_returns is None and covariance_matrix is None:
            raise ValueError(
                "You need to supply either raw prices or returns or covariance matrix"
            )

        if asset_prices is not None:
            if not isinstance(asset_prices, pd.DataFrame):
                raise ValueError("Asset prices matrix must be a dataframe")
            if not isinstance(asset_prices.index, pd.DatetimeIndex):
                raise ValueError(
                    "Asset prices dataframe must be indexed by date.")

        if risk_measure not in \
                {'variance', 'standard_deviation', 'equal_weighting', 'expected_shortfall',
                 'conditional_drawdown_risk'}:
            raise ValueError(
                "Unknown allocation metric specified. Supported metrics are - variance, "
                "standard_deviation, equal_weighting, expected_shortfall, "
                "conditional_drawdown_risk")