Ejemplo n.º 1
0
    def transform(self) -> DataFrame:
        """
        Transforms a starting block matrix to the reduced block matrix, using a reducer model produced by the
        RidgeReduction fit method.

        Returns:
             Spark DataFrame representing the reduced block matrix
        """
        _check_model(self.model_df)

        transform_key_pattern = ['header_block', 'sample_block']

        if 'label' in self.block_df.columns:
            transform_key_pattern.append('label')
            joined = self.block_df.drop('sort_key') \
                .join(self.model_df, ['header_block', 'sample_block', 'header'], 'right') \
                .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0)))
        else:
            joined = self.block_df.drop('sort_key') \
                .join(self.model_df, ['header_block', 'sample_block', 'header'], 'right')

        transform_udf = pandas_udf(
            lambda key, pdf:
            apply_model(key, transform_key_pattern, pdf, self._std_label_df,
                        self.sample_blocks, self._alphas, self._std_cov_df),
            reduced_matrix_struct, PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceTransform')

        self.reduced_block_df = joined.groupBy(transform_key_pattern).apply(
            transform_udf)

        return self.reduced_block_df
Ejemplo n.º 2
0
    def transform(self) -> pd.DataFrame:
        """
        Generates predictions for the target labels in the provided label DataFrame by applying the model resulting from
        the RidgeRegression fit method to the reduced block matrix.

        Returns:
            Pandas DataFrame containing prediction y_hat values. The shape and order match label_df such that the
            rows are indexed by sample ID and the columns by label. The column types are float64.
        """
        _check_model(self.model_df)
        _check_cv(self.cv_df)

        transform_key_pattern = ['sample_block', 'label']

        transform_udf = pandas_udf(
            lambda key, pdf:
            apply_model(key, transform_key_pattern, pdf, self._std_label_df,
                        self.sample_blocks, self._alphas, self._std_cov_df),
            reduced_matrix_struct, PandasUDFType.GROUPED_MAP)

        blocked_prediction_df = apply_model_df(self.reduced_block_df,
                                               self.model_df, self.cv_df,
                                               transform_udf,
                                               transform_key_pattern, 'right')

        self.y_hat_df = flatten_prediction_df(blocked_prediction_df,
                                              self.sample_blocks,
                                              self._std_label_df)

        record_hls_event('wgrRidgeRegressionTransform')

        return self.y_hat_df
Ejemplo n.º 3
0
    def transform(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        modeldf: DataFrame,
        cvdf: DataFrame,
        covdf: pd.DataFrame = pd.DataFrame({})) -> pd.DataFrame:
        """
        Generates predictions for the target labels in the provided label DataFrame by applying the model resulting from
        the RidgeRegression fit method to the starting block matrix.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            modeldf : Spark DataFrame produced by the RidgeRegression fit method, representing the reducer model
            cvdf : Spark DataFrame produced by the RidgeRegression fit method, containing the results of the cross
            validation routine.
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
            Pandas DataFrame containing prediction y_hat values. The shape and order match labeldf such that the
            rows are indexed by sample ID and the columns by label. The column types are float64.
        """

        validate_inputs(labeldf, covdf)
        transform_key_pattern = ['sample_block', 'label']

        transform_udf = pandas_udf(
            lambda key, pdf: apply_model(key, transform_key_pattern, pdf,
                                         labeldf, sample_blocks, self.alphas,
                                         covdf), reduced_matrix_struct,
            PandasUDFType.GROUPED_MAP)

        blocked_prediction_df = blockdf.drop('header_block', 'sort_key') \
            .join(modeldf.drop('header_block'), ['sample_block', 'header'], 'right') \
            .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \
            .groupBy(transform_key_pattern) \
            .apply(transform_udf) \
            .join(cvdf, ['label', 'alpha'], 'inner')

        sample_block_df = blockdf.sql_ctx \
            .createDataFrame(sample_blocks.items(), ['sample_block', 'sample_ids']) \
            .selectExpr('sample_block', 'posexplode(sample_ids) as (idx, sample_id)')

        flattened_prediction_df = blocked_prediction_df \
            .selectExpr('sample_block', 'label', 'posexplode(values) as (idx, value)') \
            .join(sample_block_df, ['sample_block', 'idx'], 'inner') \
            .select('sample_id', 'label', 'value')

        pivoted_df = flattened_prediction_df.toPandas() \
            .pivot(index='sample_id', columns='label', values='value') \
            .reindex(index=labeldf.index, columns=labeldf.columns)

        record_hls_event('wgrRidgeRegressionTransform')

        return pivoted_df
Ejemplo n.º 4
0
    def fit(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        covdf: pd.DataFrame = pd.DataFrame({})
    ) -> (DataFrame, DataFrame):
        """
        Fits a ridge regression model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels, as well as a
        Spark DataFrame containing the optimal ridge alpha value for each label.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
            Two Spark DataFrames, one containing the model resulting from the fitting routine and one containing the
            results of the cross validation procedure.
        """

        validate_inputs(labeldf, covdf)
        map_key_pattern = ['sample_block', 'label']
        reduce_key_pattern = ['header_block', 'header', 'label']
        metric = 'r2'

        if not self.alphas:
            self.alphas = generate_alphas(blockdf)

        map_udf = pandas_udf(
            lambda key, pdf: map_normal_eqn(key, map_key_pattern, pdf, labeldf, sample_blocks, covdf
                                            ), normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
                                normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(key, map_key_pattern, pdf, labeldf, self.alphas, covdf
                                              ), model_struct, PandasUDFType.GROUPED_MAP)
        score_udf = pandas_udf(
            lambda key, pdf: score_models(key, map_key_pattern, pdf, labeldf, sample_blocks, self.
                                          alphas, covdf, pd.DataFrame({}), metric), cv_struct,
            PandasUDFType.GROUPED_MAP)

        modeldf = blockdf \
            .groupBy(map_key_pattern) \
            .apply(map_udf) \
            .groupBy(reduce_key_pattern) \
            .apply(reduce_udf) \
            .groupBy(map_key_pattern) \
            .apply(model_udf)

        cvdf = cross_validation(blockdf, modeldf, score_udf, map_key_pattern, self.alphas, metric)

        record_hls_event('wgrRidgeRegressionFit')

        return modeldf, cvdf
Ejemplo n.º 5
0
    def fit(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        covdf: pd.DataFrame = pd.DataFrame({})
    ) -> DataFrame:
        """
        Fits a ridge reducer model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
            Spark DataFrame containing the model resulting from the fitting routine.
        """

        validate_inputs(labeldf, covdf)
        map_key_pattern = ['header_block', 'sample_block']
        reduce_key_pattern = ['header_block', 'header']

        if 'label' in blockdf.columns:
            map_key_pattern.append('label')
            reduce_key_pattern.append('label')
        if not self.alphas:
            self.alphas = generate_alphas(blockdf)

        map_udf = pandas_udf(
            lambda key, pdf: map_normal_eqn(key, map_key_pattern, pdf, labeldf,
                                            sample_blocks, covdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(key, map_key_pattern, pdf,
                                              labeldf, self.alphas, covdf),
            model_struct, PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceFit')

        return blockdf \
            .groupBy(map_key_pattern) \
            .apply(map_udf) \
            .groupBy(reduce_key_pattern) \
            .apply(reduce_udf) \
            .groupBy(map_key_pattern) \
            .apply(model_udf)
Ejemplo n.º 6
0
    def transform(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        modeldf: DataFrame,
        covdf: pd.DataFrame = pd.DataFrame({})) -> DataFrame:
        """
        Transforms a starting block matrix to the reduced block matrix, using a reducer model produced by the
        RidgeReducer fit method.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks: Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            modeldf : Spark DataFrame produced by the RidgeReducer fit method, representing the reducer model
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
             Spark DataFrame representing the reduced block matrix
        """

        validate_inputs(labeldf, covdf)
        transform_key_pattern = ['header_block', 'sample_block']

        if 'label' in blockdf.columns:
            transform_key_pattern.append('label')
            joined = blockdf.drop('sort_key') \
                .join(modeldf, ['header_block', 'sample_block', 'header'], 'right') \
                .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0)))
        else:
            joined = blockdf.drop('sort_key') \
                .join(modeldf, ['header_block', 'sample_block', 'header'], 'right')

        transform_udf = pandas_udf(
            lambda key, pdf: apply_model(key, transform_key_pattern, pdf,
                                         labeldf, sample_blocks, self.alphas,
                                         covdf), reduced_matrix_struct,
            PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceTransform')

        return joined \
            .groupBy(transform_key_pattern) \
            .apply(transform_udf)
Ejemplo n.º 7
0
    def fit(self) -> (DataFrame, DataFrame):
        """
        Fits a ridge regression model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting reduced matrix, for each label in the target labels, as well as a
        Spark DataFrame containing the optimal ridge alpha value for each label.

        Returns:
            Two Spark DataFrames, one containing the model resulting from the fitting routine and one containing the
            results of the cross validation procedure.
        """

        map_key_pattern = ['sample_block', 'label']
        reduce_key_pattern = ['header_block', 'header', 'label']
        metric = 'r2'

        map_udf = pandas_udf(
            lambda key, pdf:
            map_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self.
                           sample_blocks, self._std_cov_df), normal_eqn_struct,
            PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(
                key, map_key_pattern, pdf, self._std_label_df, self._alphas,
                self._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP)
        score_udf = pandas_udf(
            lambda key, pdf: score_models(
                key, map_key_pattern, pdf, self._std_label_df, self
                .sample_blocks, self._alphas, self._std_cov_df, pd.DataFrame({
                }), metric), cv_struct, PandasUDFType.GROUPED_MAP)

        self.model_df = self.reduced_block_df.groupBy(map_key_pattern).apply(
            map_udf).groupBy(reduce_key_pattern).apply(reduce_udf).groupBy(
                map_key_pattern).apply(model_udf)

        self.cv_df = cross_validation(self.reduced_block_df, self.model_df,
                                      score_udf, map_key_pattern, self._alphas,
                                      metric)

        record_hls_event('wgrRidgeRegressionFit')

        return self.model_df, self.cv_df
Ejemplo n.º 8
0
    def transform(self,
                  blockdf: DataFrame,
                  labeldf: pd.DataFrame,
                  sample_blocks: Dict[str, List[str]],
                  modeldf: DataFrame,
                  cvdf: DataFrame,
                  covdf: pd.DataFrame = pd.DataFrame({}),
                  response: str = 'linear') -> pd.DataFrame:
        """
        Generates GWAS covariates for the target labels in the provided label DataFrame by applying the model resulting
        from the LogisticRegression fit method to the starting block matrix.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            modeldf : Spark DataFrame produced by the LogisticRegression fit method, representing the reducer model
            cvdf : Spark DataFrame produced by the LogisticRegression fit method, containing the results of the cross
            validation routine.
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional). The covariates should not include an explicit intercept term, as one will be
                added automatically.
            response : String specifying the desired output.  Can be 'linear' to specify the direct output of the linear
                WGR model (default) or 'sigmoid' to specify predicted label probabilities.

        Returns:
            Pandas DataFrame containing  covariate values. The shape and order match labeldf such that the
            rows are indexed by sample ID and the columns by label. The column types are float64.
        """

        block_prediction_df = self.reduce_block_matrix(blockdf, labeldf,
                                                       sample_blocks, modeldf,
                                                       cvdf, covdf, response)
        pivoted_df = flatten_prediction_df(block_prediction_df, sample_blocks,
                                           labeldf)

        record_hls_event('wgrLogisticRegressionTransform')

        return pivoted_df
Ejemplo n.º 9
0
    def fit(self) -> DataFrame:
        """
        Fits a ridge reducer model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels.

        Returns:
            Spark DataFrame containing the model resulting from the fitting routine.
        """

        map_key_pattern = ['header_block', 'sample_block']
        reduce_key_pattern = ['header_block', 'header']

        if 'label' in self.block_df.columns:
            map_key_pattern.append('label')
            reduce_key_pattern.append('label')

        map_udf = pandas_udf(
            lambda key, pdf:
            map_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self.
                           sample_blocks, self._std_cov_df), normal_eqn_struct,
            PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(
                key, map_key_pattern, pdf, self._std_label_df, self._alphas,
                self._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceFit')

        self.model_df = self.block_df.groupBy(map_key_pattern).apply(
            map_udf).groupBy(reduce_key_pattern).apply(reduce_udf).groupBy(
                map_key_pattern).apply(model_udf)

        return self.model_df
Ejemplo n.º 10
0
    def transform(self, response: str = 'linear') -> pd.DataFrame:
        """
        Generates GWAS covariates for the target labels in the provided label DataFrame by applying the model resulting
        from the LogisticRidgeRegression fit method to the starting reduced block matrix.

        Args:
            response : String specifying the desired output.  Can be 'linear' to specify the direct output of the linear
                WGR model (default) or 'sigmoid' to specify predicted label probabilities.

        Returns:
            Pandas DataFrame containing  covariate values. The shape and order match label_df such that the
            rows are indexed by sample ID and the columns by label. The column types are float64.
        """
        _check_model(self.model_df)
        _check_cv(self.cv_df)

        block_prediction_df = self.reduce_block_matrix(response)
        self.y_hat_df = flatten_prediction_df(block_prediction_df,
                                              self.sample_blocks,
                                              self._label_df)

        record_hls_event('wgrLogisticRegressionTransform')

        return self.y_hat_df
Ejemplo n.º 11
0
    def fit(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        covdf: pd.DataFrame = pd.DataFrame({})
    ) -> (DataFrame, DataFrame):
        """
        Fits a ridge regression model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels, as well as a
        Spark DataFrame containing the optimal ridge alpha value for each label.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
            Two Spark DataFrames, one containing the model resulting from the fitting routine and one containing the
            results of the cross validation procedure.
        """

        validate_inputs(labeldf, covdf)
        map_key_pattern = ['sample_block', 'label']
        reduce_key_pattern = ['header_block', 'header', 'label']

        if not self.alphas:
            self.alphas = generate_alphas(blockdf)

        map_udf = pandas_udf(
            lambda key, pdf: map_normal_eqn(key, map_key_pattern, pdf, labeldf,
                                            sample_blocks, covdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(key, map_key_pattern, pdf,
                                              labeldf, self.alphas, covdf),
            model_struct, PandasUDFType.GROUPED_MAP)
        score_udf = pandas_udf(
            lambda key, pdf: score_models(key, map_key_pattern, pdf, labeldf,
                                          sample_blocks, self.alphas, covdf),
            cv_struct, PandasUDFType.GROUPED_MAP)

        modeldf = blockdf \
            .groupBy(map_key_pattern) \
            .apply(map_udf) \
            .groupBy(reduce_key_pattern) \
            .apply(reduce_udf) \
            .groupBy(map_key_pattern) \
            .apply(model_udf)

        # Break ties in favor of the larger alpha
        alpha_df = blockdf.sql_ctx \
            .createDataFrame([Row(alpha=k, alpha_value=float(v)) for k, v in self.alphas.items()])
        window_spec = Window.partitionBy('label').orderBy(
            f.desc('r2_mean'), f.desc('alpha_value'))

        cvdf = blockdf.drop('header_block', 'sort_key') \
            .join(modeldf, ['header', 'sample_block'], 'right') \
            .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \
            .groupBy(map_key_pattern) \
            .apply(score_udf) \
            .join(alpha_df, ['alpha']) \
            .groupBy('label', 'alpha', 'alpha_value').agg(f.mean('r2').alias('r2_mean')) \
            .withColumn('modelRank', f.row_number().over(window_spec)) \
            .filter('modelRank = 1') \
            .drop('modelRank')

        record_hls_event('wgrRidgeRegressionFit')

        return modeldf, cvdf
Ejemplo n.º 12
0
    def fit(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        covdf: pd.DataFrame = pd.DataFrame({})
    ) -> (DataFrame, DataFrame):
        """
        Fits a logistic regression model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels, as well as a
        Spark DataFrame containing the optimal ridge alpha value for each label.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).  The covariates should not include an explicit intercept term, as one will be
                added automatically.  If empty, the intercept will be used as the only covariate.

        Returns:
            Two Spark DataFrames, one containing the model resulting from the fitting routine and one containing the
            results of the cross validation procedure.
        """
        map_key_pattern = ['sample_block', 'label', 'alpha_name']
        reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name']
        model_key_pattern = ['sample_block', 'label', 'alpha_name']
        score_key_pattern = ['sample_block', 'label']
        metric = 'log_loss'

        if not self.alphas:
            self.alphas = generate_alphas(blockdf)

        if covdf.empty:
            covdf = pd.DataFrame(data=np.ones(labeldf.shape[0]),
                                 columns=['intercept'],
                                 index=labeldf.index)
            validate_inputs(labeldf, pd.DataFrame({}), 'binary')
        else:
            covdf = covdf.copy()
            validate_inputs(labeldf, covdf, 'binary')
            covdf.insert(0, 'intercept', 1)

        maskdf = pd.DataFrame(data=np.where(np.isnan(labeldf), False, True),
                              columns=labeldf.columns,
                              index=labeldf.index)

        beta_cov_dict = {}
        for label in labeldf:
            row_mask = slice_label_rows(maskdf, label, list(labeldf.index),
                                        np.array([])).ravel()
            cov_mat = slice_label_rows(covdf, 'all', list(labeldf.index),
                                       row_mask)
            y = slice_label_rows(labeldf, label, list(labeldf.index),
                                 row_mask).ravel()
            fit_result = constrained_logistic_fit(cov_mat,
                                                  y,
                                                  np.zeros(cov_mat.shape[1]),
                                                  guess=np.array([]),
                                                  n_cov=0)
            beta_cov_dict[label] = fit_result.x

        map_udf = pandas_udf(
            lambda key, pdf: map_irls_eqn(
                key, map_key_pattern, pdf, labeldf, sample_blocks, covdf,
                beta_cov_dict, maskdf, self.alphas), irls_eqn_struct,
            PandasUDFType.GROUPED_MAP)

        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf),
            irls_eqn_struct, PandasUDFType.GROUPED_MAP)

        model_udf = pandas_udf(
            lambda key, pdf: solve_irls_eqn(key, model_key_pattern, pdf,
                                            labeldf, self.alphas, covdf),
            model_struct, PandasUDFType.GROUPED_MAP)

        score_udf = pandas_udf(
            lambda key, pdf:
            score_models(key, score_key_pattern, pdf, labeldf, sample_blocks,
                         self.alphas, covdf, maskdf, metric), cv_struct,
            PandasUDFType.GROUPED_MAP)

        modeldf = blockdf.drop('alpha') \
            .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in self.alphas.keys()]))) \
            .groupBy(map_key_pattern) \
            .apply(map_udf) \
            .groupBy(reduce_key_pattern) \
            .apply(reduce_udf) \
            .groupBy(model_key_pattern) \
            .apply(model_udf) \
            .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \
            .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \
            .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \
            .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \
            .drop('alphas_labels_coefs', 'label')

        cvdf = cross_validation(blockdf, modeldf, score_udf, score_key_pattern,
                                self.alphas, metric)

        record_hls_event('wgrLogisticRegressionFit')

        return modeldf, cvdf
Ejemplo n.º 13
0
    def fit(self) -> (DataFrame, DataFrame):
        """
        Fits a logistic regression model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the reduced block matrix, for each label in the target labels, as well as a
        Spark DataFrame containing the optimal ridge alpha value for each label.

        Returns:
            Two Spark DataFrames, one containing the model resulting from the fitting routine and one containing the
            results of the cross validation procedure.
        """
        map_key_pattern = ['sample_block', 'label', 'alpha_name']
        reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name']
        model_key_pattern = ['sample_block', 'label', 'alpha_name']
        score_key_pattern = ['sample_block', 'label']
        metric = 'log_loss'

        maskdf = pd.DataFrame(data=np.where(np.isnan(self._label_df), False,
                                            True),
                              columns=self._label_df.columns,
                              index=self._label_df.index)

        beta_cov_dict = {}

        for label in self._label_df:
            if self._std_cov_df.empty:
                beta_cov_dict[label] = np.array([])
            else:
                row_mask = slice_label_rows(maskdf, label,
                                            list(self._label_df.index),
                                            np.array([])).ravel()

                cov_mat = slice_label_rows(self._std_cov_df, 'all',
                                           list(self._label_df.index),
                                           row_mask)
                y = slice_label_rows(self._label_df, label,
                                     list(self._label_df.index),
                                     row_mask).ravel()
                fit_result = constrained_logistic_fit(cov_mat,
                                                      y,
                                                      np.zeros(
                                                          cov_mat.shape[1]),
                                                      guess=np.array([]),
                                                      n_cov=0)
                beta_cov_dict[label] = fit_result.x

        map_udf = pandas_udf(
            lambda key, pdf: map_irls_eqn(
                key, map_key_pattern, pdf, self._label_df, self.sample_blocks,
                self._std_cov_df, beta_cov_dict, maskdf, self._alphas),
            irls_eqn_struct, PandasUDFType.GROUPED_MAP)

        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf),
            irls_eqn_struct, PandasUDFType.GROUPED_MAP)

        model_udf = pandas_udf(
            lambda key, pdf: solve_irls_eqn(
                key, model_key_pattern, pdf, self._label_df, self._alphas, self
                ._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP)

        score_udf = pandas_udf(
            lambda key, pdf: score_models(
                key, score_key_pattern, pdf, self._label_df, self.
                sample_blocks, self._alphas, self._std_cov_df, maskdf, metric),
            cv_struct, PandasUDFType.GROUPED_MAP)

        self.model_df = self.reduced_block_df.drop('alpha') \
            .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in self._alphas.keys()]))) \
            .groupBy(map_key_pattern) \
            .apply(map_udf) \
            .groupBy(reduce_key_pattern) \
            .apply(reduce_udf) \
            .groupBy(model_key_pattern) \
            .apply(model_udf) \
            .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \
            .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \
            .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \
            .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \
            .drop('alphas_labels_coefs', 'label')

        self.cv_df = cross_validation(self.reduced_block_df, self.model_df,
                                      score_udf, score_key_pattern,
                                      self._alphas, metric)

        record_hls_event('wgrLogisticRegressionFit')

        return self.model_df, self.cv_df