Ejemplo n.º 1
0
    def create_train_and_test_sets(self,
                                   window_id,
                                   users_as_rows=True,
                                   app_threshold=5,
                                   subset_pct=0.5,
                                   pct_test=0.2,
                                   seed=1,
                                   random_mask=True,
                                   log_disabled=False):
        '''
        Creates train and test CSR matrices. The matrices are identical
        except that the train matrix has masked (i.e., 0) values for test user
        applications submitted during the testing portion of the
        application window.

        Parameters:
            window_id (int): the window id

        Returns:
            (scipy.sparse.csr_matrix, scipy.sparse.csr_matrix, pd.DataFrame): 
                a tuple of three elements: (1) a CSR matrix representing the
                training set; (2) a CSR matrix reprsenting the testing set; and
                (3) a Pandas DataFrame holding user and job ids of the
                applications that were masked.
        '''
        log = Log(log_disabled)

        log.header(f"Preparing data for application Window {window_id} "
                   "train and test sets...")

        # Restrict applications to users meeting app submission threshold
        log.header(
            "Filtering data to only include most popular jobs and users (i.e., "
            f"those sending or receiving {app_threshold} or more applications "
            "in the given window)")
        app_ids = self.filter_applications(window_id, app_threshold, log)
        log.detail(f"{len(app_ids)} apps after filtering")

        # Select a random subset of app ids for training
        log.header(f"Selecting random {subset_pct * 100} percent "
                   "of these apps for testing...")
        random.seed(seed * 2)
        num_samples = int(np.ceil(subset_pct * len(app_ids)))
        population = app_ids.index.tolist()
        samples = random.sample(population, num_samples)
        app_ids = app_ids.loc[app_ids.index.isin(samples)]
        log.detail(f"{len(app_ids)} apps chosen by random selection")

        # Create id-index lookup for the qualified users
        user_ids = app_ids["UserID"].sort_values().unique().tolist()
        user_lookup = {val: idx for idx, val in enumerate(user_ids)}
        log.detail(f"Users successfully filtered. {len(user_ids)} users "
                   f"submitted {app_threshold} or more apps in the window.")

        # Create id-index lookup for jobs in current window
        job_ids = app_ids["JobID"].sort_values().unique().tolist()
        job_lookup = {val: idx for idx, val in enumerate(job_ids)}
        log.detail(
            "Jobs successfully filtered. The qualified users submitted "
            f"apps to {len(job_ids)} different qualified jobs in the window.")
        log.detail("Filtering complete")

        # Mask a percentage of applications for testing
        log.header("Masking applications to use as test cells...")
        masked_ids = self.mask_applications(app_ids, pct_test, seed,
                                            random_mask, log)
        log.detail(
            f"{len(masked_ids)} app ids successfully masked with zeroes.")

        # Finalize application ids used for training and test sets
        log.header(
            "Finalizing application ids to use in train and test sets...")
        train_app_ids = (app_ids.merge(
            masked_ids, on=["UserID", "JobID"], how="outer",
            indicator=True).query("_merge == 'left_only'")[["UserID",
                                                            "JobID"]])
        test_app_ids = app_ids
        log.header("Data preparation complete")

        # Create training and testing matrices
        log.header("Creating train and test sets...")
        log.detail("Creating CSR train matrix...")
        train_matrix = self.create_csr_matrix(train_app_ids, user_lookup,
                                              job_lookup, users_as_rows, log)

        log.detail("Creating CSR test matrix...")
        test_matrix = self.create_csr_matrix(test_app_ids, user_lookup,
                                             job_lookup, users_as_rows, log)

        log.detail("Training and testing sets successfully created")

        return (train_matrix, test_matrix, masked_ids, user_lookup, job_lookup)
Ejemplo n.º 2
0
class JobAppDatasets:

    ## DATASET LOADING ##
    def __init__(self):
        '''
        The constructor for the JobAppDatasets class. Loads the TSV data files
        into Pandas DataFrames from a local "data" folder.
        '''
        if not os.path.isdir("data"):
            self.load_data_from_storage()

        # Initialize logger
        self.log = Log()

        # Load users
        self.users = pd.read_csv("data/users.tsv", sep="\t")
        self.assign_user_groups()
        self.log.info("users.tsv", "h4")
        self.log.info("Holds all users and their metadata", "p")
        display(self.users.head(2))

        # Load apps
        self.apps = pd.read_csv("data/apps.tsv", sep="\t")
        self.log.info("apps.tsv", "h4")
        self.log.info("Holds the applications users submitted", "p")
        display(self.apps.head(2))

        # Load jobs assigned to first window
        self.jobs1 = pd.read_csv("data/jobs1.tsv", sep="\t")
        self.log.info("jobs1.tsv", "h4")
        self.log.info(
            "Holds the jobs available on CareerBuilder.com during a 13-day window",
            "p")
        display(self.jobs1.head(2))

        # Load user history
        self.user_history = pd.read_csv("data/user_history.tsv", sep="\t")
        self.log.info("user_history.tsv", "h4")
        self.log.info("Holds users' past job title(s)", "p")
        display(self.user_history.head(2))

        # Load window dates
        self.window_dates = pd.read_csv("data/window_dates.tsv", sep="\t")
        self.log.info("window_dates.tsv", "h4")
        self.log.info("Holds the application window dates", "p")
        display(self.window_dates)

    def load_data_from_storage(self):
        '''
        Retrieves the data files from a remote storage endpoint and
        writes them to a local folder named "data".

        Parameters:
            None

        Returns:
            None
        '''
        # Retrieve zip file from Dropbox and write to base/default folder
        r = requests.get(
            "https://www.dropbox.com/s/v2fdobitjrjieku/data.zip?dl=1")
        with open("data.zip", 'wb') as f:
            f.write(r.content)

        # Extract zip file contents to create local data folder with .tsv.gz files
        with zipfile.ZipFile("data.zip", 'r') as zip_ref:
            zip_ref.extractall(".")

        # For each unzipped file path
        for path in glob.glob("data/*.tsv.gz"):

            # Create destination file path
            dest_path = f'data/{os.path.basename(path)[:-3]}'

            # Open unzipped file for reading and destination file for writing
            with open(path, 'rb') as f:
                with open(dest_path, 'wb') as g:

                    # Decompress unzipped file data and write to destination
                    decompressed = gzip.decompress(f.read())
                    g.write(decompressed)

            # Delete original compressed file
            os.remove(path)

        # Delete zip file
        os.remove("data.zip")

    def assign_user_groups(self):
        '''
        Assigns each user to a user group.
        '''
        def assign_to_educ_group(degree_type):

            if degree_type in ["Associate's", "Bachelor's", "Vocational"]:
                return "College"

            elif degree_type in ["Master's", "PhD"]:
                return "Post-Graduate"

            else:
                return degree_type

        self.users["Group"] = (
            self.users["DegreeType"].apply(lambda d: assign_to_educ_group(d)))

    ## DATA SUMMARY AND VISUALIZATION ##
    def preview_csr_matrix(self, csr):
        '''
        Displays a portion of the CSR matrix.

        Parameters:
            csr (scipy.sparse.csr_matrix): the matrix

        Returns:
            None
        '''
        fig = plt.figure(figsize=(50, 50))
        tick_range = np.arange(0, 80000, 100)
        plt.yticks(tick_range, list(tick_range))

        plt.xlabel("JobID")
        plt.ylabel("UserID")
        plt.spy(csr, markersize=1, origin="lower")

    def summarize_csr_matrix(self, csr):
        '''
        Summarizes the CSR matrix.

        Parameters:
            csr (scipy.sparse.csr_matrix): the matrix

        Returns:
            None
        '''
        num_rows, num_cols = csr.shape
        num_cells = num_rows * num_cols
        num_entries = csr.nnz  # here, the number of apps
        num_zeroes = num_cells - num_entries
        sparsity = (num_zeroes / num_cells) * 100

        display(
            HTML(
                f"<h5>The CSR matrix has {num_rows:,} rows and " +
                f"{num_cols:,} columns with {num_cells:,} matrix cells total.</h4>"
            ))

        display(
            HTML(f"<h5>{num_zeroes:,} of those elements are zeroes, so " +
                 f"the matrix is ~{sparsity} percent sparse.</h5>"))

    ## DATA RETRIEVAL ##
    def get_jobs_dataset(self, window_id):
        '''
        Retrieves the job dataset corresponding to the given window id.

        Parameters:
            window_id (int): the window id

        Returns:
            (pd.DataFrame): the jobs DataFrame
        '''
        if window_id == 1:
            return self.jobs1
        else:
            return None

    def get_window_date_range(self, window_id):
        '''
        Generates a DataFrame representing the date range of the application window.
        
        Parameters:
            window_id (int): the window id
            
        Returns:
            pd.DataFrame: The date range DataFrame. Contains one column, "Date",
                        whose entries are formatted as "yyyy-MM-dd," as well as
                        an index containing the same values.
        '''
        test_start = self.window_dates.query(
            "Window == @window_id")["Train Start"][0]
        test_end = self.window_dates.query(
            "Window == @window_id")["Test End"][0]
        return (pd.date_range(start=test_start, end=test_end,
                              freq="D").to_frame().rename(columns={0: "Date"}))

    ## CREATION OF TRAIN AND TEST SETS ##
    def create_csr_matrix(self, app_ids, user_lookup, job_lookup,
                          users_as_rows, log):
        '''
        Creates a CSR ("Compressed Sparse Row") matrix.

        Parameters:
            app_ids (pd.DataFrame): the UserID-JobID pairs related to an app
            user_lookup (dict<int, int>): a lookup of user index by UserID
            job_lookup (dict<int, int>): a lookup of job index by JobID
            users_as_rows (bool): whether the matrix should have users as
                                  rows and jobs as columns

        Returns:
            (scipy.sparse.csr_matrix): the sparse matrix
        '''
        users = app_ids["UserID"].apply(lambda id: user_lookup[id]).tolist()
        jobs = app_ids["JobID"].apply(lambda id: job_lookup[id]).tolist()

        data = [1] * len(app_ids)
        data_rows = users if users_as_rows else jobs
        data_columns = jobs if users_as_rows else users
        num_rows = len(user_lookup) if users_as_rows else len(job_lookup)
        num_cols = len(job_lookup) if users_as_rows else len(user_lookup)

        log.info(f"Data Length: {len(data)}", "p")

        csr = csr_matrix(arg1=(data, (data_rows, data_columns)),
                         shape=(num_rows, num_cols),
                         dtype=np.int8)

        return csr

    def create_train_and_test_sets(self,
                                   window_id,
                                   users_as_rows=True,
                                   app_threshold=5,
                                   subset_pct=0.5,
                                   pct_test=0.2,
                                   seed=1,
                                   random_mask=True,
                                   log_disabled=False):
        '''
        Creates train and test CSR matrices. The matrices are identical
        except that the train matrix has masked (i.e., 0) values for test user
        applications submitted during the testing portion of the
        application window.

        Parameters:
            window_id (int): the window id

        Returns:
            (scipy.sparse.csr_matrix, scipy.sparse.csr_matrix, pd.DataFrame): 
                a tuple of three elements: (1) a CSR matrix representing the
                training set; (2) a CSR matrix reprsenting the testing set; and
                (3) a Pandas DataFrame holding user and job ids of the
                applications that were masked.
        '''
        log = Log(log_disabled)

        log.header(f"Preparing data for application Window {window_id} "
                   "train and test sets...")

        # Restrict applications to users meeting app submission threshold
        log.header(
            "Filtering data to only include most popular jobs and users (i.e., "
            f"those sending or receiving {app_threshold} or more applications "
            "in the given window)")
        app_ids = self.filter_applications(window_id, app_threshold, log)
        log.detail(f"{len(app_ids)} apps after filtering")

        # Select a random subset of app ids for training
        log.header(f"Selecting random {subset_pct * 100} percent "
                   "of these apps for testing...")
        random.seed(seed * 2)
        num_samples = int(np.ceil(subset_pct * len(app_ids)))
        population = app_ids.index.tolist()
        samples = random.sample(population, num_samples)
        app_ids = app_ids.loc[app_ids.index.isin(samples)]
        log.detail(f"{len(app_ids)} apps chosen by random selection")

        # Create id-index lookup for the qualified users
        user_ids = app_ids["UserID"].sort_values().unique().tolist()
        user_lookup = {val: idx for idx, val in enumerate(user_ids)}
        log.detail(f"Users successfully filtered. {len(user_ids)} users "
                   f"submitted {app_threshold} or more apps in the window.")

        # Create id-index lookup for jobs in current window
        job_ids = app_ids["JobID"].sort_values().unique().tolist()
        job_lookup = {val: idx for idx, val in enumerate(job_ids)}
        log.detail(
            "Jobs successfully filtered. The qualified users submitted "
            f"apps to {len(job_ids)} different qualified jobs in the window.")
        log.detail("Filtering complete")

        # Mask a percentage of applications for testing
        log.header("Masking applications to use as test cells...")
        masked_ids = self.mask_applications(app_ids, pct_test, seed,
                                            random_mask, log)
        log.detail(
            f"{len(masked_ids)} app ids successfully masked with zeroes.")

        # Finalize application ids used for training and test sets
        log.header(
            "Finalizing application ids to use in train and test sets...")
        train_app_ids = (app_ids.merge(
            masked_ids, on=["UserID", "JobID"], how="outer",
            indicator=True).query("_merge == 'left_only'")[["UserID",
                                                            "JobID"]])
        test_app_ids = app_ids
        log.header("Data preparation complete")

        # Create training and testing matrices
        log.header("Creating train and test sets...")
        log.detail("Creating CSR train matrix...")
        train_matrix = self.create_csr_matrix(train_app_ids, user_lookup,
                                              job_lookup, users_as_rows, log)

        log.detail("Creating CSR test matrix...")
        test_matrix = self.create_csr_matrix(test_app_ids, user_lookup,
                                             job_lookup, users_as_rows, log)

        log.detail("Training and testing sets successfully created")

        return (train_matrix, test_matrix, masked_ids, user_lookup, job_lookup)

    def filter_applications(self, window_id, app_threshold, log):
        '''
        Retrieves all job applications submitted during a given window and then
        filters them to include only those from users who submitted a minimum 
        "threshold" number of apps (e.g., 2 or more, 5 or more, etc.).
        
        Parameters:
            window_id (int): the window id
            app_threshold (int): the number of apps a user should submit to
                                 be included in the train or test datasets
        
        Returns:
            (pd.DataFrame): the filtered job application DataFrame. Contains
                            only "UserID" and "JobID" columns.
        '''
        # Filter job application DataFrame to show app counts by user
        app_counts_by_user = (self.apps.query("WindowID == @window_id").
                              groupby("UserID").size().to_frame().rename(
                                  columns={
                                      0: "count"
                                  }).query("count >= @app_threshold"))

        # Display app counts for improved troubleshooting
        app_counts_for_display = (app_counts_by_user["count"].sort_values(
            ascending=False).to_frame())

        log.dataframe(app_counts_for_display)

        # Filter job application DataFrame to show app counts by job
        app_counts_by_job = (self.apps.query("WindowID == @window_id").groupby(
            "JobID").size().to_frame().rename(columns={
                0: "count"
            }).query("count >= @app_threshold"))

        # Display app counts for improved troubleshooting
        app_counts_for_display = (app_counts_by_job["count"].sort_values(
            ascending=False).to_frame())

        log.dataframe(app_counts_for_display)

        # Return app ids belonging to qualified users meeting threshold
        qualified_users = app_counts_by_user.index.tolist()
        qualified_jobs = app_counts_by_job.index.tolist()
        query = "UserID in @qualified_users & JobID in @qualified_jobs"
        app_ids = self.apps.query(query)[["UserID", "JobID"]]

        return app_ids

    def mask_applications(self, app_ids, pct_test, seed, random_mask, log):
        '''
        Masks applications according to one of two strategies:
        (1) at random for a certain given percentage or (2) targeting only
        test users' applications in the test window.

        Parameters:
            app_ids (pd.DataFrame): the UserID and JobID columns comprising
                                    the application ids
            pct_test (float): the percentage of apps to assign to the test
                              group. Only relevant if "random_mask" is True.
            seed (int): the seed for the random generator. Only relevant if
                        "random_mask" is True.
            random_mask (bool): whether the random masking strategy
                                should be used.

        Returns:
            (pd.DataFrame): a subset of the app_ids DataFrame indicating
                            applications that should be masked
        '''
        if random_mask:

            log.detail("Randomly masking applications in dataset...")

            # Initialize random seed
            random.seed(seed)
            log.detail(f"Initializing random generator with seed {seed}...")

            # Initialize number of samples and population to draw from
            num_samples = int(np.ceil(pct_test * len(app_ids)))
            population = app_ids.index.tolist()
            log.detail(f"Reading configuration setting...{pct_test * 100} "
                       "percent of apps should be used for testing out of the "
                       f"{len(population)} total available...")

            # Sample from population of indices
            samples = random.sample(population, num_samples)
            log.detail(
                f"{len(samples)} app ids randomly chosen for masking...")

            # Mask apps corresponding to sampled/selected index values
            masked_ids = app_ids.loc[app_ids.index.isin(samples)]

        else:
            log.detail("Masking apps made by test users in test window...")

            # Determine date range for testing window
            window_start = (self.window_dates.query("Window == @window_id")
                            ["Test Start"][0])
            window_end = (
                self.window_dates.query("Window == @window_id")["Test End"][0])
            log.detail(f"The test window is {window_start} to {window_end}")

            # Filter apps to only include those in testing window
            masked_ids_query = ("WindowID == @window_id & " +
                                "Split == 'Test' & " +
                                "ApplicationDate >= @window_start & " +
                                "ApplicationDate <= @window_end")
            masked_ids = self.apps.query(masked_ids_query)[["UserID", "JobID"]]

        return masked_ids

    ## CROSS VALIDATION ##
    def perform_nmf(self,
                    csr,
                    k,
                    init="nndsvd",
                    max_iter=500,
                    random_state=0,
                    alpha=0):
        '''
        Performs Non-Negative Matrix Factorization (NMF) of a compressed
        sparse matrix using the sklearn.decomposition library.

        Parameters:
            csr (scipy.sparse.csr_matrix): the sparse matrix

            k (int): the number of principal components to use

            init (str): the method used to initialize the procedure. Defaults
                        to 'nndsvd' (Nonnegative Double Singular Value
                        Decomposition).

            max_iter (int): the maximum number of iterations to perform

            random_state (int):

            alpha (float):

        Returns:
            (array): the row by component aspect of the data. For example, if
                the original matrix had users as rows and jobs as 
                features/columns, was of size 1000 x 20, and was decomposed into
                2 components, the method would return a user-job aspect matrix
                of size 1000 by 2.

            (array): the component by feature aspect of the data. For example,
                if the original matrix had users as rows and jobs as 
                features/columns, was of size 1000 x 20, and was decomposed into
                2 components, the method would return a job aspect-job matrix
                of size 2 by 20.
        '''
        model = NMF(n_components=k,
                    init=init,
                    max_iter=max_iter,
                    random_state=random_state,
                    alpha=alpha)
        W = model.fit_transform(csr)
        H = model.components_
        return W, H

    def tune_hyperparameters(self,
                             window_id=1,
                             users_as_rows=True,
                             app_threshold=5,
                             pct_test=0.2,
                             subset_pct=0.5,
                             num_iterations=10,
                             max_nmf_iterations=500,
                             k_range=(10, 20),
                             random_mask=True):
        '''
        '''
        # Initialize list of auc metrics
        aucs = []

        # Initialize list of best number of latent features at each iteration
        best_latent_features = []

        # Initialize best model
        best_model = None

        # Initialize best mean AUC
        best_mean_auc = -1

        # Initialize best alphas
        best_alphas = []

        # Initialize list of regularization parameters to try
        alphas = [0]

        # Initialize range of latent factor counts to tune
        min_num_components = k_range[0]
        max_num_components = k_range[1] + 1

        # Iterate over range of random seeds
        for seed in range(num_iterations):

            self.log.header(f"Iteration {seed + 1}")

            # Split data into training and test sets.
            # Different random seeds will ensure different splits of data.
            train, test, masked, user_lookup, job_lookup = self.create_train_and_test_sets(
                window_id=window_id,
                users_as_rows=users_as_rows,
                app_threshold=app_threshold,
                pct_test=0.2,
                seed=seed,
                random_mask=random_mask,
                log_disabled=True)

            # Keep track of AUC metrics for each CV fold
            cv_auc = {}

            # Iterate over range of latent features
            for num_components in range(min_num_components,
                                        max_num_components):
                for alpha in alphas:

                    # Define model
                    # Note: Initialize estimates using non-negative double SVD,
                    # which is better for sparseness
                    self.log.header(f"Running NMF with k = {num_components} "
                                    f"and alpha = {alpha}")
                    user_vecs, item_vecs = self.perform_nmf(
                        train,
                        k=num_components,
                        init="nndsvd",
                        max_iter=max_nmf_iterations,
                        random_state=0,
                        alpha=alpha)

                    # Calculate mean AUC on test data
                    mean_auc, popular_auc = self.calc_mean_auc(
                        train=train,
                        masked_apps=masked,
                        predictions=[
                            csr_matrix(user_vecs),
                            csr_matrix(item_vecs)
                        ],
                        test=test,
                        user_lookup=user_lookup,
                        job_lookup=job_lookup)

                    # Update best mean AUC and model
                    if mean_auc >= best_mean_auc:
                        best_mean_auc = mean_auc
                        best_model = (user_vecs, item_vecs)

                    # Log results
                    self.log.detail(f"Number of Components: {num_components}")
                    self.log.detail(f"Mean AUC: {mean_auc}")
                    self.log.detail(f"Popular AUC: {popular_auc}")

                    # Keep track of AUC score corresponding to each number
                    # of latent features and regularization param
                    cv_auc[(num_components, alpha)] = (mean_auc, popular_auc)

            # Define the "best" number of latent features
            # as the one with the highest AUC
            best_latent_feature = self.get_key(cv_auc, max(cv_auc.values()))[0]
            best_alpha = self.get_key(cv_auc, max(cv_auc.values()))[1]
            best_alphas.append(best_alpha)

            #plt.plot(cv_auc)
            best_latent_features.append(best_latent_feature)
            cv_auc_mean = np.mean(list(cv_auc.values()))
            aucs.append(cv_auc_mean)

        # Define best number of latent features as rounded mean of best latent features across random seeds
        overall_best_latent_feature = int(
            np.round(np.mean(best_latent_features)))
        overall_best_alpha = np.mean(best_alphas)
        self.log.header(
            f"Optimal number of latent features: {overall_best_latent_feature}"
        )
        self.log.header(
            f"Optimal regularization parameter: {overall_best_alpha}")

        return overall_best_latent_feature, overall_best_alpha, best_model

    def train_data(self,
                   num_components,
                   alpha,
                   window_id=1,
                   users_as_rows=True,
                   app_threshold=5,
                   pct_test=0.2,
                   subset_pct=0.5,
                   num_iterations=10,
                   max_nmf_iterations=500,
                   random_mask=True):
        '''
        '''
        # Keep track of AUC metrics for each CV fold
        mean_aucs_for_folds = []
        popular_aucs_for_folds = []
        group_aucs_for_folds = []
        group_preds_for_folds = []
        group_actual_for_folds = []

        # Iterate over range of random seeds
        for seed in range(num_iterations):

            self.log.header(f"Fold {seed + 1}")

            # Split data into training and test sets.
            # Different random seeds will ensure different splits of data.
            train, test, masked, user_lookup, job_lookup = self.create_train_and_test_sets(
                window_id=window_id,
                users_as_rows=users_as_rows,
                app_threshold=app_threshold,
                pct_test=0.2,
                subset_pct=subset_pct,
                seed=seed,
                random_mask=random_mask,
                log_disabled=True)

            # Define model/run NMF
            # Note: Initialize estimates using non-negative double SVD,
            # which is better for sparseness
            self.log.detail(f"Running NMF with k = {num_components} "
                            f"and alpha = {alpha}")
            user_vecs, item_vecs = self.perform_nmf(
                train,
                k=num_components,
                init="nndsvd",
                max_iter=max_nmf_iterations,
                random_state=0,
                alpha=alpha)

            # Calculate mean AUC on overall test data
            mean_auc, popular_auc = self.calc_mean_auc(
                train=train,
                masked_apps=masked,
                predictions=[csr_matrix(user_vecs),
                             csr_matrix(item_vecs)],
                test=test,
                user_lookup=user_lookup,
                job_lookup=job_lookup)

            # Calculate mean AUC of user groups specifically
            group_actual, group_preds, group_aucs = self.get_group_aucs(
                user_lookup, user_vecs, item_vecs, test)

            # Store results
            mean_aucs_for_folds.append(mean_auc)
            popular_aucs_for_folds.append(popular_auc)
            group_aucs_for_folds.append(group_aucs)
            group_preds_for_folds.append(group_preds)
            group_actual_for_folds.append(group_actual)

            # Log results
            self.log.detail(f"Number of Components: {num_components}")
            self.log.detail(f"Mean AUC: {mean_auc}")
            self.log.detail(f"Popular AUC: {popular_auc}")
            self.log.detail(f"Group AUCs:")
            for name, auc in group_aucs.items():
                self.log.detail(f"- {name}: {auc}")

        return mean_aucs_for_folds, popular_aucs_for_folds, group_aucs_for_folds, group_preds_for_folds, group_actual_for_folds

    ## EVALUATION ##
    def auc_score(self, predictions, targets):
        '''
        Outputs the area under the curve using sklearn's metrics. 
        
        Parameters:
            predictions (): your prediction output
            targets (): the actual target results you are comparing to
        
        Returns:
            (): the AUC (area under the Receiver Operating Characterisic curve)
        '''
        fpr, tpr, thresholds = metrics.roc_curve(targets, predictions)
        return metrics.auc(fpr, tpr)

    def avg_auc_score(self, predicted, actual):
        '''
        Calculates average AUC score without test data.
        '''
        rows, cols = predicted.shape
        scores = []

        for i in range(rows):
            pred_row = predicted[i, :].toarray().reshape((cols, 1))
            actual_row = actual[i, :].toarray().reshape((cols, 1))
            score = metrics.roc_auc_score(actual_row, pred_row)
            scores.append(score)

        return np.mean(scores)

    def calc_mean_auc(self, train, test, masked_apps, predictions, user_lookup,
                      job_lookup):
        '''
        Calculates the mean AUC by user for any user that had their 
        user-item matrix altered. 
        
        Parameters: 
            train (scipy.sparse.csr_matrix): the training set, where a certain 
                                            percentage of the original user/item
                                            interactions are reset to zero 
                                            to hide them from the model 
            
            masked_apps (): the indices of the users where at least one 
                             user/item pair was masked
      
            predictions (): the matrix of predictions for each 
                            user/item pair as output from the implicit MF.
                            These should be stored in a list, with user vectors 
                            as item zero and item vectors as item one. 
            
            test (scipy.sparse.csr_matrix): the test set
           
        Returns:   
            (): The mean AUC (area under the Receiver Operator Characteristic 
                curve) of the test set only on user-item interactions there 
                were originally zero to test ranking ability in addition to the 
                most popular items as a benchmark.
        '''
        # Initialize empty list to store the AUC for each masked user
        user_auc = []

        # Initialize empty list to store popular AUC scores
        popularity_auc = []

        # Get sum of item interactions to find most popular
        popular_items = np.array(test.sum(axis=0)).reshape(-1)

        # Retrieve predictions (i.e., sklearn's W and H)
        user_vecs, item_vecs = predictions

        # Iterate through each user that had an application masked
        for user_id in masked_apps["UserID"]:

            # Get the training set row
            user_idx = user_lookup[user_id]
            training_row = train[user_idx, :].toarray().reshape(-1)

            # Find where the interaction had not yet occurred
            zero_inds = np.where(training_row == 0)

            # Get the predicted values based on our user/item vectors
            user_vec = predictions[0][user_idx, :]
            pred = user_vec.dot(item_vecs).toarray()[0, zero_inds].reshape(-1)

            # Get only the items that were originally zero
            # Select all ratings from the MF prediction for this user
            # that originally had no interaction
            actual = test[user_idx, :].toarray()[0, zero_inds].reshape(-1)

            # Select the binarized yes/no interaction pairs from the original
            # full data that align with the same pairs in training

            # Get the item popularity for our chosen items
            pop = popular_items[zero_inds]

            # Calculate AUC for the given user and job application
            user_auc.append(self.auc_score(pred, actual))

            # Calculate AUC using most popular job application and score
            popularity_auc.append(self.auc_score(pop, actual))

        # Return the mean AUC rounded to three decimal places for both
        # test and popularity benchmark
        return float('%.3f' % np.mean(user_auc)), float(
            '%.3f' % np.mean(popularity_auc))

    def get_key(self, dictionary, val):
        '''
        Retrieves the first key in a given dictionary that is
        associated with a given value.

        Parameters:
            dictionary (dict): the dictionary
            val (dynamic): the value

        Returns:
            (dynamic): the first key corresponding to the value, or None if
                       no keys are found
        '''
        for key, value in dictionary.items():
            if val == value:
                return key

        return None

    def get_group_aucs(self, user_lookup, user_vecs, item_vecs, test):
        '''
        '''
        # Create DataFrame to look up trained users' group affiliations
        data = {
            'Index': list(user_lookup.values()),
            'UserID': list(user_lookup.keys())
        }
        data = pd.DataFrame(data)
        group_lookup = data.merge(self.users, on="UserID", how="inner")
        display(group_lookup)

        # Initialize function to calculate a single group's average AUC
        def calc_group_avg_auc(group_name, group_lookup):
            grp_indices = (group_lookup.query(f"Group == '{group_name}'")
                           ["Index"].tolist())
            grp_actual = test[grp_indices, :]
            grp_pred = csr_matrix(user_vecs.dot(item_vecs))[grp_indices, :]
            grp_avg_auc = self.avg_auc_score(grp_pred, grp_actual)

            return grp_pred, grp_avg_auc, grp_actual

        # Store predictions and average AUCs for each group
        group_predictions = {}
        group_actual = {}
        group_aucs = {}
        group_names = self.users["Group"].unique().tolist()

        for name in group_names:
            grp_pred, grp_avg_auc, grp_actual = calc_group_avg_auc(
                name, group_lookup)
            group_predictions[name] = grp_pred
            group_aucs[name] = grp_avg_auc
            group_actual[name] = grp_actual

        return group_actual, group_predictions, group_aucs

    ## FAIRNESS ##
    def value_unfairness(self, advantaged_true, advantaged_pred,
                         disadvantaged_true, disadvantaged_pred):
        '''
        Measures inconsistency in signed estimation error across
        the user types
        
        Value unfairness occurs when one class of user is consistently 
        given higher or lower predictions than their true preferences. If 
        the errors in prediction are evenly balanced between overestimation
        and underestimation or if both classes of users have the same 
        direction and magnitude of error, the value unfairness becomes 
        small. Value unfairness becomes large when predictions for one class
        are consistently overestimated and predictions for the other class 
        are consistently underestimated.
        '''
        # rows, cols = advantaged.shape
        # n = cols
        # ddiff = disadvantaged.mean(axis=0) - np.mean(disadvantaged)
        # adiff = advantaged.mean(axis=0) - np.mean(advantaged)
        # return 1/ n * np.sum(abs(ddiff - adiff))

        rows, cols = advantaged_true.shape
        n = cols
        ddiff = disadvantaged_pred.mean(axis=0) - disadvantaged_true.mean(
            axis=0)
        adiff = advantaged_pred.mean(axis=0) - advantaged_true.mean(axis=0)
        return (1 / n) * np.sum(abs(ddiff - adiff))

    def absolute_unfairness(self, advantaged_true, advantaged_pred,
                            disadvantaged_true, disadvantaged_pred):
        '''
        Measures inconsistency in absolute estimation error
        across user types
        
        Absolute unfairness is unsigned, so it captures a single statistic 
        representing the quality of prediction for each user type. If one user 
        type has small reconstruction error and the other user type has large 
        reconstruction error, one type of user has the unfair advantage of good 
        recommendation, while the other user type has poor recommendation. 
        In contrast to value unfairness, absolute unfairness does not consider 
        the direction of error. 
        
        For example, if female students are given predictions 0.5 points
        below their true preferences and male students are given predictions 
        0.5 points above their true preferences, there is no absolute unfairness.
        '''
        rows, cols = advantaged_true.shape
        n = cols
        ddiff = abs(
            disadvantaged_pred.mean(axis=0) - disadvantaged_true.mean(axis=0))
        adiff = abs(
            advantaged_pred.mean(axis=0) - advantaged_true.mean(axis=0))
        return (1 / n) * np.sum(abs(ddiff - adiff))

    def underestimation_unfairness(self, advantaged_true, advantaged_pred,
                                   disadvantaged_true, disadvantaged_pred):
        '''
        Measures inconsistency in how much the predictions underestimate
        the true ratings.
        
        Underestimation unfairness is important in settings where missing 
        recommendations are more critical than extra recommendations. For 
        example, underestimation could lead to a top student not being
        recommended to explore a topic they would excel in.
        '''
        rows, cols = advantaged_true.shape
        n = cols

        ddiff = disadvantaged_true.mean(axis=0) - disadvantaged_pred.mean(
            axis=0)
        adiff = advantaged_true.mean(axis=0) - advantaged_pred.mean(axis=0)

        ddiff_clip = ddiff.clip(min=0)
        adiff_clip = adiff.clip(min=0)
        return (1 / n) * np.sum(abs(ddiff_clip - adiff_clip))

    def overestimation_unfairness(self, advantaged_true, advantaged_pred,
                                  disadvantaged_true, disadvantaged_pred):
        '''
        Measures inconsistency in how much the predictions overestimate 
        the true ratings.
        
        Overestimation unfairness may be important in settings where users 
        may be overwhelmed by recommendations, so providing too many 
        recommendations would be especially detrimental.
        
        For example, if users must invest large amounts of time to evaluate each 
        recommended item, overestimating essentially costs the user time. 
        Thus, uneven amounts of overestimation could cost one type of user 
        more time than the other.
        '''
        rows, cols = advantaged_true.shape
        n = cols

        ddiff = disadvantaged_pred.mean(axis=0) - disadvantaged_true.mean(
            axis=0)
        adiff = advantaged_pred.mean(axis=0) - advantaged_true.mean(axis=0)

        ddiff_clip = ddiff.clip(min=0)
        adiff_clip = adiff.clip(min=0)
        return (1 / n) * np.sum(abs(ddiff_clip - adiff_clip))

    def nonparity_unfairness(self, advantaged_true, advantaged_pred,
                             disadvantaged_true, disadvantaged_pred):
        '''
        Measure based on the regularization term introduced by Kamishima
        et al. Can be computed as the absolute difference between the 
        overall average ratings of disadvantaged users and 
        those of advantaged users.
        '''
        return abs(np.mean(disadvantaged_pred) - np.mean(advantaged_pred))

    def fairness_df(self, group_true_and_preds, unfairness_func, group_names):
        '''
        Creates a DataFrame of fairness metrics between groups.      
        '''
        metrics = np.zeros((len(group_names), len(group_names)))
        for i, val1 in enumerate(group_true_and_preds):

            for j, val2 in enumerate(group_true_and_preds):

                advantaged_true, advantaged_pred = val1
                disadvantaged_true, disadvantaged_pred = val2
                metrics[i,
                        j] = unfairness_func(advantaged_true, advantaged_pred,
                                             disadvantaged_true,
                                             disadvantaged_pred)

        data = pd.DataFrame(metrics, columns=group_names)
        data["group_names"] = group_names
        data = data.set_index("group_names")

        return data