Beispiel #1
0
    def postprocess_result(self):

        if self.dataset == "engineering-NoSemi":
            """
            Engineering Students Data - NoSemi - gender

            """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - gender"
            )
            header = ["query_id", "doc_id", "score", "prot_attr"]
            protected_attribute = 3
            protected_group = "prot_attr"
            score_attribute = 2
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:
                print("post-processing for " + fold + " with " +
                      self.p_classifier)

                if "figr" in self.p_classifier.lower():
                    origPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/" + self.p_classifier + "/predictions_ORIG.pred"
                    rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/" + self.p_classifier + "/predictions.pred"
                else:
                    origPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
                    rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/FA-IR/" + self.p_classifier + "/predictions.pred"
                EngineeringData = DatasetDescription(
                    rerankedPredictions, origPredictions, protected_attribute,
                    score_attribute, protected_group, header, judgment)

                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData,
                                                      self.dataset,
                                                      self.p,
                                                      self.k,
                                                      post_process=True)
                else:
                    rerank.rerank_featurevectors(EngineeringData,
                                                 self.dataset,
                                                 self.p,
                                                 post_process=True)

                fold_count += 1

            # """
            # Engineering Students Data - NoSemi - highschool

            # """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - highschool"
            )
            header = ["query_id", "doc_id", "score", "prot_attr"]
            protected_attribute = 3
            protected_group = "prot_attr"
            score_attribute = 2
            judgment = "score"
            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:
                print("post-processing for " + fold + " with " +
                      self.p_classifier)
                if "figr" in self.p_classifier.lower():
                    origPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/" + self.p_classifier + "/predictions_ORIG.pred"
                    rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/" + self.p_classifier + "/predictions.pred"
                else:
                    origPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
                    rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/FA-IR/" + self.p_classifier + "/predictions.pred"
                EngineeringData = DatasetDescription(
                    rerankedPredictions, origPredictions, protected_attribute,
                    score_attribute, protected_group, header, judgment)

                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData,
                                                      self.dataset,
                                                      self.p,
                                                      self.k,
                                                      post_process=True)
                else:
                    rerank.rerank_featurevectors(EngineeringData,
                                                 self.dataset,
                                                 self.p,
                                                 post_process=True)

                fold_count += 1
    def postprocess_on_groundtruth_result(self):

        if self.dataset == "engineering-NoSemi":

            """
            Engineering Students Data - NoSemi - gender

            """
            print("Start reranking of Engineering Students Data - No Semi Private - gender")
            protected_attribute = 1
            score_attribute = 6
            protected_group = "hombre"
            header = ['query_id', 'hombre', 'psu_mat', 'psu_len', 'psu_cie', 'nem', 'score']
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(fold_count) + "_test.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(fold_count) + "_test_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData, self.dataset, self.alpha, self.k)
                else:
                    rerank.rerank_featurevectors(EngineeringData, self.dataset, self.p)


                fold_count += 1

            """
            Engineering Students Data - NoSemi - highschool

            """
            print("Start reranking of Engineering Students Data - No Semi Private - highschool")
            protected_attribute = 1
            score_attribute = 6
            protected_group = "highschool_type"
            header = ['query_id', 'highschool_type', 'psu_mat', 'psu_len', 'psu_cie', 'nem', 'score']
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(fold_count) + "_test.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(fold_count) + "_test_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)

                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData, self.dataset, self.alpha, self.k)
                else:
                    rerank.rerank_featurevectors(EngineeringData, self.dataset, self.p)


                fold_count += 1

        elif self.dataset == "german":

            """
            German Credit dataset - age 25

            """
            print("Start reranking of German Credit - Age 25")
            protected_attribute = 3
            score_attribute = 2
            protected_group = "age25"
            header = ['DurationMonth', 'CreditAmount', 'score', 'age25']
            judgment = "score"

            origFile = "../data/GermanCredit/GermanCredit_age25.csv"
            resultFile = "../data/GermanCredit/GermanCredit_age25_" + self.description_classifier + ".txt"
            GermanCreditData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
            if "figr" in self.p_classifier.lower():
                rerank.rerank_featurevectors_figr(GermanCreditData, self.dataset, self.p, self.k)
            else:
                rerank.rerank_featurevectors(GermanCreditData, self.dataset, self.p)


            """
            German Credit dataset - age 35

            """
            print("Start reranking of German Credit - Age 35")
            protected_attribute = 3
            score_attribute = 2
            protected_group = "age35"
            header = ['DurationMonth', 'CreditAmount', 'score', 'age35']
            judgment = "score"

            origFile = "../data/GermanCredit/GermanCredit_age35.csv"
            resultFile = "../data/GermanCredit/GermanCredit_age35_" + self.description_classifier + ".txt"
            GermanCreditData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
            if "figr" in self.p_classifier.lower():
                rerank.rerank_featurevectors_figr(GermanCreditData, self.dataset, self.p, self.k)
            else:
                rerank.rerank_featurevectors(GermanCreditData, self.dataset, self.p)


            """
            German Credit dataset - gender

            """
            print("Start reranking of German Credit - gender")
            protected_attribute = 3
            score_attribute = 2
            protected_group = "sex"
            header = ['DurationMonth', 'CreditAmount', 'score', 'sex']
            judgment = "score"

            origFile = "../data/GermanCredit/GermanCredit_sex.csv"
            resultFile = "../data/GermanCredit/GermanCredit_sex_" + self.description_classifier + ".txt"
            GermanCreditData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
            if "figr" in self.p_classifier.lower():
                rerank.rerank_featurevectors_figr(GermanCreditData, self.dataset, self.p, self.k)
            else:
                rerank.rerank_featurevectors(GermanCreditData, self.dataset, self.p)

        elif self.dataset == 'compas':

            """
            COMPAS propublica dataset - race

            """
            print("Start reranking of COMPAS propublica - Race")
            protected_attribute = 3
            score_attribute = 2
            protected_group = "race"
            header = ['priors_count','Violence_rawscore','Recidivism_rawscore','race']
            judgment = "Recidivism_rawscore"

            origFile = "../data/COMPAS/ProPublica_race.csv"
            resultFile = "../data/COMPAS/ProPublica_race_" + self.description_classifier + ".txt"
            CompasData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
            if "figr" in self.p_classifier.lower():
                rerank.rerank_featurevectors_figr(CompasData, self.dataset, self.p, self.k)
            else:
                rerank.rerank_featurevectors(CompasData, self.dataset, self.p)


            """
            COMPAS propublica dataset - gender

            """
            print("Start reranking of COMPAS propublica - gender")
            protected_attribute = 3
            score_attribute = 2
            protected_group = "sex"
            header = ['priors_count','Violence_rawscore','Recidivism_rawscore','sex']
            judgment = "Recidivism_rawscore"

            origFile = "../data/COMPAS/ProPublica_sex.csv"
            resultFile = "../data/COMPAS/ProPublica_sex_" + self.description_classifier + ".txt"
            CompasData = DatasetDescription(resultFile,
                                                     origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)
            if "figr" in self.p_classifier.lower():
                rerank.rerank_featurevectors_figr(CompasData, self.dataset, self.p, self.k)
            else:
                rerank.rerank_featurevectors(CompasData, self.dataset, self.p)
    def postprocess_result (self):
        header = ["query_id", "rank", "pred_score", "prot_attr"]
        protected_attribute = "prot_attr"
        judgment = "pred_score"
        if self.dataset == "trec":
            """
            TREC Data
            """
            print("Start post-processing TREC Data")
            protected_group = "female"

            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5", "fold_6"]:
                print("Post-processing for " + fold + " with " + self.p_classifier)
                origPredictions = "../results/TREC/" + fold + "/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
                rerankedPredictions = "../results/TREC/" + fold + "/FA-IR/" + self.p_classifier + "/predictions.pred"
                TRECData = DatasetDescription(rerankedPredictions,
                                              origPredictions,
                                              protected_attribute,
                                              protected_group,
                                              header,
                                              judgment)

                rerank.rerank_featurevectors(TRECData, self.p, post_process=True)

        if self.dataset == "law":
            """
            LSAT Data - Gender

            """
            print("Start post-processing of LSAT Data" + " with " + self.p_classifier)
            print("protected attribute: sex")

            origPredictions = "../results/LawStudents/gender/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
            rerankedPredictions = "../results/LawStudents/gender/FA-IR/" + self.p_classifier + "/predictions.pred"
            protected_group = "female"

            LSATGenderData = DatasetDescription(rerankedPredictions,
                                                origPredictions,
                                                protected_attribute,
                                                protected_group,
                                                header,
                                                judgment)

            rerank.rerank_featurevectors(LSATGenderData, self.p, post_process=True)

            """
            LSAT Data - Race - Black

            """
            if self.p < 0:
                # the black group has only 6% blacks, so p_minus is not possible
                return
            print("Start post-processing of LSAT Data" + " with " + self.p_classifier)
            print("protected attribute: race - protected group: black")

            origPredictions = "../results/LawStudents/race_black/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
            rerankedPredictions = "../results/LawStudents/race_black/FA-IR/" + self.p_classifier + "/predictions.pred"
            protected_group = "black"

            LSATRaceBlackData = DatasetDescription(rerankedPredictions,
                                                   origPredictions,
                                                   protected_attribute,
                                                   protected_group,
                                                   header,
                                                   judgment)

            rerank.rerank_featurevectors(LSATRaceBlackData, self.p, post_process=True)

        if self.dataset == "engineering-NoSemi":

            """
            Engineering Students Data - NoSemi - gender

            """
            print("Start reranking of Engineering Students Data - No Semi Private - gender")
            protected_group = "female"
            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("post-processing for " + fold + " with " + self.p_classifier)
                origPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
                rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/FA-IR/" + self.p_classifier + "/predictions.pred"
                EngineeringData = DatasetDescription(rerankedPredictions,
                                                     origPredictions,
                                                     protected_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)

                rerank.rerank_featurevectors(EngineeringData, self.p, post_process=True)

                fold_count += 1

            """
            Engineering Students Data - NoSemi - highschool

            """
            print("Start reranking of Engineering Students Data - No Semi Private - highschool")
            protected_group = "highschool"
            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("post-processing for " + fold + " with " + self.p_classifier)
                origPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/FA-IR/" + self.p_classifier + "/predictions_ORIG.pred"
                rerankedPredictions = "../results/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/FA-IR/" + self.p_classifier + "/predictions.pred"
                EngineeringData = DatasetDescription(rerankedPredictions,
                                                     origPredictions,
                                                     protected_attribute,
                                                     protected_group,
                                                     header,
                                                     judgment)

                rerank.rerank_featurevectors(EngineeringData, self.p, post_process=True)

                fold_count += 1
Beispiel #4
0
    def preprocess_dataset(self):

        if self.dataset == "engineering-NoSemi":
            """
            Engineering Students Data - NoSemi - gender

            """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - gender"
            )
            protected_attribute = 1
            score_attribute = 6
            protected_group = "hombre"
            header = [
                'query_id', 'hombre', 'psu_mat', 'psu_len', 'psu_cie', 'nem',
                'score'
            ]
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(
                    fold_count) + "_train.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(
                    fold_count
                ) + "_train_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile, origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group, header,
                                                     judgment)

                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData,
                                                      self.dataset,
                                                      self.p,
                                                      self.k,
                                                      pre_process=True)
                else:
                    rerank.rerank_featurevectors(EngineeringData,
                                                 self.dataset,
                                                 self.p,
                                                 pre_process=True)

                fold_count += 1
            """
            Engineering Students Data - NoSemi - highschool

            """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - highschool"
            )
            protected_attribute = 1
            score_attribute = 6
            protected_group = "highschool_type"
            header = [
                'query_id', 'highschool_type', 'psu_mat', 'psu_len', 'psu_cie',
                'nem', 'score'
            ]
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(
                    fold_count) + "_train.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(
                    fold_count
                ) + "_train_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile, origFile,
                                                     protected_attribute,
                                                     score_attribute,
                                                     protected_group, header,
                                                     judgment)

                if "figr" in self.p_classifier.lower():
                    rerank.rerank_featurevectors_figr(EngineeringData,
                                                      self.dataset,
                                                      self.p,
                                                      self.k,
                                                      pre_process=True)
                else:
                    rerank.rerank_featurevectors(EngineeringData,
                                                 self.dataset,
                                                 self.p,
                                                 pre_process=True)

                fold_count += 1
Beispiel #5
0
    def preprocess_dataset(self):

        if self.dataset == "trec":
            """
            TREC Data

            """
            print("Start reranking of TREC Data")
            protected_attribute = "gender"
            protected_group = "female"
            header = [
                "query_id", "gender", "match_body_email_subject_score_norm",
                "match_body_email_subject_df_stdev",
                "match_body_email_subject_idf_stdev", "match_body_score_norm",
                "match_subject_score_norm", "judgment"
            ]
            judgment = "judgment"

            for fold in [
                    "fold_1", "fold_2", "fold_3", "fold_4", "fold_5", "fold_6"
            ]:
                print("Reranking for " + fold)
                origFile = "../data/TREC/" + fold + "/features_withListNetFormat_withGender_withZscore_candidateAmount-200_train.csv"
                resultFile = "../data/TREC/" + fold + "/features_withListNetFormat_withGender_withZscore_candidateAmount-200_train_" + self.description_classifier + ".csv"
                TRECData = DatasetDescription(resultFile, origFile,
                                              protected_attribute,
                                              protected_group, header,
                                              judgment)

                rerank.rerank_featurevectors(TRECData, self.p)

        if self.dataset == "law":
            """
            LSAT Data - Gender

            """
            print("Start reranking of LSAT Data")
            print("protected attribute: sex")

            origFile = "../data/LawStudents/gender/LawStudents_Gender_train.txt"
            resultFile = "../data/LawStudents/gender/LawStudents_Gender_train_" + self.description_classifier + ".txt"
            protected_attribute = "sex"
            protected_group = "female"
            header = ["query_id", "sex", "LSAT", "UGPA", "ZFYA"]
            judgment = "ZFYA"

            LSATGenderData = DatasetDescription(resultFile, origFile,
                                                protected_attribute,
                                                protected_group, header,
                                                judgment)

            rerank.rerank_featurevectors(LSATGenderData, self.p)
            """
            LSAT Data - Race - Black

            """
            if self.p < 0:
                # the black group has only 6% blacks, so p_minus is not possible
                return
            print("Start reranking of LSAT Data")
            print("protected attribute: race - protected group: black")

            resultFile = "../data/LawStudents/race_black/LawStudents_Race_train_" + self.description_classifier + ".txt"
            origFile = "../data/LawStudents/race_black/LawStudents_Race_train.txt"
            protected_attribute = "race"
            protected_group = "black"
            header = ["query_id", "race", "LSAT", "UGPA", "ZFYA"]
            judgment = "ZFYA"

            LSATRaceBlackData = DatasetDescription(resultFile, origFile,
                                                   protected_attribute,
                                                   protected_group, header,
                                                   judgment)

            rerank.rerank_featurevectors(LSATRaceBlackData, self.p)

        if self.dataset == "engineering-NoSemi":
            """
            Engineering Students Data - NoSemi - gender

            """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - gender"
            )
            protected_attribute = "hombre"
            protected_group = "female"
            header = [
                'query_id', 'hombre', 'psu_mat', 'psu_len', 'psu_cie', 'nem',
                'score'
            ]
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(
                    fold_count) + "_train.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/gender/" + fold + "/chileDataL2R_gender_nosemi_fold" + str(
                    fold_count
                ) + "_train_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile, origFile,
                                                     protected_attribute,
                                                     protected_group, header,
                                                     judgment)

                rerank.rerank_featurevectors(EngineeringData, self.p)

                fold_count += 1
            """
            Engineering Students Data - NoSemi - highschool

            """
            print(
                "Start reranking of Engineering Students Data - No Semi Private - highschool"
            )
            protected_attribute = "highschool_type"
            protected_group = "highschool"
            header = [
                'query_id', 'highschool_type', 'psu_mat', 'psu_len', 'psu_cie',
                'nem', 'score'
            ]
            judgment = "score"

            fold_count = 1
            for fold in ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]:

                print("Reranking for " + fold)
                origFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(
                    fold_count) + "_train.txt"
                resultFile = "../data/EngineeringStudents/NoSemiPrivate/highschool/" + fold + "/chileDataL2R_highschool_nosemi_fold" + str(
                    fold_count
                ) + "_train_" + self.description_classifier + ".txt"
                EngineeringData = DatasetDescription(resultFile, origFile,
                                                     protected_attribute,
                                                     protected_group, header,
                                                     judgment)

                rerank.rerank_featurevectors(EngineeringData, self.p)

                fold_count += 1