Example #1
0
class SC2ASubmissions(SubmissionTools, ST2):
    def __init__(self, client=None, version=2):
        super(SC2ASubmissions, self).__init__(client=client, name='SC2A')
        # should be here to avoid import cycling
        from hpn import HPNAdmin
        self.hpn = HPNAdmin(client=self.client)
        self.version = version

    def load_submissions(self, startweek=0, endweek=9, keep_latest=True):
        """Loads all SCORED submissions from SC2A

        Attaches the week, status, ranking and zscores

        """
        # load all scored submissions
        self.submissions = self.hpn.get_submissions_prediction(status="SCORED")
        print("Got %s SCORED submissions" % len(self.submissions))

        # attach week and filter the submissions
        self.submissions = self.attach_week_to_submissions(self.submissions,"sc2a")
        self.submissions = [sub for sub in self.submissions if sub['week']<=endweek]
        self.submissions = [sub for sub in self.submissions if sub['week']>=startweek]
        print("Keeping %s submissions in the week range requested" % len(self.submissions))

        if keep_latest:
            self.submissions = self._keep_latest_only()
            print("Keep %s latest scored submissions " % len(self.submissions))

        # attach the zscore
        print("attaching submissions")
        self.submissions = self.attach_status_to_submissions(self.submissions)

        print("remove some users")
        self.remove_users()

        print("attaching scores and compute final ranking")
        self._attach_zscores() #  attach zscores
        print("all submissions available in the **submissions** attribute")

        for sub in self.submissions:
            rmse = json.loads(sub['substatus']['report'])
            sub['old_rmses'] = copy.deepcopy(rmse)

    def remove_users(self, userIds=["375805", "1991105", "1971259"]):
        """

        * 375805 alphabeta is a test from TC
        * 1991105 sakve from week 5 has different id from sakev week 6. renmove
          week5 that has a lower score anyway
        * HD systems see SC1A function docstring 1971259

        """
        submissions = [x for x in self.submissions if x['userId'] not in userIds]
        self.submissions = submissions

    def _get_ranking(self):
        print("Getting ranking")
        import json
        ranking = scoring.HPNScoringPrediction_ranking()
        for i,sub in enumerate(self.submissions):
            rmse = json.loads(sub['substatus']['report'])
            filename = self.client.getSubmission(sub, downloadFile=True, ifcollision="keep.local")['filePath']
            print(i, filename)
            s = scoring.HPNScoringPrediction(filename, version=self.version)
            s.compute_all_rmse()
            rmse = copy.deepcopy(s.rmse)

            ranking.add_rmse(rmse, sub['submitterAlias'] +"_"+ str(i))
        return ranking

    def get_final_pvalue(self, submission):
        from scipy import stats
        # get all zscores
        zz = submission['zscores']
        zscores = [zz[k1][k2] for k1 in zz.keys() for k2 in zz[k1].keys()]
        dof = len(zscores) * 2
        assert dof == 56

        # zscores are one-sided (could negative) so multiply by 1
        sided = 1
        total_score = sum([-2 * np.log(stats.norm.sf(x) * sided) for x in zscores])
        # this is a fisher method to combine the 32 scores.
        # chi2 survival for dof=64 and x=100 is 0.002686
        pvalue = stats.chi2.sf(total_score, dof)
        return pvalue

    def _attach_zscores(self):
        """attach mean zscore, 32 individual zscores
        """
        ranking = self._get_ranking()
        zscores = ranking.get_mean_zscores()
        ranks = ranking.get_mean_ranks()

        for i,participant in enumerate(ranking.participants):
            # mean zscore
            self.submissions[i]['zscore'] = zscores[participant]
            # 32 individual zscores
            self.submissions[i]['zscores'] = ranking._get_zscores(ranking.rmse[i])
            # final rank
            self.submissions[i]['ranking'] = ranks[participant]
            # 32 individual ranks
            self.submissions[i]['ranks'] =  ranking.get_rank_participant(participant)
            # 32 individual RMSEs
            self.submissions[i]['rmses'] =  ranking.rmse[i].copy()
            # 32 individual RMSEs
            s = ranking.rmse[i]
            data =  [s[k1][k2] for k1 in s.keys() for k2 in s[k1].keys()]
            data = [x for x in data if np.isnan(x)==False]
            mu = np.mean(data)
            self.submissions[i]['mean_rmse'] =  mu

    def summary_final(self, show=True):
        # an alias
        subs = self.submissions

        # sorted indices of the mean ranks
        ranks = np.argsort([sub['ranking'] for sub in subs])

        teams = [subs[rank]['submitterAlias'] for rank in ranks]

        df = pd.DataFrame(index=teams, data=ranks, columns=['mean Rank'])

        # finally the ranks

        df['Team Name'] = [subs[rank]['submitterAlias'] for rank in ranks]
        df['Team Id'] = [subs[rank]['userId'] for rank in ranks]
        df['Submission Id'] = [subs[rank]['substatus']['id'] for rank in ranks]
        df['Entity Id'] = [subs[rank]['substatus']['entityId'] for rank in ranks]
        df['Mean Rank'] = [subs[rank]['ranking'] for rank in ranks]
        df['Mean RMSE'] = [subs[rank]['mean_rmse'] for rank in ranks]

        ranks = df['Mean Rank'].rank()
        df['Final Rank'] = ranks.values

        df = df.set_index('Final Rank')
        df = df[['Team Name', u'Team Id', u'Submission Id', 
                u'Entity Id', 'Mean Rank', u'Mean RMSE']]

        ranks = np.argsort([sub['ranking'] for sub in subs])
        header = ("| Final rank| Team name | Team Id | Synapse ID |  Entity ID | mean RMSE  | mean Rank | mean zscore |")
        
        if show is False:
            return df
        print(dataframe_towiki(df))
        return df

    def save_rmse_to_json(self):
        super(SC2ASubmissions, self).save_rmse_to_json(N1=162, N2=153)