class Evaluator():

    SCORE_ACCURACY='acc'
    SCORE_CONFUSIONMATRIX='confmatrix'
    SCORE_CLASSREPORT='class'
    SCORE_F1='f1'
    SCORE_PRECISION='precision'
    SCORE_RECALL='recall'
    SCORE_F1_PERCLASS='f1_perclass'
    SCORE_PRECISION_PERCLASS='precision_perclass'
    SCORE_RECALL_PERCLASS='recall_perclass'


    __logFile=''
    util=Utilities.Utility()

    def __init__(self, logFile=None, utilObj=None):
        if (utilObj != None):
            self.util = utilObj
        elif (logFile != None):
            self.util = Utilities.Utility()
            self.util.setupLogFileLoc(logFile)

    def generateGraphicalConfusionMatrix(self, array):
        import seaborn as sn
        import matplotlib.pyplot as plt
        df_cm = pd.DataFrame(array, index=[i for i in "ESPW"],
                             columns=[i for i in "ESPW"])
        plt.figure(figsize=(10, 7))
        confMatrix=sn.heatmap(df_cm, annot=True, cmap="YlGnBu"  )
        return plt
        pass

    def generateSummary(self, folderpath):
        from decimal import Decimal
        metricDict=defaultdict(list)
        for metricFile in sorted(glob.iglob(folderpath + '/*.metric')):
            tokens=metricFile.split('/')[-1].split('_')
            approach=tokens[0]
            vsm=tokens[1]
            metric=tokens[2].split('.')[0]
            value=str(round(Decimal(self.util.readFileContent(metricFile)),2))
            recordTuple=(approach,vsm,value)
            if(metric in metricDict):
                tupleList=metricDict[metric]
                tupleList.append(recordTuple)
                metricDict[metric]=tupleList
            else:
                tupleList=[recordTuple]
                metricDict[metric] = tupleList

        metricTuple=[]
        for key,value in  metricDict.items():

            metricResultsDF = pd.DataFrame(columns=(['VSM']))
            metricResultsDF = metricResultsDF.set_index("VSM")
            for tupleList in value:
                approach=tupleList[0]
                vsm = tupleList[1]
                metricValue = tupleList[2]
                metricResultsDF.set_value(vsm, approach, metricValue)
            print(metricResultsDF)
            metricResultsDF.to_csv(folderpath+'/'+key+'.summary',header=True,index_label='VSM',index=True,mode='w')




    def printSummary(self):
        pass

    def score(self,y=None, ypred=None,type=SCORE_ACCURACY, filename=None, **kwargs):
        results=None
        if(type==self.SCORE_ACCURACY):
            # print(y)
            # print(ypred)
            results=accuracy_score(y,ypred)
            self.util.saveStringToFile(results,filename=filename+'_ACC.metric')

        elif(type==self.SCORE_CONFUSIONMATRIX):
            print('y:',y)
            print('ypred:', ypred)
            results=confusion_matrix(y,ypred)
            print('results:',results)

            try:
                import seaborn as sn
                import matplotlib.pyplot as plt
                snMatrix=self.generateGraphicalConfusionMatrix(results)
                snMatrix.savefig(filename + '_CONF.png')
            except Exception as error:
                print(error)
                self.util.logError('Evaluator','Graphical version of confusion matrix cannot be generated with xScreen and tkinter support on python')
            results=np.array2string(results)
            self.util.saveStringToFile((results), filename=filename + '_CONF.CONFU')

        elif(type==self.SCORE_CLASSREPORT):
            results=classification_report(y,ypred)
            # self.util.saveStringToFile(results, filename=filename + '_CLASS.metric')
        elif(type==self.SCORE_F1_PERCLASS):
            results=f1_score(y,ypred, average=None)
            counter=0
            for result in results:
                self.util.saveStringToFile(results[counter], filename=filename + '_F1-'+str(counter)+'.metric')
                counter=counter+1
        elif(type==self.SCORE_PRECISION_PERCLASS):
            results=precision_score(y,ypred, average=None)
            counter=0
            for result in results:
                self.util.saveStringToFile(results[counter], filename=filename + '_PREC-'+str(counter)+'.metric')
                counter=counter+1

        elif(type==self.SCORE_RECALL_PERCLASS):
            results=recall_score(y,ypred,average=None)
            counter=0
            for result in results:
                self.util.saveStringToFile(results[counter], filename=filename + '_RECALL-'+str(counter)+'.metric')
                counter=counter+1

        elif(type==self.SCORE_F1):
            results=f1_score(y,ypred, average='macro')
            self.util.saveStringToFile(results, filename=filename + '_F1.metric')
        elif(type==self.SCORE_PRECISION):
            results=precision_score(y,ypred, average='macro')
            self.util.saveStringToFile(results, filename=filename + '_PREC.metric')
        elif(type==self.SCORE_RECALL):
            results=recall_score(y,ypred,average='macro')
            self.util.saveStringToFile(results, filename=filename + '_RECALL.metric')

        return  results


    def _getFirstString(self,myDelimitedStr=None, delimiter=','):
        """
        Return the first string from a set of delimited strings
        :param categoryStr:
        :param delimiter:
        :return:
        """
        try:
            results=myDelimitedStr.split(delimiter)[0]
        except Exception as error:
            results=''
        return (results)


    def evaluateHeuristic(self, resultsDatasetFilename=None, heuristicsFilename=None, appd2vEduFolder=None, appd2vSkillsFolder=None, appd2vPersonalDetailsFolder=None):
        """
         resultsDataset contains the categorised results from Test phase 2.
        This will have information on appid, clientid, cvd2vfilename,and its inferred categories (edu, skills, personaldetails) and corresponding scores.
        appd2vEduFolder,appd2vSkillsFolder and appd2vPersonalDetailsFolder will contain the appd2v files which are named app_xx_yyyy.d2v.
        For each row in resultsDataset,
        - pull the content of cvd2vfilename.
        - pull the content of appd2v[category]Folder/app_xx_yyyy.d2v.
        - Strip the stop words
        compare every word in both content,
        - if the number of words identical hit a certain threshold, then HIT.
        - else, MISS
        Add this HIT/MISS into the resultsDataset csv as a new column.
        Add identical words as a new column
        :param resultsDatasetFilename: Names of files seperated by ';'
        :param heuristicsFilename: Name of the file to save results in
        :param appd2vEduFolder: Folder that contain the appd2v files which are named app_xx_yyyy.d2v.
        :param appd2vSkillsFolder: Folder that contain the appd2v files which are named app_xx_yyyy.d2v.
        :param appd2vPersonalDetailsFolder: Folder that contain the appd2v files which are named app_xx_yyyy.d2v.
        :return:
        """
        filecounter=0
        resultsDatasetFilenames=resultsDatasetFilename.split(';')
        fullResults = pd.DataFrame(columns=(
            'appid', 'clientid', 'cvd2vfilename', 'categories', 'scores', 'heuristics', 'heuristics_reason','content'))

        hitCounter=0
        MissCounter = 0
        for resultsDatasetFilename in resultsDatasetFilenames:
            #To confirm if the filename contains headers.
            self.util.logDebug('Evaluator-evaluateFromDataset', 'Reading ' + resultsDatasetFilename)
            resultsDF=None
            resultsDF=pd.read_csv(resultsDatasetFilename, header=None)

            #Should load as 'appid', 'clientid', 'cvd2vfilename','content', 'categories', 'scores'
            self.util.logDebug('Evaluator-evaluateFromDataset', 'Processing...')

            counter=0
            errcounter=0

            for index, row in resultsDF.iterrows():
                # try:
                    clientid=row[1]
                    clientid=(str(clientid)).zfill(3)
                    appid=row[0]
                    cvd2vFullpath=row[2]
                    categories=row[4]
                    scores=row[5]
                    content=row[3]
                    #The heuristics will only take the category with highest score.
                    category=self._getFirstString(myDelimitedStr=categories, delimiter=':')
                    score=self._getFirstString(myDelimitedStr=scores, delimiter=':')

                    cvd2vContent = open(cvd2vFullpath, 'r').read()
                    cvd2vContentTokens=self.util.tokenize(cvd2vContent)   #This is to be used for comparison.
                    heuristics_reason=''
                    appd2vFilename=''
                    #Based on the matched category, pull the relevant app_xx_yyyy.d2v file.
                    if (category==self.util.LOOKUP_CAT_EDU):
                        appd2vFilename=appd2vEduFolder+'/'+'app_'+str(clientid)+'_'+str(appid)+'.'+ self.util.LOOKUP_EXT_APPD2V

                    elif(category==self.util.LOOKUP_CAT_SKILLS):
                        appd2vFilename = appd2vSkillsFolder + '/' + 'app_' + str(clientid) + '_' + str(
                            appid) + '.' + self.util.LOOKUP_EXT_APPD2V

                    elif (category == self.util.LOOKUP_CAT_PERSONALDETAILS):
                        appd2vFilename = appd2vPersonalDetailsFolder + '/' + 'app_' + str(clientid) + '_' + str(
                            appid) + '.' + self.util.LOOKUP_EXT_APPD2V

                    if (os.path.exists(appd2vFilename)==True):
                        appd2vContent = open(appd2vFilename, 'r').read()
                    else:
                        appd2vContent=''
                        heuristics_reason='FILE_NOT_FOUND_IN_CATEGORY: '+appd2vFilename
                    appd2vContentTokens = self.util.tokenize(appd2vContent)  # This is to be used for comparison.

                    identicals=set(appd2vContentTokens) & set(cvd2vContentTokens)
                    # if(len(identicals)>0):
                    #     heuristics_reason=self.util.tokensToStr(identicals)
                    heuristics='MISS'
                    if(category==self.util.LOOKUP_CAT_EDU and len(identicals)>=self.util.THRES_EDU):
                        heuristics='HIT'
                        heuristics_reason = self.util.tokensToStr(identicals)
                    elif(category==self.util.LOOKUP_CAT_SKILLS and len(identicals)>=self.util.THRES_SKILLS):
                        heuristics = 'HIT'
                        heuristics_reason = self.util.tokensToStr(identicals)
                    elif (category == self.util.LOOKUP_CAT_PERSONALDETAILS and len(identicals) >= self.util.THRES_PERSONALDETAILS):
                        ## For personal details, can be more restrictive by limiting to words not in English Dictionary
                        identicals=self.util.returnNonEnglishDictWords(identicals)
                        if(len(identicals)>self.util.THRES_PERSONALDETAILS):
                            heuristics = 'HIT'
                            heuristics_reason = self.util.tokensToStr(identicals)

                    # print(heuristics)
                    currentRow = pd.DataFrame(data={'appid': [appid], 'clientid': [clientid],'cvd2vfilename': [resultsDatasetFilename], 'categories': [categories],'scores': [scores], 'heuristics': [heuristics],'heuristics_reason': [heuristics_reason],'content': [cvd2vContent]})
                    fullResults=fullResults.append(currentRow)
                    counter = counter + 1
                    if counter%100 ==0:
                        self.util.logDebug('Evaluator-evaluateFromDataset',
                                             str(counter) + ' files completed with ' + str(errcounter) + ' errors.')
                        self.util.logDebug('Evaluator-evaluateFromDataset', 'Saving!')
                        fullResults.to_csv(heuristicsFilename.split('.')[0]+'_'+str((filecounter)).zfill(2)+'.csv', ',', mode='a', header=False, index=False,
                                           columns=['appid', 'clientid', 'cvd2vfilename', 'categories', 'scores', 'heuristics', 'heuristics_reason', 'content'])
                        fullResults = fullResults[0:0]
                # except Exception as error:
                #     errcounter=errcounter+1
                #     self.util.logDebug('Evaluator-evaluateFromDataset', 'Error encountered: ' + repr(error))
            self.util.logDebug('Evaluator-evaluateFromDataset', 'Final save!')
            fullResults.to_csv(heuristicsFilename.split('.')[0]+'_'+str((filecounter)).zfill(2)+'.csv', ',', mode='a', header=False, index=False,
                               columns=['appid', 'clientid', 'cvd2vfilename', 'categories', 'scores', 'heuristics',
                                        'heuristics_reason', 'content'])
            filecounter=filecounter+1

# # TestA phase 1xa
# # python3 Evaluator.py '/u01/bigdata/03a_01b_test/cvd2v/test2/heuEval.log' '/u01/bigdata/03a_01b_test/cvd2v/test2/TestA_32_userlabelled_results_1000.csv;/u01/bigdata/03a_01b_test/cvd2v/test2/TestA_32_userlabelled_results_2000.csv;/u01/bigdata/03a_01b_test/cvd2v/test2/TestA_32_userlabelled_results_2759.csv' '/u01/bigdata/03a_01b_test/cvd2v/test2/heuEval.csv' '/u01/bigdata/01b_d2v/032/edu/doc2vecEdu' '/u01/bigdata/01b_d2v/032/skills/doc2vecSkills' '/u01/bigdata/01b_d2v/032/personaldetails/doc2vecPersonalDetails'
# if __name__ == "__main__":
#     if(len(sys.argv)==7):
#         logFile = sys.argv[1]
#         resultsDatasetFilename = sys.argv[2]
#         heuristicsFilename=(sys.argv[3])
#         appd2vEduFolder=(sys.argv[4])
#         appd2vSkillsFolder=sys.argv[5]
#         appd2vPersonalDetailsFolder=sys.argv[6]
#         print('Logging to ', logFile)
#         print('resultsDatasetFilename',resultsDatasetFilename)
#         print('heuristicsFilename', heuristicsFilename)
#         print('appd2vEduFolder', appd2vEduFolder)
#         print('appd2vSkillsFolder', appd2vSkillsFolder)
#         print('appd2vPersonalDetailsFolder', appd2vPersonalDetailsFolder)
#
#         eval=Evaluator(logFile)
#         eval.evaluateHeuristic(resultsDatasetFilename=resultsDatasetFilename, heuristicsFilename=heuristicsFilename, appd2vEduFolder=appd2vEduFolder, appd2vSkillsFolder=appd2vSkillsFolder, appd2vPersonalDetailsFolder=appd2vPersonalDetailsFolder)
#     else:
#         print('Arguments incorrect')

# e=Evaluator()
# e.generateSummary('/u01/bigdata/02d_d2vModel1/features')
 def __init__(self, logFile=None, utilObj=None):
     if (utilObj != None):
         self.util = utilObj
     elif (logFile != None):
         self.util = Utilities.Utility()
         self.util.setupLogFileLoc(logFile)
        dstFilename=sys.argv[3]
        ngram=int(sys.argv[4])
        maxdocs = int(sys.argv[5])
        maxDim = int(sys.argv[6])
        tokenRules = (sys.argv[7])


        print('logFile:',logFile)
        print('folderListOfCorpus:', folderListOfCorpus)
        print('dstFilename:', dstFilename)
        print('ngram:', ngram)
        print('maxdocs:', maxdocs)
        print('maxDim:', maxDim)
        print('tokenRules:', tokenRules)

        util=Utilities.Utility()
        util.setupLogFileLoc(logFile=logFile)
        util.setupTokenizationRules(tokenRules)

        w2v = W2V(utilObj=util,wordFreqIgnored=2, vectordim=maxDim)
        w2v.buildCorpus(folderListOfCorpus=folderListOfCorpus, ngram=ngram, maxdocs=maxdocs,
                        dstFilename=dstFilename, maxDim=maxDim)
        vetor = w2v.inferVector('Non existing sebastian developing morning')
        vetor2 = w2v.inferVector('morning')
        vetor3 = w2v.inferVector('jax')
        print(vetor)
        print(vetor2)
        print(vetor3)
        from sklearn.metrics.pairwise import cosine_similarity

        print(cosine_similarity(vetor, vetor2))