def generateKWTopwords(filemanager): """ :return: :raise ValueError: """ testbyClass, option, Low, High = getTopWordOption() ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) # create a word list to handle wordfilter in KWtest() WordLists = general_functions.matrixtodict(countMatrix) # create division map divisionmap, NameMap, classLabel = filemanager.getClassDivisionMap() print divisionmap if len(divisionmap) == 1: raise ValueError('only one class given, cannot do Kruaskal-Wallis test, at least 2 class needed') # divide the countMatrix via division map words = countMatrix[0][1:] # get the list of word for i in range(len(divisionmap)): for j in range(len(divisionmap[i])): id = divisionmap[i][j] divisionmap[i][j] = countMatrix[id + 1] # +1 because the first line is words Matrixs = divisionmap AnalysisResult = KWtest(Matrixs, words, WordLists=WordLists, option=option, Low=Low, High=High) return AnalysisResult
def GenerateZTestTopWord(filemanager): """ Generates the Z-test Topwod results based on user options Args: None Returns: A dictionary containing the Z-test results """ testbyClass, option, Low, High = getTopWordOption() ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = matrixtodict(countMatrix) if not testbyClass: # test for all analysisResult = testall(WordLists, option=option, Low=Low, High=High) # make the result human readable by adding the templabel on them humanResult = [[countMatrix[i + 1][0].decode(), analysisResult[i]] for i in range(len(analysisResult))] else: # test by class # create division map divisionmap, NameMap, classLabelMap = filemanager.getClassDivisionMap() if len(divisionmap) == 1: raise ValueError('only one class given, cannot do Z-test By class, at least 2 class needed') # divide into group GroupWordLists = groupdivision(WordLists, divisionmap) # test analysisResult = testgroup(GroupWordLists, option=option, Low=Low, High=High) # convert to human readable form humanResult = {} for key in analysisResult.keys(): fileName = NameMap[key[0]][key[1]] CompClassName = classLabelMap[key[2]] humanResult.update({(fileName.decode(), CompClassName): analysisResult[key]}) return humanResult
def GenerateZTestTopWord(filemanager): """ Generates the Z-test Topwod results based on user options Args: None Returns: A dictionary containing the Z-test results """ testbyClass, option, Low, High = getTopWordOption() ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = matrixtodict(countMatrix) if not testbyClass: # test for all analysisResult = testall(WordLists, option=option, Low=Low, High=High) # make the result human readable by adding the templabel on them humanResult = [[countMatrix[i + 1][0].decode(), analysisResult[i]] for i in range(len(analysisResult))] else: # test by class # create division map divisionmap, NameMap, classLabelMap = filemanager.getClassDivisionMap() if len(divisionmap) == 1: raise ValueError('only one class given, cannot do Z-test By class, at least 2 class needed') # divide into group GroupWordLists = groupdivision(WordLists, divisionmap) # test analysisResult = testgroup(GroupWordLists, option=option, Low=Low, High=High) # convert to human readable form humanResult = {} for key in analysisResult.keys(): fileName = NameMap[key[0]][key[1]] CompClassName = classLabelMap[key[2]] humanResult.update({(fileName.decode(), CompClassName): analysisResult[key]}) return humanResult
def generateStatistics(filemanager): """ Calls analyze/information to get the information about each file and the whole corpus Args: None Returns: FileInfoList: a list contains a tuple that containing the file id and the file information (see analyze/information.py/Corpus_Information.returnstatistics() function for more) corpusInformation: the statistics information about the whole corpus (see analyze/information.py/File_Information.returnstatistics() function for more) """ checkedLabels = request.form.getlist('segmentlist') ids = set(filemanager.files.keys()) checkedLabels = set(map(int, checkedLabels)) # convert the checkedLabels into int for id in ids - checkedLabels: # if the id is not in checked list filemanager.files[id].disable() # make that file inactive in order to getMatrix FileInfoList = [] folderpath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER) # folder path for storing graphs and plots try: os.mkdir(folderpath) # attempt to make folder to store graphs/plots except: pass ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = general_functions.matrixtodict(countMatrix) Files = [file for file in filemanager.getActiveFiles()] for i in range(len(Files)): templabel = countMatrix[i + 1][0] # because the first row of the first line is the '' fileinformation = information.File_Information(WordLists[i], templabel) FileInfoList.append((Files[i].id, fileinformation.returnstatistics())) corpusInformation = information.Corpus_Information(WordLists, Files) # make a new object called corpus corpusInfoDict = corpusInformation.returnstatistics() return FileInfoList, corpusInfoDict
def generateStatistics(filemanager): """ Calls analyze/information to get the information about each file and the whole corpus Args: None Returns: FileInfoList: a list contains a tuple that containing the file id and the file information (see analyze/information.py/Corpus_Information.returnstatistics() function for more) corpusInformation: the statistics information about the whole corpus (see analyze/information.py/File_Information.returnstatistics() function for more) """ checkedLabels = request.form.getlist('segmentlist') ids = set(filemanager.files.keys()) checkedLabels = set(map(int, checkedLabels)) # convert the checkedLabels into int for id in ids - checkedLabels: # if the id is not in checked list filemanager.files[id].disable() # make that file inactive in order to getMatrix FileInfoList = [] folderpath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER) # folder path for storing graphs and plots try: os.mkdir(folderpath) # attempt to make folder to store graphs/plots except: pass ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = general_functions.matrixtodict(countMatrix) Files = [file for file in filemanager.getActiveFiles()] for i in range(len(Files)): templabel = countMatrix[i + 1][0] # because the first row of the first line is the '' fileinformation = information.File_Information(WordLists[i], templabel) FileInfoList.append((Files[i].id, fileinformation.returnstatistics())) corpusInformation = information.Corpus_Information(WordLists, Files) # make a new object called corpus corpusInfoDict = corpusInformation.returnstatistics() return FileInfoList, corpusInfoDict
def generateStatistics(filemanager): """ the function calls analyze/information to get the information about each file and the whole corpus :return: FileInfoList: a list contain a tuple contain the file id and the file information (see analyze/information.py/Corpus_Information.returnstatistics() function for more) corpusInformation: the statistics information about the whole corpus (see analyze/information.py/File_Information.returnstatistics() function for more) """ FileInfoList = [] folderpath = os.path.join(session_functions.session_folder(), constants.RESULTS_FOLDER) # folder path for storing # graphs and plots try: os.mkdir(folderpath) # attempt to make folder to store graphs/plots except: pass ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = general_functions.matrixtodict(countMatrix) Files = [file for file in filemanager.getActiveFiles()] for i in range(len(Files)): fileinformation = information.File_Information(WordLists[i], Files[i].name) FileInfoList.append((Files[i].id, fileinformation.returnstatistics())) try: fileinformation.plot(os.path.join(folderpath, str(Files[i].id) + constants.FILE_INFORMATION_FIGNAME)) except: pass corpusInformation = information.Corpus_Information(WordLists, Files) # make a new object called corpus corpusInfoDict = corpusInformation.returnstatistics() try: corpusInformation.plot(os.path.join(folderpath, constants.CORPUS_INFORMATION_FIGNAME)) except: pass return FileInfoList, corpusInfoDict
def generateKWTopwords(filemanager): """ Generates the Kruskal Wallis Topwod results based on user options Args: None Returns: A dictionary containing the Kruskal Wallis results """ testbyClass, option, Low, High = getTopWordOption() ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) # create division map divisionmap, NameMap, classLabel = filemanager.getClassDivisionMap() # create a word list to handle wordfilter in KWtest() WordLists = general_functions.matrixtodict(countMatrix) if len(divisionmap) == 1: raise ValueError('only one class given, cannot do Kruaskal-Wallis test, at least 2 class needed') # divide the countMatrix via division map words = countMatrix[0][1:] # get the list of word for i in range(len(divisionmap)): for j in range(len(divisionmap[i])): id = divisionmap[i][j] divisionmap[i][j] = countMatrix[id + 1] # +1 because the first line is words Matrixs = divisionmap AnalysisResult = KWtest(Matrixs, words, WordLists=WordLists, option=option, Low=Low, High=High) return AnalysisResult