Esempio n. 1
0
def generateKWTopwords(filemanager):
    """


    :return: :raise ValueError:
    """
    testbyClass, option, Low, High = getTopWordOption()

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    # create a word list to handle wordfilter in KWtest()
    WordLists = general_functions.matrixtodict(countMatrix)

    # create division map
    divisionmap, NameMap, classLabel = filemanager.getClassDivisionMap()
    print divisionmap
    if len(divisionmap) == 1:
        raise ValueError('only one class given, cannot do Kruaskal-Wallis test, at least 2 class needed')

    # divide the countMatrix via division map
    words = countMatrix[0][1:]  # get the list of word
    for i in range(len(divisionmap)):
        for j in range(len(divisionmap[i])):
            id = divisionmap[i][j]
            divisionmap[i][j] = countMatrix[id + 1]  # +1 because the first line is words
    Matrixs = divisionmap

    AnalysisResult = KWtest(Matrixs, words, WordLists=WordLists, option=option, Low=Low, High=High)

    return AnalysisResult
Esempio n. 2
0
def GenerateZTestTopWord(filemanager):
    """
    Generates the Z-test Topwod results based on user options
    
    Args: 
        None

    Returns: 
        A dictionary containing the Z-test results
    """

    testbyClass, option, Low, High = getTopWordOption()

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)
    WordLists = matrixtodict(countMatrix)

    if not testbyClass:  # test for all

        analysisResult = testall(WordLists, option=option, Low=Low, High=High)
        # make the result human readable by adding the templabel on them
        humanResult = [[countMatrix[i + 1][0].decode(), analysisResult[i]] for i in range(len(analysisResult))]

    else:  # test by class

        # create division map
        divisionmap, NameMap, classLabelMap = filemanager.getClassDivisionMap()
        if len(divisionmap) == 1:
            raise ValueError('only one class given, cannot do Z-test By class, at least 2 class needed')

        # divide into group
        GroupWordLists = groupdivision(WordLists, divisionmap)

        # test
        analysisResult = testgroup(GroupWordLists, option=option, Low=Low, High=High)

        # convert to human readable form
        humanResult = {}
        for key in analysisResult.keys():
            fileName = NameMap[key[0]][key[1]]
            CompClassName = classLabelMap[key[2]]
            humanResult.update({(fileName.decode(), CompClassName): analysisResult[key]})

    return humanResult
Esempio n. 3
0
def GenerateZTestTopWord(filemanager):
    """
    Generates the Z-test Topwod results based on user options
    
    Args: 
        None

    Returns: 
        A dictionary containing the Z-test results
    """

    testbyClass, option, Low, High = getTopWordOption()

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)
    WordLists = matrixtodict(countMatrix)

    if not testbyClass:  # test for all

        analysisResult = testall(WordLists, option=option, Low=Low, High=High)
        # make the result human readable by adding the templabel on them
        humanResult = [[countMatrix[i + 1][0].decode(), analysisResult[i]] for i in range(len(analysisResult))]

    else:  # test by class

        # create division map
        divisionmap, NameMap, classLabelMap = filemanager.getClassDivisionMap()
        if len(divisionmap) == 1:
            raise ValueError('only one class given, cannot do Z-test By class, at least 2 class needed')

        # divide into group
        GroupWordLists = groupdivision(WordLists, divisionmap)

        # test
        analysisResult = testgroup(GroupWordLists, option=option, Low=Low, High=High)

        # convert to human readable form
        humanResult = {}
        for key in analysisResult.keys():
            fileName = NameMap[key[0]][key[1]]
            CompClassName = classLabelMap[key[2]]
            humanResult.update({(fileName.decode(), CompClassName): analysisResult[key]})

    return humanResult
Esempio n. 4
0
def generateStatistics(filemanager):
    """
    Calls analyze/information to get the information about each file and the whole corpus

    Args:
        None

    Returns:
        FileInfoList: a list contains a tuple that containing the file id and the file information
                     (see analyze/information.py/Corpus_Information.returnstatistics() function for more)
        corpusInformation: the statistics information about the whole corpus
                          (see analyze/information.py/File_Information.returnstatistics() function for more)
    """
    checkedLabels = request.form.getlist('segmentlist')
    ids = set(filemanager.files.keys())

    checkedLabels = set(map(int, checkedLabels))  # convert the checkedLabels into int

    for id in ids - checkedLabels:  # if the id is not in checked list
        filemanager.files[id].disable()  # make that file inactive in order to getMatrix

    FileInfoList = []
    folderpath = os.path.join(session_manager.session_folder(),
                              constants.RESULTS_FOLDER)  # folder path for storing graphs and plots
    try:
        os.mkdir(folderpath)  # attempt to make folder to store graphs/plots
    except:
        pass

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    WordLists = general_functions.matrixtodict(countMatrix)
    Files = [file for file in filemanager.getActiveFiles()]
    for i in range(len(Files)):
        templabel = countMatrix[i + 1][0]  # because the first row of the first line is the ''
        fileinformation = information.File_Information(WordLists[i], templabel)
        FileInfoList.append((Files[i].id, fileinformation.returnstatistics()))

    corpusInformation = information.Corpus_Information(WordLists, Files)  # make a new object called corpus
    corpusInfoDict = corpusInformation.returnstatistics()

    return FileInfoList, corpusInfoDict
Esempio n. 5
0
def generateStatistics(filemanager):
    """
    Calls analyze/information to get the information about each file and the whole corpus

    Args:
        None

    Returns:
        FileInfoList: a list contains a tuple that containing the file id and the file information
                     (see analyze/information.py/Corpus_Information.returnstatistics() function for more)
        corpusInformation: the statistics information about the whole corpus
                          (see analyze/information.py/File_Information.returnstatistics() function for more)
    """
    checkedLabels = request.form.getlist('segmentlist')
    ids = set(filemanager.files.keys())

    checkedLabels = set(map(int, checkedLabels))  # convert the checkedLabels into int

    for id in ids - checkedLabels:  # if the id is not in checked list
        filemanager.files[id].disable()  # make that file inactive in order to getMatrix

    FileInfoList = []
    folderpath = os.path.join(session_manager.session_folder(),
                              constants.RESULTS_FOLDER)  # folder path for storing graphs and plots
    try:
        os.mkdir(folderpath)  # attempt to make folder to store graphs/plots
    except:
        pass

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    WordLists = general_functions.matrixtodict(countMatrix)
    Files = [file for file in filemanager.getActiveFiles()]
    for i in range(len(Files)):
        templabel = countMatrix[i + 1][0]  # because the first row of the first line is the ''
        fileinformation = information.File_Information(WordLists[i], templabel)
        FileInfoList.append((Files[i].id, fileinformation.returnstatistics()))

    corpusInformation = information.Corpus_Information(WordLists, Files)  # make a new object called corpus
    corpusInfoDict = corpusInformation.returnstatistics()

    return FileInfoList, corpusInfoDict
Esempio n. 6
0
def generateStatistics(filemanager):
    """
    the function calls analyze/information to get the information about each file and the whole corpus

    :return:
    FileInfoList: a list contain a tuple contain the file id and the file information
                    (see analyze/information.py/Corpus_Information.returnstatistics() function for more)
    corpusInformation: the statistics information about the whole corpus
                    (see analyze/information.py/File_Information.returnstatistics() function for more)
    """
    FileInfoList = []
    folderpath = os.path.join(session_functions.session_folder(),
                              constants.RESULTS_FOLDER)  # folder path for storing
    # graphs and plots
    try:
        os.mkdir(folderpath)  # attempt to make folder to store graphs/plots
    except:
        pass

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf,
                                        normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords,
                                        ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord,
                                        showGreyWord=showDeleted, MFW=MFW, cull=culling)
    WordLists = general_functions.matrixtodict(countMatrix)
    Files = [file for file in filemanager.getActiveFiles()]
    for i in range(len(Files)):
        fileinformation = information.File_Information(WordLists[i], Files[i].name)
        FileInfoList.append((Files[i].id, fileinformation.returnstatistics()))
        try:
            fileinformation.plot(os.path.join(folderpath, str(Files[i].id) + constants.FILE_INFORMATION_FIGNAME))
        except:
            pass

    corpusInformation = information.Corpus_Information(WordLists, Files)  # make a new object called corpus
    corpusInfoDict = corpusInformation.returnstatistics()
    try:
        corpusInformation.plot(os.path.join(folderpath, constants.CORPUS_INFORMATION_FIGNAME))
    except:
        pass
    return FileInfoList, corpusInfoDict
Esempio n. 7
0
def generateKWTopwords(filemanager):
    """
    Generates the Kruskal Wallis Topwod results based on user options

    Args: 
        None

    Returns: 
        A dictionary containing the Kruskal Wallis results
    """

    testbyClass, option, Low, High = getTopWordOption()

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    # create division map
    divisionmap, NameMap, classLabel = filemanager.getClassDivisionMap()

    # create a word list to handle wordfilter in KWtest()
    WordLists = general_functions.matrixtodict(countMatrix)

    if len(divisionmap) == 1:
        raise ValueError('only one class given, cannot do Kruaskal-Wallis test, at least 2 class needed')

    # divide the countMatrix via division map
    words = countMatrix[0][1:]  # get the list of word
    for i in range(len(divisionmap)):
        for j in range(len(divisionmap[i])):
            id = divisionmap[i][j]
            divisionmap[i][j] = countMatrix[id + 1]  # +1 because the first line is words
    Matrixs = divisionmap

    AnalysisResult = KWtest(Matrixs, words, WordLists=WordLists, option=option, Low=Low, High=High)

    return AnalysisResult