Example #1
0
    def handleUploadWorkSpace(self):
        """
        This function takes care of the session when you upload a workspace(.lexos) file

        Args:
            None

        Returns:
            None
        """
        # save .lexos file
        savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR)
        savefile = os.path.join(savePath, str(self.nextID) + '.zip')
        try:
            os.makedirs(savePath)
        except:
            pass
        f = open(savefile, 'wb')
        f.write(request.data)
        f.close()

        # clean the session folder
        shutil.rmtree(session_manager.session_folder())

        # extract the zip
        with zipfile.ZipFile(savefile) as zf:
            zf.extractall(savePath)
        NewSessionPath = os.path.join(savePath, constants.WORKSPACE_UPLOAD_DIR)
        general_functions.copydir(NewSessionPath, session_manager.session_folder())

        # remove temp
        os.remove(savefile)
        shutil.rmtree(savePath)
Example #2
0
def generateRWmatrix(dataList):
    """
    Generates rolling windows graph raw data matrix

    Args:
        dataPoints: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'RWresults' + extension)

    rows = ["" for _ in xrange(len(dataList[0]))]

    with open(outFilePath, 'w') as outFile:
        for i in xrange(len(dataList)):

            for j in xrange(len(dataList[i])):
                rows[j] = rows[j] + str(dataList[i][j]) + deliminator

        for i in xrange(len(rows)):
            outFile.write(rows[i] + '\n')
    outFile.close()

    return outFilePath, extension
Example #3
0
def generateRWmatrix(dataList):
    """
    Generates rolling windows graph raw data matrix

    Args:
        dataPoints: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'RWresults' + extension)

    rows = ["" for _ in xrange(len(dataList[0]))]

    with open(outFilePath, 'w') as outFile:
        for i in xrange(len(dataList)):

            for j in xrange(len(dataList[i])):
                rows[j] = rows[j] + str(dataList[i][j]) + deliminator

        for i in xrange(len(rows)):
            outFile.write(rows[i] + '\n')
    outFile.close()

    return outFilePath, extension
Example #4
0
    def __init__(self, originalFilename, fileName, fileString, fileID):
        """ Constructor
        Creates a new LexosFile object from the information passed in, and performs some preliminary processing.

        Args:
            fileName: File name of the originally uploaded file.
            fileString: Contents of the file's text.
            fileID: The ID to assign to the new file.

        Returns:
            The newly constructed LexosFile object.
        """
        self.id = fileID  # Starts out without an id - later assigned one from FileManager
        self.originalSourceFilename = originalFilename
        self.name = fileName
        self.contentsPreview = self.generatePreview(fileString)
        self.savePath = pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER,
                                 str(self.id) + '.txt')
        self.saveContents(fileString)

        self.active = True
        self.classLabel = ''

        splitName = self.name.split('.')

        self.label = '.'.join(splitName[:-1])

        self.setTypeFrom(splitName[-1], fileString)

        self.hasTags = self.checkForTags(fileString)

        self.isGutenberg = self.checkForGutenberg(fileString)

        self.options = {}
Example #5
0
    def handleUploadWorkSpace(self):
        """
        This function takes care of the session when you upload a workspace(.lexos) file

        Args:
            None

        Returns:
            None
        """
        # save .lexos file
        savePath = os.path.join(constants.UPLOAD_FOLDER,
                                constants.WORKSPACE_DIR)
        savefile = os.path.join(savePath, str(self.nextID) + '.zip')
        try:
            os.makedirs(savePath)
        except:
            pass
        f = open(savefile, 'wb')
        f.write(request.data)
        f.close()

        # clean the session folder
        shutil.rmtree(session_manager.session_folder())

        # extract the zip
        upload_session_path = os.path.join(
            constants.UPLOAD_FOLDER,
            str(self.nextID) + '_upload_work_space_folder')
        with zipfile.ZipFile(savefile) as zf:
            zf.extractall(upload_session_path)
        general_functions.copydir(upload_session_path,
                                  session_manager.session_folder())

        # remove temp
        shutil.rmtree(savePath)
        shutil.rmtree(upload_session_path)

        try:
            # if there is no file content folder make one.
            # this dir will be lost during download(zip) if your original file content folder does not contain anything.
            os.makedirs(
                os.path.join(session_manager.session_folder(),
                             constants.FILECONTENTS_FOLDER))
        except (WindowsError, OSError) as e:
            pass
Example #6
0
def getTopWordCSV(TestResult, TestMethod):
    """
    Write the generated topword results to an output CSV file

    Args: 
        TestResult: Analysis Result generated by either generateKWTopwords() or GenerateZTestTopWord()
        TestMethod: 'pzClass' - proportional z-test for class, 'pzAll' - proportional z-test for all, 'KW' - Kruskal Wallis test for class

    Returns: 
        Path of the generated CSV file
    """

    # make the path
    ResultFolderPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER)
    try:
        os.makedirs(ResultFolderPath)  # attempt to make the save path dirctory
    except OSError:
        pass
    SavePath = os.path.join(ResultFolderPath, constants.TOPWORD_CSV_FILE_NAME)
    delimiter = ','
    CSVcontent = ''

    if TestMethod == 'pzClass':
        CSVcontent = 'Proptional-Z test for Class \n'  # add a header

        for key in TestResult:
            TableLegend = 'File: ' + key[0] + 'compare to Class: ' + key[1] + delimiter
            TableTopWord = 'TopWord, '
            TableZscore = 'Z-score, '
            for data in TestResult[key]:
                TableTopWord += data[0] + delimiter
                TableZscore += str(data[1]) + delimiter
            CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n'

    if TestMethod == 'pzAll':
        CSVcontent = 'Proptional-Z test for all \n'  # add a header

        for File in TestResult:
            TableLegend = 'File: ' + File[0] + delimiter
            TableTopWord = 'TopWord, '
            TableZscore = 'Z-score, '
            for data in File[1]:
                TableTopWord += data[0] + delimiter
                TableZscore += str(data[1]) + delimiter
            CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n'

    if TestMethod == 'KW':
        CSVcontent = 'Kruckal-Wallis test for Class \n'  # add a header
        TableTopWord = 'TopWord, '
        TableZscore = 'Z-score, '
        for data in TestResult:
            TableTopWord += data[0] + delimiter
            TableZscore += str(data[1]) + delimiter
        CSVcontent += TableTopWord + '\n' + TableZscore + '\n'

    with open(SavePath, 'w') as f:
        f.write(CSVcontent)
    return SavePath
Example #7
0
def getTopWordCSV(TestResult, TestMethod):
    """
    Write the generated topword results to an output CSV file

    Args: 
        TestResult: Analysis Result generated by either generateKWTopwords() or GenerateZTestTopWord()
        TestMethod: 'pzClass' - proportional z-test for class, 'pzAll' - proportional z-test for all, 'KW' - Kruskal Wallis test for class

    Returns: 
        Path of the generated CSV file
    """

    # make the path
    ResultFolderPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER)
    try:
        os.makedirs(ResultFolderPath)  # attempt to make the save path dirctory
    except OSError:
        pass
    SavePath = os.path.join(ResultFolderPath, constants.TOPWORD_CSV_FILE_NAME)
    delimiter = ','
    CSVcontent = ''

    if TestMethod == 'pzClass':
        CSVcontent = 'Proptional-Z test for Class \n'  # add a header

        for key in TestResult:
            TableLegend = 'File: ' + key[0] + 'compare to Class: ' + key[1] + delimiter
            TableTopWord = 'TopWord, '
            TableZscore = 'Z-score, '
            for data in TestResult[key]:
                TableTopWord += data[0] + delimiter
                TableZscore += str(data[1]) + delimiter
            CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n'

    if TestMethod == 'pzAll':
        CSVcontent = 'Proptional-Z test for all \n'  # add a header

        for File in TestResult:
            TableLegend = 'File: ' + File[0] + delimiter
            TableTopWord = 'TopWord, '
            TableZscore = 'Z-score, '
            for data in File[1]:
                TableTopWord += data[0] + delimiter
                TableZscore += str(data[1]) + delimiter
            CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n'

    if TestMethod == 'KW':
        CSVcontent = 'Kruckal-Wallis test for Class \n'  # add a header
        TableTopWord = 'TopWord, '
        TableZscore = 'Z-score, '
        for data in TestResult:
            TableTopWord += data[0] + delimiter
            TableZscore += str(data[1]) + delimiter
        CSVcontent += TableTopWord + '\n' + TableZscore + '\n'

    with open(SavePath, 'w') as f:
        f.write(CSVcontent.encode('utf-8'))
    return SavePath
Example #8
0
def dendrogramimage():
    """
    Reads the png image of the dendrogram and displays it on the web browser.
    *dendrogramimage() linked to in analysis.html, displaying the dendrogram.png
    Note: Returns a response object with the dendrogram png to flask and eventually to the browser.
    """
    # dendrogramimage() is called in analysis.html, displaying the dendrogram.png (if session['dengenerated'] != False).
    imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.DENDROGRAM_FILENAME)
    return send_file(imagePath)
Example #9
0
def dendrogramimage():
    """
    Reads the png image of the dendrogram and displays it on the web browser.
    *dendrogramimage() linked to in analysis.html, displaying the dendrogram.png
    Note: Returns a response object with the dendrogram png to flask and eventually to the browser.
    """
    # dendrogramimage() is called in analysis.html, displaying the dendrogram.png (if session['dengenerated'] != False).
    imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.DENDROGRAM_FILENAME)
    return send_file(imagePath)
Example #10
0
    def zipWorkSpace(self):
        """
        Sends a zip file containing a pickel file of the session and the session folder.

        Args:
            fileName: Name to assign to the zipped file.

        Returns:
            the path of the zipped workspace
        """
        # initialize the save path
        savepath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR)
        id = str(self.nextID % 10000)  # take the last 4 digit
        workspacefilepath = os.path.join(constants.UPLOAD_FOLDER, id + '_' + constants.WORKSPACE_FILENAME)

        # remove unnecessary content in the workspace
        try:
            shutil.rmtree(os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER))
            # attempt to remove result folder(CSV matrix that kind of crap)
        except:
            pass

        # move session folder to work space folder
        try:
            os.remove(workspacefilepath)  # try to remove previous workspace in order to resolve conflict
        except:
            pass
        try:
            shutil.rmtree(savepath)  # empty the save path in order to resolve conflict
        except:
            pass
        general_functions.copydir(session_manager.session_folder(), savepath)

        # save session in the work space folder
        session_manager.save(savepath)

        # zip the dir
        zipf = zipfile.ZipFile(workspacefilepath, 'w')
        general_functions.zipdir(savepath, zipf)
        zipf.close()
        # remove the original dir
        shutil.rmtree(savepath)

        return workspacefilepath
Example #11
0
def kmeansimage():
    """
    Reads the png image of the kmeans and displays it on the web browser.

    *kmeansimage() linked to in analysis.html, displaying the kmeansimage.png

    Note: Returns a response object with the kmeansimage png to flask and eventually to the browser.
    """
    # kmeansimage() is called in kmeans.html, displaying the KMEANS_GRAPH_FILENAME (if session['kmeansdatagenerated'] != False).
    imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.KMEANS_GRAPH_FILENAME)
    return send_file(imagePath)
Example #12
0
def kmeansimage():
    """
    Reads the png image of the kmeans and displays it on the web browser.

    *kmeansimage() linked to in analysis.html, displaying the kmeansimage.png

    Note: Returns a response object with the kmeansimage png to flask and eventually to the browser.
    """
    # kmeansimage() is called in kmeans.html, displaying the KMEANS_GRAPH_FILENAME (if session['kmeansdatagenerated'] != False).
    imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.KMEANS_GRAPH_FILENAME)
    return send_file(imagePath)
Example #13
0
    def handleUploadWorkSpace(self):
        """
        This function takes care of the session when you upload a workspace(.lexos) file

        Args:
            None

        Returns:
            None
        """
        # save .lexos file
        savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR)
        savefile = os.path.join(savePath, str(self.nextID) + '.zip')
        try:
            os.makedirs(savePath)
        except:
            pass
        f = open(savefile, 'wb')
        f.write(request.data)
        f.close()

        # clean the session folder
        shutil.rmtree(session_manager.session_folder())

        # extract the zip
        upload_session_path = os.path.join(constants.UPLOAD_FOLDER, str(self.nextID) + '_upload_work_space_folder')
        with zipfile.ZipFile(savefile) as zf:
            zf.extractall(upload_session_path)
        general_functions.copydir(upload_session_path, session_manager.session_folder())

        # remove temp
        shutil.rmtree(savePath)
        shutil.rmtree(upload_session_path)

        try:
            # if there is no file content folder make one.
            # this dir will be lost during download(zip) if your original file content folder does not contain anything.
            os.makedirs(os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER))
        except (WindowsError, OSError) as e:
            pass
Example #14
0
def saveFileManager(fileManager):
    """
    Saves the file manager to the hard drive.

    Args:
        fileManager: File manager object to be saved.

    Returns:
        None
    """

    fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)
    pickle.dump(fileManager, open(fileManagerPath, 'wb'))
Example #15
0
def saveFileManager(fileManager):
    """
    Saves the file manager to the hard drive.

    Args:
        fileManager: File manager object to be saved.

    Returns:
        None
    """

    fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)
    pickle.dump(fileManager, open(fileManagerPath, 'wb'))
Example #16
0
def generateCSV(filemanager):
    """
    Generates a CSV file from the active files.

    Args:
        None

    Returns:
        The filepath where the CSV was saved, and the chosen extension (.csv or .tsv) for the file.
    """
    transpose = request.form['csvorientation'] == 'filerow'
    useTSV = request.form['csvdelimiter'] == 'tab'
    extension = '.tsv' if useTSV else '.csv'

    countMatrix = generateCSVMatrix(filemanager)

    delimiter = '\t' if useTSV else ','

    # replace newlines and tabs with space to avoid messing output sheet format
    countMatrix[0] = [item.replace('\t', ' ') for item in countMatrix[0]]
    countMatrix[0] = [item.replace('\n', ' ') for item in countMatrix[0]]

    # replace comma with Chinese comma to avoid messing format for .csv output file
    if delimiter == ',':
        newComma = u'\uFF0C'.encode('utf-8')
        countMatrix[0] = [item.replace(',', newComma) for item in countMatrix[0]]

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'results' + extension)

    # Write results to output file, and write class labels depending on transpose
    classLabelList = ["Class Label"]
    for lFile in filemanager.files.values():
        if lFile.active:
            classLabelList.append(lFile.classLabel)

    with open(outFilePath, 'w') as outFile:
        for i, row in enumerate(countMatrix):
            rowStr = delimiter.join([str(x) for x in row])
            if transpose:
                rowStr += delimiter + classLabelList[i]

            outFile.write(rowStr + '\n')

        if not transpose:
            outFile.write(delimiter.join(classLabelList) + '\n')
    outFile.close()

    return outFilePath, extension
Example #17
0
def generateCSV(filemanager):
    """
    Generates a CSV file from the active files.

    Args:
        None

    Returns:
        The filepath where the CSV was saved, and the chosen extension (.csv or .tsv) for the file.
    """
    transpose = request.form['csvorientation'] == 'filerow'
    useTSV = request.form['csvdelimiter'] == 'tab'
    extension = '.tsv' if useTSV else '.csv'

    countMatrix = generateCSVMatrix(filemanager)

    delimiter = '\t' if useTSV else ','

    # replace newlines and tabs with space to avoid messing output sheet format
    countMatrix[0] = [item.replace('\t', ' ') for item in countMatrix[0]]
    countMatrix[0] = [item.replace('\n', ' ') for item in countMatrix[0]]

    # replace comma with Chinese comma to avoid messing format for .csv output file
    if delimiter == ',':
        newComma = u'\uFF0C'.encode('utf-8')
        countMatrix[0] = [item.replace(',', newComma) for item in countMatrix[0]]

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'results' + extension)

    # Write results to output file, and write class labels depending on transpose
    classLabelList = ["Class Label"]
    for lFile in filemanager.files.values():
        if lFile.active:
            classLabelList.append(lFile.classLabel)

    with open(outFilePath, 'w') as outFile:
        for i, row in enumerate(countMatrix):
            rowStr = delimiter.join([str(x) for x in row])
            if transpose:
                rowStr += delimiter + classLabelList[i]

            outFile.write(rowStr + '\n')

        if not transpose:
            outFile.write(delimiter.join(classLabelList) + '\n')
    outFile.close()

    return outFilePath, extension
Example #18
0
    def scrubContents(self, savingChanges):
        """
        Scrubs the contents of the file according to the options chosen by the user, saves the changes or doesn't,
        and returns a preview of the changes either way.

        Args:
            savingChanges: Boolean saying whether or not to save the changes made.

        Returns:
            Returns a preview string of the possibly changed file.
        """

        cache_options = []
        for key in request.form.keys():
            if 'usecache' in key:
                cache_options.append(key[len('usecache'):])

        if 'scrub' not in self.options:
            self.options['scrub'] = {}
        scrubOptions = self.getScrubOptions()

        textString = self.loadContents()

        textString = scrubber.scrub(textString,
                                    gutenberg=self.isGutenberg,
                                    lower=scrubOptions['lowercasebox'],
                                    punct=scrubOptions['punctuationbox'],
                                    apos=scrubOptions['aposbox'],
                                    hyphen=scrubOptions['hyphensbox'],
                                    amper=scrubOptions['ampersandbox'],
                                    digits=scrubOptions['digitsbox'],
                                    tags=scrubOptions['tagbox'],
                                    whiteSpace=scrubOptions['whitespacebox'],
                                    spaces=scrubOptions['spacesbox'],
                                    tabs=scrubOptions['tabsbox'],
                                    newLines=scrubOptions['newlinesbox'],
                                    opt_uploads=request.files,
                                    cache_options=cache_options,
                                    cache_folder=session_manager.session_folder() + '/scrub/',
                                    previewing=not savingChanges)

        if savingChanges:
            self.saveContents(textString)
            self.saveScrubOptions()

        # renew the preview
        self.contentsPreview = self.generatePreview()
        textString = self.contentsPreview

        return textString
Example #19
0
    def __init__(self):
        """ Constructor:
        Creates an empty file manager.

        Args:
            None

        Returns:
            FileManager object with no files.
        """
        self.files = {}
        self.nextID = 0

        makedirs(pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER))
Example #20
0
    def updateWorkspace(self):
        """
        Updates the whole work space

        Args:
            None

        Returns:
            None
        """
        # update the savepath of each file
        for lFile in self.files.values():
            lFile.savePath = pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER,
                                      str(lFile.id) + '.txt')
        # update the session
        session_manager.load()
    def __init__(self):
        """ Constructor:
        Creates an empty file manager.

        Args:
            None

        Returns:
            FileManager object with no files.
        """
        self.files = {}
        self.nextID = 0

        makedirs(
            pathjoin(session_manager.session_folder(),
                     constants.FILECONTENTS_FOLDER))
Example #22
0
def generateStatistics(filemanager):
    """
    Calls analyze/information to get the information about each file and the whole corpus

    Args:
        None

    Returns:
        FileInfoList: a list contains a tuple that containing the file id and the file information
                     (see analyze/information.py/Corpus_Information.returnstatistics() function for more)
        corpusInformation: the statistics information about the whole corpus
                          (see analyze/information.py/File_Information.returnstatistics() function for more)
    """
    checkedLabels = request.form.getlist('segmentlist')
    ids = set(filemanager.files.keys())

    checkedLabels = set(map(int, checkedLabels))  # convert the checkedLabels into int

    for id in ids - checkedLabels:  # if the id is not in checked list
        filemanager.files[id].disable()  # make that file inactive in order to getMatrix

    FileInfoList = []
    folderpath = os.path.join(session_manager.session_folder(),
                              constants.RESULTS_FOLDER)  # folder path for storing graphs and plots
    try:
        os.mkdir(folderpath)  # attempt to make folder to store graphs/plots
    except:
        pass

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    WordLists = general_functions.matrixtodict(countMatrix)
    Files = [file for file in filemanager.getActiveFiles()]
    for i in range(len(Files)):
        templabel = countMatrix[i + 1][0]  # because the first row of the first line is the ''
        fileinformation = information.File_Information(WordLists[i], templabel)
        FileInfoList.append((Files[i].id, fileinformation.returnstatistics()))

    corpusInformation = information.Corpus_Information(WordLists, Files)  # make a new object called corpus
    corpusInfoDict = corpusInformation.returnstatistics()

    return FileInfoList, corpusInfoDict
Example #23
0
def generateStatistics(filemanager):
    """
    Calls analyze/information to get the information about each file and the whole corpus

    Args:
        None

    Returns:
        FileInfoList: a list contains a tuple that containing the file id and the file information
                     (see analyze/information.py/Corpus_Information.returnstatistics() function for more)
        corpusInformation: the statistics information about the whole corpus
                          (see analyze/information.py/File_Information.returnstatistics() function for more)
    """
    checkedLabels = request.form.getlist('segmentlist')
    ids = set(filemanager.files.keys())

    checkedLabels = set(map(int, checkedLabels))  # convert the checkedLabels into int

    for id in ids - checkedLabels:  # if the id is not in checked list
        filemanager.files[id].disable()  # make that file inactive in order to getMatrix

    FileInfoList = []
    folderpath = os.path.join(session_manager.session_folder(),
                              constants.RESULTS_FOLDER)  # folder path for storing graphs and plots
    try:
        os.mkdir(folderpath)  # attempt to make folder to store graphs/plots
    except:
        pass

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW,
                                        cull=culling)

    WordLists = general_functions.matrixtodict(countMatrix)
    Files = [file for file in filemanager.getActiveFiles()]
    for i in range(len(Files)):
        templabel = countMatrix[i + 1][0]  # because the first row of the first line is the ''
        fileinformation = information.File_Information(WordLists[i], templabel)
        FileInfoList.append((Files[i].id, fileinformation.returnstatistics()))

    corpusInformation = information.Corpus_Information(WordLists, Files)  # make a new object called corpus
    corpusInfoDict = corpusInformation.returnstatistics()

    return FileInfoList, corpusInfoDict
Example #24
0
def generateRWmatrixPlot(dataPoints, legendLabelsList):
    """
    Generates rolling windows graph raw data matrix

    Args:
        dataPoints: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'RWresults' + extension)

    maxlen = 0
    for i in xrange(len(dataPoints)):
        if len(dataPoints[i]) > maxlen: maxlen = len(dataPoints[i])
    maxlen += 1

    rows = [""] * maxlen

    legendLabelsList[0] = legendLabelsList[0].split('#')

    rows[0] = (deliminator + deliminator).join(legendLabelsList[0]) + deliminator + deliminator

    with open(outFilePath, 'w') as outFile:
        for i in xrange(len(dataPoints)):
            for j in xrange(1, len(dataPoints[i]) + 1):
                rows[j] = rows[j] + str(dataPoints[i][j - 1][0]) + deliminator + str(
                    dataPoints[i][j - 1][1]) + deliminator

        for i in xrange(len(rows)):
            outFile.write(rows[i] + '\n')
    outFile.close()

    return outFilePath, extension
Example #25
0
def generateRWmatrixPlot(dataPoints, legendLabelsList):
    """
    Generates rolling windows graph raw data matrix

    Args:
        dataPoints: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'RWresults' + extension)

    maxlen = 0
    for i in xrange(len(dataPoints)):
        if len(dataPoints[i]) > maxlen: maxlen = len(dataPoints[i])
    maxlen += 1

    rows = [""] * maxlen

    legendLabelsList[0] = legendLabelsList[0].split('#')

    rows[0] = (deliminator + deliminator).join(legendLabelsList[0]) + deliminator + deliminator

    with open(outFilePath, 'w') as outFile:
        for i in xrange(len(dataPoints)):
            for j in xrange(1, len(dataPoints[i]) + 1):
                rows[j] = rows[j] + str(dataPoints[i][j - 1][0]) + deliminator + str(
                    dataPoints[i][j - 1][1]) + deliminator

        for i in xrange(len(rows)):
            outFile.write(rows[i] + '\n')
    outFile.close()

    return outFilePath, extension
Example #26
0
def loadFileManager():
    """
    Loads the file manager for the specific session from the hard drive.

    Args:
        None

    Returns:
        The file manager object for the session.
    """

    fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)
    # encryption
    # if constants.FILEMANAGER_KEY != '':
    #     fileManagerPath = general_function.decryptFile(path=fileManagerPath, key=constants.FILEMANAGER_KEY)

    fileManager = pickle.load(open(fileManagerPath, 'rb'))

    # encryption
    # if constants.FILEMANAGER_KEY != '':
    #     os.remove(fileManagerPath)

    return fileManager
Example #27
0
def loadFileManager():
    """
    Loads the file manager for the specific session from the hard drive.

    Args:
        None

    Returns:
        The file manager object for the session.
    """

    fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)
    # encryption
    # if constants.FILEMANAGER_KEY != '':
    #     fileManagerPath = general_function.decryptFile(path=fileManagerPath, key=constants.FILEMANAGER_KEY)

    fileManager = pickle.load(open(fileManagerPath, 'rb'))

    # encryption
    # if constants.FILEMANAGER_KEY != '':
    #     os.remove(fileManagerPath)

    return fileManager
Example #28
0
def generateSimsCSV(filemanager):
    """
    Generates a CSV file from the calculating similarity.

    Args:
        None

    Returns:
        The filepath where the CSV was saved, and the chosen extension .csv for the file.
    """
    extension = '.csv'

    cosineSims, DocumentName = generateSimilarities(filemanager)

    delimiter = ','

    cosineSims=cosineSims.split("***");
    DocumentName=DocumentName.split("***");

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'results' + extension)
    compFileId = request.form['uploadname']

    with open(outFilePath, 'w') as outFile:
        
        outFile.write("Similarity Rankings:"+'\n')
        outFile.write("\'The module used to produce this ranking employs Latent Semantic Analysis to generate unique\n vectors for each document. The cosine angle between your comparison document's vector and the vector\n of each document of your corpus is calculated and these values are then compared. Cosine similarity\n measures can be between 0 and 1 and the higher the value the closer the comparison document's vector is to that\n document's vector as opposed to the other documents' vectors."+'\n')
        outFile.write("Selected Comparison Document: "+delimiter+str(filemanager.getActiveLabels()[int(compFileId.encode("utf-8"))])+'\n')
        outFile.write("Rank," + "Document,"+ "Cosine Similarity"+'\n')
        for i in range(0,(len(cosineSims)-1)):
            outFile.write(str(i+1)+delimiter+DocumentName[i]+delimiter+cosineSims[i]+'\n')

    outFile.close()

    return outFilePath, extension
Example #29
0
def generateSimsCSV(filemanager):
    """
    Generates a CSV file from the calculating similarity.

    Args:
        None

    Returns:
        The filepath where the CSV was saved, and the chosen extension .csv for the file.
    """
    extension = '.csv'

    cosineSims, DocumentName = generateSimilarities(filemanager)

    delimiter = ','

    cosineSims=cosineSims.split("***");
    DocumentName=DocumentName.split("***");

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)
    outFilePath = pathjoin(folderPath, 'results' + extension)
    compFileId = request.form['uploadname']

    with open(outFilePath, 'w') as outFile:
        
        outFile.write("Similarity Rankings:"+'\n')
        outFile.write("\'The module used to produce this ranking employs Latent Semantic Analysis to generate unique\n vectors for each document. The cosine angle between your comparison document's vector and the vector\n of each document of your corpus is calculated and these values are then compared. Cosine similarity\n measures can be between 0 and 1 and the higher the value the closer the comparison document's vector is to that\n document's vector as opposed to the other documents' vectors."+'\n')
        outFile.write("Selected Comparison Document: "+delimiter+str(filemanager.getActiveLabels()[int(compFileId.encode("utf-8"))])+'\n')
        outFile.write("Rank," + "Document,"+ "Cosine Similarity"+'\n')
        for i in range(0,(len(cosineSims)-1)):
            outFile.write(str(i+1)+delimiter+DocumentName[i]+delimiter+cosineSims[i]+'\n')

    outFile.close()

    return outFilePath, extension
Example #30
0
def generateMCJSONObj(filemanager):
    """
    Generates a JSON object for multicloud when working with a mallet .txt file.

    Args:
        malletPath: path to the saved mallet .txt file

    Returns:
        An object, formatted in the JSON that d3 needs, either a list or a dictionary.
    """

    contentPath = os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER,
                               constants.MALLET_INPUT_FILE_NAME)
    outputPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER,
                              constants.MALLET_OUTPUT_FILE_NAME)
    try:
        makedirs(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER))
        # attempt to make the result dir
    except:
        pass  # result dir already exists

    if request.form['analysistype'] == 'userfiles':

        JSONObj = generateJSONForD3(filemanager, mergedSet=False)

    else:  # request.form['analysistype'] == 'topicfile'

        topicString = str(request.files['optuploadname'])
        topicString = re.search(r"'(.*?)'", topicString)
        topicString = topicString.group(1)

        if topicString != '':
            request.files['optuploadname'].save(contentPath)

        with open(contentPath, 'r') as f:
            content = f.read()  # reads content from the upload file
        if content.startswith('#doc source pos typeindex type topic'):
            # --- begin converting a Mallet file into the file d3 can understand ---
            tuples = []
            # Read the output_state file
            with open(contentPath) as f:
                # Skip the first three lines
                for _ in xrange(3):
                    next(f)
                # Create a list of type:topic combinations
                for line in f:
                    line = re.sub('\s+', ' ', line)  # Make sure the number of columns is correct
                    try:
                        doc, source, pos, typeindex, type, topic = line.rstrip().split(' ')
                        tuple = type + ':' + topic
                        tuples.append(tuple)
                    except:
                        raise Exception(
                            "Your source data cannot be parsed into a regular number of columns. Please ensure that there are no spaces in your file names or file paths. It; may be easiest to open the outpt_state file in a spreadsheet using a space as; the delimiter and text as the field type. Data should only be present in columns; A to F. Please fix any misaligned data and run this script again.")

            # Count the number of times each type-topic combo appears
            from collections import defaultdict

            topicCount = defaultdict(int)
            for x in tuples:
                topicCount[x] += 1

            # Populate a topicCounts dict with type: topic:count
            words = []
            topicCounts = {}
            for k, v in topicCount.iteritems():
                type, topic = k.split(':')
                count = int(v)
                tc = topic + ":" + str(count)
                if type in words:
                    topicCounts[type] = topicCounts[type] + " " + tc
                else:
                    topicCounts[type] = tc
                words.append(type)

            # Add a word ID
            out = ""
            i = 0
            for k, v in topicCounts.iteritems():
                out += str(i) + " " + k + " " + v + "\n"
                i += 1

            # Write the output file
            with open(outputPath, 'w') as f:
                f.write(out)  # Python will convert \n to os.linesep
                # --- end converting a Mallet file into the file d3 can understand ---
        else:
            with open(outputPath, 'w') as f:
                f.write(content)  # if this is the jsonform, just write that in the output folder

        JSONObj = multicloud_topic.topicJSONmaker(outputPath)

    return JSONObj
Example #31
0
def generateMCJSONObj(filemanager):
    """
    Generates a JSON object for multicloud when working with a mallet .txt file.

    Args:
        malletPath: path to the saved mallet .txt file

    Returns:
        An object, formatted in the JSON that d3 needs, either a list or a dictionary.
    """

    contentPath = os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER,
                               constants.MALLET_INPUT_FILE_NAME)
    outputPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER,
                              constants.MALLET_OUTPUT_FILE_NAME)
    try:
        makedirs(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER))
        # attempt to make the result dir
    except:
        pass  # result dir already exists

    if request.form['analysistype'] == 'userfiles':

        JSONObj = generateJSONForD3(filemanager, mergedSet=False)

    else:  # request.form['analysistype'] == 'topicfile'

        topicString = str(request.files['optuploadname'])
        topicString = re.search(r"'(.*?)'", topicString)
        topicString = topicString.group(1)

        if topicString != '':
            request.files['optuploadname'].save(contentPath)

        with open(contentPath, 'r') as f:
            content = f.read()  # reads content from the upload file
        if content.startswith('#doc source pos typeindex type topic'):
            # --- begin converting a Mallet file into the file d3 can understand ---
            tuples = []
            # Read the output_state file
            with open(contentPath) as f:
                # Skip the first three lines
                for _ in xrange(3):
                    next(f)
                # Create a list of type:topic combinations
                for line in f:
                    line = re.sub('\s+', ' ', line)  # Make sure the number of columns is correct
                    try:
                        doc, source, pos, typeindex, type, topic = line.rstrip().split(' ')
                        tuple = type + ':' + topic
                        tuples.append(tuple)
                    except:
                        raise Exception(
                            "Your source data cannot be parsed into a regular number of columns. Please ensure that there are no spaces in your file names or file paths. It; may be easiest to open the outpt_state file in a spreadsheet using a space as; the delimiter and text as the field type. Data should only be present in columns; A to F. Please fix any misaligned data and run this script again.")

            # Count the number of times each type-topic combo appears
            from collections import defaultdict

            topicCount = defaultdict(int)
            for x in tuples:
                topicCount[x] += 1

            # Populate a topicCounts dict with type: topic:count
            words = []
            topicCounts = {}
            for k, v in topicCount.iteritems():
                type, topic = k.split(':')
                count = int(v)
                tc = topic + ":" + str(count)
                if type in words:
                    topicCounts[type] = topicCounts[type] + " " + tc
                else:
                    topicCounts[type] = tc
                words.append(type)

            # Add a word ID
            out = ""
            i = 0
            for k, v in topicCounts.iteritems():
                out += str(i) + " " + k + " " + v + "\n"
                i += 1

            # Write the output file
            with open(outputPath, 'w') as f:
                f.write(out)  # Python will convert \n to os.linesep
                # --- end converting a Mallet file into the file d3 can understand ---
        else:
            with open(outputPath, 'w') as f:
                f.write(content)  # if this is the jsonform, just write that in the output folder

        JSONObj = multicloud_topic.topicJSONmaker(outputPath)

    return JSONObj
Example #32
0
def generateDendrogram(filemanager):
    """
    Generates dendrogram image and PDF from the active files.

    Args:
        None

    Returns:
        Total number of PDF pages, ready to calculate the height of the embeded PDF on screen
    """

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf,
                                        normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords,
                                        ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord,
                                        showGreyWord=showGreyWord, MFW=MFW, cull=culling)

    # Gets options from request.form and uses options to generate the dendrogram (with the legends) in a PDF file
    orientation = str(request.form['orientation'])
    title = request.form['title']
    pruning = request.form['pruning']
    pruning = int(request.form['pruning']) if pruning else 0
    linkage = str(request.form['linkage'])
    metric = str(request.form['metric'])

    augmentedDendrogram = False
    if 'augmented' in request.form:
        augmentedDendrogram = request.form['augmented'] == 'on'

    showDendroLegends = False
    if 'dendroLegends' in request.form:
        showDendroLegends = request.form['dendroLegends'] == 'on'

    dendroMatrix = []
    fileNumber = len(countMatrix)
    totalWords = len(countMatrix[0])

    for row in range(1, fileNumber):
        wordCount = []
        for col in range(1, totalWords):
            wordCount.append(countMatrix[row][col])
        dendroMatrix.append(wordCount)

    distanceList = dendrogrammer.getDendroDistances(linkage, metric, dendroMatrix)

    legend = getDendrogramLegend(filemanager, distanceList)

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)

    # we need labels (segment names)
    tempLabels = []
    for matrixRow in countMatrix:
        tempLabels.append(matrixRow[0])

    pdfPageNumber = dendrogrammer.dendrogram(orientation, title, pruning, linkage, metric, tempLabels, dendroMatrix,
                                             legend, folderPath, augmentedDendrogram, showDendroLegends)
    return pdfPageNumber
Example #33
0
def hierarchy():
    """
    Handles the functionality on the hierarchy page. It analyzes the various texts and
    displays a dendrogram.
    Note: Returns a response object (often a render_template call) to flask and eventually
          to the browser.
    """
    fileManager = managers.utility.loadFileManager()
    leq = '≤'.decode('utf-8')

    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS
        if 'hierarchyoption' not in session:
            session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS
        labels = fileManager.getActiveLabels()
        thresholdOps = {}
        return render_template('hierarchy.html', labels=labels, thresholdOps=thresholdOps)

    if 'dendro_download' in request.form:
        # The 'Download Dendrogram' button is clicked on hierarchy.html.
        # sends pdf file to downloads folder.
        utility.generateDendrogram(fileManager)
        attachmentname = "den_" + request.form['title'] + ".pdf" if request.form['title'] != '' else 'dendrogram.pdf'
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.pdf"),
                         attachment_filename=attachmentname, as_attachment=True)

    if 'dendroSVG_download' in request.form:
        utility.generateDendrogram(fileManager)
        attachmentname = "den_" + request.form['title'] + ".svg" if request.form['title'] != '' else 'dendrogram.svg'
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.svg"),
                         attachment_filename=attachmentname, as_attachment=True)

    if 'getdendro' in request.form:
        # The 'Get Dendrogram' button is clicked on hierarchy.html.

        pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram(
            fileManager)
        session['dengenerated'] = True
        labels = fileManager.getActiveLabels()

        inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax)
        maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax)
        distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax)
        monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax)

        thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp,
                        "monocrit": monocritOp}

        managers.utility.saveFileManager(fileManager)
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return render_template('hierarchy.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score,
                               inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax,
                               distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin,
                               threshold=threshold, thresholdOps=thresholdOps)
Example #34
0
def generateKMeansVoronoi(filemanager):
    """
    Generates a table of cluster_number and file name from the active files.

    Args:
        None

    Returns:
        kmeansIndex.tolist(): a list of index of the closest center of the file
        silttScore: a float of silhouette score based on KMeans algorithm
        fileNameStr: a string of file names, separated by '#'
        KValue: an int of the number of K from input
    """

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW,
                                        cull=culling)

    del countMatrix[0]
    for row in countMatrix:
        del row[0]

    matrix = np.array(countMatrix)

    # Gets options from request.form and uses options to generate the K-mean results
    KValue = len(filemanager.getActiveFiles()) / 2  # default K value
    max_iter = 300  # default number of iterations
    initMethod = request.form['init']
    n_init = 300
    tolerance = 1e-4

    if (request.form['nclusters'] != '') and (int(request.form['nclusters']) != KValue):
        KValue = int(request.form['nclusters'])
    if (request.form['max_iter'] != '') and (int(request.form['max_iter']) != max_iter):
        max_iter = int(request.form['max_iter'])
    if request.form['n_init'] != '':
        n_init = int(request.form['n_init'])
    if request.form['tolerance'] != '':
        tolerance = float(request.form['tolerance'])

    metric_dist = request.form['KMeans_metric']

    fileNameList = []
    for lFile in filemanager.files.values():
        if lFile.active:
            if request.form["file_" + str(lFile.id)] == lFile.label:
                fileNameList.append(lFile.label.encode("utf-8"))
            else:
                newLabel = request.form["file_" + str(lFile.id)].encode("utf-8")
                fileNameList.append(newLabel)

    fileNameStr = fileNameList[0]

    for i in range(1, len(fileNameList)):
        fileNameStr += "#" + fileNameList[i]

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)

    kmeansIndex, silttScore, colorChart, finalPointsList, finalCentroidsList, textData, maxVal = KMeans.getKMeansVoronoi(
        matrix, KValue, max_iter, initMethod, n_init, tolerance, metric_dist, fileNameList)

    return kmeansIndex, silttScore, fileNameStr, KValue, colorChart, finalPointsList, finalCentroidsList, textData, maxVal
Example #35
0
def hierarchy():
    """
    Handles the functionality on the hierarchy page. It analyzes the various texts and
    displays a dendrogram.
    Note: Returns a response object (often a render_template call) to flask and eventually
          to the browser.
    """
    fileManager = managers.utility.loadFileManager()
    leq = '≤'.decode('utf-8')

    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS
        if 'hierarchyoption' not in session:
            session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS
        labels = fileManager.getActiveLabels()
        thresholdOps = {}
        return render_template('hierarchy.html', labels=labels, thresholdOps=thresholdOps)

    if 'dendro_download' in request.form:
        # The 'Download Dendrogram' button is clicked on hierarchy.html.
        # sends pdf file to downloads folder.
        utility.generateDendrogram(fileManager)
        attachmentname = "den_" + request.form['title'] + ".pdf" if request.form['title'] != '' else 'dendrogram.pdf'
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.pdf"),
                         attachment_filename=attachmentname, as_attachment=True)

    if 'dendroSVG_download' in request.form:
        utility.generateDendrogram(fileManager)
        attachmentname = "den_" + request.form['title'] + ".svg" if request.form['title'] != '' else 'dendrogram.svg'
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.svg"),
                         attachment_filename=attachmentname, as_attachment=True)

    if 'getdendro' in request.form:
        # The 'Get Dendrogram' button is clicked on hierarchy.html.

        pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram(
            fileManager)
        session['dengenerated'] = True
        labels = fileManager.getActiveLabels()

        inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax)
        maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax)
        distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax)
        monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax)

        thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp,
                        "monocrit": monocritOp}

        managers.utility.saveFileManager(fileManager)
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        return render_template('hierarchy.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score,
                               inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax,
                               distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin,
                               threshold=threshold, thresholdOps=thresholdOps)
Example #36
0
def generateKMeansVoronoi(filemanager):
    """
    Generates a table of cluster_number and file name from the active files.

    Args:
        None

    Returns:
        kmeansIndex.tolist(): a list of index of the closest center of the file
        silttScore: a float of silhouette score based on KMeans algorithm
        fileNameStr: a string of file names, separated by '#'
        KValue: an int of the number of K from input
    """

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize,
                                        useFreq=False, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW,
                                        cull=culling)

    del countMatrix[0]
    for row in countMatrix:
        del row[0]

    matrix = np.array(countMatrix)

    # Gets options from request.form and uses options to generate the K-mean results
    KValue = len(filemanager.getActiveFiles()) / 2  # default K value
    max_iter = 300  # default number of iterations
    initMethod = request.form['init']
    n_init = 300
    tolerance = 1e-4

    if (request.form['nclusters'] != '') and (int(request.form['nclusters']) != KValue):
        KValue = int(request.form['nclusters'])
    if (request.form['max_iter'] != '') and (int(request.form['max_iter']) != max_iter):
        max_iter = int(request.form['max_iter'])
    if request.form['n_init'] != '':
        n_init = int(request.form['n_init'])
    if request.form['tolerance'] != '':
        tolerance = float(request.form['tolerance'])

    metric_dist = request.form['KMeans_metric']

    fileNameList = []
    for lFile in filemanager.files.values():
        if lFile.active:
            if request.form["file_" + str(lFile.id)] == lFile.label:
                fileNameList.append(lFile.label.encode("utf-8"))
            else:
                newLabel = request.form["file_" + str(lFile.id)].encode("utf-8")
                fileNameList.append(newLabel)

    fileNameStr = fileNameList[0]

    for i in range(1, len(fileNameList)):
        fileNameStr += "#" + fileNameList[i]

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)

    kmeansIndex, silttScore, colorChart, finalPointsList, finalCentroidsList, textData, maxVal = KMeans.getKMeansVoronoi(
        matrix, KValue, max_iter, initMethod, n_init, tolerance, metric_dist, fileNameList)

    return kmeansIndex, silttScore, fileNameStr, KValue, colorChart, finalPointsList, finalCentroidsList, textData, maxVal
Example #37
0
def generateDendrogram(filemanager):
    """
    Generates dendrogram image and PDF from the active files.

    Args:
        None

    Returns:
        Total number of PDF pages, ready to calculate the height of the embeded PDF on screen
    """

    ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions()

    countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf,
                                        normOption=normOption,
                                        onlyCharGramsWithinWords=onlyCharGramsWithinWords,
                                        ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord,
                                        showGreyWord=showGreyWord, MFW=MFW, cull=culling)

    # Gets options from request.form and uses options to generate the dendrogram (with the legends) in a PDF file
    orientation = str(request.form['orientation'])
    title = request.form['title']
    pruning = request.form['pruning']
    pruning = int(request.form['pruning']) if pruning else 0
    linkage = str(request.form['linkage'])
    metric = str(request.form['metric'])

    augmentedDendrogram = False
    if 'augmented' in request.form:
        augmentedDendrogram = request.form['augmented'] == 'on'

    showDendroLegends = False
    if 'dendroLegends' in request.form:
        showDendroLegends = request.form['dendroLegends'] == 'on'

    dendroMatrix = []
    fileNumber = len(countMatrix)
    totalWords = len(countMatrix[0])

    for row in range(1, fileNumber):
        wordCount = []
        for col in range(1, totalWords):
            wordCount.append(countMatrix[row][col])
        dendroMatrix.append(wordCount)

    distanceList = dendrogrammer.getDendroDistances(linkage, metric, dendroMatrix)

    legend = getDendrogramLegend(filemanager, distanceList)

    folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
    if (not os.path.isdir(folderPath)):
        makedirs(folderPath)

    # we need labels (segment names)
    tempLabels = []
    for matrixRow in countMatrix:
        tempLabels.append(matrixRow[0])

    pdfPageNumber = dendrogrammer.dendrogram(orientation, title, pruning, linkage, metric, tempLabels, dendroMatrix,
                                             legend, folderPath, augmentedDendrogram, showDendroLegends)
    return pdfPageNumber