def handleUploadWorkSpace(self): """ this function take care of the session when you upload a .lexos file """ # save .lexos file savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) savefile = os.path.join(savePath, str(self.nextID) + '.zip') try: os.makedirs(savePath) except: pass f = open(savefile, 'wb') f.write(request.data) f.close() # clean the session folder shutil.rmtree(session_functions.session_folder()) # extract the zip with zipfile.ZipFile(savefile) as zf: zf.extractall(savePath) NewSessionPath = os.path.join(savePath, constants.WORKSPACE_UPLOAD_DIR) general_functions.copydir(NewSessionPath, session_functions.session_folder()) # remove temp os.remove(savefile) shutil.rmtree(savePath)
def multicloud(): """ Handles the functionality on the multicloud pages. Note: Returns a response object (often a render_template call) to flask and eventually to the browser. """ fileManager = session_functions.loadFileManager() if 'multicloudoptions' not in session: session['multicloudoptions'] = constants.DEFAULT_MC_OPTIONS folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) malletPath = pathjoin(folderPath, "topicFile") if request.method == 'GET': # 'GET' request occurs when the page is first loaded. labels = fileManager.getActiveLabels() return render_template('multicloud.html', jsonStr="", labels=labels) if request.method == "POST": # 'POST' request occur when html form is submitted (i.e. 'Get Graphs', 'Download...') labels = fileManager.getActiveLabels() JSONObj = fileManager.generateMCJSONObj(malletPath) return render_template('multicloud.html', JSONObj = JSONObj, labels=labels, loading='loading')
def hierarchy(): """ Handles the functionality on the hierarchy page. It analyzes the various texts and displays a dendrogram. Note: Returns a response object (often a render_template call) to flask and eventually to the browser. """ fileManager = session_functions.loadFileManager() if request.method == "GET": # "GET" request occurs when the page is first loaded. # if 'dendrogramoptions' not in session: # Default settings # session['dendrogramoptions'] = constants.DEFAULT_DENDRO_OPTIONS labels = fileManager.getActiveLabels() return render_template('hierarchy.html', labels=labels) if 'dendro_download' in request.form: # The 'Download Dendrogram' button is clicked on hierarchy.html. # sends pdf file to downloads folder. attachmentname = "den_"+request.form['title']+".pdf" if request.form['title'] != '' else 'dendrogram.pdf' return send_file(pathjoin(session_functions.session_folder(),constants.RESULTS_FOLDER+"dendrogram.pdf"), attachment_filename=attachmentname, as_attachment=True) if 'refreshThreshold' in request.form: pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = fileManager.generateDendrogram() labels = fileManager.getActiveLabels() return render_template('hierarchy.html', labels=labels, inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax, distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin, threshold=threshold) if 'getdendro' in request.form: #The 'Get Dendrogram' button is clicked on hierarchy.html. pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = fileManager.generateDendrogram() session['dengenerated'] = True labels = fileManager.getActiveLabels() return render_template('hierarchy.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score, inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax, distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin, threshold=threshold)
def generateRWmatrix(dataList): """ Generates rolling windows graph raw data matrix Args: dataPoints: a list of [x, y] points Returns: Output file path and extension. """ extension = '.csv' deliminator = ',' folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'RWresults' + extension) rows = ["" for _ in xrange(len(dataList[0]))] with open(outFilePath, 'w') as outFile: for i in xrange(len(dataList)): for j in xrange(len(dataList[i])): rows[j] = rows[j] + str(dataList[i][j]) + deliminator for i in xrange(len(rows)): outFile.write(rows[i] + '\n') outFile.close() return outFilePath, extension
def dendrogramimage(): """ Reads the png image of the dendrogram and displays it on the web browser. *dendrogramimage() linked to in analysis.html, displaying the dendrogram.png Note: Returns a response object with the dendrogram png to flask and eventually to the browser. """ # dendrogramimage() is called in analysis.html, displaying the dendrogram.png (if session['dengenerated'] != False). imagePath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER, constants.DENDROGRAM_FILENAME) return send_file(imagePath)
def zipWorkSpace(self): """ Sends a zip file containing a pickel file of the session and the session folder. Args: fileName: Name to assign to the zipped file. Returns: the path of the zipped workspace """ # initialize the save path savepath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) id = str(self.nextID % 10000) # take the last 4 digit workspacefilepath = os.path.join(constants.UPLOAD_FOLDER, id + '_' + constants.WORKSPACE_FILENAME) # remove unnecessary content in the workspace try: shutil.rmtree(os.path.join(session_functions.session_folder(), constants.RESULTS_FOLDER)) # attempt to remove result folder(CSV matrix that kind of crap) except: pass # move session folder to work space folder try: os.remove(workspacefilepath) # try to remove previous workspace in order to resolve conflict except: pass try: shutil.rmtree(savepath) # empty the save path in order to resolve conflict except: pass general_functions.copydir(session_functions.session_folder(), savepath) # save session in the work space folder session_functions.saveSession(savepath) # zip the dir zipf = zipfile.ZipFile(workspacefilepath, 'w') general_functions.zipdir(savepath, zipf) zipf.close() # remove the original dir shutil.rmtree(savepath) return workspacefilepath
def kmeansimage(): """ Reads the png image of the kmeans and displays it on the web browser. *kmeansimage() linked to in analysis.html, displaying the kmeansimage.png Note: Returns a response object with the kmeansimage png to flask and eventually to the browser. """ # kmeansimage() is called in kmeans.html, displaying the KMEANS_GRAPH_FILENAME (if session['kmeansdatagenerated'] != False). imagePath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER, constants.KMEANS_GRAPH_FILENAME) return send_file(imagePath)
def updateWorkspace(self): """ update the file to the new path when upload a workspace """ # update the savepath of each file for lFile in self.files.values(): lFile.savePath = pathjoin(session_functions.session_folder(), constants.FILECONTENTS_FOLDER, str(lFile.id) + '.txt') # update the session session_functions.loadSession()
def __init__(self): """ Constructor: Creates an empty file manager. Args: None Returns: FileManager object with no files. """ self.files = {} self.nextID = 0 makedirs(pathjoin(session_functions.session_folder(), constants.FILECONTENTS_FOLDER))
def generateCSV(filemanager): """ Generates a CSV file from the active files. Args: None Returns: The filepath where the CSV was saved, and the chosen extension (.csv or .tsv) for the file. """ transpose = request.form['csvorientation'] == 'filerow' useTSV = request.form['csvdelimiter'] == 'tab' extension = '.tsv' if useTSV else '.csv' countMatrix = generateCSVMatrix(filemanager) delimiter = '\t' if useTSV else ',' # replace newlines and tabs with space to avoid messing output sheet format countMatrix[0] = [item.replace('\t', ' ') for item in countMatrix[0]] countMatrix[0] = [item.replace('\n', ' ') for item in countMatrix[0]] # replace comma with Chinese comma to avoid messing format for .csv output file if delimiter == ',': newComma = u'\uFF0C'.encode('utf-8') countMatrix[0] = [item.replace(',', newComma) for item in countMatrix[0]] folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'results' + extension) classLabelList = ["Class Label"] for lFile in filemanager.files.values(): if lFile.active: classLabelList.append(lFile.classLabel) with open(outFilePath, 'w') as outFile: for i, row in enumerate(countMatrix): rowStr = delimiter.join([str(x) for x in row]) if transpose: rowStr += delimiter + classLabelList[i] outFile.write(rowStr + '\n') if not transpose: outFile.write(delimiter.join(classLabelList) + '\n') outFile.close() return outFilePath, extension
def generateStatistics(filemanager): """ the function calls analyze/information to get the information about each file and the whole corpus :return: FileInfoList: a list contain a tuple contain the file id and the file information (see analyze/information.py/Corpus_Information.returnstatistics() function for more) corpusInformation: the statistics information about the whole corpus (see analyze/information.py/File_Information.returnstatistics() function for more) """ FileInfoList = [] folderpath = os.path.join(session_functions.session_folder(), constants.RESULTS_FOLDER) # folder path for storing # graphs and plots try: os.mkdir(folderpath) # attempt to make folder to store graphs/plots except: pass ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = general_functions.matrixtodict(countMatrix) Files = [file for file in filemanager.getActiveFiles()] for i in range(len(Files)): fileinformation = information.File_Information(WordLists[i], Files[i].name) FileInfoList.append((Files[i].id, fileinformation.returnstatistics())) try: fileinformation.plot(os.path.join(folderpath, str(Files[i].id) + constants.FILE_INFORMATION_FIGNAME)) except: pass corpusInformation = information.Corpus_Information(WordLists, Files) # make a new object called corpus corpusInfoDict = corpusInformation.returnstatistics() try: corpusInformation.plot(os.path.join(folderpath, constants.CORPUS_INFORMATION_FIGNAME)) except: pass return FileInfoList, corpusInfoDict
def generateRWmatrixPlot(dataPoints, legendLabelsList): """ Generates rolling windows graph raw data matrix Args: dataPoints: a list of [x, y] points Returns: Output file path and extension. """ extension = '.csv' deliminator = ',' folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'RWresults' + extension) maxlen = 0 for i in xrange(len(dataPoints)): if len(dataPoints[i]) > maxlen: maxlen = len(dataPoints[i]) maxlen += 1 rows = [""] * maxlen legendLabelsList[0] = legendLabelsList[0].split('#') rows[0] = (deliminator + deliminator).join(legendLabelsList[0]) + deliminator + deliminator with open(outFilePath, 'w') as outFile: for i in xrange(len(dataPoints)): for j in xrange(1, len(dataPoints[i]) + 1): rows[j] = rows[j] + str(dataPoints[i][j - 1][0]) + deliminator + str( dataPoints[i][j - 1][1]) + deliminator for i in xrange(len(rows)): outFile.write(rows[i] + '\n') outFile.close() return outFilePath, extension
def hierarchy(): """ Handles the functionality on the hierarchy page. It analyzes the various texts and displays a dendrogram. Note: Returns a response object (often a render_template call) to flask and eventually to the browser. """ fileManager = session_functions.loadFileManager() leq = '≤'.decode('utf-8') if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS if 'hierarchyoption' not in session: session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS if request.method == "GET": # "GET" request occurs when the page is first loaded. labels = fileManager.getActiveLabels() thresholdOps = {} return render_template('hierarchy.html', labels=labels, thresholdOps=thresholdOps) if 'dendro_download' in request.form: # The 'Download Dendrogram' button is clicked on hierarchy.html. # sends pdf file to downloads folder. utility.generateDendrogram(fileManager) attachmentname = "den_" + request.form['title'] + ".pdf" if request.form['title'] != '' else 'dendrogram.pdf' session_functions.cacheAnalysisOption() session_functions.cacheHierarchyOption() return send_file(pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER + "dendrogram.pdf"), attachment_filename=attachmentname, as_attachment=True) if 'dendroSVG_download' in request.form: utility.generateDendrogram(fileManager) attachmentname = "den_" + request.form['title'] + ".svg" if request.form['title'] != '' else 'dendrogram.svg' session_functions.cacheAnalysisOption() session_functions.cacheHierarchyOption() return send_file(pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER + "dendrogram.svg"), attachment_filename=attachmentname, as_attachment=True) if 'getdendro' in request.form: # The 'Get Dendrogram' button is clicked on hierarchy.html. pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram(fileManager) session['dengenerated'] = True labels = fileManager.getActiveLabels() inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax) maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax) distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax) monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax) thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp, "monocrit": monocritOp} session_functions.saveFileManager(fileManager) session_functions.cacheAnalysisOption() session_functions.cacheHierarchyOption() return render_template('hierarchy.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score, inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax, distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin, threshold=threshold, thresholdOps=thresholdOps)
def generateMCJSONObj(filemanager): """ Generates a JSON object for multicloud when working with a mallet .txt file. Args: malletPath: path to the saved mallet .txt file Returns: An object, formatted in the JSON that d3 needs, either a list or a dictionary. """ contentPath = os.path.join(session_functions.session_folder(), constants.FILECONTENTS_FOLDER, constants.MALLET_INPUT_FILE_NAME) outputPath = os.path.join(session_functions.session_folder(), constants.RESULTS_FOLDER, constants.MALLET_OUTPUT_FILE_NAME) try: makedirs(pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER)) # attempt to make the result dir except: pass # result dir already exists if request.form['analysistype'] == 'userfiles': JSONObj = generateJSONForD3(filemanager, mergedSet=False) else: # request.form['analysistype'] == 'topicfile' topicString = str(request.files['optuploadname']) topicString = re.search(r"'(.*?)'", topicString) topicString = topicString.group(1) if topicString != '': request.files['optuploadname'].save(contentPath) with open(contentPath, 'r') as f: content = f.read() # reads content from the upload file if content.startswith('#doc source pos typeindex type topic'): # --- begin converting a Mallet file into the file d3 can understand --- tuples = [] # Read the output_state file with open(contentPath) as f: # Skip the first three lines for _ in xrange(3): next(f) # Create a list of type:topic combinations for line in f: line = re.sub('\s+', ' ', line) # Make sure the number of columns is correct try: doc, source, pos, typeindex, type, topic = line.rstrip().split(' ') tuple = type + ':' + topic tuples.append(tuple) except: raise Exception( "Your source data cannot be parsed into a regular number of columns. Please ensure that there are no spaces in your file names or file paths. It; may be easiest to open the outpt_state file in a spreadsheet using a space as; the delimiter and text as the field type. Data should only be present in columns; A to F. Please fix any misaligned data and run this script again.") # Count the number of times each type-topic combo appears from collections import defaultdict topicCount = defaultdict(int) for x in tuples: topicCount[x] += 1 # Populate a topicCounts dict with type: topic:count words = [] topicCounts = {} for k, v in topicCount.iteritems(): type, topic = k.split(':') count = int(v) tc = topic + ":" + str(count) if type in words: topicCounts[type] = topicCounts[type] + " " + tc else: topicCounts[type] = tc words.append(type) # Add a word ID out = "" i = 0 for k, v in topicCounts.iteritems(): out += str(i) + " " + k + " " + v + "\n" i += 1 # Write the output file with open(outputPath, 'w') as f: f.write(out) # Python will convert \n to os.linesep # --- end converting a Mallet file into the file d3 can understand --- else: with open(outputPath, 'w') as f: f.write(content) # if this is the jsonform, just write that in the output folder JSONObj = multicloud_topic.topicJSONmaker(outputPath) return JSONObj
def generateKMeansVoronoi(filemanager): """ Generates a table of cluster_number and file name from the active files. Args: None Returns: kmeansIndex.tolist(): a list of index of the closest center of the file silttScore: a float of silhouette score based on KMeans algorithm fileNameStr: a string of file names, separated by '#' KValue: an int of the number of K from input """ ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW, cull=culling) del countMatrix[0] for row in countMatrix: del row[0] matrix = np.array(countMatrix) # Gets options from request.form and uses options to generate the K-mean results KValue = len(filemanager.getActiveFiles()) / 2 # default K value max_iter = 300 # default number of iterations initMethod = request.form['init'] n_init = 300 tolerance = 1e-4 if (request.form['nclusters'] != '') and (int(request.form['nclusters']) != KValue): KValue = int(request.form['nclusters']) if (request.form['max_iter'] != '') and (int(request.form['max_iter']) != max_iter): max_iter = int(request.form['max_iter']) if request.form['n_init'] != '': n_init = int(request.form['n_init']) if request.form['tolerance'] != '': tolerance = float(request.form['tolerance']) metric_dist = request.form['KMeans_metric'] fileNameList = [] for lFile in filemanager.files.values(): if lFile.active: if request.form["file_" + str(lFile.id)] == lFile.label: fileNameList.append(lFile.label.encode("utf-8")) else: newLabel = request.form["file_" + str(lFile.id)].encode("utf-8") fileNameList.append(newLabel) fileNameStr = fileNameList[0] for i in range(1, len(fileNameList)): fileNameStr += "#" + fileNameList[i] folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) kmeansIndex, silttScore, colorChart, finalPointsList, finalCentroidsList, textData, maxVal = KMeans.getKMeansVoronoi( matrix, KValue, max_iter, initMethod, n_init, tolerance, metric_dist, fileNameList) return kmeansIndex, silttScore, fileNameStr, KValue, colorChart, finalPointsList, finalCentroidsList, textData, maxVal
def generateDendrogram(filemanager): """ Generates dendrogram image and pdf from the active files. Args: None Returns: Total number of PDF pages, ready to calculate the height of the embeded PDF on screen """ ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW, cull=culling) # Gets options from request.form and uses options to generate the dendrogram (with the legends) in a PDF file orientation = str(request.form['orientation']) title = request.form['title'] pruning = request.form['pruning'] pruning = int(request.form['pruning']) if pruning else 0 linkage = str(request.form['linkage']) metric = str(request.form['metric']) augmentedDendrogram = False if 'augmented' in request.form: augmentedDendrogram = request.form['augmented'] == 'on' showDendroLegends = False if 'dendroLegends' in request.form: showDendroLegends = request.form['dendroLegends'] == 'on' dendroMatrix = [] fileNumber = len(countMatrix) totalWords = len(countMatrix[0]) for row in range(1, fileNumber): wordCount = [] for col in range(1, totalWords): wordCount.append(countMatrix[row][col]) dendroMatrix.append(wordCount) distanceList = dendrogrammer.getDendroDistances(linkage, metric, dendroMatrix) legend = getDendrogramLegend(filemanager, distanceList) folderPath = pathjoin(session_functions.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) # we need labels (segment names) tempLabels = [] for matrixRow in countMatrix: tempLabels.append(matrixRow[0]) pdfPageNumber = dendrogrammer.dendrogram(orientation, title, pruning, linkage, metric, tempLabels, dendroMatrix, legend, folderPath, augmentedDendrogram, showDendroLegends) return pdfPageNumber