def multi_cloud(): """Handles the functionality on the multicloud pages. :return: a response object (often a render_template call) to Flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() labels = OrderedDict( natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == 'GET': # 'GET' request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'multicloudoptions' not in session: session['multicloudoptions'] = \ constants.DEFAULT_MULTICLOUD_OPTIONS return render_template( 'multicloud.html', itm="multicloud", jsonStr="", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # This is legacy code. # The form is now submitted by Ajax do_multicloud() # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') file_manager = utility.load_file_manager() json_obj = utility.generate_mc_json_obj(file_manager) # Replaces client-side array generator word_counts_array = [] for doc in json_obj: name = doc["name"] children = doc["children"] word_counts = {} for item in children: word_counts[item["text"]] = item["size"] word_counts_array.append( {"name": name, "word_counts": word_counts, "words": children}) # Temporary fix because the front end needs a string json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_multi_cloud_options() return render_template( 'multicloud.html', itm="multicloud", JSONObj=json_obj, labels=labels, numActiveDocs=num_active_docs)
def multi_cloud(): """Handles the functionality on the multicloud pages. :return: a response object (often a render_template call) to Flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == 'GET': # 'GET' request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'multicloudoptions' not in session: session['multicloudoptions'] = \ constants.DEFAULT_MULTICLOUD_OPTIONS return render_template('multicloud.html', itm="multicloud", jsonStr="", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # This is legacy code. # The form is now submitted by Ajax do_multicloud() # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') file_manager = utility.load_file_manager() json_obj = utility.generate_mc_json_obj(file_manager) # Replaces client-side array generator word_counts_array = [] for doc in json_obj: name = doc["name"] children = doc["children"] word_counts = {} for item in children: word_counts[item["text"]] = item["size"] word_counts_array.append({ "name": name, "word_counts": word_counts, "words": children }) # Temporary fix because the front end needs a string json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_multi_cloud_options() return render_template('multicloud.html', itm="multicloud", JSONObj=json_obj, labels=labels, numActiveDocs=num_active_docs)
def top_words(): """Handles the topword page functionality. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() # 'GET' request occurs when the page is first loaded if 'topwordoption' not in session: session['topwordoption'] = constants.DEFAULT_TOPWORD_OPTIONS if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS # get the class division map and number of existing classes class_division_map = FileManagerModel().load_file_manager().\ get_class_division_map() num_class = class_division_map.shape[0] return render_template('topword.html', labels=labels, classmap=class_division_map, numclass=num_class, topwordsgenerated='class_div', itm='topwords', numActiveDocs=num_active_docs)
def topword_html(): # 'POST' request occurs when html form is submitted num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() # get the class division map and number of existing classes class_division_map = FileManagerModel().load_file_manager().\ get_class_division_map() num_class = class_division_map.shape[0] if 'get-topword' in request.form: # download topword path = TopwordModel().get_topword_csv_path( class_division_map=class_division_map) session_manager.cache_analysis_option() session_manager.cache_top_word_options() return send_file(path, attachment_filename=constants.TOPWORD_CSV_FILE_NAME, as_attachment=True) else: session_manager.cache_analysis_option() session_manager.cache_top_word_options() topword_result = TopwordModel().get_readable_result( class_division_map=class_division_map) return render_template('topword.html', result=topword_result.results, labels=labels, header=topword_result.header, numclass=num_class, topwordsgenerated='True', classmap=[], itm='topwords', numActiveDocs=num_active_docs)
def delete_one(): """:return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() file_manager.delete_files([int(request.data)]) utility.save_file_manager(file_manager) return "success"
def scrub(): # Are you looking for scrubber.py? """Handles the functionality of the scrub page. It scrubs the files depending on the specifications chosen by the user, with an option to download the scrubbed files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() # "GET" request occurs when the page is first loaded. if 'scrubbingoptions' not in session: session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS if 'xmlhandlingoptions' not in session: session['xmlhandlingoptions'] = { "myselect": { "action": '', "attribute": "" } } utility.xml_handling_options() previews = file_manager.get_previews_of_active() tags_present, doe_present, gutenberg_present = \ file_manager.check_actives_tags() return render_template('scrub.html', previews=previews, itm="scrubber", haveTags=tags_present, haveDOE=doe_present, haveGutenberg=gutenberg_present, numActiveDocs=num_active_docs)
def get_document_previews() -> str: """ Returns previews of the active documents. :return: Previews of the active documents. """ file_manager = utility.load_file_manager() return json.dumps(file_manager.get_previews_of_active())
def delete_selected(): """:returns json object with the ids of the files to delete """ file_manager = utility.load_file_manager() file_ids = file_manager.delete_active_files() utility.save_file_manager(file_manager) return json.dumps(file_ids)
def options_from_front_end(self) -> StatsFrontEndOption: """Get the options from front end. The only option is selected file ids. """ # Force file ids to be integer type and remove extra blank. active_file_ids = [ file.id for file in load_file_manager().get_active_files() ] # Get the selected column sort_column = int( self._front_end_data["statistics_table_selected_column"]) # Get the sort column sort_ascending = bool( self._front_end_data["statistics_table_sort_mode"] == "Ascending") # Get the colors text_color = self._front_end_data.get("text_color") highlight_color = self._front_end_data.get("highlight_color") # Return stats front end option. return StatsFrontEndOption(active_file_ids=active_file_ids, sort_column=sort_column, sort_ascending=sort_ascending, text_color=text_color, highlight_color=highlight_color)
def scrub(): # Are you looking for scrubber.py? """Handles the functionality of the scrub page. It scrubs the files depending on the specifications chosen by the user, with an option to download the scrubbed files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() # "GET" request occurs when the page is first loaded. if 'scrubbingoptions' not in session: session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS if 'xmlhandlingoptions' not in session: session['xmlhandlingoptions'] = { "myselect": {"action": '', "attribute": ""}} utility.xml_handling_options() previews = file_manager.get_previews_of_active() tags_present, doe_present, gutenberg_present = \ file_manager.check_actives_tags() return render_template( 'scrub.html', previews=previews, itm="scrubber", haveTags=tags_present, haveDOE=doe_present, haveGutenberg=gutenberg_present, numActiveDocs=num_active_docs)
def download(): """ Downloads the cut files. :return: A .zip file containing the cut files. """ file_manager = utility.load_file_manager() return file_manager.zip_active_files("cut-files.zip")
def upload(): """Handles the functionality of the upload page. It uploads files to be used in the current session. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() if request.method == "GET": print("About to fix session in case of browser caching") # fix the session in case the browser is caching the old session session_manager.fix() print("Session fixed. Rendering template.") if 'generalsettings' not in session: session['generalsettings'] = \ constants.DEFAULT_GENERALSETTINGS_OPTIONS return render_template( 'upload.html', MAX_FILE_SIZE=constants.MAX_FILE_SIZE, MAX_FILE_SIZE_INT=constants.MAX_FILE_SIZE_INT, MAX_FILE_SIZE_UNITS=constants.MAX_FILE_SIZE_UNITS, itm="upload-tool", numActiveDocs=num_active_docs) # X-FILENAME is the flag to signify a file upload if 'X-FILENAME' in request.headers: # File upload through javascript file_manager = utility.load_file_manager() # --- check file name --- # Grab the filename, which will be UTF-8 percent-encoded (e.g. '%E7' # instead of python's '\xe7') file_name = request.headers['X-FILENAME'] # Unquote using urllib's percent-encoding decoder (turns '%E7' into # '\xe7') file_name = unquote(file_name) # --- end check file name --- if file_name.endswith('.lexos'): file_manager.handle_upload_workspace() # update filemanager file_manager = utility.load_file_manager() file_manager.update_workspace() else: file_manager.add_upload_file(request.data, file_name) utility.save_file_manager(file_manager) return 'success'
def set_class_selected(): file_manager = utility.load_file_manager() rows = request.json[0] new_class_label = request.json[1] for file_id in list(rows): file_manager.files[int(file_id)].set_class_label(new_class_label) utility.save_file_manager(file_manager) return json.dumps(rows)
def set_class_selected(): file_manager = utility.load_file_manager() rows = request.json[0] new_class_label = request.json[1] for fileID in list(rows): file_manager.files[int(fileID)].set_class_label(new_class_label) utility.save_file_manager(file_manager) return json.dumps(rows)
def disable_rows(): """:return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() for file_id in request.json: file_manager.disable_files([file_id, ]) utility.save_file_manager(file_manager) return 'success'
def download_scrubbing(): """downloads scrubbed files. :return: a .zip with all the scrubbed files """ # The 'Download Scrubbed Files' button is clicked on scrub.html. # Sends zipped files to downloads folder. file_manager = utility.load_file_manager() return file_manager.zip_active_files('scrubbed.zip')
def select_all(): """selects all files. :return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() file_manager.enable_all() utility.save_file_manager(file_manager) return 'success'
def download_documents(): """downloads all selected files. :return: a .zip file congaing all selected files """ # The 'Download Selected Documents' button is clicked in manage.html. # Sends zipped files to downloads folder. file_manager = utility.load_file_manager() return file_manager.zip_active_files('selected_documents.zip')
def download_cutting(): """downloads cut files. :return: a .zip with all the cut files """ # The 'Download Segmented Files' button is clicked on cut.html # sends zipped files to downloads folder file_manager = utility.load_file_manager() return file_manager.zip_active_files('cut_files.zip')
def disable_rows(): """:return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() for file_id in request.json: file_manager.disable_files([ file_id, ]) utility.save_file_manager(file_manager) return 'success'
def set_class(): """sets a class. :return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() file_id = int(request.json[0]) new_class_label = request.json[1] file_manager.files[file_id].set_class_label(new_class_label) utility.save_file_manager(file_manager) return 'success'
def download_workspace(): """send the workspace file (.lexos) to the user. Note that the workspace can be uploaded and restore all the workspace. :return: send workspace to the user """ file_manager = utility.load_file_manager() path = file_manager.zip_workspace() return send_file(path, attachment_filename=constants.WORKSPACE_FILENAME, as_attachment=True)
def set_label(): """sets the label of a file. :return: string indicating that it has succeeded """ file_manager = utility.load_file_manager() file_id = int(request.json[0]) new_name = request.json[1] file_manager.files[file_id].set_name(new_name) file_manager.files[file_id].label = new_name utility.save_file_manager(file_manager) return 'success'
def download_workspace() -> str: """ Sends the workspace file (.lexos) to the user. :return: The workspace file. """ file_manager = utility.load_file_manager() path = file_manager.zip_workspace() return send_file( path, attachment_filename=constants.WORKSPACE_FILENAME, as_attachment=True)
def download_workspace(): """send the workspace file (.lexos) to the user. Note that the workspace can be uploaded and restore all the workspace. :return: send workspace to the user """ file_manager = utility.load_file_manager() path = file_manager.zip_workspace() return send_file( path, attachment_filename=constants.WORKSPACE_FILENAME, as_attachment=True)
def get_previews(): """:return: a json object with the id, label, and preview text for all text files """ file_manager = utility.load_file_manager() file_id = int(request.data) file_label = file_manager.files[file_id].label file_preview = file_manager.files[file_id].load_contents() preview_vals = { "id": file_id, "label": file_label, "previewText": file_preview} return json.dumps(preview_vals)
def add_document() -> str: """ Adds a document to the file manager or load a .lexos file. :return: None. """ file_manager = utility.load_file_manager() # Get and decode the file name file_name = request.headers["file-name"] file_name = unquote(file_name) # If the file is a .lexos file, load it if file_name.endswith('.lexos'): file_manager.handle_upload_workspace() file_manager = utility.load_file_manager() file_manager.update_workspace() # Otherwise, add the document else: file_manager.add_upload_file(request.data, file_name) utility.save_file_manager(file_manager) return ''
def get_previews(): """:return: a json object with the id, label, and preview text for all text files """ file_manager = utility.load_file_manager() file_id = int(request.data) file_label = file_manager.files[file_id].label file_preview = file_manager.files[file_id].load_contents() preview_vals = { "id": file_id, "label": file_label, "previewText": file_preview } return json.dumps(preview_vals)
def detect_active_docs() -> int: """ Detects the number of active documents. :return: The number of active documents. """ if session: file_manager = utility.load_file_manager() active = file_manager.get_active_files() if active: return len(active) else: return 0 else: redirect(url_for('base.no_session')) return 0
def get_tokenizer_csv(): """Called when the CSV button in Tokenizer is clicked. :return: a response object (often a render_template call) to flask and eventually to the browser. """ file_manager = utility.load_file_manager() session_manager.cache_analysis_option() session_manager.cache_csv_options() save_path, file_extension = utility.generate_csv(file_manager) utility.save_file_manager(file_manager) return send_file(save_path, attachment_filename="frequency_matrix" + file_extension, as_attachment=True)
def analyze(): """ Analyzes the files. :return: The results of the analysis. """ path = get_path() analysis = ContentAnalysisModel() file_manager = load_file_manager() active_files = file_manager.get_active_files() # Set the formula session["formula"] = ContentAnalysisReceiver() \ .options_from_front_end().formula # Add the files to analyze for file in active_files: analysis.add_file(file_name=file.name, label=file.label, content=file.load_contents()) # Add the dictionaries for name in os.listdir(path): analysis.add_dictionary(file_name=name, label=name, content=open(os.path.join(path, name), 'r').read()) # Analyze overview_results, overview_csv, corpus_results, corpus_csv, \ document_results, errors = analysis.analyze() # Return the results if len(errors): return jsonify({"error": errors}) if not len(corpus_results): return jsonify({"error": "Failed to perform the analysis."}) return jsonify({ "overview-table-head": overview_results[0], "overview-table-body": overview_results[1:], "overview-table-csv": overview_csv, "corpus-table-head": ["Dictionary", "Phrase", "Count"], "corpus-table-body": corpus_results, "corpus-table-csv": corpus_csv, "documents": document_results, "error": False })
def do_cutting(): """cuts the files. :return: cut files and their preview in a json object """ file_manager = utility.load_file_manager() # The 'Preview Cuts' or 'Apply Cuts' button is clicked on cut.html. session_manager.cache_cutting_options() # Saving changes only if action = apply saving_changes = True if request.form['action'] == 'apply' else False previews = file_manager.cut_files(saving_changes=saving_changes) if saving_changes: utility.save_file_manager(file_manager) data = {"data": previews} data = json.dumps(data) return data
def get_active_document_count() -> int: """ Gets the number of active documents. :return: The number of active documents. """ if session: file_manager = utility.load_file_manager() active_files = file_manager.get_active_files() if active_files: return len(active_files) else: return 0 else: redirect("no-session") return 0
def download() -> str: """ Returns a download of the active files. :return: the zip files needs to be downloaded. """ file_manager = utility.load_file_manager() response = make_response( file_manager.zip_active_files("scrubbed_documents.zip")) # Disable download caching response.headers["Cache-Control"] = \ "max-age=0, no-cache, no-store, must-revalidate" response.headers["Expires"] = 0 response.headers["Pragma"] = "no-cache" return response
def execute(): """ Cuts the files. :return: Previews of the cut files. """ file_manager = utility.load_file_manager() session_manager.cache_cutting_options() # Apply the cutting save = request.form["action"] == "apply" previews = file_manager.cut_files(saving_changes=save) # Save the results if requested if save: utility.save_file_manager(file_manager) return json.dumps(previews)
def detect_active_docs() -> int: """detects the number of active documents. This function can be called at the beginning of each tool. :return: number of active documents """ # TODO: this function should probably be moved to file_manager.py if session: file_manager = utility.load_file_manager() active = file_manager.get_active_files() if active: return len(active) else: return 0 else: redirect(url_for('base.no_session')) return 0
def cut(): """ Handles the functionality of the cut page. It cuts the files into various segments depending on the specifications chosen by the user, and sends the text segments. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() active = file_manager.get_active_files() if len(active) > 0: num_char = [x.num_letters() for x in active] num_word = [x.num_words() for x in active] num_line = [x.num_lines() for x in active] max_char = max(num_char) max_word = max(num_word) max_line = max(num_line) active_file_ids = [lfile.id for lfile in active] else: num_char = [] num_word = [] num_line = [] max_char = 0 max_word = 0 max_line = 0 active_file_ids = [] if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cuttingoptions' not in session: session['cuttingoptions'] = constants.DEFAULT_CUT_OPTIONS previews = file_manager.get_previews_of_active() return render_template( 'cut.html', previews=previews, num_active_files=len(previews), numChar=num_char, numWord=num_word, numLine=num_line, maxChar=max_char, maxWord=max_word, maxLine=max_line, activeFileIDs=active_file_ids, itm="cut", numActiveDocs=num_active_docs)
def do_scrubbing(): """:return: a json object with a scrubbed preview """ file_manager = utility.load_file_manager() # The 'Preview Scrubbing' or 'Apply Scrubbing' button is clicked on # scrub.html. session_manager.cache_alteration_files() session_manager.cache_scrub_options() # saves changes only if 'Apply Scrubbing' button is clicked saving_changes = True if request.form["formAction"] == "apply" else False # preview_info is a tuple of (id, file_name(label), class_label, preview) previews = file_manager.scrub_files(saving_changes=saving_changes) # escape the html elements, only transforms preview[3], because that is # the text: previews = [ [preview[0], preview[1], preview[2], general_functions.html_escape(preview[3])] for preview in previews] if saving_changes: utility.save_file_manager(file_manager) data = {"data": previews} data = json.dumps(data) return data
def scrape(): """scraps the urls an generates text file from each url. :return: json object with a string that indicates that is has succeeded """ # Detect the number of active documents. num_active_docs = detect_active_docs() if request.method == "GET": return render_template('scrape.html', numActiveDocs=num_active_docs) if request.method == "POST": import requests urls = request.json["urls"] urls = urls.strip() urls = urls.replace(",", "\n") # Replace commas with line breaks urls = re.sub(r"\s+", "\n", urls) # Get rid of extra white space urls = urls.split("\n") file_manager = utility.load_file_manager() for i, url in enumerate(urls): r = requests.get(url) file_manager.add_upload_file(r.text, "url" + str(i) + ".txt") utility.save_file_manager(file_manager) response = "success" return json.dumps(response)
def merge_documents(): """:return: json object with the new file's id and preview """ print("Merging...") file_manager = utility.load_file_manager() file_manager.disable_all() file_ids = request.json[0] new_name = request.json[1] source_file = request.json[2] milestone = request.json[3] end_milestone = re.compile(milestone + '$') new_file = "" for file_id in file_ids: new_file += file_manager.files[int(file_id)].load_contents() new_file += request.json[3] # Add the milestone string new_file = re.sub(end_milestone, '', new_file) # Strip the last milestone # The routine below is ugly, but it works file_id = file_manager.add_file(source_file, new_name, new_file) file_manager.files[file_id].name = new_name file_manager.files[file_id].label = new_name file_manager.files[file_id].active = True utility.save_file_manager(file_manager) # Returns a new fileID and some preview text return json.dumps([file_id, new_file[0:152] + '...'])
def viz(): """Handles the functionality on the alternate bubbleViz page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict from natsort import natsorted labels = OrderedDict(natsorted(labels.items(), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'bubblevisoption' not in session: session['bubblevisoption'] = constants.DEFAULT_BUBBLEVIZ_OPTIONS return render_template( 'viz.html', JSONObj="", labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Legacy function # json_obj = utility.generateJSONForD3(file_manager, mergedSet=True) # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for file_id in chosen_doc_ids: active_docs.append(file_id) else: for l_file in file_manager.files.values(): if l_file.active: active_docs.append(l_file.id) # Get the contents of all selected/active docs all_contents = [] for file_id in active_docs: if file_manager.files[file_id].active: content = file_manager.files[file_id].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer( all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe with the terms as column # headers import pandas as pd df = pd.DataFrame(dtm, columns=vocab) # Get the Minimum Token Length and Maximum Term Settings minimum_length = int( request.form['minlength']) if 'minlength' in request.form else 0 if 'maxwords' in request.form: # Make sure there is a number in the input form check_for_value = request.form['maxwords'] if check_for_value == "": max_num_words = 100 else: max_num_words = int(request.form['maxwords']) # Filter words that don't meet the minimum length from the dataframe for term in vocab: if len(term) < minimum_length: del df[term] # Extract a dictionary of term count sums sums_dict = df.sum(axis=0).to_dict() # Create a new dataframe of sums and sort it by counts, then terms # Warning!!! This is not natsort. Multiple terms at the edge of # the maximum number of words limit may be cut off in abitrary # order. We need to implement natsort for dataframes. f = pd.DataFrame(list(sums_dict.items()), columns=['term', 'count']) f.sort_values(by=['count', 'term'], axis=0, ascending=[False, True], inplace=True) # Convert the dataframe head to a dict for use below f = f.head(n=max_num_words).to_dict() # Build the JSON object for d3.js termslist = [] countslist = [] children = [] for item in f['term'].items(): termslist.append(item[1]) for item in f['count'].items(): countslist.append(item[1]) for k, v in enumerate(termslist): children.append({"name": v, "size": str(countslist[k])}) json_obj = {"name": "tokens", "children": children} # Turn the JSON object into a JSON string for the front end json_str = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_bubble_viz_option() return render_template( 'viz.html', JSONObj=json_str, labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs)
def content_analysis(): """Handles the functionality on the contentanalysis page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ analysis = ContentAnalysisModel() path = os.path.join(constants.TMP_FOLDER, constants.UPLOAD_FOLDER, session['id'], 'content_analysis/') if os.path.isdir(path): dictionary_names = [name for name in os.listdir(path)] else: dictionary_names = [] if request.method == 'GET': if 'dictionary_labels' in session: dict_labels = session['dictionary_labels'] else: dict_labels = [] if 'active_dictionaries' in session: active_dicts = session['active_dictionaries'] else: active_dicts = [True] * len(dict_labels) if 'toggle_all_value' in session: toggle_all_value = session['toggle_all_value'] else: toggle_all_value = True if 'formula' in session: formula = session['formula'] else: formula = "" return render_template('contentanalysis.html', dictionary_labels=dict_labels, active_dictionaries=active_dicts, toggle_all_value=toggle_all_value, itm="content-analysis", formula=formula) else: num_active_docs = detect_active_docs() active_dicts = ContentAnalysisReceiver().options_from_front_end( ).active_dicts dict_labels = ContentAnalysisReceiver().options_from_front_end( ).dict_labels session['formula'] = ContentAnalysisReceiver().options_from_front_end( ).formula if len(dict_labels) == 0: dict_labels = [os.path.splitext(dict_name)[0] for dict_name in dictionary_names] active_dicts = [True] * len(dict_labels) num_active_dicts = active_dicts.count(True) if num_active_docs == 0 and num_active_dicts == 0: return error("At least 1 active document and 1 active " "dictionary are required to perform a " "content analysis.") elif num_active_docs == 0: return error("At least 1 active document is required to perform " "a content analysis.") elif num_active_dicts == 0: return error("At least 1 active dictionary is required to perform" " a content analysis.") file_manager = load_file_manager() active_files = file_manager.get_active_files() for file in active_files: analysis.add_file(file_name=file.name, label=file.label, content=file.load_contents()) for dict_name, dict_label, active in zip(dictionary_names, dict_labels, active_dicts): if active: f = open(os.path.join(path, dict_name), "r") content = f.read() analysis.add_dictionary(file_name=dict_name, label=dict_label, content=content) result_table, corpus_raw_counts_table, files_raw_counts_tables,\ formula_errors = analysis.analyze() if len(formula_errors) != 0 or result_table is None: return error(formula_errors) data = {"result_table": result_table, "dictionary_labels": dict_labels, "active_dictionaries": active_dicts, "corpus_raw_counts_table": corpus_raw_counts_table, "files_raw_counts_tables": files_raw_counts_tables, "error": False} return json.dumps(data)
def word_cloud(): """Handles the functionality on the visualisation page. a prototype for displaying single word cloud graphs. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS # there is no wordcloud option so we don't initialize that return render_template( 'wordcloud.html', itm="word-cloud", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for file_id in chosen_doc_ids: active_docs.append(file_id) else: for l_file in file_manager.files.values(): if l_file.active: active_docs.append(l_file.id) # Get the contents of all selected/active docs all_contents = [] for file_id in active_docs: if file_manager.files[file_id].active: content = file_manager.files[file_id].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer( all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe and save the sums import pandas as pd df = pd.DataFrame(dtm) df = df.sum(axis=0) # Build the JSON object for d3.js json_obj = {"name": "tokens", "children": []} for k, v in enumerate(vocab): json_obj["children"].append({"name": v, "size": str(df[k])}) # Create a list of column values for the word count table from operator import itemgetter terms = natsorted( json_obj["children"], key=itemgetter('size'), reverse=True) column_values = [] for term in terms: # rows = [term["name"].encode('utf-8'), term["size"]] rows = [term["name"], term["size"]] column_values.append(rows) # Turn the JSON object into a JSON string for the front end json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() return render_template( 'wordcloud.html', labels=labels, JSONObj=json_obj, columnValues=column_values, itm="word-cloud", numActiveDocs=num_active_docs)
def manage(): """Handles the functionality of the select page. Its primary role is to activate/deactivate specific files depending on the user's input. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Usual loading of the FileManager file_manager = utility.load_file_manager() if request.method == "GET": rows = file_manager.get_previews_of_all() for row in rows: if row["state"]: row["state"] = "selected" else: row["state"] = "" return render_template( 'manage.html', rows=rows, itm="manage", numActiveDocs=num_active_docs) if 'previewTest' in request.headers: file_id = int(request.data) file_label = file_manager.files[file_id].label file_preview = file_manager.files[file_id].get_preview() preview_vals = { "id": file_id, "label": file_label, "previewText": file_preview} return json.dumps(preview_vals) if 'toggleFile' in request.headers: # Catch-all for any POST request. # On the select page, POSTs come from JavaScript AJAX XHRequests. file_id = int(request.data) # Toggle the file from active to inactive or vice versa file_manager.toggle_file(file_id) elif 'toggliFy' in request.headers: file_ids = request.data file_ids = file_ids.split(",") file_manager.disable_all() # Toggle the file from active to inactive or vice versa file_manager.enable_files(file_ids) elif 'setLabel' in request.headers: new_name = (request.headers['setLabel']) file_id = int(request.data) file_manager.files[file_id].set_name(new_name) file_manager.files[file_id].label = new_name elif 'setClass' in request.headers: new_class_label = (request.headers['setClass']) file_id = int(request.data) file_manager.files[file_id].set_class_label(new_class_label) elif 'disableAll' in request.headers: file_manager.disable_all() elif 'selectAll' in request.headers: file_manager.enable_all() elif 'applyClassLabel' in request.headers: file_manager.classify_active_files() elif 'deleteActive' in request.headers: file_manager.delete_active_files() elif 'deleteRow' in request.headers: # delete the file in request.form file_manager.delete_files(list(request.form.keys())) utility.save_file_manager(file_manager) return '' # Return an empty string because you have to return something
def download_scrub(): """:return: the zip files needs to be downloaded.""" file_manager = utility.load_file_manager() return file_manager.zip_active_files('scrubbed.zip')
def do_multicloud(): """:return: a json object with all the word counts """ # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [ int(x) for x in request.form.getlist('segmentlist') ] active_docs = [] if chosen_doc_ids: for file_id in chosen_doc_ids: active_docs.append(file_id) else: for l_file in file_manager.files.values(): if l_file.active: active_docs.append(l_file.id) # Get a sorted list of the labels for each selected doc labels = [] for file_id in active_docs: labels.append(file_manager.files[file_id].label) labels = sorted(labels) # Get the contents of all selected/active docs all_contents = [] for file_id in active_docs: if file_manager.files[file_id].active: content = file_manager.files[file_id].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer(all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe with terms # as column headers df = pd.DataFrame(dtm, columns=vocab) # Automatically sorts terms # Create a dict for each document. # Format: # {0: [{u'term1': 1}, {u'term2': 0}], 1: [{u'term1': 1}, # {u'term2': 0}]} docs = {} for i, row in df.iterrows(): countslist = [] for k, term in enumerate(sorted(vocab)): countslist.append({term: row[k]}) docs[i] = countslist # Build the JSON object expected by d3.js json_obj = [] for i, doc in enumerate(docs.items()): children = [] # Convert simple json values to full json values: {u'a': 1} > # {'text': u'a', 'size': 1} for simple_values in doc[1]: for val in simple_values.items(): values = {"text": val[0], "size": str(val[1])} # Append the new values to the children list children.append(values) # Append the new doc object to the JSON object json_obj.append({"name": labels[i], "children": children}) # Replaces client-side array generator word_counts_array = [] for doc in json_obj: name = doc["name"] children = doc["children"] word_counts = {} for item in children: word_counts[item["text"]] = item["size"] word_counts_array.append( {"name": name, "word_counts": word_counts, "words": children}) # The front end needs a string in the response response = json.dumps([json_obj, word_counts_array]) session_manager.cache_cloud_option() session_manager.cache_multi_cloud_options() return response