Example #1
0
def k_means():
    """Handles the functionality on the K Means page.

    It analyzes the various texts and displays the class label of the files.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Set default number of clusters to be half of the number of documents.
    default_k = int(num_active_docs / 2)
    # Get file labels.
    labels = FileManagerModel().load_file_manager().get_active_labels_with_id()
    # Fill the default options.
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'kmeanoption' not in session:
        session['kmeanoption'] = constants.DEFAULT_KMEAN_OPTIONS
    # Always update the nclusters.
    session['kmeanoption']['nclusters'] = default_k
    return render_template(
        'kmeans.html',
        itm='kmeans',
        labels=labels,
        numActiveDocs=num_active_docs)
def scrub():
    # Are you looking for scrubber.py?
    """Handles the functionality of the scrub page.

    It scrubs the files depending on the specifications chosen by the user,
    with an option to download the scrubbed files.
    :return: a response object (often a render_template call) to flask and
     eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()

    # "GET" request occurs when the page is first loaded.
    if 'scrubbingoptions' not in session:
        session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS
    if 'xmlhandlingoptions' not in session:
        session['xmlhandlingoptions'] = {
            "myselect": {
                "action": '',
                "attribute": ""
            }
        }
    utility.xml_handling_options()
    previews = file_manager.get_previews_of_active()
    tags_present, doe_present, gutenberg_present = \
        file_manager.check_actives_tags()
    return render_template('scrub.html',
                           previews=previews,
                           itm="scrubber",
                           haveTags=tags_present,
                           haveDOE=doe_present,
                           haveGutenberg=gutenberg_present,
                           numActiveDocs=num_active_docs)
Example #3
0
def bct_analysis():
    """Display the web page when first got to bootstrap consensus analysis.

    :return: The rendered template.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Get labels with their ids.
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # Fill in default options.
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'bctoption' not in session:
        session['bctoption'] = constants.DEFAULT_BCT_OPTIONS

    try:
        from lexos.models.bct_model import BCTModel
        # Use a black hole variable to hold the model to get rid of warning.
        _ = BCTModel()
        # Render the HTML template.
        return render_template(
            'bct_analysis.html',
            itm="bct-analysis",
            labels=id_label_map,
            numActiveDocs=num_active_docs
        )
    except ImportError:
        return render_template(
            'bct_analysis_import_error.html',
            itm="bct-analysis"
        )
Example #4
0
def bct_analysis():
    """Display the web page when first got to bootstrap consensus analysis.

    :return: The rendered template.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Get labels with their ids.
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # Fill in default options.
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'bctoption' not in session:
        session['bctoption'] = constants.DEFAULT_BCT_OPTIONS

    try:
        from lexos.models.bct_model import BCTModel
        # Use a black hole variable to hold the model to get rid of warning.
        _ = BCTModel()
        # Render the HTML template.
        return render_template('bct_analysis.html',
                               itm="bct-analysis",
                               labels=id_label_map,
                               numActiveDocs=num_active_docs)
    except ImportError:
        return render_template('bct_analysis_import_error.html',
                               itm="bct-analysis")
Example #5
0
def scrub():
    # Are you looking for scrubber.py?
    """Handles the functionality of the scrub page.

    It scrubs the files depending on the specifications chosen by the user,
    with an option to download the scrubbed files.
    :return: a response object (often a render_template call) to flask and
     eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()

    # "GET" request occurs when the page is first loaded.
    if 'scrubbingoptions' not in session:
        session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS
    if 'xmlhandlingoptions' not in session:
        session['xmlhandlingoptions'] = {
            "myselect": {"action": '', "attribute": ""}}
    utility.xml_handling_options()
    previews = file_manager.get_previews_of_active()
    tags_present, doe_present, gutenberg_present = \
        file_manager.check_actives_tags()
    return render_template(
        'scrub.html',
        previews=previews,
        itm="scrubber",
        haveTags=tags_present,
        haveDOE=doe_present,
        haveGutenberg=gutenberg_present,
        numActiveDocs=num_active_docs)
Example #6
0
def similarity():
    """Handles the similarity query page functionality.

    Returns ranked list of files and their cosine similarities to a comparison
    document.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # 'GET' request occurs when the page is first loaded
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'similarities' not in session:
        session['similarities'] = constants.DEFAULT_SIM_OPTIONS
    return render_template(
        'similarity.html',
        itm="similarity-query",
        labels=id_label_map,
        numActiveDocs=num_active_docs,

    )
Example #7
0
def topword_html():
    # 'POST' request occurs when html form is submitted
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()

    # get the class division map and number of existing classes
    class_division_map = FileManagerModel().load_file_manager().\
        get_class_division_map()
    num_class = class_division_map.shape[0]
    if 'get-topword' in request.form:  # download topword
        path = TopwordModel().get_topword_csv_path(
            class_division_map=class_division_map)
        session_manager.cache_analysis_option()
        session_manager.cache_top_word_options()
        return send_file(path,
                         attachment_filename=constants.TOPWORD_CSV_FILE_NAME,
                         as_attachment=True)
    else:
        session_manager.cache_analysis_option()
        session_manager.cache_top_word_options()
        topword_result = TopwordModel().get_readable_result(
            class_division_map=class_division_map)
        return render_template('topword.html',
                               result=topword_result.results,
                               labels=labels,
                               header=topword_result.header,
                               numclass=num_class,
                               topwordsgenerated='True',
                               classmap=[],
                               itm='topwords',
                               numActiveDocs=num_active_docs)
Example #8
0
def top_words():
    """Handles the topword page functionality.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()

    # 'GET' request occurs when the page is first loaded
    if 'topwordoption' not in session:
        session['topwordoption'] = constants.DEFAULT_TOPWORD_OPTIONS
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS

    # get the class division map and number of existing classes
    class_division_map = FileManagerModel().load_file_manager().\
        get_class_division_map()
    num_class = class_division_map.shape[0]
    return render_template('topword.html',
                           labels=labels,
                           classmap=class_division_map,
                           numclass=num_class,
                           topwordsgenerated='class_div',
                           itm='topwords',
                           numActiveDocs=num_active_docs)
Example #9
0
def upload_dictionaries():
    """Uploads dictionaries to the content analysis object.

    :return: a json object.
    """
    path = os.path.join(constants.TMP_FOLDER,
                        constants.UPLOAD_FOLDER,
                        session['id'], 'content_analysis/')
    if not os.path.isdir(path):
        os.makedirs(path)
    data = {'dictionary_labels': [],
            'active_dictionaries': [],
            'formula': "",
            'toggle_all_value': True,
            'error': False}
    if detect_active_docs() == 0:
        data['error'] = True
    for upload_file in request.files.getlist('lemfileselect[]'):
        file_name = upload_file.filename
        content = upload_file.read().decode("utf-8").replace('\n', '')
        file = open(path + file_name, 'w')
        file.write(content)
        file.close()
    dictionary_names = [name for name in os.listdir(path)]
    data['dictionary_labels'] = [os.path.splitext(dict_name)[0]
                                 for dict_name in dictionary_names]
    data['active_dictionaries'] = [True] * len(dictionary_names)
    session['dictionary_labels'] = data['dictionary_labels']
    session['active_dictionaries'] = data['active_dictionaries']
    session['toggle_all_value'] = data['toggle_all_value']
    return json.dumps(data)
Example #10
0
def multi_cloud():
    """Handles the functionality on the multicloud pages.

    :return: a response object (often a render_template call) to Flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    labels = OrderedDict(
        natsorted(list(labels.items()), key=lambda x: x[1]))
    if request.method == 'GET':
        # 'GET' request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        if 'multicloudoptions' not in session:
            session['multicloudoptions'] = \
                constants.DEFAULT_MULTICLOUD_OPTIONS
        return render_template(
            'multicloud.html',
            itm="multicloud",
            jsonStr="",
            labels=labels,
            numActiveDocs=num_active_docs)
    if request.method == "POST":
        # This is legacy code.
        # The form is now submitted by Ajax do_multicloud()
        # 'POST' request occur when html form is submitted
        # (i.e. 'Get Graphs', 'Download...')
        file_manager = utility.load_file_manager()
        json_obj = utility.generate_mc_json_obj(file_manager)
        # Replaces client-side array generator
        word_counts_array = []
        for doc in json_obj:
            name = doc["name"]
            children = doc["children"]
            word_counts = {}
            for item in children:
                word_counts[item["text"]] = item["size"]
            word_counts_array.append(
                {"name": name, "word_counts": word_counts,
                    "words": children})
        # Temporary fix because the front end needs a string
        json_obj = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        session_manager.cache_multi_cloud_options()
        return render_template(
            'multicloud.html',
            itm="multicloud",
            JSONObj=json_obj,
            labels=labels,
            numActiveDocs=num_active_docs)
Example #11
0
def dendrogram():
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'hierarchyoption' not in session:
        session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS
    labels = FileManagerModel().load_file_manager().get_active_labels_with_id()
    return render_template('dendrogram.html',
                           labels=labels,
                           numActiveDocs=num_active_docs,
                           itm="hierarchical")
Example #12
0
def multi_cloud():
    """Handles the functionality on the multicloud pages.

    :return: a response object (often a render_template call) to Flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1]))
    if request.method == 'GET':
        # 'GET' request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        if 'multicloudoptions' not in session:
            session['multicloudoptions'] = \
                constants.DEFAULT_MULTICLOUD_OPTIONS
        return render_template('multicloud.html',
                               itm="multicloud",
                               jsonStr="",
                               labels=labels,
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        # This is legacy code.
        # The form is now submitted by Ajax do_multicloud()
        # 'POST' request occur when html form is submitted
        # (i.e. 'Get Graphs', 'Download...')
        file_manager = utility.load_file_manager()
        json_obj = utility.generate_mc_json_obj(file_manager)
        # Replaces client-side array generator
        word_counts_array = []
        for doc in json_obj:
            name = doc["name"]
            children = doc["children"]
            word_counts = {}
            for item in children:
                word_counts[item["text"]] = item["size"]
            word_counts_array.append({
                "name": name,
                "word_counts": word_counts,
                "words": children
            })
        # Temporary fix because the front end needs a string
        json_obj = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        session_manager.cache_multi_cloud_options()
        return render_template('multicloud.html',
                               itm="multicloud",
                               JSONObj=json_obj,
                               labels=labels,
                               numActiveDocs=num_active_docs)
Example #13
0
def upload():
    """Handles the functionality of the upload page.

    It uploads files to be used in the current session.
    :return: a response object (often a render_template call) to flask and
     eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    if request.method == "GET":
        print("About to fix session in case of browser caching")
        # fix the session in case the browser is caching the old session
        session_manager.fix()
        print("Session fixed. Rendering template.")
        if 'generalsettings' not in session:
            session['generalsettings'] = \
                constants.DEFAULT_GENERALSETTINGS_OPTIONS
        return render_template(
            'upload.html',
            MAX_FILE_SIZE=constants.MAX_FILE_SIZE,
            MAX_FILE_SIZE_INT=constants.MAX_FILE_SIZE_INT,
            MAX_FILE_SIZE_UNITS=constants.MAX_FILE_SIZE_UNITS,
            itm="upload-tool",
            numActiveDocs=num_active_docs)

    # X-FILENAME is the flag to signify a file upload
    if 'X-FILENAME' in request.headers:

        # File upload through javascript
        file_manager = utility.load_file_manager()
        # --- check file name ---
        # Grab the filename, which will be UTF-8 percent-encoded (e.g. '%E7'
        # instead of python's '\xe7')
        file_name = request.headers['X-FILENAME']
        # Unquote using urllib's percent-encoding decoder (turns '%E7' into
        # '\xe7')
        file_name = unquote(file_name)
        # --- end check file name ---
        if file_name.endswith('.lexos'):
            file_manager.handle_upload_workspace()
            # update filemanager
            file_manager = utility.load_file_manager()
            file_manager.update_workspace()
        else:
            file_manager.add_upload_file(request.data, file_name)
        utility.save_file_manager(file_manager)
        return 'success'
Example #14
0
def upload():
    """Handles the functionality of the upload page.

    It uploads files to be used in the current session.
    :return: a response object (often a render_template call) to flask and
     eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    if request.method == "GET":
        print("About to fix session in case of browser caching")
        # fix the session in case the browser is caching the old session
        session_manager.fix()
        print("Session fixed. Rendering template.")
        if 'generalsettings' not in session:
            session['generalsettings'] = \
                constants.DEFAULT_GENERALSETTINGS_OPTIONS
        return render_template(
            'upload.html',
            MAX_FILE_SIZE=constants.MAX_FILE_SIZE,
            MAX_FILE_SIZE_INT=constants.MAX_FILE_SIZE_INT,
            MAX_FILE_SIZE_UNITS=constants.MAX_FILE_SIZE_UNITS,
            itm="upload-tool",
            numActiveDocs=num_active_docs)

    # X-FILENAME is the flag to signify a file upload
    if 'X-FILENAME' in request.headers:

        # File upload through javascript
        file_manager = utility.load_file_manager()
        # --- check file name ---
        # Grab the filename, which will be UTF-8 percent-encoded (e.g. '%E7'
        # instead of python's '\xe7')
        file_name = request.headers['X-FILENAME']
        # Unquote using urllib's percent-encoding decoder (turns '%E7' into
        # '\xe7')
        file_name = unquote(file_name)
        # --- end check file name ---
        if file_name.endswith('.lexos'):
            file_manager.handle_upload_workspace()
            # update filemanager
            file_manager = utility.load_file_manager()
            file_manager.update_workspace()
        else:
            file_manager.add_upload_file(request.data, file_name)
        utility.save_file_manager(file_manager)
        return 'success'
Example #15
0
def cut():
    """ Handles the functionality of the cut page.

    It cuts the files into various segments depending on the specifications
    chosen by the user, and sends the text segments.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    active = file_manager.get_active_files()
    if len(active) > 0:
        num_char = [x.num_letters() for x in active]
        num_word = [x.num_words() for x in active]
        num_line = [x.num_lines() for x in active]
        max_char = max(num_char)
        max_word = max(num_word)
        max_line = max(num_line)
        active_file_ids = [lfile.id for lfile in active]
    else:
        num_char = []
        num_word = []
        num_line = []
        max_char = 0
        max_word = 0
        max_line = 0
        active_file_ids = []
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cuttingoptions' not in session:
            session['cuttingoptions'] = constants.DEFAULT_CUT_OPTIONS
        previews = file_manager.get_previews_of_active()
        return render_template(
            'cut.html',
            previews=previews,
            num_active_files=len(previews),
            numChar=num_char,
            numWord=num_word,
            numLine=num_line,
            maxChar=max_char,
            maxWord=max_word,
            maxLine=max_line,
            activeFileIDs=active_file_ids,
            itm="cut",
            numActiveDocs=num_active_docs)
Example #16
0
def cut():
    """ Handles the functionality of the cut page.

    It cuts the files into various segments depending on the specifications
    chosen by the user, and sends the text segments.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    active = file_manager.get_active_files()
    if len(active) > 0:
        num_char = [x.num_letters() for x in active]
        num_word = [x.num_words() for x in active]
        num_line = [x.num_lines() for x in active]
        max_char = max(num_char)
        max_word = max(num_word)
        max_line = max(num_line)
        active_file_ids = [lfile.id for lfile in active]
    else:
        num_char = []
        num_word = []
        num_line = []
        max_char = 0
        max_word = 0
        max_line = 0
        active_file_ids = []
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cuttingoptions' not in session:
            session['cuttingoptions'] = constants.DEFAULT_CUT_OPTIONS
        previews = file_manager.get_previews_of_active()
        return render_template('cut.html',
                               previews=previews,
                               num_active_files=len(previews),
                               numChar=num_char,
                               numWord=num_word,
                               numLine=num_line,
                               maxChar=max_char,
                               maxWord=max_word,
                               maxLine=max_line,
                               activeFileIDs=active_file_ids,
                               itm="cut",
                               numActiveDocs=num_active_docs)
Example #17
0
def statistics():
    """Handles the functionality on the Statistics page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Get labels with their ids.
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # "GET" request occurs when the page is first loaded.
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS

    return render_template('statistics.html',
                           itm="statistics",
                           labels=id_label_map,
                           numActiveDocs=num_active_docs)
Example #18
0
def statistics():
    """Handles the functionality on the Statistics page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Get labels with their ids.
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # "GET" request occurs when the page is first loaded.
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS

    return render_template(
        'statistics.html',
        itm="statistics",
        labels=id_label_map,
        numActiveDocs=num_active_docs)
def rolling_window():
    """Handles the functionality on the rollingwindow page.

    It analyzes the various texts using a rolling window of analysis.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = FileManagerModel().load_file_manager()
    # Get active labels with id and sort all labels.
    labels = file_manager.get_active_labels_with_id()
    labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1]))

    # Fill in the default options if the option was not already there.
    if 'rwoption' not in session:
        session['rwoption'] = constants.DEFAULT_ROLLINGWINDOW_OPTIONS

    # Return the rendered template.
    return render_template('rwanalysis.html',
                           itm="rolling-windows",
                           labels=labels,
                           numActiveDocs=num_active_docs)
Example #20
0
def scrape():
    """scraps the urls an generates text file from each url.

    :return: json object with a string that indicates that is has succeeded
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    if request.method == "GET":
        return render_template('scrape.html', numActiveDocs=num_active_docs)
    if request.method == "POST":
        import requests
        urls = request.json["urls"]
        urls = urls.strip()
        urls = urls.replace(",", "\n")  # Replace commas with line breaks
        urls = re.sub(r"\s+", "\n", urls)  # Get rid of extra white space
        urls = urls.split("\n")
        file_manager = utility.load_file_manager()
        for i, url in enumerate(urls):
            r = requests.get(url)
            file_manager.add_upload_file(r.text, "url" + str(i) + ".txt")
        utility.save_file_manager(file_manager)
        response = "success"
        return json.dumps(response)
Example #21
0
def scrape():
    """scraps the urls an generates text file from each url.

    :return: json object with a string that indicates that is has succeeded
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    if request.method == "GET":
        return render_template('scrape.html', numActiveDocs=num_active_docs)
    if request.method == "POST":
        import requests
        urls = request.json["urls"]
        urls = urls.strip()
        urls = urls.replace(",", "\n")  # Replace commas with line breaks
        urls = re.sub("\s+", "\n", urls)  # Get rid of extra white space
        urls = urls.split("\n")
        file_manager = utility.load_file_manager()
        for i, url in enumerate(urls):
            r = requests.get(url)
            file_manager.add_upload_file(r.text, "url" + str(i) + ".txt")
        utility.save_file_manager(file_manager)
        response = "success"
        return json.dumps(response)
def similarity():
    """Handles the similarity query page functionality.

    Returns ranked list of files and their cosine similarities to a comparison
    document.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    id_label_map = \
        FileManagerModel().load_file_manager().get_active_labels_with_id()

    # 'GET' request occurs when the page is first loaded
    if 'analyoption' not in session:
        session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
    if 'similarities' not in session:
        session['similarities'] = constants.DEFAULT_SIM_OPTIONS
    return render_template(
        'similarity.html',
        itm="similarity-query",
        labels=id_label_map,
        numActiveDocs=num_active_docs,
    )
Example #23
0
def viz():
    """Handles the functionality on the alternate bubbleViz page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    from collections import OrderedDict
    from natsort import natsorted
    labels = OrderedDict(natsorted(labels.items(), key=lambda x: x[1]))
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        if 'bubblevisoption' not in session:
            session['bubblevisoption'] = constants.DEFAULT_BUBBLEVIZ_OPTIONS
        return render_template(
            'viz.html',
            JSONObj="",
            labels=labels,
            itm="bubbleviz",
            numActiveDocs=num_active_docs)
    if request.method == "POST":
        # "POST" request occur when html form is submitted
        # (i.e. 'Get Dendrogram', 'Download...')
        # Legacy function
        # json_obj = utility.generateJSONForD3(file_manager, mergedSet=True)
        # Get the file manager, sorted labels, and tokenization options
        file_manager = utility.load_file_manager()
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        token_type = session['analyoption']['tokenType']
        token_size = int(session['analyoption']['tokenSize'])
        # Limit docs to those selected or to active docs
        chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')]
        active_docs = []
        if chosen_doc_ids:
            for file_id in chosen_doc_ids:
                active_docs.append(file_id)
        else:
            for l_file in file_manager.files.values():
                if l_file.active:
                    active_docs.append(l_file.id)
        # Get the contents of all selected/active docs
        all_contents = []
        for file_id in active_docs:
            if file_manager.files[file_id].active:
                content = file_manager.files[file_id].load_contents()
                all_contents.append(content)
        # Generate a DTM
        dtm, vocab = utility.simple_vectorizer(
            all_contents, token_type, token_size)
        # Convert the DTM to a pandas dataframe with the terms as column
        # headers
        import pandas as pd
        df = pd.DataFrame(dtm, columns=vocab)
        # Get the Minimum Token Length and Maximum Term Settings
        minimum_length = int(
            request.form['minlength']) if 'minlength' in request.form else 0
        if 'maxwords' in request.form:
            # Make sure there is a number in the input form
            check_for_value = request.form['maxwords']
            if check_for_value == "":
                max_num_words = 100
            else:
                max_num_words = int(request.form['maxwords'])
        # Filter words that don't meet the minimum length from the dataframe
        for term in vocab:
            if len(term) < minimum_length:
                del df[term]
        # Extract a dictionary of term count sums
        sums_dict = df.sum(axis=0).to_dict()
        # Create a new dataframe of sums and sort it by counts, then terms
        # Warning!!! This is not natsort. Multiple terms at the edge of
        # the maximum number of words limit may be cut off in abitrary
        # order. We need to implement natsort for dataframes.
        f = pd.DataFrame(list(sums_dict.items()), columns=['term', 'count'])
        f.sort_values(by=['count', 'term'], axis=0,
                      ascending=[False, True], inplace=True)
        # Convert the dataframe head to a dict for use below
        f = f.head(n=max_num_words).to_dict()
        # Build the JSON object for d3.js
        termslist = []
        countslist = []
        children = []
        for item in f['term'].items():
            termslist.append(item[1])
        for item in f['count'].items():
            countslist.append(item[1])
        for k, v in enumerate(termslist):
            children.append({"name": v, "size": str(countslist[k])})
        json_obj = {"name": "tokens", "children": children}
        # Turn the JSON object into a JSON string for the front end
        json_str = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        session_manager.cache_bubble_viz_option()
        return render_template(
            'viz.html',
            JSONObj=json_str,
            labels=labels,
            itm="bubbleviz",
            numActiveDocs=num_active_docs)
Example #24
0
def content_analysis():
    """Handles the functionality on the contentanalysis page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    analysis = ContentAnalysisModel()
    path = os.path.join(constants.TMP_FOLDER,
                        constants.UPLOAD_FOLDER,
                        session['id'], 'content_analysis/')
    if os.path.isdir(path):
        dictionary_names = [name for name in os.listdir(path)]
    else:
        dictionary_names = []
    if request.method == 'GET':
        if 'dictionary_labels' in session:
            dict_labels = session['dictionary_labels']
        else:
            dict_labels = []
        if 'active_dictionaries' in session:
            active_dicts = session['active_dictionaries']
        else:
            active_dicts = [True] * len(dict_labels)
        if 'toggle_all_value' in session:
            toggle_all_value = session['toggle_all_value']
        else:
            toggle_all_value = True
        if 'formula' in session:
            formula = session['formula']
        else:
            formula = ""
        return render_template('contentanalysis.html',
                               dictionary_labels=dict_labels,
                               active_dictionaries=active_dicts,
                               toggle_all_value=toggle_all_value,
                               itm="content-analysis",
                               formula=formula)
    else:
        num_active_docs = detect_active_docs()
        active_dicts = ContentAnalysisReceiver().options_from_front_end(
        ).active_dicts
        dict_labels = ContentAnalysisReceiver().options_from_front_end(
        ).dict_labels
        session['formula'] = ContentAnalysisReceiver().options_from_front_end(
        ).formula
        if len(dict_labels) == 0:
            dict_labels = [os.path.splitext(dict_name)[0]
                           for dict_name in dictionary_names]
            active_dicts = [True] * len(dict_labels)
        num_active_dicts = active_dicts.count(True)
        if num_active_docs == 0 and num_active_dicts == 0:
            return error("At least 1 active document and 1 active "
                         "dictionary are required to perform a "
                         "content analysis.")
        elif num_active_docs == 0:
            return error("At least 1 active document is required to perform "
                         "a content analysis.")
        elif num_active_dicts == 0:
            return error("At least 1 active dictionary is required to perform"
                         " a content analysis.")
        file_manager = load_file_manager()
        active_files = file_manager.get_active_files()
        for file in active_files:
            analysis.add_file(file_name=file.name,
                              label=file.label,
                              content=file.load_contents())
        for dict_name, dict_label, active in zip(dictionary_names,
                                                 dict_labels,
                                                 active_dicts):
            if active:
                f = open(os.path.join(path, dict_name), "r")
                content = f.read()
                analysis.add_dictionary(file_name=dict_name,
                                        label=dict_label,
                                        content=content)
        result_table, corpus_raw_counts_table, files_raw_counts_tables,\
            formula_errors = analysis.analyze()
        if len(formula_errors) != 0 or result_table is None:
            return error(formula_errors)
        data = {"result_table": result_table,
                "dictionary_labels": dict_labels,
                "active_dictionaries": active_dicts,
                "corpus_raw_counts_table": corpus_raw_counts_table,
                "files_raw_counts_tables": files_raw_counts_tables,
                "error": False}
        return json.dumps(data)
Example #25
0
def manage():
    """Handles the functionality of the select page.

    Its primary role is to activate/deactivate specific files depending on the
    user's input.
    :return: a response object (often a render_template call) to flask
    and eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Usual loading of the FileManager
    file_manager = utility.load_file_manager()
    if request.method == "GET":
        rows = file_manager.get_previews_of_all()
        for row in rows:
            if row["state"]:
                row["state"] = "selected"
            else:
                row["state"] = ""
        return render_template(
            'manage.html',
            rows=rows,
            itm="manage",
            numActiveDocs=num_active_docs)
    if 'previewTest' in request.headers:
        file_id = int(request.data)
        file_label = file_manager.files[file_id].label
        file_preview = file_manager.files[file_id].get_preview()
        preview_vals = {
            "id": file_id,
            "label": file_label,
            "previewText": file_preview}
        return json.dumps(preview_vals)
    if 'toggleFile' in request.headers:
        # Catch-all for any POST request.
        # On the select page, POSTs come from JavaScript AJAX XHRequests.
        file_id = int(request.data)
        # Toggle the file from active to inactive or vice versa
        file_manager.toggle_file(file_id)
    elif 'toggliFy' in request.headers:
        file_ids = request.data
        file_ids = file_ids.split(",")
        file_manager.disable_all()
        # Toggle the file from active to inactive or vice versa
        file_manager.enable_files(file_ids)

    elif 'setLabel' in request.headers:
        new_name = (request.headers['setLabel'])
        file_id = int(request.data)
        file_manager.files[file_id].set_name(new_name)
        file_manager.files[file_id].label = new_name
    elif 'setClass' in request.headers:
        new_class_label = (request.headers['setClass'])
        file_id = int(request.data)
        file_manager.files[file_id].set_class_label(new_class_label)
    elif 'disableAll' in request.headers:
        file_manager.disable_all()
    elif 'selectAll' in request.headers:
        file_manager.enable_all()
    elif 'applyClassLabel' in request.headers:
        file_manager.classify_active_files()
    elif 'deleteActive' in request.headers:
        file_manager.delete_active_files()
    elif 'deleteRow' in request.headers:
        # delete the file in request.form
        file_manager.delete_files(list(request.form.keys()))
    utility.save_file_manager(file_manager)
    return ''  # Return an empty string because you have to return something
Example #26
0
def word_cloud():
    """Handles the functionality on the visualisation page.

    a prototype for displaying single word cloud graphs.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    from collections import OrderedDict
    labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1]))
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        # there is no wordcloud option so we don't initialize that
        return render_template(
            'wordcloud.html',
            itm="word-cloud",
            labels=labels,
            numActiveDocs=num_active_docs)
    if request.method == "POST":
        # "POST" request occur when html form is submitted
        # (i.e. 'Get Dendrogram', 'Download...')
        # Get the file manager, sorted labels, and tokenization options
        file_manager = utility.load_file_manager()
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        token_type = session['analyoption']['tokenType']
        token_size = int(session['analyoption']['tokenSize'])
        # Limit docs to those selected or to active docs
        chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')]
        active_docs = []
        if chosen_doc_ids:
            for file_id in chosen_doc_ids:
                active_docs.append(file_id)
        else:
            for l_file in file_manager.files.values():
                if l_file.active:
                    active_docs.append(l_file.id)
        # Get the contents of all selected/active docs
        all_contents = []
        for file_id in active_docs:
            if file_manager.files[file_id].active:
                content = file_manager.files[file_id].load_contents()
                all_contents.append(content)
        # Generate a DTM
        dtm, vocab = utility.simple_vectorizer(
            all_contents, token_type, token_size)
        # Convert the DTM to a pandas dataframe and save the sums
        import pandas as pd
        df = pd.DataFrame(dtm)
        df = df.sum(axis=0)
        # Build the JSON object for d3.js
        json_obj = {"name": "tokens", "children": []}
        for k, v in enumerate(vocab):
            json_obj["children"].append({"name": v, "size": str(df[k])})
        # Create a list of column values for the word count table
        from operator import itemgetter
        terms = natsorted(
            json_obj["children"],
            key=itemgetter('size'),
            reverse=True)
        column_values = []
        for term in terms:
            # rows = [term["name"].encode('utf-8'), term["size"]]
            rows = [term["name"], term["size"]]
            column_values.append(rows)
        # Turn the JSON object into a JSON string for the front end
        json_obj = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        return render_template(
            'wordcloud.html',
            labels=labels,
            JSONObj=json_obj,
            columnValues=column_values,
            itm="word-cloud",
            numActiveDocs=num_active_docs)
Example #27
0
def k_means():
    """Handles the functionality on the kmeans page.

    It analyzes the various texts and displays the class label of the files.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    for key in labels:
        labels[key] = labels[key]
    default_k = int(len(labels) / 2)
    if request.method == 'GET':
        # 'GET' request occurs when the page is first loaded
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        if 'kmeanoption' not in session:
            session['kmeanoption'] = constants.DEFAULT_KMEAN_OPTIONS
        return render_template('kmeans.html',
                               labels=labels,
                               silhouettescore='',
                               kmeansIndex=[],
                               fileNameStr='',
                               fileNumber=len(labels),
                               KValue=0,
                               defaultK=default_k,
                               colorChartStr='',
                               kmeansdatagenerated=False,
                               itm="kmeans",
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        # 'POST' request occur when html form is submitted
        # (i.e. 'Get Graphs', 'Download...')
        session_manager.cache_analysis_option()
        session_manager.cache_k_mean_option()
        utility.save_file_manager(file_manager)
        if request.form['viz'] == 'PCA':
            kmeans_index, silhouette_score, file_name_str, k_value, \
                color_chart_str = utility.generate_k_means_pca(file_manager)
            return render_template('kmeans.html',
                                   labels=labels,
                                   silhouettescore=silhouette_score,
                                   kmeansIndex=kmeans_index,
                                   fileNameStr=file_name_str,
                                   fileNumber=len(labels),
                                   KValue=k_value,
                                   defaultK=default_k,
                                   colorChartStr=color_chart_str,
                                   kmeansdatagenerated=True,
                                   itm="kmeans",
                                   numActiveDocs=num_active_docs)
        elif request.form['viz'] == 'Voronoi':
            kmeans_index, silhouette_score, file_name_str, k_value, \
                color_chart_str, final_points_list, final_centroids_list, \
                text_data, max_x = \
                utility.generate_k_means_voronoi(file_manager)
            return render_template('kmeans.html',
                                   labels=labels,
                                   silhouettescore=silhouette_score,
                                   kmeansIndex=kmeans_index,
                                   fileNameStr=file_name_str,
                                   fileNumber=len(labels),
                                   KValue=k_value,
                                   defaultK=default_k,
                                   colorChartStr=color_chart_str,
                                   finalPointsList=final_points_list,
                                   finalCentroidsList=final_centroids_list,
                                   textData=text_data,
                                   maxX=max_x,
                                   kmeansdatagenerated=True,
                                   itm="kmeans",
                                   numActiveDocs=num_active_docs)
Example #28
0
def word_cloud():
    """Handles the functionality on the visualisation page.

    a prototype for displaying single word cloud graphs.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    from collections import OrderedDict
    labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1]))
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        # there is no wordcloud option so we don't initialize that
        return render_template('wordcloud.html',
                               labels=labels,
                               itm="word-cloud",
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        # "POST" request occur when html form is submitted
        # (i.e. 'Get Dendrogram', 'Download...')
        # Get the file manager, sorted labels, and tokenization options
        file_manager = utility.load_file_manager()
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        token_type = session['analyoption']['tokenType']
        token_size = int(session['analyoption']['tokenSize'])
        # Limit docs to those selected or to active docs
        chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')]
        active_docs = []
        if chosen_doc_ids:
            for ID in chosen_doc_ids:
                active_docs.append(ID)
        else:
            for lFile in file_manager.files.values():
                if lFile.active:
                    active_docs.append(lFile.id)
        # Get the contents of all selected/active docs
        all_contents = []
        for ID in active_docs:
            if file_manager.files[ID].active:
                content = file_manager.files[ID].load_contents()
                all_contents.append(content)
        # Generate a DTM
        dtm, vocab = utility.simple_vectorizer(all_contents, token_type,
                                               token_size)
        # Convert the DTM to a pandas dataframe and save the sums
        import pandas as pd
        df = pd.DataFrame(dtm)
        df = df.sum(axis=0)
        # Build the JSON object for d3.js
        json_obj = {"name": "tokens", "children": []}
        for k, v in enumerate(vocab):
            json_obj["children"].append({"name": v, "size": str(df[k])})
        # Create a list of column values for the word count table
        from operator import itemgetter
        terms = natsorted(json_obj["children"],
                          key=itemgetter('size'),
                          reverse=True)
        column_values = []
        for term in terms:
            rows = [term["name"].encode('utf-8'), term["size"]]
            column_values.append(rows)
        # Turn the JSON object into a JSON string for the front end
        json_obj = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        return render_template('wordcloud.html',
                               labels=labels,
                               JSONObj=json_obj,
                               columnValues=column_values,
                               itm="word-cloud",
                               numActiveDocs=num_active_docs)
Example #29
0
def rolling_window():
    """Handles the functionality on the rollingwindow page.

    It analyzes the various texts using a rolling window of analysis.
    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1]))
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'rwoption' not in session:
            session['rwoption'] = constants.DEFAULT_ROLLINGWINDOW_OPTIONS
        # default legendlabels
        legend_labels = [""]
        return render_template(
            'rwanalysis.html',
            labels=labels,
            legendLabels=legend_labels,
            rwadatagenerated=False,
            itm="rolling-windows",
            numActiveDocs=num_active_docs)
    if request.method == "POST":
        # "POST" request occurs when user hits submit (Get Graph) button
        data_points, data_list, graph_title, x_axis_label, y_axis_label, \
            legend_labels = utility.generate_rwa(file_manager)
        if 'get-RW-plot' in request.form:
            # The 'Graph Data' button is clicked on rollingwindow.html.
            save_path, file_extension = utility.generate_rw_matrix_plot(
                data_points, legend_labels)
            return send_file(
                save_path,
                attachment_filename="rollingwindow_matrix" +
                                    file_extension,
                as_attachment=True)
        if 'get-RW-data' in request.form:
            # The 'CSV Matrix' button is clicked on rollingwindow.html.
            save_path, file_extension = utility.generate_rw_matrix(data_list)
            return send_file(
                save_path,
                attachment_filename="rollingwindow_matrix" +
                                    file_extension,
                as_attachment=True)
        session_manager.cache_rw_analysis_option()
        if session['rwoption']['rollingwindowsize'] != '':
            return render_template(
                'rwanalysis.html',
                labels=labels,
                data=data_points,
                graphTitle=graph_title,
                xAxisLabel=x_axis_label,
                yAxisLabel=y_axis_label,
                legendLabels=legend_labels,
                rwadatagenerated=True,
                itm="rolling-windows",
                numActiveDocs=num_active_docs)
        else:
            return render_template(
                'rwanalysis.html',
                labels=labels,
                data=data_points,
                graphTitle=graph_title,
                xAxisLabel=x_axis_label,
                yAxisLabel=y_axis_label,
                legendLabels=legend_labels,
                rwadatagenerated=False,
                itm="rolling-windows",
                numActiveDocs=num_active_docs)
Example #30
0
def manage():
    """Handles the functionality of the select page.

    Its primary role is to activate/deactivate specific files depending on the
    user's input.
    :return: a response object (often a render_template call) to flask
    and eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    # Usual loading of the FileManager
    file_manager = utility.load_file_manager()
    if request.method == "GET":
        rows = file_manager.get_previews_of_all()
        for row in rows:
            if row["state"]:
                row["state"] = "selected"
            else:
                row["state"] = ""
        return render_template('manage.html',
                               rows=rows,
                               itm="manage",
                               numActiveDocs=num_active_docs)
    if 'previewTest' in request.headers:
        file_id = int(request.data)
        file_label = file_manager.files[file_id].label
        file_preview = file_manager.files[file_id].get_preview()
        preview_vals = {
            "id": file_id,
            "label": file_label,
            "previewText": file_preview
        }
        return json.dumps(preview_vals)
    if 'toggleFile' in request.headers:
        # Catch-all for any POST request.
        # On the select page, POSTs come from JavaScript AJAX XHRequests.
        file_id = int(request.data)
        # Toggle the file from active to inactive or vice versa
        file_manager.toggle_file(file_id)
    elif 'toggliFy' in request.headers:
        file_ids = request.data
        file_ids = file_ids.split(",")
        file_manager.disable_all()
        # Toggle the file from active to inactive or vice versa
        file_manager.enable_files(file_ids)

    elif 'setLabel' in request.headers:
        new_name = (request.headers['setLabel'])
        file_id = int(request.data)
        file_manager.files[file_id].set_name(new_name)
        file_manager.files[file_id].label = new_name
    elif 'setClass' in request.headers:
        new_class_label = (request.headers['setClass'])
        file_id = int(request.data)
        file_manager.files[file_id].set_class_label(new_class_label)
    elif 'disableAll' in request.headers:
        file_manager.disable_all()
    elif 'selectAll' in request.headers:
        file_manager.enable_all()
    elif 'applyClassLabel' in request.headers:
        file_manager.classify_active_files()
    elif 'deleteActive' in request.headers:
        file_manager.delete_active_files()
    elif 'deleteRow' in request.headers:
        # delete the file in request.form
        file_manager.delete_files(list(request.form.keys()))
    utility.save_file_manager(file_manager)
    return ''  # Return an empty string because you have to return something
def viz():
    """Handles the functionality on the alternate bubbleViz page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    labels = file_manager.get_active_labels_with_id()
    from collections import OrderedDict
    from natsort import natsorted
    labels = OrderedDict(natsorted(labels.items(), key=lambda x: x[1]))
    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'cloudoption' not in session:
            session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS
        if 'bubblevisoption' not in session:
            session['bubblevisoption'] = constants.DEFAULT_BUBBLEVIZ_OPTIONS
        return render_template('viz.html',
                               JSONObj="",
                               labels=labels,
                               itm="bubbleviz",
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        # "POST" request occur when html form is submitted
        # (i.e. 'Get Dendrogram', 'Download...')
        # Legacy function
        # json_obj = utility.generateJSONForD3(file_manager, mergedSet=True)
        # Get the file manager, sorted labels, and tokenization options
        file_manager = utility.load_file_manager()
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        token_type = session['analyoption']['tokenType']
        token_size = int(session['analyoption']['tokenSize'])
        # Limit docs to those selected or to active docs
        chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')]
        active_docs = []
        if chosen_doc_ids:
            for ID in chosen_doc_ids:
                active_docs.append(ID)
        else:
            for lFile in file_manager.files.values():
                if lFile.active:
                    active_docs.append(lFile.id)
        # Get the contents of all selected/active docs
        all_contents = []
        for ID in active_docs:
            if file_manager.files[ID].active:
                content = file_manager.files[ID].load_contents()
                all_contents.append(content)
        # Generate a DTM
        dtm, vocab = utility.simple_vectorizer(all_contents, token_type,
                                               token_size)
        # Convert the DTM to a pandas dataframe with the terms as column
        # headers
        import pandas as pd
        df = pd.DataFrame(dtm, columns=vocab)
        # Get the Minimum Token Length and Maximum Term Settings
        minimum_length = int(
            request.form['minlength']) if 'minlength' in request.form else 0
        if 'maxwords' in request.form:
            # Make sure there is a number in the input form
            check_for_value = request.form['maxwords']
            if check_for_value == "":
                max_num_words = 100
            else:
                max_num_words = int(request.form['maxwords'])
        # Filter words that don't meet the minimum length from the dataframe
        for term in vocab:
            if len(term) < minimum_length:
                del df[term]
        # Extract a dictionary of term count sums
        sums_dict = df.sum(axis=0).to_dict()
        # Create a new dataframe of sums and sort it by counts, then terms
        # Warning!!! This is not natsort. Multiple terms at the edge of
        # the maximum number of words limit may be cut off in abitrary
        # order. We need to implement natsort for dataframes.
        f = pd.DataFrame(list(sums_dict.items()), columns=['term', 'count'])
        f.sort_values(by=['count', 'term'],
                      axis=0,
                      ascending=[False, True],
                      inplace=True)
        # Convert the dataframe head to a dict for use below
        f = f.head(n=max_num_words).to_dict()
        # Build the JSON object for d3.js
        termslist = []
        countslist = []
        children = []
        for item in f['term'].items():
            termslist.append(item[1])
        for item in f['count'].items():
            countslist.append(item[1])
        for k, v in enumerate(termslist):
            children.append({"name": v, "size": str(countslist[k])})
        json_obj = {"name": "tokens", "children": children}
        # Turn the JSON object into a JSON string for the front end
        json_str = json.dumps(json_obj)
        session_manager.cache_cloud_option()
        session_manager.cache_bubble_viz_option()
        return render_template('viz.html',
                               JSONObj=json_str,
                               labels=labels,
                               itm="bubbleviz",
                               numActiveDocs=num_active_docs)
Example #32
0
def tokenizer():
    """Handles the functionality on the tokenizer page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Use timeit to test peformance
    from timeit import default_timer as timer
    start_t = timer()
    print("Initialising GET request.")
    import pandas as pd
    from operator import itemgetter
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    if request.method == "GET":
        # Get the active labels and sort them
        labels = file_manager.get_active_labels_with_id()
        header_labels = []
        for fileID in labels:
            header_labels.append(file_manager.files[int(fileID)].label)
        header_labels = natsorted(header_labels)
        # Get the starting options from the session
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        if 'csvoptions' not in session:
            session['csvoptions'] = constants.DEFAULT_CSV_OPTIONS
        csv_orientation = session['csvoptions']['csvorientation']
        csv_delimiter = session['csvoptions']['csvdelimiter']
        cull_number = session['analyoption']['cullnumber']
        token_type = session['analyoption']['tokenType']
        normalize_type = session['analyoption']['normalizeType']
        token_size = session['analyoption']['tokenSize']
        norm = session['analyoption']['norm']
        data = {
            'cullnumber': cull_number,
            'tokenType': token_type,
            'normalizeType': normalize_type,
            'csvdelimiter': csv_delimiter,
            'mfwnumber': '1',
            'csvorientation': csv_orientation,
            'tokenSize': token_size,
            'norm': norm
        }
        # If there are active documents, generate a DTM matrix
        if num_active_docs > 0:
            end_t = timer()
            elapsed = end_t - start_t
            print("before generateCSVMatrixFromAjax")
            print(elapsed)
            # Get the DTM with the session options and convert it to a list of
            # lists
            dtm = utility.generate_csv_matrix_from_ajax(data,
                                                        file_manager,
                                                        round_decimal=True)
            end_t = timer()
            elapsed = end_t - start_t
            print("after generateCSVMatrixFromAjax")
            print(elapsed)
            # Print the first five rows for testing
            # print dtm[0:5]
            # #dtm[0] += (0,0,)
            # for i,row in enumerate(dtm[1:]):
            #     dtm[i+1] += (0,0,)
            # print dtm[0:5]
            # Create a pandas dataframe with the correct orientation.
            # Convert it to a list of lists (matrix)
            if csv_orientation == "filerow":
                df = pd.DataFrame(dtm)
                # Create the matrix
                matrix = df.values.tolist()
            else:
                df = pd.DataFrame(dtm)
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame created.")
                print(elapsed)
                # Calculate the sums and averages
                length = len(df.index)
                sums = [0] * (length - 1)
                sums.insert(0, "Total")
                averages = [0] * (length - 1)
                averages.insert(0, "Average")
                end_t = timer()
                elapsed = end_t - start_t
                print("Sum and averages calculated.")
                print(elapsed)
                # Concatenate the total and average columns to the dataframe
                df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])],
                               axis=1)
                df = pd.concat(
                    [df, pd.DataFrame(averages, columns=['Average'])], axis=1)
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame modified.")
                print(elapsed)
                # Create the matrix
                matrix = df.values.tolist()
                matrix[0][0] = "Terms"
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame converted to matrix.")
                print(elapsed)
            # Prevent Unicode errors in column headers
            for i, v in enumerate(matrix[0]):
                matrix[0][i] = v
            # Save the column headers and remove them from the matrix
            # columns = natsorted(matrix[0])
            columns = matrix[0]
            if csv_orientation == "filecolumn":
                columns[0] = "Terms"
            else:
                columns[0] = "Documents"
            del matrix[0]
            # Prevent Unicode errors in the row headers
            for i, v in enumerate(matrix):
                matrix[i][0] = v[0]
            # Calculate the number of rows in the matrix
            records_total = len(matrix)
            # Sort the matrix by column 0
            matrix = natsorted(matrix, key=itemgetter(0), reverse=False)
            # Set the table length -- maximum 10 records for initial load
            if records_total <= 10:
                end_index = records_total - 1
                matrix = matrix[0:end_index]
            else:
                matrix = matrix[0:9]
            # escape all the html character in matrix
            matrix = [[general_functions.html_escape(row[0])] + row[1:]
                      for row in matrix]
            # escape all the html character in columns
            columns = [general_functions.html_escape(item) for item in columns]
            # The first 10 rows are sent to the template as an HTML string.
            # After the template renders, an ajax request fetches new data
            # to re-render the table with the correct number of rows.
            # Create the columns string
            cols = "<tr>"
            for s in columns:
                cols += "<th>" + str(s) + "</th>"
            cols += "</tr>"
            # Create the rows string
            rows = ""
            for l in matrix:
                row = "<tr>"
                for s in l:
                    row += "<td>" + str(s) + "</td>"
                row += "</tr>"
                rows += row
        # Catch instances where there is no active document (triggers the error
        # modal)
        else:
            cols = "<tr><th>Terms</th></tr>"
            rows = "<tr><td></td></tr>"
            records_total = 0
        # Render the template
        end_t = timer()
        elapsed = end_t - start_t
        print("Matrix generated. Rendering template.")
        print(elapsed)
        return render_template('tokenizer.html',
                               draw=1,
                               itm="tokenize",
                               labels=labels,
                               headers=header_labels,
                               columns=cols,
                               rows=rows,
                               numRows=records_total,
                               orientation=csv_orientation,
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        end_t = timer()
        elapsed = end_t - start_t
        print("POST received.")
        print(elapsed)
        session_manager.cache_analysis_option()
        session_manager.cache_csv_options()
        if 'get-csv' in request.form:
            # The 'Download Matrix' button is clicked on tokenizer.html.
            save_path, file_extension = utility.generate_csv(file_manager)
            utility.save_file_manager(file_manager)
            return send_file(save_path,
                             attachment_filename="frequency_matrix" +
                             file_extension,
                             as_attachment=True)
        else:
            # Get the active labels and sort them
            labels = file_manager.get_active_labels_with_id()
            header_labels = []
            for fileID in labels:
                header_labels.append(file_manager.files[int(fileID)].label)
            # Get the Tokenizer options from the request json object
            length = int(request.json["length"])
            # Increment for the ajax response
            draw = int(request.json["draw"]) + 1
            search = request.json["search"]
            order = str(request.json["order"][1])
            sort_column = int(request.json["order"][0])
            csv_orientation = request.json["csvorientation"]
            # Set the sorting order
            if order == "desc":
                reverse = True
            else:
                reverse = False
            # Get the DTM with the requested options and convert it to a list
            # of lists
            dtm = utility.generate_csv_matrix_from_ajax(request.json,
                                                        file_manager,
                                                        round_decimal=True)
            end_t = timer()
            elapsed = end_t - start_t
            print("DTM received.")
            print(elapsed)
            if csv_orientation == "filerow":
                dtm[0][0] = "Documents"
                df = pd.DataFrame(dtm)
                footer_stats = df.drop(df.index[[0]], axis=0)
                footer_stats = footer_stats.drop(df.index[[0]], axis=1)
                footer_totals = footer_stats.sum().tolist()
                footer_totals = [round(total, 4) for total in footer_totals]
                footer_averages = footer_stats.mean().tolist()
                footer_averages = [round(ave, 4) for ave in footer_averages]
                sums = ["Total"]
                averages = ["Average"]
                # Discrepancy--this is used for tokenize/POST
                length = len(df.index)
                for i in range(0, length):
                    if i > 0:
                        rounded_sum = round(df.iloc[i][1:].sum(), 4)
                        sums.append(rounded_sum)
                        rounded_ave = round(df.iloc[i][1:].mean(), 4)
                        averages.append(rounded_ave)
                df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])],
                               axis=1)
                df = pd.concat(
                    [df, pd.DataFrame(averages, columns=['Average'])], axis=1)
                # Populate the sum of sums and average of averages cells
                sum_of_sums = df['Total'].tolist()
                num_rows = len(df['Total'].tolist())
                num_rows = num_rows - 1
                sum_of_sums = sum(sum_of_sums[1:])
                sum_of_ave = df['Average'].tolist()
                sum_of_ave = sum(sum_of_ave[1:])
                footer_totals.append(round(sum_of_sums, 4))
                footer_totals.append(round(sum_of_ave, 4))
                ave_of_sums = sum_of_sums / num_rows
                ave_of_aves = ave_of_sums / num_rows
                footer_averages.append(round(ave_of_sums, 4))
                footer_averages.append(round(ave_of_aves, 4))
                # Change the DataFrame to a list
                matrix = df.values.tolist()
                # Prevent Unicode errors in column headers
                for i, v in enumerate(matrix[0]):
                    matrix[0][i] = v
                # Save the column headers and remove them from the matrix
                columns = natsorted(matrix[0][1:-2])
                columns.insert(0, "Documents")
                columns.append("Total")
                columns.append("Average")
                del matrix[0]
            else:
                df = pd.DataFrame(dtm)
                # print(df[0:3])
                end_t = timer()
                elapsed = end_t - start_t
                print("DTM created. Calculating footer stats")
                print(elapsed)
                footer_stats = df.drop(df.index[[0]], axis=0)
                # print(footer_stats[0:3])
                footer_stats = footer_stats.drop(df.index[[0]], axis=1)
                footer_totals = footer_stats.sum().tolist()
                footer_totals = [round(total, 4) for total in footer_totals]
                footer_averages = footer_stats.mean().tolist()
                footer_averages = [round(ave, 4) for ave in footer_averages]
                end_t = timer()
                elapsed = end_t - start_t
                print("Footer stats calculated. "
                      "Calculating totals and averages...")
                print(elapsed)
                # try it with nested for loops
                sums = []
                averages = []
                n_rows = len(df.index)
                # all rows are the same, so picking any row
                n_cols = len(df.iloc[1])
                for i in range(1, n_rows):
                    row_total = 0
                    for j in range(1, n_cols):
                        row_total += df.iloc[i][j]
                    sums.append(round(row_total, 4))
                    averages.append(round((row_total / (n_cols - 1)), 4))
                sums.insert(0, "Total")
                averages.insert(0, "Average")
                end_t = timer()
                elapsed = end_t - start_t
                print("Totals and averages calculated. Appending columns...")
                print(elapsed)
                # This seems to be the bottleneck
                df['Total'] = sums
                df['Average'] = averages
                end_t = timer()
                elapsed = end_t - start_t
                print("Populating columns with rounded values.")
                print(elapsed)
                # Populate the sum of sums and average of averages cells
                sum_of_sums = df['Total'].tolist()
                num_rows = len(df['Total'].tolist())
                num_rows = num_rows - 1
                sum_of_sums = sum(sum_of_sums[1:])
                sum_of_ave = df['Average'].tolist()
                sum_of_ave = sum(sum_of_ave[1:])
                footer_totals.append(round(sum_of_sums, 4))
                footer_totals.append(round(sum_of_ave, 4))
                ave_of_sums = sum_of_sums / num_rows
                ave_of_aves = ave_of_sums / num_rows
                footer_averages.append(round(ave_of_sums, 4))
                footer_averages.append(round(ave_of_aves, 4))
                end_t = timer()
                elapsed = end_t - start_t
                print("Rounded values added.")
                print(elapsed)
                matrix = df.values.tolist()
                matrix[0][0] = "Terms"
                # Prevent Unicode errors in column headers
                for i, v in enumerate(matrix[0]):
                    matrix[0][i] = v
                # Save the column headers and remove them from the matrix
                columns = natsorted(matrix[0])
                if csv_orientation == "filecolumn":
                    columns[0] = "Terms"
                else:
                    columns[0] = "Documents"
                del matrix[0]
        # Code for both orientations #
        end_t = timer()
        elapsed = end_t - start_t
        print("Starting common code.")
        print(elapsed)
        # Prevent Unicode errors in the row headers
        for i, v in enumerate(matrix):
            matrix[i][0] = v[0]
        # Calculate the number of rows in the matrix
        records_total = len(matrix)
        # Sort and Filter the cached DTM by column
        if len(search) != 0:
            matrix = [x for x in matrix if x[0].startswith(search)]
            matrix = natsorted(matrix,
                               key=itemgetter(sort_column),
                               reverse=reverse)
        else:
            matrix = natsorted(matrix,
                               key=itemgetter(sort_column),
                               reverse=reverse)
        # Get the number of filtered rows
        records_filtered = len(matrix)
        # Set the table length
        if length == -1:
            matrix = matrix[0:]
        else:
            start_index = int(request.json["start"])
            end_index = int(request.json["end"])
            matrix = matrix[start_index:end_index]
        # Correct the footer rows
        footer_totals = [float(Decimal("%.4f" % e)) for e in footer_totals]
        footer_averages = [float(Decimal("%.4f" % e)) for e in footer_averages]
        footer_totals.insert(0, "Total")
        footer_averages.insert(0, "Average")
        footer_totals.append("")
        footer_averages.append("")
        response = {
            "draw": draw,
            "records_total": records_total,
            "records_filtered": records_filtered,
            "length": int(length),
            "columns": columns,
            "data": matrix,
            "totals": footer_totals,
            "averages": footer_averages
        }
        end_t = timer()
        elapsed = end_t - start_t
        print("Returning table data to the browser.")
        print(elapsed)
        return json.dumps(response)