Ejemplo n.º 1
0
def submit_stop_words():
    '''
    This route function handles the submission of new stop words for the word-frequency distribution. 
    It processes the input text which should contain the new stop words delimited by commas. 
    It then re-reuns the frequency_dist method that produces the word-frequency distribution graph with the new stopwrods as one of the parameters. 
    '''

    myid = session['myid']

    if request.method == 'POST':
        if is_filled(request.form['stopwords']):
            new_stopwords = request.form['stopwords']
            new_stopwords = new_stopwords.replace(' ', '')
            new_stopwords = new_stopwords.split(",")

            keywords_path = "pickles/keywords_" + str(myid) + '.p'

            keywords = pickle.load(open(keywords_path, "rb"))
            title = session['title']
            graph_data = frequency_dist(keywords, 26, title, new_stopwords)

            wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

            wordcloud_html = pickle.load(open(wordcloud_html_path, "rb"))

            stop_words_form = StopWordsForm()
            return render_template('analysis_options.html',
                                   title='Single file NLP analysis',
                                   graph_data=graph_data,
                                   stop_words_form=stop_words_form,
                                   wordcloud_html=wordcloud_html)
Ejemplo n.º 2
0
def submit():
    '''
    This is the route function that deals with the text field submission from the homepage.
    It checks whether text has been entered. 
    If it has, the text will be parsed and tokenized.
    The word frequency graph and wordcloud will then be created similarly to the description above.

    '''

    myid = session['myid']

    if request.method == 'POST':

        #checks if text has been entered in the text area.
        if 'text' not in request.form:
            print("no text entered")
            flash('No text was entered')
            return redirect(request.url)

        text = request.form['text']

        #processes the text so it is tokenized and linguistic noise is removed.
        text, tokens, keywords = simple_parse(text)

        keywords_path = "pickles/keywords_" + str(myid) + '.p'

        pickle.dump(keywords, open(keywords_path, "wb"))

        # gets the pygal word-frequency object
        graph_data = frequency_dist(keywords, 26,
                                    ('Word frequency for input text'))

        # gets the wordcloud plot in html form
        wordcloud_html = build_word_cloud(text, 2000)

        wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

        pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb"))

        session['title'] = 'NLP analysis'

        stop_words_form = StopWordsForm()

        session['save'] = False

        return render_template('analysis_options.html',
                               title='NLP analysis',
                               graph_data=graph_data,
                               wordcloud_html=wordcloud_html,
                               stop_words_form=stop_words_form)
Ejemplo n.º 3
0
def display_history_single():
    '''
    This route function handles the loaded single file from Azure blob storage,
    processes it and displays the wordcloud and the word-frequency distribution graph.
    This has a lot of similarity how an uploaded single file is handled.
    '''

    myid = session['myid']

    save_path = request.args['save_path']

    file_name_short_with_extension = request.args[
        'file_name_short_with_extension']

    text, tokens, keywords = extract(save_path)

    keywords_path = "pickles/keywords_" + str(myid) + '.p'

    pickle.dump(keywords, open(keywords_path, "wb"))

    graph_data = frequency_dist(keywords, 26,
                                ('Word frequency for file  with filename: ' +
                                 file_name_short_with_extension))

    wordcloud_html = build_word_cloud(text, 2000)

    wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

    pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb"))

    session['save'] = False

    stop_words_form = StopWordsForm()

    return render_template('analysis_options.html',
                           title='Single file NLP analysis',
                           graph_data=graph_data,
                           stop_words_form=stop_words_form,
                           wordcloud_html=wordcloud_html)
Ejemplo n.º 4
0
def save_single():
    '''
    This route function handles the process of saving the data of a single file input from a logged-in user.
    The process of saving the data involves saving the actual uploaded file into Azure blob storage and saving the metadata to the SQLite database. 
    Since single files are usually small in terms of memory and the processing of these is quick, there is little advantage from serializing the resulting 
    Python objects and saving these instead of the original file. 

    '''

    myid = session['myid']

    single_file_name_short_no_extension = session[
        'single_file_name_short_no_extension']
    single_file_name_uuid_long_no_extension = session[
        'single_file_name_uuid_long_no_extension']
    single_file_name_short_with_extension = session[
        'single_file_name_short_with_extension']
    single_file_name_long_with_extension = session[
        'single_file_name_long_with_extension']

    #########################################################################
    # This snippet has been adapted from the following source:
    # Link: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
    # Author: limasxgoesto0
    # Date: 20/03/2014
    #########################################################################
    # deletes any non alphanumeric characters (and dash) in the compressed file name because Azure blob storage only allows for these characters in its container names.
    regex = re.compile('[^a-zA-Z0-9-]')
    single_file_name_uuid_long_no_extension = regex.sub(
        '', single_file_name_uuid_long_no_extension)

    single_file_path = os.path.join('uploads',
                                    single_file_name_long_with_extension)

    #########################################################################
    # This snippet has been adapted from the following source:
    # Link: https://stackoverflow.com/questions/5998245/get-current-time-in-milliseconds-in-python
    # Author: Naftuli Kay
    # Date: 13/05/2011
    #########################################################################
    # converts the current time into one tenth of a milisecond
    millis = int(round(time.time() * 10000))
    container_name = single_file_name_uuid_long_no_extension + str(millis)

    #this method handles the Azure blob storage upload and the database update
    blob_upload.upload_single_file(single_file_name_short_with_extension,
                                   single_file_path,
                                   container_name,
                                   single_file_name_long_with_extension,
                                   delete=False,
                                   update_db=True)

    # the next few lines reload the template after the form submission.
    keywords_path = "pickles/keywords_" + str(myid) + '.p'

    keywords = pickle.load(open(keywords_path, "rb"))
    title = session['title']
    graph_data = frequency_dist(keywords, 26, title)

    wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

    wordcloud_html = pickle.load(open(wordcloud_html_path, "rb"))

    stop_words_form = StopWordsForm()
    flash('Your model has been saved')

    # hides the 'save' button as there is no need to save the same model twice.
    session['save'] = False

    return render_template('analysis_options.html',
                           title='Single file NLP analysis',
                           graph_data=graph_data,
                           stop_words_form=stop_words_form,
                           wordcloud_html=wordcloud_html)
Ejemplo n.º 5
0
def upload_file():
    '''
    This is the route function that governs the home page of the web app. 
    If there are not requests from the html file upload form the function will simply render the home.html template.
    If there is a POST request from the html file upload form, the uploaded file will be processed. 
    During the processing it is checked whether the upload file has an allowed extension (pdf,docx,txt for single files. rar,zip for compressed files).
    If the file is is a 'single file' (pdf, docx, txt), it will be stored in the 'uploads' folder and processed using the extractor.py module. 
    The processed text will then be used to build a wordcloud and a word-frequency graph.
    If the file is a 'compressed folder' (rar, zip), it will be decrompressed and the text extracted and processed usingt the compressed_main.py module. 
    The processed text will then be used to build the scatter graph of the clusters (using the lda_tsne_model2.py module) and the pyldavis visualisation (using the mypyldavis.py module).

    It must be noted that if less than 4 file are inside the compressed folder then the web app will trigger an error as the LDA model does not work with such little data.
    '''

    # This flask session object is used to determine whether the 'save' button will be visible or not.
    session['save'] = True

    # If the a user is not logged in, create a unique identifier for this specific user.
    # If the user is already logged in, the identifier has already been createrd.
    # the current_user.is_anonymous comes from the Flask-Login Flask extension.
    if current_user.is_anonymous:
        an_id = str(uuid.uuid4())
        myid = an_id[:8] + an_id[24:]
        session['myid'] = myid

    else:
        myid = session['myid']

    if request.method == 'POST':
        # check if the post request has the file part
        if 'document' not in request.files:
            print("file not in request.files")
            flash('No file part')
            return redirect(request.url)
        file = request.files['document']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):

            filename = secure_filename(file.filename)

            file_extension = filename.rsplit('.', 1)[1].lower()

            file_name_no_extension = filename.rsplit('.', 1)[0].lower()

            if len(file_name_no_extension) > 28:
                file_name_no_extension = file_name_no_extension[0:28]
                filename = file_name_no_extension + '.' + file_extension

            # defining the regular expresison of the allowed characters for the file name.
            regex = re.compile('[^a-zA-Z0-9-_]')

            #making sure that the file name contains only alpha-numeric and ('-', '_') characters. Any other character is deleted.
            file_name_no_extension = regex.sub('', file_name_no_extension)

            file_name_uuid = str(file_name_no_extension) + \
                '_' + str(myid) + '.' + file_extension
            file_name_uuid_no_extension = str(
                file_name_no_extension) + '_' + str(myid)

            file.save(os.path.join('uploads', file_name_uuid))

            session['single_file_path'] = os.path.join('uploads',
                                                       file_name_uuid)

            #checks if the uploaded file is a compressed format. This is pivotal in the program execution.
            if file_extension in compressed_extensions:

                # >>>>>>>>>>> This section of the code handles a compressed file (zip/rar)

                # creates the unique paths that are going to be used to store the serialised python objects (using the pickle module).
                # myid is the unique identifier that was previously created and is stored in session['myid']
                total_text_path = 'pickles/total_text_' + str(myid) + '.p'
                file_names_path = 'pickles/file_names_' + str(myid) + '.p'
                lda_model_path = "pickles/lda_model_" + str(myid) + '.p'
                lda_html_path = "pickles/lda_html_" + str(myid) + '.p'
                document_term_matrix_path = "pickles/document_term_matrix_" + \
                    str(myid) + '.p'
                cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p'
                pyldavis_html_path = "pickles/pyldavis_html_" + \
                    str(myid) + '.p'

                # stores the various path variables in the flask session so that it can be used later.
                session['total_text_path'] = total_text_path
                session['file_names_path'] = file_names_path
                session['vectorizer_path'] = cvectorizer_path
                session[
                    'document_term_matrix_path'] = document_term_matrix_path
                session['lda_model_path'] = lda_model_path
                session['lda_html_path'] = lda_html_path
                session['pyldavis_html_path'] = pyldavis_html_path

                # handle_compressed_file decompresses the compressed file, extracts the text from each document within and parses/tokenizes/stems/removes stop words.
                # total_text is a list of strings where each element in the list represents the text from one document
                # totalvocab_stemmed is a list of stemmed words from the all the documents
                # totalvocab_tokenized is a list of text tokens from all the documents
                # file_names is a list of the file names from the compressed folder.
                total_text, totalvocab_stemmed, totalvocab_tokenized, file_names = handle_compressed_file(
                    (os.path.join('uploads', file_name_uuid)), filename)

                #once the compressed folder has been decrompressed and processed there is no need to keep it on the filing system so it is removed.
                os.remove(os.path.join('uploads', file_name_uuid))

                # LDA does not work with less than 4 files
                if len(file_names) < 4:
                    flash(
                        'At least 4 files in the compressed folder are required'
                    )
                    return redirect(url_for('upload_file'))

                # calls the lda_tsne method from lda_tsne_model2.py which peforms text vectorization, LDA clustering and then tSNE before converting the plot into html format.
                lda_html = lda_tsne(total_text, file_names)

                # Flask form for inputting a new number of topics parameter.
                topic_number_form = inputTopicNumber()

                #calls the pyldavis_run method from mypyldavis.py which produces the pyLDAvis visualization and converts it to html format.
                pyldavis_html = pyldavis_run(lda_model_path,
                                             document_term_matrix_path,
                                             cvectorizer_path)

                # stores serialized versions of total_text, file_names, pyldavis_html and lda_html which will later be used.
                pickle.dump(total_text, open(total_text_path, "wb"))
                pickle.dump(file_names, open(file_names_path, "wb"))
                pickle.dump(pyldavis_html, open(pyldavis_html_path, "wb"))
                pickle.dump(lda_html, open(lda_html_path, "wb"))

                # Flask sessions object that dictates whether the 'Download' button is visible.
                session['download'] = True

                return render_template('bulk_analysis.html',
                                       title='Clustering analysis',
                                       lda_html=lda_html,
                                       number_form=topic_number_form,
                                       pyldavis_html=pyldavis_html)

            # >>>>>>>>>>> This section of the code handles a single file (docx/pdf/txt)

            #stores the name of the file name with and without the file extension and unique identifier in flask session objects.
            session[
                'single_file_name_short_no_extension'] = file_name_no_extension
            session[
                'single_file_name_uuid_long_no_extension'] = file_name_uuid_no_extension
            session['single_file_name_short_with_extension'] = filename
            session['single_file_name_long_with_extension'] = file_name_uuid

            # calls the extract method from extractor.py that extracts and processes the text from the document.
            text, tokens, keywords = extract(
                os.path.join('uploads', file_name_uuid))

            keywords_path = "pickles/keywords_" + str(myid) + '.p'

            pickle.dump(keywords, open(keywords_path, "wb"))

            # gets the pygal word-frequency distribution graph
            graph_data = frequency_dist(
                keywords, 26,
                ('Word frequency for file  with filename: ' + filename))

            # gets the wordcloud plot html
            wordcloud_html = build_word_cloud(text, 2000)

            wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

            pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb"))

            # gets the flask form that is used to input new stopwords.
            stop_words_form = StopWordsForm()

            session['title'] = 'Single file NLP analysis'

            return render_template('analysis_options.html',
                                   title='Single file NLP analysis',
                                   graph_data=graph_data,
                                   wordcloud_html=wordcloud_html,
                                   stop_words_form=stop_words_form)

        else:
            flash('not an allowed file format')
            return redirect(url_for('upload_file'))
    else:
        uploadForm = UploadFileForm()
        inputTextForm = inputText()
        return render_template('home.html',
                               title='Welcome',
                               form=uploadForm,
                               textform=inputTextForm)