Beispiel #1
0
def display_history_group():
    '''
    This route function handles the loaded single file from Azure blob storage, 
    processes it and displays the wordcloud and the word-frequency distribution graph.
    This has a lot of similarity how an uploaded single file is handled.
    '''

    myid = session['myid']

    pyldavis_html_path = "pickles/pyldavis_html_" + str(myid) + '.p'
    lda_html_path = "pickles/lda_html_" + str(myid) + '.p'

    lda_html = pickle.load(open(lda_html_path, "rb"))
    pyldavis_html = pickle.load(open(pyldavis_html_path, "rb"))

    topic_number_form = inputTopicNumber()

    # there is no need to save a model that has already been saved before
    session['save'] = False
    # the downloaded model has no access to the original uploaded files and therefore cannot produce a folder containing these files.
    session['download'] = False

    return render_template('bulk_analysis.html',
                           title='Clustering analysis',
                           lda_html=lda_html,
                           number_form=topic_number_form,
                           pyldavis_html=pyldavis_html)
Beispiel #2
0
def submit_number_topics():
    '''
    This is the route function handles the submission of a new number of topics of top words for the LDA model. It re-runs the lda_tsne and pyldavis_run methods.
    '''

    myid = session['myid']

    # checks which of the two form fields contains data.
    if request.method == 'POST':
        if is_filled(request.form['number_topics']):
            number_topics = int(request.form['number_topics'])
            session['number_topics'] = str(number_topics)

        else:
            number_topics = int(session['number_topics'])

        if is_filled(request.form['number_topwords']):
            number_topwords = int(request.form['number_topwords'])
            session['number_topwords'] = str(number_topwords)

        else:
            number_topwords = int(session['number_topwords'])

        total_text_path = 'pickles/total_text_' + str(myid) + '.p'
        file_names_path = 'pickles/file_names_' + str(myid) + '.p'
        pyldavis_html_path = "pickles/pyldavis_html_" + str(myid) + '.p'

        # loads the serialized python objects for the total_text list and and the file_names list which are going to be used to re-run the lda_tsne method.
        total_text = pickle.load(open(total_text_path, "rb"))
        file_names = pickle.load(open(file_names_path, "rb"))

        lda_html_path = "pickles/lda_html_" + str(myid) + '.p'
        pyldavis_html_path = "pickles/pyldavis_html_" + str(myid) + '.p'

        # performs the lda clustering and tsne dimensionality reduction, resulting in scatter plot that is converted to html.
        lda_html = lda_tsne(total_text,
                            file_names,
                            n_topics=number_topics,
                            n_top_words=number_topwords)

        lda_model_path = "pickles/lda_model_" + str(myid) + '.p'
        document_term_matrix_path = "pickles/document_term_matrix_" + \
            str(myid) + '.p'
        cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p'

        # if only the number of of top words is changing, there is no need to re-run the computationaly demanding pyldavis_run method. Instead the last output can simply be
        # loaded from the stored serilizaed object.
        # if however the number of topics has changed, the pyldavis_run method needs to be called as it also incorporates the number of topics within its calculations.
        if is_filled(request.form['number_topics']):
            pyldavis_html = pyldavis_run(lda_model_path,
                                         document_term_matrix_path,
                                         cvectorizer_path)
        else:
            pyldavis_html = pickle.load(open(pyldavis_html_path, "rb"))

        pickle.dump(pyldavis_html, open(pyldavis_html_path, "wb"))
        pickle.dump(lda_html, open(lda_html_path, "wb"))

        topic_number_form = inputTopicNumber()
        return render_template('bulk_analysis.html',
                               title='Clustering analysis',
                               lda_html=lda_html,
                               number_form=topic_number_form,
                               pyldavis_html=pyldavis_html)
Beispiel #3
0
def save_group():
    '''
    This route function handles the process of saving the data of a compressed file input from a logged-in user.
    The process of saving the data involves saving the serialized pickle objects for the
    - total text list
    - file names list
    - vectorizer object (SciKitLearn CountVectorizer)
    - document-term matrix obtained from CountVectorizer
    - LDA model object
    - LDA model resulting html
    - pyldavis html object

    to Azure blob storage and saving the metadata to the SQLite database. 



    '''

    compressed_file_name_without_extension_uuid = session[
        'compressed_file_name_without_extension_uuid']
    compressed_file_name_with_extension = session['compressed_file_name']

    compressed_file_name_uuid = session['compressed_file_name_uuid']

    total_text_path = session['total_text_path']
    file_names_path = session['file_names_path']
    vectorizer_path = session['vectorizer_path']
    document_term_matrix_path = session['document_term_matrix_path']
    lda_model_path = session['lda_model_path']
    lda_html_path = session['lda_html_path']
    pyldavis_html_path = session['pyldavis_html_path']

    #########################################################################
    # This snippet has been adapted from the following source:
    # Link: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
    # Author: limasxgoesto0
    # Date: 20/03/2014
    #########################################################################
    # deletes any non alphanumeric characters (and dash) in the compressed file name because
    # Azure blob storage only allows for these characters in its container names.
    regex = re.compile('[^a-zA-Z0-9-]')
    compressed_file_name_without_extension_uuid = regex.sub(
        '', compressed_file_name_without_extension_uuid)

    #########################################################################
    # This snippet has been adapted from the following source:
    # Link: https://stackoverflow.com/questions/5998245/get-current-time-in-milliseconds-in-python
    # Author: Naftuli Kay
    # Date: 13/05/2011
    #########################################################################
    # converts the current time into one tenth of a milisecond
    millis = int(round(time.time() * 10000))

    container_name = compressed_file_name_without_extension_uuid + str(millis)

    #this method handles the Azure blob storage upload and the database update
    blob_upload.upload_group_file(compressed_file_name_with_extension,
                                  compressed_file_name_uuid,
                                  container_name,
                                  total_text_path,
                                  vectorizer_path,
                                  document_term_matrix_path,
                                  file_names_path,
                                  lda_model_path,
                                  lda_html_path,
                                  pyldavis_html_path,
                                  delete=False,
                                  update_db=True)

    lda_html = pickle.load(open(lda_html_path, "rb"))
    pyldavis_html = pickle.load(open(pyldavis_html_path, "rb"))

    topic_number_form = inputTopicNumber()

    flash('Your model has been saved')

    session['save'] = False

    return render_template('bulk_analysis.html',
                           title='Clustering analysis',
                           lda_html=lda_html,
                           number_form=topic_number_form,
                           pyldavis_html=pyldavis_html)
Beispiel #4
0
def upload_file():
    '''
    This is the route function that governs the home page of the web app. 
    If there are not requests from the html file upload form the function will simply render the home.html template.
    If there is a POST request from the html file upload form, the uploaded file will be processed. 
    During the processing it is checked whether the upload file has an allowed extension (pdf,docx,txt for single files. rar,zip for compressed files).
    If the file is is a 'single file' (pdf, docx, txt), it will be stored in the 'uploads' folder and processed using the extractor.py module. 
    The processed text will then be used to build a wordcloud and a word-frequency graph.
    If the file is a 'compressed folder' (rar, zip), it will be decrompressed and the text extracted and processed usingt the compressed_main.py module. 
    The processed text will then be used to build the scatter graph of the clusters (using the lda_tsne_model2.py module) and the pyldavis visualisation (using the mypyldavis.py module).

    It must be noted that if less than 4 file are inside the compressed folder then the web app will trigger an error as the LDA model does not work with such little data.
    '''

    # This flask session object is used to determine whether the 'save' button will be visible or not.
    session['save'] = True

    # If the a user is not logged in, create a unique identifier for this specific user.
    # If the user is already logged in, the identifier has already been createrd.
    # the current_user.is_anonymous comes from the Flask-Login Flask extension.
    if current_user.is_anonymous:
        an_id = str(uuid.uuid4())
        myid = an_id[:8] + an_id[24:]
        session['myid'] = myid

    else:
        myid = session['myid']

    if request.method == 'POST':
        # check if the post request has the file part
        if 'document' not in request.files:
            print("file not in request.files")
            flash('No file part')
            return redirect(request.url)
        file = request.files['document']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):

            filename = secure_filename(file.filename)

            file_extension = filename.rsplit('.', 1)[1].lower()

            file_name_no_extension = filename.rsplit('.', 1)[0].lower()

            if len(file_name_no_extension) > 28:
                file_name_no_extension = file_name_no_extension[0:28]
                filename = file_name_no_extension + '.' + file_extension

            # defining the regular expresison of the allowed characters for the file name.
            regex = re.compile('[^a-zA-Z0-9-_]')

            #making sure that the file name contains only alpha-numeric and ('-', '_') characters. Any other character is deleted.
            file_name_no_extension = regex.sub('', file_name_no_extension)

            file_name_uuid = str(file_name_no_extension) + \
                '_' + str(myid) + '.' + file_extension
            file_name_uuid_no_extension = str(
                file_name_no_extension) + '_' + str(myid)

            file.save(os.path.join('uploads', file_name_uuid))

            session['single_file_path'] = os.path.join('uploads',
                                                       file_name_uuid)

            #checks if the uploaded file is a compressed format. This is pivotal in the program execution.
            if file_extension in compressed_extensions:

                # >>>>>>>>>>> This section of the code handles a compressed file (zip/rar)

                # creates the unique paths that are going to be used to store the serialised python objects (using the pickle module).
                # myid is the unique identifier that was previously created and is stored in session['myid']
                total_text_path = 'pickles/total_text_' + str(myid) + '.p'
                file_names_path = 'pickles/file_names_' + str(myid) + '.p'
                lda_model_path = "pickles/lda_model_" + str(myid) + '.p'
                lda_html_path = "pickles/lda_html_" + str(myid) + '.p'
                document_term_matrix_path = "pickles/document_term_matrix_" + \
                    str(myid) + '.p'
                cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p'
                pyldavis_html_path = "pickles/pyldavis_html_" + \
                    str(myid) + '.p'

                # stores the various path variables in the flask session so that it can be used later.
                session['total_text_path'] = total_text_path
                session['file_names_path'] = file_names_path
                session['vectorizer_path'] = cvectorizer_path
                session[
                    'document_term_matrix_path'] = document_term_matrix_path
                session['lda_model_path'] = lda_model_path
                session['lda_html_path'] = lda_html_path
                session['pyldavis_html_path'] = pyldavis_html_path

                # handle_compressed_file decompresses the compressed file, extracts the text from each document within and parses/tokenizes/stems/removes stop words.
                # total_text is a list of strings where each element in the list represents the text from one document
                # totalvocab_stemmed is a list of stemmed words from the all the documents
                # totalvocab_tokenized is a list of text tokens from all the documents
                # file_names is a list of the file names from the compressed folder.
                total_text, totalvocab_stemmed, totalvocab_tokenized, file_names = handle_compressed_file(
                    (os.path.join('uploads', file_name_uuid)), filename)

                #once the compressed folder has been decrompressed and processed there is no need to keep it on the filing system so it is removed.
                os.remove(os.path.join('uploads', file_name_uuid))

                # LDA does not work with less than 4 files
                if len(file_names) < 4:
                    flash(
                        'At least 4 files in the compressed folder are required'
                    )
                    return redirect(url_for('upload_file'))

                # calls the lda_tsne method from lda_tsne_model2.py which peforms text vectorization, LDA clustering and then tSNE before converting the plot into html format.
                lda_html = lda_tsne(total_text, file_names)

                # Flask form for inputting a new number of topics parameter.
                topic_number_form = inputTopicNumber()

                #calls the pyldavis_run method from mypyldavis.py which produces the pyLDAvis visualization and converts it to html format.
                pyldavis_html = pyldavis_run(lda_model_path,
                                             document_term_matrix_path,
                                             cvectorizer_path)

                # stores serialized versions of total_text, file_names, pyldavis_html and lda_html which will later be used.
                pickle.dump(total_text, open(total_text_path, "wb"))
                pickle.dump(file_names, open(file_names_path, "wb"))
                pickle.dump(pyldavis_html, open(pyldavis_html_path, "wb"))
                pickle.dump(lda_html, open(lda_html_path, "wb"))

                # Flask sessions object that dictates whether the 'Download' button is visible.
                session['download'] = True

                return render_template('bulk_analysis.html',
                                       title='Clustering analysis',
                                       lda_html=lda_html,
                                       number_form=topic_number_form,
                                       pyldavis_html=pyldavis_html)

            # >>>>>>>>>>> This section of the code handles a single file (docx/pdf/txt)

            #stores the name of the file name with and without the file extension and unique identifier in flask session objects.
            session[
                'single_file_name_short_no_extension'] = file_name_no_extension
            session[
                'single_file_name_uuid_long_no_extension'] = file_name_uuid_no_extension
            session['single_file_name_short_with_extension'] = filename
            session['single_file_name_long_with_extension'] = file_name_uuid

            # calls the extract method from extractor.py that extracts and processes the text from the document.
            text, tokens, keywords = extract(
                os.path.join('uploads', file_name_uuid))

            keywords_path = "pickles/keywords_" + str(myid) + '.p'

            pickle.dump(keywords, open(keywords_path, "wb"))

            # gets the pygal word-frequency distribution graph
            graph_data = frequency_dist(
                keywords, 26,
                ('Word frequency for file  with filename: ' + filename))

            # gets the wordcloud plot html
            wordcloud_html = build_word_cloud(text, 2000)

            wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p'

            pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb"))

            # gets the flask form that is used to input new stopwords.
            stop_words_form = StopWordsForm()

            session['title'] = 'Single file NLP analysis'

            return render_template('analysis_options.html',
                                   title='Single file NLP analysis',
                                   graph_data=graph_data,
                                   wordcloud_html=wordcloud_html,
                                   stop_words_form=stop_words_form)

        else:
            flash('not an allowed file format')
            return redirect(url_for('upload_file'))
    else:
        uploadForm = UploadFileForm()
        inputTextForm = inputText()
        return render_template('home.html',
                               title='Welcome',
                               form=uploadForm,
                               textform=inputTextForm)