Example #1
0
def OpenOutputFiles(window, openOutputFiles, filesToOpen):
    if filesToOpen == None:
        return
    if len(filesToOpen) == 0:
        return
    if len(filesToOpen) == 1:
        singularPlural = 'file'
    else:
        singularPlural = 'files'
    if openOutputFiles == True:  # now the approach is to open all files at the end, so this extra check is redundant "and runningAll==False:""
        # should display a reminder about csv files with weird characters most likely dues to non utf-8 apostrophes and quotes
        #   but... this reminder does not have a specific config, so... perhaps *?
        reminders_util.checkReminder(
            '*', ['csv files'],
            'If csv ouput files open displaying weird characters in a Windows OS (e.g., a€), most likely the cause is due to non utf-8 compliant input text. Apostrophes and quotes are the typical culprits, but also other punctuation characters.\n\nPlease, run the tool to check documents for utf-8 compliance and, if necessary, run the tool for automatic apostrophe and quote conversion from non utf-8 to utf-8.\n\nTo learm more on utf-8 compliance, read the TIPS on utf-8 compliance.',
            True)
        routine_options = reminders_util.getReminder_list('*')
        timed_alert(
            window, 2000, 'Warning', 'Opening ' + str(len(filesToOpen)) +
            ' output ' + singularPlural + '... Please wait...', False)
        for file in filesToOpen:
            if os.path.isfile(file):
                if file.endswith('.kml'):
                    open_kmlFile(window, file)
                else:
                    openFile(window, file)
def activate_fileName_wellFormedness(*args):
	if check_filename_var.get() == False:
		fileName_embeds_date_checkbox.configure(state="disabled")
		NER_checkbox.configure(state="disabled")
		NER_var.set(0)
		similarityIndex_Intruder_menu.configure(state="disabled")
		character_home_checkbox.configure(state="normal")
		missing_character_checkbox.configure(state="normal")
		Levenshtein_checkbox.configure(state='normal')
		character_checkbox.configure(state="normal")
		# character_home_checkbox.configure(state="normal")
		intruder_checkbox.configure(state="normal")
		plagiarist_checkbox.configure(state="normal")
	else:
		reminders_util.checkReminder(config_filename, ["Filename checker"], '', True)
		fileName_embeds_date_checkbox.configure(state="normal")
		NER_checkbox.configure(state="disabled")
		NER_var.set(0)
		similarityIndex_Intruder_menu.configure(state="disabled")
		character_home_checkbox.configure(state="disabled")
		missing_character_checkbox.configure(state="disabled")
		Levenshtein_checkbox.configure(state='disabled')
		character_checkbox.configure(state="disabled")
		# character_home_checkbox.configure(state="disabled")
		intruder_checkbox.configure(state="disabled")
		plagiarist_checkbox.configure(state="disabled")
Example #3
0
def checkUSSSUpdate():
    if annotator_dictionary_var.get() == True or plot_var.get() == True:
        currentYear = datetime.now().year
        yearDiff = currentYear - last_SS_year_var.get()
        if yearDiff >= 2:
            reminders_util.checkReminder(
                config_filename, ['Time to download new US SS data'],
                'It has been more than two years since the US Social Security gender data have been downloaded to your machine.\n\nCheck on the US Social Security website whether more current data are available at US Social Security website\n\nhttps://www.ssa.gov/oact/babynames/limits.html',
                True)
Example #4
0
def display_warning(*args):
    if GIS_package2_var.get():
        # routine_options = reminders_util.getReminder_list(config_filename)
        reminders_util.checkReminder(
            config_filename, ['Open Google Earth GUI'],
            'You should tick the Open GUI checkbox ONLY if you wish to open the GUI.\n\nThe Google Earth Pro GUI will provide a number of options to personalize a Google Earth Pro map. Press Run after selecting the Open GUI option.',
            True)
        routine_options = reminders_util.getReminder_list(config_filename)
        return
def activate_filenameEmbedsDate(*args):
	if plagiarist_var.get() == False:
		similarityIndex_Plagiarist_menu.configure(state="disabled")
		similarityIndex_Plagiarist_menu.configure(state="disabled")
		check_filename_checkbox.configure(state='normal')
		character_checkbox.configure(state="normal")
		character_home_checkbox.configure(state="normal")
		missing_character_checkbox.configure(state="normal")
		Levenshtein_checkbox.configure(state='normal')
		intruder_checkbox.configure(state="normal")
		fileName_embeds_date_checkbox.configure(state="disabled")
		fileName_embeds_date.set(0)
	else:
		reminders_util.checkReminder(config_filename, ["Plagiarist"], '', True)
		similarityIndex_Plagiarist_menu.configure(state="normal")
		check_filename_checkbox.configure(state='disabled')
		character_checkbox.configure(state="disabled")
		character_home_checkbox.configure(state="disabled")
		Levenshtein_checkbox.configure(state='disabled')
		missing_character_checkbox.configure(state="disabled")
		intruder_checkbox.configure(state="disabled")
		fileName_embeds_date_checkbox.configure(state="normal")
def create_js(output_filename, locations, api_key, geocoder, latLongList):
    gmaps_list = []
    if not latLongList:
        latLongList = []
        for l in locations:
            returned_loc = GIS_geocode_util.nominatim_geocode(geocoder, l)
            latLongList.append([returned_loc.latitude, returned_loc.longitude])
    else:
        latLongList = locations
    for item in latLongList:
        gmaps_str = ''.join([
            "new google.maps.LatLng(",
            str(item[0]), ", ",
            str(item[1]), "),"
        ])
        gmaps_list.append(gmaps_str)
        # gmaps_list geocoded values
    create_google_heatmap(output_filename, gmaps_list, api_key)
    config_filename = 'GIS-config.txt'
    reminders_util.checkReminder(
        config_filename, ['Google Maps API'],
        'If the heatmap produced by Google Maps is displayed correctly for a split second and then displays "Oops! Something went wrong" you probably:\n  1. pasted incorrectly into the API key widget the Google API key;\n  2. you may not have entered billing information when applying for an API key; billing information is required although it is VERY unlikely you will be charged since you are not producing maps on a massive scale;\n  3. you may not have enabled the Maps JavaScript API (and if you use Google for geocoding, you also need to enable the Geocoding API.\n\nPlease, check the API key, your billing information, and tthe API enabled and try again.',
        True)
Example #7
0
def check_requirements(*args):
    inputDir=GUI_util.input_main_dir_path.get()
    inputFilename=GUI_util.inputFilename.get()
    sentimentAnalysis=sentiment_analysis_var.get()
    if inputDir=='' and sentimentAnalysis == True or corpus_analysis_var.get() == True:
        mb.showwarning(title='Input folder error',
                       message='The \'Sentiment analysis\' and \'Compute & visualize corpus statistcs\' options require in input a set of txt files for which to compute sentiment scores and/or corpus statistics.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.')
        return
    if inputDir!='' and sentiment_analysis_var.get() == True or corpus_analysis_var.get() == True:
        nSAscoreFiles=IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt')
        if nSAscoreFiles==0:
            mb.showwarning(title="Directory error",
                           message="Sentiment Analysis and Corpus Statistics algorithms require in input a set of txt files for which to compute sentiment scores and/or create statistics. The selected input directory\n\n"+inputDir+"\n\ndoes not contain any txt files.\n\nPlease, select a different directory (or untick the checkboxes 'Sentiment Analysis' and/or 'Compute & visualize corpus statistics') and try again.")
            return
        if sentiment_analysis_var.get() == True:
            title_options = ['Stanford CoreNLP Sentiment Analysis system requirements']
            message = 'The Stanford CoreNLP Sentiment Analysis tool requires two components.\n\n1. A copy of the FREEWARE Stanford CoreNLP suite installed on your machine. You can download the FREEWARE Stanford CoreNLP at https://stanfordnlp.github.io/CoreNLP/download.html.\n\n2. CoreNLP, in turn, requires to have the FREEWARE Java installed. You can download and install the FREEWARE JAVA at https://www.java.com/en/download/'
            reminders_util.checkReminder(config_filename,
                                         title_options,
                                         message,
                                         True)
            return
    if inputFilename=='' and sentiment_analysis_var.get() == False and corpus_analysis_var.get() == False and (
            hierarchical_clustering_var.get() == True or SVD_var.get() == True or NMF_var.get() == True):
        mb.showwarning(title="Data warning: Data reduction algorithms",
                       message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores.\n\nPlease, use the IO widget \'Select INPUT csv file\' and try again.")
    if inputFilename!='' and sentiment_analysis_var.get() == False and corpus_analysis_var.get() == False and (
            hierarchical_clustering_var.get() == True or SVD_var.get() == True or NMF_var.get() == True):
        nSAscoreFiles = IO_csv_util.GetNumberOfDocumentsInCSVfile(inputFilename,'Shape of Stories')
        if nSAscoreFiles == None:
            return
        # nSAscoreFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'csv')
        if nSAscoreFiles < 50:
            mb.showwarning(title="Data warning: Data reduction algorithms",
                                 message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores for a large number of documents (at least 50). The selected input file\n\n" + inputFilename + "\n\ncontains only " + str(
                                     nSAscoreFiles) + " files. TOO FEW!\n\nYou REALLY should select a different csv file and try again.")
Example #8
0
def display_reminder(*args):
    if SA_algorithm_var.get() == 'Stanford CoreNLP':
        message = 'The Stanford CoreNLP Sentiment Analysis tool requires two components.\n\n1. A copy of the FREEWARE Stanford CoreNLP suite installed on your machine. You can download the FREEWARE Stanford CoreNLP at https://stanfordnlp.github.io/CoreNLP/download.html.\n\n2. CoreNLP, in turn, requires to have the FREEWARE Java installed. You can download and install the FREEWARE JAVA at https://www.java.com/en/download/'
        title_option = [SA_algorithm_var.get()]
        reminders_util.checkReminder(config_filename, title_option, message,
                                     True)
    elif SA_algorithm_var.get() == 'VADER':
        message = 'VADER heavily relies on a number of NLTK libraries. If VADER fails to run, make sure that in command line you run\n\npython -m nltk.downloader all'
        title_option = [SA_algorithm_var.get()]
        reminders_util.checkReminder(config_filename, title_option, message,
                                     True)
        if mean_var.get() or median_var.get() == True:
            message = 'VADER cannot compute sentence mean and median values because VADER computes a single compound value for the entire sentence.\n\nUse the hedonometer to compute separate values and word list of words found.'
            title_option = ['VADER Mean/Median']
            reminders_util.checkReminder(config_filename, title_option,
                                         message, True)
    elif SA_algorithm_var.get() == 'SentiWordNet':
        message = 'SentiWordNet does not compute sentence mean and median values nor does it display a list of the individual words found.'
        title_option = ['SentiWordNet']
        reminders_util.checkReminder(config_filename, title_option, message,
                                     True)
    else:
        return
Example #9
0
    GUI_IO_util.place_help_button(
        window, help_button_x_coordinate, basic_y_coordinate + y_step * 8,
        "Help",
        "Please, tick the checkbox if you wish to MAP a list of geococed locations.\n\nUsing the dropdown menu, select the GIS (Geographic Information System) package you wish to use to produce maps.\n\nGoogle Maps requires an API key that you obtain from registering.\n\nWhen selecting Google Maps, the API key field will become available.\n\nYou will need to get the API key from the Google console and entering it there. REMEMBER! When applying for an API key you will need to enter billing information; billing information is required although it is VERY unlikely you will be charged since you are not producing maps on a massive scale.\n https://developers.google.com/maps/documentation/embed/get-api-key.\n\nAfter entering the Google API key, click OK to save it and the key will be read in automatically next time around.\n\nTick the Open GUI checkbox ONLY if you wish to open the Google Earth Pro GUI for more options. Do not tick the checkbox if you wish to run the pipeline automatically from text to maps."
        + GUI_IO_util.msg_Esc)
    GUI_IO_util.place_help_button(window, help_button_x_coordinate,
                                  basic_y_coordinate + y_step * 9, "Help",
                                  GUI_IO_util.msg_openOutputFiles)


help_buttons(window, GUI_IO_util.get_help_button_x_coordinate(),
             GUI_IO_util.get_basic_y_coordinate(), GUI_IO_util.get_y_step())

# change the value of the readMe_message
readMe_message = "This Python 3 script allows users to go from text to map in three steps:\n\n1. EXTRACT locations from a text file using Stanford CoreNLP NER extractor (NER values: CITY, STATE_OR_PROVINCE, COUNTRY);\n2. GEOCODE locations, previously extracted, using Nominatim or Google (an API is needed for Google);\n3. MAP locations, previously geocoded, using a selected GIS package (e.g., Google Earth Pro; Google Maps to produce heat maps; Google Maps requires an API key).\n\nOptions are preset and\or disabled depending upon the input type (directory or file; txt or csv file; csv CoNLL file or list of locations to be geocoded or already geocoded).\n\nAll three steps can be selected and carried out in sequence in a pipeline, going automatically from text to map."
readMe_command = lambda: GUI_IO_util.readme_button(
    window, GUI_IO_util.get_help_button_x_coordinate(),
    GUI_IO_util.get_basic_y_coordinate(), "Help", readMe_message)
GUI_util.GUI_bottom(config_input_output_options, y_multiplier_integer,
                    readMe_command, TIPS_lookup, TIPS_options)

# routine_options = reminders_util.getReminder_list(config_filename)
result = reminders_util.checkReminder(
    config_filename, ['GIS GUI options'],
    'The options available on the GUI have been automatically set for you depending upon the type of input file selected: txt or csv.\n\nWith a TXT file, NER extraction via Stanford CoreNLP must be first performed.\n\nWith a CSV file, the script checks whether the file is a CoNLL table, a geocoded file containing latitude and longitude values, or a file containing a list of locations that need to be geocoded.'
)
if result != None:
    routine_options = reminders_util.getReminder_list(config_filename)

GUI_util.window.mainloop()
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts,
        memory_var, manual_Coref, parser, parser_menu_var, dateInclude, sep,
        date_field_position, dateFormat, compute_sentence_var,
        CoNLL_table_analyzer_var, CoreNLP_annotators_var,
        CoreNLP_annotators_menu_var):
    # check internet connection
    filesToOpen = []

    if not IO_internet_util.check_internet_availability_warning(
            "Stanford CoreNLP"):
        return

    errorFound, error_code, system_output = IO_libraries_util.check_java_installation(
        'Stanford CoreNLP')
    if errorFound:
        return

    if parser == 0 and CoNLL_table_analyzer_var == 0 and CoreNLP_annotators_var == 0:
        mb.showinfo(
            "Warning",
            "No options have been selected.\n\nPlease, select an option and try again."
        )

    if CoreNLP_annotators_var == True and 'Coreference PRONOMINAL resolution' in CoreNLP_annotators_menu_var:
        if IO_libraries_util.inputProgramFileCheck(
                "Stanford_CoreNLP_coReference_util.py") == False:
            return
        if "Neural" in CoreNLP_annotators_menu_var:
            CoRef_Option = 'Neural Network'
        file_open, error_indicator = Stanford_CoreNLP_coreference_util.run(
            inputFilename, inputDir, outputDir, openOutputFiles,
            createExcelCharts, memory_var, CoRef_Option, manual_Coref)
        if error_indicator == 0:
            IO_user_interface_util.timed_alert(
                GUI_util.window, 4000,
                'Stanford CoreNLP Co-Reference Resolution',
                'Finished running Stanford CoreNLP Co-Reference Resolution using the '
                + CoRef_Option + ' approach at', True)
        else:
            mb.showinfo(
                "Coreference Resolution Error",
                "Since Stanford CoreNLP Co-Reference Resolution throws error, "
                +
                "and you either didn't choose manual Co-Reference Resolution or manual Co-Referenece Resolution fails as well, the process ends now."
            )
        filesToOpen = filesToOpen + file_open

    outputCoNLLfilePath = ''

    # parser ---------------------------------------------------------------------------------------------------------------------------

    if parser:

        # Parser  ------------------------------
        if parser_menu_var == 'Probabilistic Context Free Grammar (PCFG)' or parser_menu_var == 'Neural Network':
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return
            if parser_menu_var == 'Probabilistic Context Free Grammar (PCFG)':
                tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                    inputFilename,
                    inputDir,
                    outputDir,
                    openOutputFiles,
                    createExcelCharts,
                    'parser (pcfg)',
                    False,
                    memory_var,
                    extract_date_from_filename_var=dateInclude,
                    date_format=dateFormat,
                    date_separator_var=sep,
                    date_position_var=date_field_position)
            else:
                # Parser (Neural Network) ------------------------------
                tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                    inputFilename,
                    inputDir,
                    outputDir,
                    openOutputFiles,
                    createExcelCharts,
                    'parser (nn)',
                    False,
                    memory_var,
                    extract_date_from_filename_var=dateInclude,
                    date_format=dateFormat,
                    date_separator_var=sep,
                    date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)
                if compute_sentence_var:
                    tempOutputFile = IO_CoNLL_util.compute_sentence_table(
                        tempOutputFiles[0], outputDir)
                    filesToOpen.append(tempOutputFile)

        if CoNLL_table_analyzer_var and len(filesToOpen) > 0:
            if IO_libraries_util.inputProgramFileCheck(
                    'CoNLL_table_analyzer_main.py') == False:
                return
            # open the analyzer having saved the new new parser output in config so that it open the right input file
            if parser:
                config_filename_temp = 'conll-table-analyzer-config.txt'
                config_array = [
                    'EMPTY LINE', outputCoNLLfilePath, 'EMPTY LINE',
                    'EMPTY LINE', 'EMPTY LINE', outputDir
                ]
                config_util.saveConfig(GUI_util.window, config_filename_temp,
                                       config_array, True)

                reminders_util.checkReminder(
                    config_filename, ['CoNLL table analyzer'],
                    "The Stanford CoreNLP GUI will now open the 'CoNLL table analyzer' where you can:\n\n  1. search the words contained in the CoNLL table (the one just created or a different one) by their syntactical properties and the type of relations to other words;\n  2. compute frequency distributions of various types of linguistic objects: clauses, nouns, verbs, function words ('junk/stop' words).",
                    True)

                call("python CoNLL_table_analyzer_main.py", shell=True)

    if CoreNLP_annotators_var and CoreNLP_annotators_menu_var != '':

        # POS annotator ---------------------------------------------------------------------------------------------------------------------------
        if 'POS annotator' in CoreNLP_annotators_menu_var or CoreNLP_annotators_menu_var == '*':
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'All POS',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # DepRel annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'DepRel annotator' in CoreNLP_annotators_menu_var or CoreNLP_annotators_menu_var == '*':
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'DepRel',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # NER annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'NER (GUI)' in CoreNLP_annotators_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_NER_main.py') == False:
                return
            call("python Stanford_CoreNLP_NER_main.py", shell=True)

        # NER normalized date annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'Normalized' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var:
            # date_extractor
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'normalized-date',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # quote annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'Quote' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var:
            # if quote_extractor:
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'quote',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)

            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # gender annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'Gender' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'gender',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)

            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # Sentiment analysis annotator ---------------------------------------------------------------------------------------------------------------------------

        if 'Sentiment analysis' in CoreNLP_annotators_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'sentiment',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

        # OpenIE SVO extractor ---------------------------------------------------------------------------------------------------------------------------

        if 'OpenIE' in CoreNLP_annotators_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'Stanford_CoreNLP_annotator_util.py') == False:
                return

            IO_user_interface_util.script_under_development(
                'Stanford CoreNLP OpenIE')

            tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                'openIE',
                False,
                memory_var,
                extract_date_from_filename_var=dateInclude,
                date_format=dateFormat,
                date_separator_var=sep,
                date_position_var=date_field_position)
            if len(tempOutputFiles) > 0:
                filesToOpen.extend(tempOutputFiles)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts,
        utf8_var, ASCII_var, corpus_statistics_var, corpus_options_menu_var,
        topics_var, topics_Mallet_var, topics_Gensim_var, open_GUI_var,
        what_else_var, what_else_menu_var, memory_var):

    filesToOpen = []
    inputFilename = ''  # only corpus in dir used

    if (corpus_statistics_var==False and \
        corpus_options_menu_var==False and \
        topics_Mallet_var==False and \
        topics_Gensim_var==False and \
        what_else_var==False and \
        what_else_var == False):
        mb.showwarning(
            title='No options selected',
            message=
            'No options have been selected.\n\nPlease, select an option and try again.'
        )
        return

    if utf8_var == True:
        IO_user_interface_util.timed_alert(
            GUI_util.window, 7000, 'Analysis start',
            'Started running utf8 compliance test at', True)
        file_utf8_compliance_util.check_utf8_compliance(
            GUI_util.window, inputFilename, inputDir, outputDir,
            openOutputFiles)

    if ASCII_var == True:
        IO_user_interface_util.timed_alert(
            GUI_util.window, 7000, 'Analysis start',
            'Started running characters conversion at', True)
        file_cleaner_util.convert_quotes(GUI_util.window, inputFilename,
                                         inputDir)

    if corpus_statistics_var == True:
        if IO_libraries_util.inputProgramFileCheck(
                'statistics_txt_util.py') == False:
            return

        lemmatize = False
        stopwords = False

        if '*' or 'stopwords' in corpus_options_menu_var:
            stopwords = True
        if '*' or 'Lemmatize' in corpus_options_menu_var:
            lemmatize = True

        if '*' in corpus_options_menu_var or 'stopwords' in corpus_options_menu_var or 'Lemmatize' in corpus_options_menu_var:
            output = statistics_txt_util.compute_corpus_statistics(
                window, '', inputDir, outputDir, False, createExcelCharts,
                stopwords, lemmatize)
            if output != None:
                filesToOpen.extend(output)

        if '*' in corpus_options_menu_var or 'lines' in corpus_options_menu_var:
            output = statistics_txt_util.read_line(window, '', inputDir,
                                                   outputDir, False,
                                                   createExcelCharts)
            if output != None:
                filesToOpen.extend(output)

    if topics_var == True:
        if topics_Gensim_var == True:
            if IO_libraries_util.inputProgramFileCheck(
                    'topic_modeling_gensim_main.py') == False:
                return
            routine_options = reminders_util.getReminder_list(config_filename)
            reminders_util.checkReminder(
                config_filename, ['What is in your corpus - Gensim'],
                'The Gensim topic modeling routine run from here is a reduced version of the script, meant to provide a quick overview of the topics in your corpus.\n\nFor a more in-depth analysis of topics, use the topic modeling scripts for Gensim and Mallet.',
                True)
            routine_options = reminders_util.getReminder_list(config_filename)

            if open_GUI_var == True:
                call("python topic_modeling_gensim_main.py", shell=True)
            else:
                # run with all default values; do not run Mallet
                output = topic_modeling_gensim_util.run_Gensim(
                    GUI_util.window,
                    inputDir,
                    outputDir,
                    num_topics=20,
                    remove_stopwords_var=1,
                    lemmatize=1,
                    nounsOnly=0,
                    run_Mallet=False,
                    openOutputFiles=openOutputFiles,
                    createExcelCharts=createExcelCharts)
                if output != None:
                    filesToOpen.extend(output)

        if topics_Mallet_var == True:
            # def run(inputDir, outputDir, openOutputFiles, createExcelCharts, OptimizeInterval, numTopics):

            if open_GUI_var == True:
                call("python topic_modeling_mallet_main.py", shell=True)
            else:
                # running with default values
                output = topic_modeling_mallet_util.run(
                    inputDir,
                    outputDir,
                    openOutputFiles=openOutputFiles,
                    createExcelCharts=createExcelCharts,
                    OptimizeInterval=True,
                    numTopics=20)
                if output != None:
                    filesToOpen.extend(output)

    nouns_var = False
    verbs_var = False
    dialogues_var = False
    people_organizations_var = False
    gender_var = False
    times_var = False
    locations_var = False
    nature_var = False

    if what_else_var and what_else_menu_var == '*':
        nouns_var = True
        verbs_var = True

    if 'noun' in what_else_menu_var.lower():
        nouns_var = True
    if 'verb' in what_else_menu_var.lower():
        verbs_var = True
    if 'dialogue' in what_else_menu_var.lower():
        dialogues_var = True
    if 'people' in what_else_menu_var.lower():
        people_organizations_var = True
    if 'male' in what_else_menu_var.lower():
        gender_var = True
    if 'time' in what_else_menu_var.lower():
        times_var = True
    if 'location' in what_else_menu_var.lower():
        locations_var = True
    if 'nature' in what_else_menu_var.lower():
        nature_var = True

    if (
            what_else_var and what_else_menu_var == '*'
    ) or nouns_var == True or verbs_var == True or people_organizations_var == True or gender_var == True or dialogues_var == True or times_var == True or locations_var == True:
        if IO_libraries_util.inputProgramFileCheck(
                'Stanford_CoreNLP_annotator_util.py') == False:
            return

        if nouns_var or verbs_var:

            if nouns_var or verbs_var or what_else_menu_var == '*':
                WordNetDir = IO_libraries_util.get_external_software_dir(
                    'whats_in_your_corpus', 'WordNet')
                if WordNetDir == None:
                    return

                annotator = ['POS']
                files = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                    inputFilename, inputDir, outputDir, openOutputFiles,
                    createExcelCharts, annotator, False, memory_var)
                if len(files) > 0:
                    noun_verb = ''
                    if verbs_var == True:
                        inputFilename = files[0]  # Verbs but... double check
                        if "verbs" in inputFilename.lower():
                            noun_verb = 'VERB'
                        else:
                            return
                        output = WordNet_util.ancestor_GoingUP(
                            WordNetDir, inputFilename, outputDir, noun_verb,
                            openOutputFiles, createExcelCharts)
                        if output != None:
                            filesToOpen.extend(output)

                    if nouns_var == True:
                        inputFilename = files[1]  # Nouns but... double check
                        if "nouns" in inputFilename.lower():
                            noun_verb = 'NOUN'
                        else:
                            return
                        output = WordNet_util.ancestor_GoingUP(
                            WordNetDir, inputFilename, outputDir, noun_verb,
                            openOutputFiles, createExcelCharts)
                        if output != None:
                            filesToOpen.extend(output)
            else:
                if (what_else_var and what_else_menu_var == '*'):
                    IO_user_interface_util.timed_alert(
                        GUI_util.window, 4000, 'Missing WordNet',
                        'The analysis of \'what else is in your corpus\' will skip the nouns and verbs classification requiring WordNet and will continue with all other CoreNLP annotators'
                    )

        if what_else_var and what_else_menu_var == '*':
            annotator_list = ['NER', 'gender', 'quote', 'normalized-date']
            NER_list = [
                'PERSON', 'ORGANIZATION', 'CITY', 'STATE_OR_PROVINCE',
                'COUNTRY'
            ]
            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                annotator_list,
                False,
                memory_var,
                NERs=NER_list)
            if output != None:
                filesToOpen.extend(output)

        if people_organizations_var == True:
            annotator = 'NER'
            NER_list = ['PERSON', 'ORGANIZATION']

            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                annotator,
                False,
                memory_var,
                NERs=NER_list)
            if output != None:
                filesToOpen.extend(output)

        if gender_var == True:
            annotator = 'gender'
            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, annotator, False, memory_var)
            if output != None:
                filesToOpen.extend(output)

        if dialogues_var == True:
            annotator = 'quote'
            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, annotator, False, memory_var)
            if output != None:
                filesToOpen.extend(output)

        if times_var == True:
            annotator = 'normalized-date'
            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, annotator, False, memory_var)
            if output != None:
                filesToOpen.extend(output)

        if locations_var == True:
            annotator = 'NER'
            NER_list = ['CITY', 'STATE_OR_PROVINCE', 'COUNTRY']

            output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
                inputFilename,
                inputDir,
                outputDir,
                openOutputFiles,
                createExcelCharts,
                annotator,
                False,
                memory_var,
                NERs=NER_list)
            if output != None:
                filesToOpen.extend(output)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def language_detection(window, inputFilename, inputDir, outputDir,
                       openOutputFiles, createExcelCharts):

    folderID = 0
    fileID = 0
    filesToOpen = []

    outputFilenameCSV = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'lang_detect')
    filesToOpen.append(outputFilenameCSV)

    files = IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    if len(files) == 0:
        return

    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    fieldnames = [
        'LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language',
        'Probability', 'LANGID', 'Language', 'Probability', 'Document ID',
        'Document'
    ]

    config_filename = 'file-spell-checker-config.txt'
    reminders_util.checkReminder(
        config_filename, ['Language detection'],
        'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.',
        True)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        'Started running language detection algorithms at', True,
        'You can follow the algorithms in command line.')

    with open(outputFilenameCSV,
              'w',
              encoding='utf-8',
              errors='ignore',
              newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        docErrors_empty = 0
        docErrors_unknown = 0
        filenameSV = ''
        for filename in files:
            fileID = fileID + 1
            head, tail = os.path.split(filename)
            print("Processing file " + str(fileID) + "/" + str(len(files)) +
                  ' ' + tail)
            text = open(filename, 'r', encoding='utf-8',
                        errors='ignore').read()
            if len(text) == 0:
                print(
                    "  The file is empty. It will be discarded from processing."
                )
                docErrors_empty = docErrors_empty + 1
                continue
            # text = opened_file.read()
            # head, tail = os.path.split(filename)
            # head is path, tail is filename
            try:
                value = detect_langs(text)
            except:
                filenameSV = filename  # do not count the same document twice in this and the other algorithms that follow
                docErrors_unknown = docErrors_unknown + 1
                print("  Unknown file read error.")
                continue
            value = str(value[0]).split(':')
            language = value[0]
            probability = value[1]
            print('   LANGDETECT', language, probability)
            # print('   LANGDETECT',value[0],value[1])  # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756]
            currentLine = ['LANGDETECT', language, probability]

            nlp = spacy.load('en_core_web_sm')
            nlp.add_pipe(LanguageDetector(),
                         name='language_detector',
                         last=True)
            try:
                doc = nlp(text)
            except:
                if filename != filenameSV:  # do not count the same document twice in this and the other algorithm that follows
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV = filename
                print("  Unknown file read error.")
                continue
            value = doc._.language
            language = value['language']
            probability = value['score']
            print(
                '   SPACY', language,
                probability)  # {'language': 'en', 'score': 0.9999978351575265}
            currentLine.extend(['SPACY', language, probability])

            lang_identifier = LanguageIdentifier.from_modelstring(
                model, norm_probs=True)
            try:
                value = lang_identifier.classify(text)
            except:
                if filename != filenameSV:
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV = filename
                print("  Unknown file read error.")
                continue
            language = value[0]
            probability = value[1]
            print('   LANGID', language,
                  probability)  # ('en', 0.999999999999998)
            print()
            currentLine.extend(['LANGID', language, probability])
            currentLine.extend(
                [fileID,
                 IO_csv_util.dressFilenameForCSVHyperlink(filename)])

            writer = csv.writer(csvfile)
            writer.writerows([currentLine])
            filenameSV = filename
    csvfile.close()
    msg = ''
    if docErrors_empty == 0 and docErrors_unknown == 0:
        msg = str(
            fileID
        ) + ' documents successfully processed for language detection.'
    else:
        if docErrors_empty > 0:
            msg = str(
                fileID
            ) + ' documents processed for language detection.\n  ' + str(
                docErrors_empty) + ' document(s) found empty.'
        if docErrors_unknown > 0:
            if msg != '':
                msg = msg + '\n  ' + str(
                    docErrors_unknown
                ) + ' document(s) read with unknown errors.'
            else:
                msg = str(fileID) + ' documents processed for language detection.\n  ' + \
                      str(docErrors_unknown) + ' document(s) read with unknown errors.'
        mb.showwarning(
            title='File read errors',
            message=msg +
            '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.'
        )
    filesToOpen.append(outputFilenameCSV)
    if createExcelCharts:
        columns_to_be_plotted = [[1, 1], [4, 4], [7, 7]]
        chart_title = 'Frequency of Languages Detected by 3 Algorithms'
        hover_label = ['LANGDETECT', 'SPACY', 'LANGID']
        inputFilename = outputFilenameCSV
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='_bar_chart',
            chart_type_list=["bar"],
            chart_title=chart_title,
            column_xAxis_label_var='Language',
            hover_info_column_list=hover_label,
            count_var=1)
        if Excel_outputFilename != '':
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Example #13
0
def run(inputDir, outputDir, openOutputFiles, createExcelCharts, n_grams_var,
        n_grams_menu_var, n_grams_list, n_grams_viewer_var, CoOcc_Viewer_var,
        search_words, date_options, temporal_aggregation, date_format,
        date_separator_var, date_position_var, viewer_list):
    # print(date_options, temporal_aggregation, date_format, date_separator_var, date_position_var)
    filesToOpen = []

    total_file_number = 0
    error_file_number = 0
    error_filenames = []
    error_flag = False

    if n_grams_var == False and n_grams_viewer_var == False and CoOcc_Viewer_var == False:
        mb.showwarning(
            title='Warning',
            message=
            'There are no options selected.\n\nPlease, select one of the available options and try again.'
        )
        return
    if date_options:
        new_date_format = date_format.replace('yyyy', '%Y').replace(
            'mm', '%m').replace('dd', '%d')
        for folder, subs, files in os.walk(inputDir):
            for filename in files:
                if not filename.endswith('.txt'):
                    continue
                filename = filename.replace('.txt', '')
                total_file_number = total_file_number + 1
                try:
                    date_text = ''
                    date_text = filename.split(date_separator_var)[
                        date_position_var - 1]
                except:  # if a file in the folder has no date it will break the code
                    pass
                try:
                    datetime.datetime.strptime(date_text, new_date_format)
                except ValueError:
                    error_file_number = error_file_number + 1
                    error_filenames.append(
                        IO_csv_util.dressFilenameForCSVHyperlink(
                            os.path.join(folder, filename + '.txt')))
                    error_flag = True

    if error_flag:
        df = pd.DataFrame(error_filenames,
                          columns=[
                              'File with date not in position ' +
                              str(date_position_var)
                          ])
        error_output = IO_files_util.generate_output_file_name(
            '', inputDir, outputDir, '.csv', 'Date_position_errors_file')
        df.to_csv(error_output, index=False)
        mb.showwarning(
            title='Warning',
            message='There are ' + str(error_file_number) + ' files out of ' +
            str(total_file_number) +
            ' processed in the selected input directory with errors in either the date format or the date position. \n\nThe selected date format is '
            + str(date_format) + ' and the selected date position is ' +
            str(date_position_var) +
            '.\n\nClick OK to open a csv file with a list of files with erroneous dates. Check carefully, both date format and date position. Any erroneous file will need to be fixed or removed from the input directory before processing. You may also simply need to select a different date format and/or date position.'
        )
        filesToOpen.append(error_output)
        if openOutputFiles == True:
            IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                          filesToOpen)
        return

# COMPUTE Ngrams ______________________________________________________________________________

    if n_grams_var:
        n_grams_word_var = False
        n_grams_character_var = False
        normalize = False
        n_grams_size = 4  # default number of n_grams
        excludePunctuation = False
        bySentenceIndex_word_var = False
        bySentenceIndex_character_var = False
        if n_grams_menu_var == "Word":
            n_grams_word_var = True
        else:
            n_grams_character_var = True
        bySentenceIndex_character_var = False
        if 'Hapax' in str(n_grams_list):
            n_grams_size = 1
        if 'punctuation' in str(n_grams_list):
            excludePunctuation = True
        if 'sentence index' in str(n_grams_list):
            if n_grams_menu_var == "Word":
                bySentenceIndex_word_var = True
            else:
                bySentenceIndex_character_var = True

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'N-Grams start',
            'Started running ' + n_grams_menu_var + ' n-grams at', True,
            'You can follow the script in command line.')

        if n_grams_word_var or n_grams_character_var or bySentenceIndex_word_var or bySentenceIndex_character_var:
            inputFilename = ''  # for now we only process a whole directory
            if IO_libraries_util.inputProgramFileCheck(
                    'statistics_txt_util.py') == False:
                return

        if n_grams_word_var or bySentenceIndex_word_var:
            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                n_grams_size, normalize, excludePunctuation, 1,
                openOutputFiles, createExcelCharts, bySentenceIndex_word_var)
        if n_grams_character_var or bySentenceIndex_character_var:
            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                n_grams_size, normalize, excludePunctuation, 0,
                openOutputFiles, createExcelCharts,
                bySentenceIndex_character_var)
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'N-Grams end',
            'Finished running ' + n_grams_menu_var + ' n-grams at', True)

# VIEWER ____________________________________________________________________________________________

    if (n_grams_viewer_var == False and CoOcc_Viewer_var == False):
        return

    if (n_grams_viewer_var == True
            or CoOcc_Viewer_var == True) and (createExcelCharts == False):
        mb.showwarning(
            title='Warning',
            message=
            'The checkbox to compute Excel charts is unticked. Since the VIEWER produces Excel charts as output, the script will abort.\n\nPlease, tick the checkbox to produce Excel charts and try again.'
        )
        return

    txtCounter = len(glob.glob1(inputDir, "*.txt"))
    if txtCounter == 0:
        mb.showwarning(
            title='Warning',
            message=
            'There are no files with txt extension in the selected directory.\n\nPlease, select a different directory and try again.'
        )
        return

    if txtCounter == 1:
        mb.showwarning(
            title='Warning',
            message=
            'There is only one file with txt extension in the selected directory. The script requires at least two files.\n\nPlease, select a different directory and try again.'
        )
        return

    if (n_grams_viewer_var or CoOcc_Viewer_var):
        if IO_libraries_util.inputProgramFileCheck(
                'NGrams_CoOccurrences_Viewer.jar') == False:
            return
        errorFound, error_code, system_output = IO_libraries_util.check_java_installation(
            'Ngram/CoOccurrence Viewer')
        if errorFound:
            return

    if ',' in search_words:
        mb.showwarning(
            title='Warning',
            message=
            'Values entered in the search bar should not be comma-separated, but blank-separated (e.g., woman man, and not woman, man).\n\nPlease, check your search bar values and try again.'
        )
        return

    if search_words != '' and n_grams_viewer_var == False and CoOcc_Viewer_var == False:
        mb.showwarning(
            title='Warning',
            message="You have entered the string '" + search_words +
            "' in the Search widget but you have not selected which Viewer you wish to use, Ngram or Co-Occurrence.\n\nPlease, select an option and try again."
        )
        return

    if search_words == '' and (n_grams_viewer_var == True
                               or CoOcc_Viewer_var == True):
        mb.showwarning(
            title='Warning',
            message=
            "You have selected to run a Viewer but you have not entered any search strings in the Search widget.\n\nPlease, enter search values  and try again."
        )
        return

    normalize = False
    scaleData = False
    useLemma = False
    fullInfo = False
    if 'Normalize' in str(viewer_list):
        normalize = True
    if 'Scale' in str(viewer_list):
        scaleData = True
    if 'Lemmatize' in str(viewer_list):
        useLemma = True
    if 'full information' in str(viewer_list):
        fullInfo = True

    cmd = [
        'java', '-jar', 'NGrams_CoOccurrences_Viewer.jar', '-inputFolder',
        inputDir, '-outputFolder', outputDir
    ]

    if (n_grams_viewer_var == 1
            or CoOcc_Viewer_var == 1) and len(search_words) == 0:
        mb.showwarning(
            title='Warning',
            message=
            'No search words have been entered for either N-Grams or words co-occurrences.\n\nPlease, enter the search words and try again.'
        )
        return

    if n_grams_viewer_var == 1 and len(search_words) > 0:
        if date_options == 0:
            mb.showwarning(
                title='Warning',
                message=
                'No Date options selected. The N-Grams routine requires date metadata (i.e., date information embedded in the document filenames, e.g., The New York Times_12-18-1899).\n\nPlease, tick the Date options checkbox, enter the appropariate date options and try again.'
            )
            return
        ngram_list = processSearchWords(search_words)
        ngram_list = ['-checkNGrams'] + ngram_list
        cmd.extend(ngram_list)

    if date_options == 1:
        cmd.extend([
            '-AggregateBy', temporal_aggregation, '-dateFormat', date_format,
            '-datePos',
            str(date_position_var), '-itemsDelim', date_separator_var
        ])

    if CoOcc_Viewer_var == 1 and len(search_words) > 0:
        co_occurrences_list = processSearchWords(search_words)
        co_occurrences_list = ["-checkCoOccurrences"] + co_occurrences_list
        cmd.extend(co_occurrences_list)

    if normalize == 1 and n_grams_viewer_var == 1 and len(search_words) > 0:
        cmd.append('-normalize')  # only available for Ngrams
    if scaleData == 1: cmd.append('-scaledata')
    if useLemma == 1: cmd.append('-lemma')
    if fullInfo == 1: cmd.append('-fullInfo')

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'N-Grams Word Co-Occurrences start',
        'Started running N-Grams Word Co-Occurrences Viewer at', True,
        'Please, be patient. Depending upon the number of documents processed this may take a few minutes.\n\nYou can follow the script in command line.'
    )

    reminders_util.checkReminder(
        config_filename, ['subprocess.call(cmd) error'],
        'subprocess.call(cmd) error\n\nIf the VIEWER you are running exits with an error code about a file not found, most likely your selected INPUT & OUTPUT directory options are too long for Windows to handle.\n\nYou may need to move your input and output folders so as to have a shorter path (e.g., desktop).',
        True)
    print(cmd)
    try:
        subprocess.run(cmd, shell=True)
    except:
        mb.showwarning(
            title='Warning',
            message=
            "The Java viewer script exited with errors. Please, check your command line for a possible error 'Java' is not recognized as an internal or external command. If that's the case, please install Java JDK. Please, check the TIPS on Java download and installation and try again."
        )
        return

    if n_grams_viewer_var == 1 and len(search_words) > 0:
        # this is the output filename generated by the Java script
        n_grams_outputFile = os.path.join(outputDir, 'Searched_N-Grams.csv')
        if IO_files_util.checkFile(n_grams_outputFile, '.csv', True) == False:
            mb.showwarning(
                title='Warning',
                message=
                "The Java viewer script did not produce an N-grams output file.\n\nPlease, check your command line for possible Java errors and try again."
            )
            return

    if CoOcc_Viewer_var == 1 and len(search_words) > 0:
        # this is the output filename generated by the Java script
        co_occurrences_outputFile = os.path.join(outputDir,
                                                 'Searched_CoOccurrences.csv')
        if IO_files_util.checkFile(co_occurrences_outputFile, '.csv',
                                   True) == False:
            mb.showwarning(
                title='Warning',
                message=
                "The Java viewer script did not produce a Co-occurrences output file.\n\nPlease, check your command line for possible Java errors and try again."
            )
            return

    # plot co-occurrences
    if createExcelCharts == True and CoOcc_Viewer_var == 1 and len(
            search_words) > 0:
        xlsxFilename = co_occurrences_outputFile
        filesToOpen.append(co_occurrences_outputFile)
        chartTitle = 'Co-Occurrences Viewer'
        if date_options == 0:
            xAxis = 'Document'
        else:
            xAxis = temporal_aggregation
        hover_label = ['More information']
        if xAxis == 'Document':
            columns_to_be_plotted = [[1, 1]]
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                xlsxFilename,
                outputDir,
                'Co-Occ_viewer',
                chart_type_list=["pie"],
                chart_title=chartTitle,
                column_xAxis_label_var=xAxis,
                hover_info_column_list=hover_label,
                count_var=1)
        else:
            columns_to_be_plotted = [[0, 1]]
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                xlsxFilename,
                outputDir,
                'Co-Occ_viewer',
                chart_type_list=["line"],
                chart_title=chartTitle,
                column_xAxis_label_var=xAxis,
                hover_info_column_list=hover_label)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    # plot Ngrams
    if createExcelCharts == True and n_grams_viewer_var == 1 and len(
            search_words) > 0:
        xlsxFilename = n_grams_outputFile
        filesToOpen.append(n_grams_outputFile)
        xAxis = temporal_aggregation
        chartTitle = 'N-Grams Viewer'
        columns_to_be_plotted = []
        for i in range(len(ngram_list) -
                       1):  # it will iterate through i = 0, 1, 2, …., n-1
            columns_to_be_plotted.append([0, i + 1])
        hover_label = [
            'Total Word Count of This Group', 'Total Word Count of This Group',
            'Total Word Count of This Group'
        ]
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            xlsxFilename,
            outputDir,
            'n-grams_viewer',
            chart_type_list=["line"],
            chart_title=chartTitle,
            column_xAxis_label_var=xAxis,
            hover_info_column_list=hover_label)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    # with both Ngrams and co-occurrences
    if n_grams_viewer_var == 1 and CoOcc_Viewer_var == 1 and CoOcc_Viewer_var == 1 and len(
            search_words) > 0:
        n_grams_co_occurrences_outputFile = os.path.join(
            outputDir, 'N-Grams_CoOccurrences_Statistics.csv')
        filesToOpen.append(n_grams_co_occurrences_outputFile)
        chartTitle = ''

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'N-Grams Word Co-Occurrences end',
        'Finished running N-Grams Word Co-Occurrences Viewer at', True)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Example #14
0
def GUI_bottom(config_input_output_options, y_multiplier_integer,
               readMe_command, TIPS_lookup, TIPS_options):
    """
    :type TIPS_options: object
    """
    # No bottom lines (README, TIPS, RUN, QUIT) displayed when opening the license agreement GUI
    if config_filename == 'license-config.txt':
        return
    # IO_options=[]
    reminder_options = []
    video_options = []
    missingIO = ""

    # for those GUIs (e.g., style analysis) that simply
    #   display options for opening more specialized GUIs
    #   do NOT display the next two sets of widgets
    #   since there is no output to display
    if config_input_output_options != [0, 0, 0, 0, 0, 0]:
        #open out csv files widget defined above since it is used earlier
        open_csv_output_label = tk.Checkbutton(
            window,
            variable=open_csv_output_checkbox,
            onvalue=1,
            offvalue=0,
            command=lambda: trace_checkbox(
                open_csv_output_label, open_csv_output_checkbox,
                "Automatically open output csv file(s)",
                "Do NOT automatically open output csv file(s)"))
        open_csv_output_label.configure(
            text="Automatically open output csv file(s)")
        open_csv_output_label.place(
            x=GUI_IO_util.get_labels_x_coordinate(),
            y=GUI_IO_util.get_basic_y_coordinate() +
            GUI_IO_util.get_y_step() * y_multiplier_integer)
        open_csv_output_checkbox.set(1)

        #creat Excel chart files widget defined above since it is used earlier
        create_Excel_chart_output_label = tk.Checkbutton(
            window,
            variable=create_Excel_chart_output_checkbox,
            onvalue=1,
            offvalue=0,
            command=lambda: trace_checkbox(
                create_Excel_chart_output_label,
                create_Excel_chart_output_checkbox,
                "Automatically compute and open Excel charts",
                "Do NOT automatically compute and open Excel charts"))
        create_Excel_chart_output_label.configure(
            text="Automatically compute and open Excel chart(s)")
        create_Excel_chart_output_label.place(
            x=GUI_IO_util.get_labels_x_coordinate() + 380,
            y=GUI_IO_util.get_basic_y_coordinate() +
            GUI_IO_util.get_y_step() * y_multiplier_integer)
        create_Excel_chart_output_checkbox.set(1)

        y_multiplier_integer = y_multiplier_integer + 1

    readme_button = tk.Button(window,
                              text='Read Me',
                              command=readMe_command,
                              width=10,
                              height=2)
    readme_button.place(x=GUI_IO_util.read_button_x_coordinate,
                        y=GUI_IO_util.get_basic_y_coordinate() +
                        GUI_IO_util.get_y_step() * y_multiplier_integer)

    video_options = ['No videos available']
    videos_dropdown_field.set('Watch videos')
    if len(video_options) == 0:
        videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field,
                                       video_options)
    else:
        if video_options[0] == "No videos available":
            videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field,
                                           "No videos available")
        else:
            videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field,
                                           *video_options)
            videos_menu_lb.configure(foreground="red")
    videos_menu_lb.place(x=GUI_IO_util.watch_videos_x_coordinate,
                         y=GUI_IO_util.get_basic_y_coordinate() +
                         GUI_IO_util.get_y_step() * y_multiplier_integer)

    tips_dropdown_field.set('Open TIPS files')
    if len(TIPS_lookup) == 1:
        if TIPS_options == "No TIPS available":
            tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field,
                                         TIPS_options)
        else:
            tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field,
                                         TIPS_options)
            tips_menu_lb.configure(foreground="red")
    else:
        tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field,
                                     *TIPS_options)
        tips_menu_lb.configure(foreground="red")
    tips_menu_lb.place(x=GUI_IO_util.open_TIPS_x_coordinate,
                       y=GUI_IO_util.get_basic_y_coordinate() +
                       GUI_IO_util.get_y_step() * y_multiplier_integer)

    TIPS_util.trace_open_tips(tips_dropdown_field, tips_menu_lb, TIPS_lookup)

    routine = config_filename[:-len('-config.txt')]
    # get the list of titles available for a given GUI
    reminder_options = reminders_util.getReminder_list(config_filename, True)
    # None returned for a faulty reminders.csv
    reminders_error = False
    if reminder_options == None:
        reminders_error = True
        reminder_options = ["No Reminders available"]

    # reminders content for specific GUIs are set in the csv file reminders
    # called from any GUI
    reminders_dropdown_field.set('Open reminders')
    reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field,
                                      "No Reminders available")

    if len(reminder_options) == 0:
        reminder_options = ["No Reminders available"]
    if len(reminder_options) == 0 or len(reminder_options) == 1:
        if reminder_options == ["No Reminders available"]:
            reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field,
                                              *reminder_options)
        else:
            reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field,
                                              *reminder_options)
            reminders_menu_lb.configure(foreground="red")
    else:
        reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field,
                                          *reminder_options)
        reminders_menu_lb.configure(foreground="red")
    reminders_menu_lb.place(x=GUI_IO_util.open_reminders_x_coordinate,
                            y=GUI_IO_util.get_basic_y_coordinate() +
                            GUI_IO_util.get_y_step() * y_multiplier_integer)

    def trace_reminders_dropdown(*args):
        if len(reminder_options) > 0:
            reminders_util.resetReminder(config_filename,
                                         reminders_dropdown_field.get())

    reminders_dropdown_field.trace('w', trace_reminders_dropdown)

    # get_help_button_x_coordinate()+700
    run_button.place(x=GUI_IO_util.run_button_x_coordinate,
                     y=GUI_IO_util.get_basic_y_coordinate() +
                     GUI_IO_util.get_y_step() * y_multiplier_integer)

    def _close_window():
        configArray = \
        config_util.setup_IO_configArray(window, config_input_output_options, select_softwareDir_button, softwareDir,
                                         select_input_file_button, inputFilename, select_input_main_dir_button,
                                         input_main_dir_path, select_input_secondary_dir_button,
                                         input_secondary_dir_path,
                                         select_output_file_button, outputFilename, select_output_dir_button,
                                         output_dir_path)[0]

        GUI_IO_util.exit_window(window, config_filename, configArray)

    # quit_button = tk.Button(window, text='QUIT', width=10,height=2, command=lambda: GUI_IO_util.exit_window(window,config_filename,configArray))
    quit_button = tk.Button(window,
                            text='QUIT',
                            width=10,
                            height=2,
                            command=lambda: _close_window())
    # get_help_button_x_coordinate()+820
    quit_button.place(x=GUI_IO_util.quit_button_x_coordinate,
                      y=GUI_IO_util.get_basic_y_coordinate() +
                      GUI_IO_util.get_y_step() * y_multiplier_integer)

    # Any message should be displayed after the whole GUI has been displayed

    if noLicenceError == True:
        mb.showwarning(
            title='Fatal error',
            message=
            "The licence agreement file 'LICENSE-NLP-1.0.txt' could not be found in the 'lib' subdirectory of your main NLP Suite directory\n"
            + GUI_IO_util.NLPPath +
            "\n\nPlease, make sure to copy this file in the 'lib' subdirectory.\n\nThe NLP Suite will now exit."
        )
        sys.exit()

    if IO_options[0] == "EMPTY LINE":  # INPUT software directory
        softwareDir.set('')
    else:
        softwareDir.set(
            config_util.checkConfigDirExists(config_filename, IO_options[0],
                                             'INPUT'))

    if IO_options[1] == "EMPTY LINE":  # INPUT filename
        inputFilename.set('')
    else:
        inputFilename.set(
            config_util.checkConfigFileExists(config_filename, IO_options[1],
                                              'INPUT'))

    if IO_options[2] == "EMPTY LINE":  # INPUT main directory
        input_main_dir_path.set('')
    else:
        input_main_dir_path.set(
            config_util.checkConfigDirExists(config_filename, IO_options[2],
                                             'INPUT'))

    if IO_options[3] == "EMPTY LINE":  # INPUT secondary directory
        input_secondary_dir_path.set('')
    else:
        input_secondary_dir_path.set(
            config_util.checkConfigDirExists(config_filename, IO_options[3],
                                             'INPUT'))

    if IO_options[4] == "EMPTY LINE":  # OUTPUT file name
        outputFilename.set('')
    else:
        outputFilename.set(
            config_util.checkConfigFileExists(config_filename, IO_options[4],
                                              'OUTPUT'))

    if IO_options[5] == "EMPTY LINE":  # OUTPUT directory
        output_dir_path.set('')
    else:
        output_dir_path.set(
            config_util.checkConfigDirExists(config_filename, IO_options[5],
                                             'OUTPUT'))

    # set the state (enabled/disabled) of the RUN button
    #   depending upon IO widgets; no IO info, RUN disabled
    configArray, missingIO = config_util.setup_IO_configArray(
        window, config_input_output_options, select_softwareDir_button,
        softwareDir, select_input_file_button, inputFilename,
        select_input_main_dir_button, input_main_dir_path,
        select_input_secondary_dir_button, input_secondary_dir_path,
        select_output_file_button, outputFilename, select_output_dir_button,
        output_dir_path)
    run_button_state = GUI_IO_util.check_missingIO(window, missingIO,
                                                   config_filename)
    run_button.configure(state=run_button_state)

    if ('GUI front end' not in reminder_options) and (configArray == [
            'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE',
            'EMPTY LINE', 'EMPTY LINE'
    ]):
        # reminders_util.No_IO_reminder(config_filename)
        reminder_options = ['GUI front end']
        message = 'The current GUI is a convenient front end that displays all the options available for the GUI.\n\nNo Input/Output options are displayed in this GUI since any selected option, when RUN, will open a specialized GUI with its own Input/Output requirements.'
        # recompute the options since a new line has been added
    else:
        message = ''

    # this will now display the error message
    if reminders_error == True:
        reminders_util.checkReminder(config_filename, reminder_options,
                                     message)

    window.protocol("WM_DELETE_WINDOW", _close_window)
Example #15
0
def display_reminder(*args):
    if best_topic_estimation_var.get():
        reminders_util.checkReminder(config_filename,
                                     ['Best topic estimation'],
                                     'The function that estimates the best topics is VERY slow and may take an hour or longer. You can follow its progress in command line.',
                                     True)
Example #16
0
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, sentimentAnalysis, sentimentAnalysisMethod, memory_var, corpus_analysis,
        hierarchical_clustering, SVD, NMF, best_topic_estimation):

# check all IO options ---------------------------------------------------------------------------

    if sentimentAnalysis==False and corpus_analysis==False and hierarchical_clustering==False and SVD==False and NMF==False and best_topic_estimation==False:
        mb.showwarning(title='Option selection error',
                       message='No options have been selected.\n\nPlease, select an option and try again.')
        return

    # check if "Shape of Stories" default output directory exists
    sosDir = os.path.join(outputDir, "Shape of Stories")
    if not os.path.exists(sosDir):
        os.mkdir(sosDir)


    tail = ''
    if inputFilename!='':
        sentiment_scores_input = inputFilename  # INPUT
        head, tail = os.path.split(sentiment_scores_input)
        outputDir = os.path.join(sosDir, os.path.basename(head))
    elif inputDir!='':
        sentiment_scores_input = inputDir  # INPUT
        head, tail = os.path.split(sentiment_scores_input)
        outputDir = os.path.join(sosDir, tail)

    # check that the specific default directory exists under "Shape of Stories"
    if not os.path.exists(outputDir):
        os.mkdir(outputDir)
    if GUI_util.output_dir_path.get()!=outputDir:
        # outputDir = head
        GUI_util.output_dir_path.set(outputDir)
        title_options = ['Output directory']
        message = 'The output directory was changed to:\n\n'+str(outputDir)
        reminders_util.checkReminder(config_filename,
                                     title_options,
                                     message,
                                     True)

    if inputDir=='' and inputFilename!='':
        if sentimentAnalysis == True:
            mb.showwarning(title='Input folder error',
                           message='The selected option requires in input a set of txt files for which to compute sentiment scores.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.')
            return
        if corpus_analysis == True:
            mb.showwarning(title='Input folder error',
                           message='The selected option requires in input a set of txt files for which to compute corpus statistics.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.')
            return

    if inputFilename!='':
        # get headers so as to check that it is a sentiment score file
        str1=' '
        str2=str1.join(IO_csv_util.get_csvfile_headers(inputFilename))
        if not('Document' in str2 and 'Sentence' in str2 and 'Sentiment' in str2):
            mb.showwarning(title='Input file error',
                           message='The selected file is not a file of sentiment scores.\n\nPlease, use the IO widget \'Select INPUT csv file\' to select the appropriate csv file containing sentiment scores and try again.')
            return

        computeSAScores = False

        nSAscoreFiles = IO_csv_util.GetNumberOfDocumentsInCSVfile(inputFilename,'Shape of Stories')
        if nSAscoreFiles == None:
            return
        if nSAscoreFiles < 50:
            answer = mb.askyesno("Data warning: Data reduction algorithms",
                                 message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores for a large number of documents (at least 50). The selected input file\n\n" + inputFilename + "\n\ncontains only " + str(
                                     nSAscoreFiles) + " files. TOO FEW!\n\nYou REALLY should select a different csv file and try again.\n\nAre you sure you want to continue?")
            if answer == False:
                return
    else: # inputDir
        if sentimentAnalysis == True or corpus_analysis == True:
            nSAscoreFiles=IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt')
            if nSAscoreFiles == 0:
                mb.showwarning(title="Directory error",
                               message="Sentiment Analysis and Corpus Statistics algorithms require in input a LARGE set of txt files for which to compute sentiment scores and/or comppute corpus statistics. The selected input directory\n\n" + inputDir + "\n\ndoes not contain any txt files.\n\nPlease, select a different directory (or untick the checkboxes 'Sentiment Analysis' and/or 'Compute & visualize corpus statistics') and try again.")
                return
            if nSAscoreFiles < 50 and sentimentAnalysis == True:
                answer = mb.askyesno("Directory error",
                                     message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of txt files. The selected input directory\n\n" + inputDir + "\n\ncontains only " + str(
                                         nSAscoreFiles) + " txt files from which to compute sentiment scores. TOO FEW!\n\nYou REALLY should select a different directory (or untick the checkboxes 'Sentiment Analysis') and try again.\n\nAre you sure you want to continue?")
        if not(sentimentAnalysis) and (hierarchical_clustering or SVD or NMF or best_topic_estimation):
            nSAscoreFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'csv')
            if nSAscoreFiles==0:
                mb.showwarning(title="Directory error",
                               message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of csv files. The selected input directory\n\n" + inputDir + "\n\ndoes not contain any csv files.\n\nPlease, select a different directory (or untick the checkboxes 'Hierarchical Clustering' 'Singular Value Decomposition' 'Non-Negative Matrix Factorization' and try again.")
                return
            elif nSAscoreFiles < 50 and sentimentAnalysis == True:
                answer = mb.askyesno("Data reduction algorithms",
                                     message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of txt files. The selected input directory\n\n" + inputDir + "\n\ncontains only " + str(
                                         nSAscoreFiles) + " txt files from which to compute sentiment scores. TOO FEW!\n\nYou REALLY should select a different directory (or untick the checkboxes 'Sentiment Analysis') and try again.\n\nAre you sure you want to continue?")
            if answer == False:
                return

        # check that the default directory of sentiment scores exists under the new default outputDir
        # sentiment_scores_folder = os.path.join(outputDir, "sentiment_analysis_scores_" + os.path.basename(inputDir))

        # computeSAScores = False
        # if os.path.exists(sentiment_scores_input):
        #     if sentimentAnalysis == True:
        #         if nSAscoreFiles>0:
        #             computeSAScores=mb.askyesno("Sentiment Analysis","You have selected to run sentiment analysis on your corpus of stories. But there already exists a set of sentiment scores for this corpus saved in the default output directory:\n\n"+sentiment_scores_input+"\n\nAre you sure you want to recompute the scores?")
        #             if computeSAScores ==True:
        #                 # remove current sentiment scores directory and recreate it
        #                 shutil.rmtree(sentiment_scores_input)
        #                 os.mkdir(sentiment_scores_input)
        #             else:
        #                 if hierarchical_clustering == False and SVD == False and NMF == False:
        #                     mb.showwarning(title='Option selection error',
        #                                    message='No data reduction options have been selected.\n\nPlease, select an option and try again.')
        #                     return
        #                 else:
        #                     answer = mb.askyesno("Sentiment Analysis",
        #                                                   "The 'Shape of Stories' algorithms will not compute sentiment scores and will continue running the data reduction algorithms using the already available scores.\n\nAre you sure you want to continue?")
        #                     if answer == False:
        #                         return
        #         else:
        #             computeSAScores=True
        #     else:
        #         if nSAscoreFiles==0:
        #             mb.showwarning(title="Folder error",
        #                            message="There are no csv files of sentiment analysis scores in the directory\n\n" +str(sentiment_scores_input) + \
        #                                     "\n\nYou will need to run the sentiment analysis algorithm. Please, tick the checkbox to run Sentiment Analysis and try again.")
        #             return
        # else:
        #     os.mkdir(sentiment_scores_input)
        #     computeSAScores = True

# RUN SCRIPTS ---------------------------------------------------------------------------

    filesToOpen = []

    # utf.check_utf8_compliance(GUI_util.window, "", inputDir, outputDir, openOutputFiles)
    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                        'Started running Shape of Stories at', True)

    # check corpus statistics
    if corpus_analysis:
        statistics_txt_util.compute_corpus_statistics(GUI_util.window, inputDir, inputDir, outputDir, openOutputFiles,
                                                      True)
    # step 1: run sentiment analysis
    if sentimentAnalysis == 1:
        # run appropriate sentiment analysis method as indicated by sentimentAnalysisMethod
        if sentimentAnalysisMethod == "Stanford CoreNLP Neural Network":
            title_options = ['Stanford CoreNLP Neural Network']
            message = 'The Stanford CoreNLP Neural Network approach to Sentiment analysis, like all neural network algorithms, is VERY slow. On a few hundred stories it may take hours to run.\n\nAlso, neural network algorithms are memory hogs. MAKE SURE TO ALLOCATE AS MUCH MEMORY AS YOU CAN AFFORD ON YOUR MACHINE.'
            reminders_util.checkReminder(config_filename,
                                         title_options,
                                         message,
                                         True)

            # TODO any changes in the way the CoreNLP_annotator generates output filenames will need to be edited here
            outputFilename = 'NLP_CoreNLP_sentiment_Dir_'+tail + '.csv'

            if os.path.isfile(os.path.join(outputDir,outputFilename)):
                computeSAScores=mb.askyesno("Sentiment Analysis","You have selected to run sentiment analysis on your corpus. But there already exists a csv file of sentiment scores for this corpus saved in the default output directory:\n\n"+outputFilename+"\n\nAre you sure you want to recompute the scores?")
                if not computeSAScores:
                    return
            tempOutputfile=Stanford_CoreNLP_annotator_util.CoreNLP_annotate('', inputDir, outputDir, openOutputFiles, createExcelCharts,'sentiment',False, memory_var)
            if tempOutputfile==None:
                return
            sentiment_scores_input=tempOutputfile[0]
        else:
            mb.showwarning(title="Sentiment Analysis Method not available", message=sentimentAnalysisMethod + " is not currently available. The only available option is the \'Stanford CoreNLP neural network\' method. Sorry!")
            return

    if hierarchical_clustering or SVD or NMF or best_topic_estimation:

        # step 2: vectorize
        # the sentiment_scores_input can either be a single merged csv file or a directory with multiple SA scores files

        vectz = vec.Vectorizer(sentiment_scores_input)

        # pop up window
        # window size

        val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for window size. Window size is the number of sentences "
                                 + "that will be averaged to obtain one point of the story arc. The recommend value is " + str(vectz.window_size)
                     + ".", 1, vectz.min_doc_len - 1, vectz.window_size)
        vectz.window_size = val

        # sentiment_vector_size
        val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for sentiment vector size. Sentiment vector size is the number of values "
                                 + "that each document will be represented with. The recommend value is " + str(vectz.ideal_sent_v_size)
                     + ".", 1, vectz.min_doc_len, vectz.ideal_sent_v_size)

        vectz.sentiment_vector_size = val

        sentiment_vectors, file_list, scoresFile_list = vectz.vectorize()#ANGEl

        rec_n_clusters = vectz.compute_suggested_n_clusters(sentiment_vectors)
        if rec_n_clusters==None:
            return

        # visualize a Principal Component Analysis (PCA) scatter plot of sentiment scores
        PCAFilename=viz.visualize_sentiment_arcs(sentiment_vectors, outputDir)
        filesToOpen.append(PCAFilename)

        # number of clusters
        val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for number of clusters (modes). The recommend value is " + str(
                         rec_n_clusters)
                              + ".", 1, vectz.sentiment_vector_size, rec_n_clusters)
        rec_n_clusters = val

    # hierarchical clustering
    if hierarchical_clustering:
        hier = cl.Clustering(rec_n_clusters)

        DendogramFilename, grouped_vectors, clusters_indices, vectors = hier.cluster(sentiment_vectors, outputDir)
        filesToOpen.append(DendogramFilename)
        sentiment_vectors = vectors
        clusters_file = cl.processCluster(clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters, os.path.join(outputDir, "Hierarchical Clustering Documents.csv"), inputDir)
        vis = viz.Visualizer(outputDir)
        vis.visualize_clusters(grouped_vectors, "Hierarchical Clustering (HC)", "HC", clusters_file)
        for i in range(rec_n_clusters):
            filesToOpen.append(os.path.join(outputDir, "HC_Cluster_" + str(i + 1) + ".png"))
            filesToOpen.append(os.path.join(outputDir, "HC_Cluster_" + str(i + 1) + "_subplot.png"))
        filesToOpen.append(os.path.join(outputDir, "Hierarchical Clustering Documents.csv"))

    # svd
    if SVD:
        svd = cl.SVDClustering(rec_n_clusters)
        pos_vector_clusters, pos_clusters_indices, pos_modes, neg_vector_clusters, neg_clusters_indices, neg_modes = \
            svd.cluster(sentiment_vectors)
        clusters_file = cl.processCluster(pos_clusters_indices,scoresFile_list, file_list, sentiment_vectors, rec_n_clusters,
                       os.path.join(outputDir, "SVD Positive Documents.csv"), inputDir)
        vis = viz.Visualizer(outputDir)
        vis.visualize_clusters(pos_vector_clusters, "Singular Value Decomposition Positive (SVD Positive)", "SVDPositive",
                               clusters_file, modes=pos_modes)
        clusters_file = cl.processCluster(neg_clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters,
                       os.path.join(outputDir, "SVD Negative Documents.csv"), inputDir)
        vis = viz.Visualizer(outputDir)
        vis.visualize_clusters(neg_vector_clusters, "Singular Value Decomposition Negative (SVD Negative)", "SVDNegative",
                               clusters_file, modes=neg_modes)
        for i in range(rec_n_clusters):
            filesToOpen.append(os.path.join(outputDir, "SVD_Positive_Cluster_" + str(i + 1) + ".png"))
        for i in range(rec_n_clusters):
            filesToOpen.append(os.path.join(outputDir, "SVD_Negative_Cluster_" + str(i + 1) + ".png"))
        filesToOpen.append(os.path.join(outputDir, "SVD Positive Documents.csv"))
        filesToOpen.append(os.path.join(outputDir, "SVD Negative Documents.csv"))

    # NMF
    if NMF:
        nmf = cl.NMFClustering(rec_n_clusters)
        grouped_vectors, clusters_indices, vectors = nmf.cluster(sentiment_vectors)
        sentiment_vectors = vectors
        clusters_file = cl.processCluster(clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters,
                       os.path.join(outputDir, "NMF Documents.csv"), inputDir)
        vis = viz.Visualizer(outputDir)
        vis.visualize_clusters(grouped_vectors, "Non-negative Matrix Factorization (NMF)", "NMF", clusters_file)
        for i in range(rec_n_clusters):
            filesToOpen.append(os.path.join(outputDir, "NMF_Cluster_" + str(i + 1) + ".png"))
            filesToOpen.append(os.path.join(outputDir, "NMF_Cluster_" + str(i + 1) + "_subplot.png"))
        filesToOpen.append(os.path.join(outputDir, "NMF Documents.csv"))

    # best topic estimate
    if best_topic_estimation:
        IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                                           'Started running estimate_best_k at', True,'You can follow the progress bar in command line.')
        filesToOpen = cl.estimate_best_k(sentiment_vectors, outputDir, filesToOpen)
        IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis end',
                            'Finished running estimate_best_k at', True)

    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis end',
                        'Finished running Shape of Stories at', True)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts):

    IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis start',
                                       'Started running Language Detection at', True)

    folderID = 0
    fileID = 0
    filesToOpen=[]

    outputFilenameCSV=IO_files_util.generate_output_file_name(inputFilename, inputDir, outputDir, '.csv', 'lang_detect')
    filesToOpen.append(outputFilenameCSV)

    files=IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    if len(files) == 0:
        return

    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    fieldnames = ['LANGDETECT',
                  'Language',
                  'Probability',
                  'SPACY',
                  'Language',
                  'Probability',
                  'LANGID',
                  'Language',
                  'Probability',
                  'Document ID',
                  'Document']

    config_filename = 'file-spell-checker-config.txt'
    reminders_util.checkReminder(config_filename,
                                 ['Language detection'],
                                 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.',
                                 True)

    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                                       'Started running language detection algorithms at', True,
                                       'You can follow the algorithms in command line.')

    with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        docErrors_empty=0
        docErrors_unknown=0
        filenameSV=''
        for filename in files:
            fileID = fileID + 1
            head, tail = os.path.split(filename)
            print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail)
            text = open(filename, 'r', encoding='utf-8', errors='ignore').read()
            if len(text)==0:
                print("  The file is empty. It will be discarded from processing.")
                docErrors_empty=docErrors_empty+1
                continue
            # text = opened_file.read()
            # head, tail = os.path.split(filename)
            # head is path, tail is filename
            try:
                value = detect_langs(text)
            except:
                filenameSV=filename # do not count the same document twice in this and the other algorithms that follow
                docErrors_unknown=docErrors_unknown+1
                print("  Unknown file read error.")
                continue
            value=str(value[0]).split(':')
            language=value[0]
            probability=value[1]
            # https://pypi.org/project/langdetect/
            # langdetect supports 55 languages out of the box (ISO 639-1 codes)
            # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
            # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
            # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
            # ISO codes https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
            print('   LANGDETECT', language, probability)
            # print('   LANGDETECT',value[0],value[1])  # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756]
            currentLine = ['LANGDETECT', language, probability]

            nlp = spacy.load('en_core_web_sm')
            nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
            try:
                doc = nlp(text)
            except:
                if filename!=filenameSV: # do not count the same document twice in this and the other algorithm that follows
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV=filename
                print("  Unknown file read error.")
                continue
            value = doc._.language
            language=value['language']
            probability=value['score']
            #
            print('   SPACY', language, probability)  # {'language': 'en', 'score': 0.9999978351575265}
            currentLine.extend(['SPACY', language, probability])

            lang_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
            try:
                value=lang_identifier.classify(text)
            except:
                if filename!=filenameSV:
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV=filename
                print("  Unknown file read error.")
                continue
            language=value[0]
            probability=value[1]
            # LANGID ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given)
            # https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes for ISO codes
            # https://pypi.org/project/langid/1.1.5/
            # af, am, an, ar, as, az, be, bg, bn, br,
            # bs, ca, cs, cy, da, de, dz, el, en, eo,
            # es, et, eu, fa, fi, fo, fr, ga, gl, gu,
            # he, hi, hr, ht, hu, hy, id, is, it, ja,
            # jv, ka, kk, km, kn, ko, ku, ky, la, lb,
            # lo, lt, lv, mg, mk, ml, mn, mr, ms, mt,
            # nb, ne, nl, nn, no, oc, or, pa, pl, ps,
            # pt, qu, ro, ru, rw, se, si, sk, sl, sq,
            # sr, sv, sw, ta, te, th, tl, tr, ug, uk,
            # ur, vi, vo, wa, xh, zh, zu
            print('   LANGID', language, probability)  # ('en', 0.999999999999998)
            print()
            currentLine.extend(['LANGID',  language, probability])
            currentLine.extend([fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)])

            writer = csv.writer(csvfile)
            writer.writerows([currentLine])
            filenameSV=filename
    csvfile.close()
    msg=''
    if docErrors_empty==0 and docErrors_unknown==0:
        msg=str(fileID) + ' documents successfully processed for language detection.'
    else:
        if docErrors_empty>0:
            msg=str(fileID) + ' documents processed for language detection.\n  ' + str(docErrors_empty) + ' document(s) found empty.'
        if docErrors_unknown>0:
            if msg!='':
                msg=msg + '\n  ' + str(docErrors_unknown) + ' document(s) read with unknown errors.'
            else:
                msg = str(fileID) + ' documents processed for language detection.\n  ' + \
                      str(docErrors_unknown) + ' document(s) read with unknown errors.'
        mb.showwarning(title='File read errors',
                message=msg+ '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.')
    filesToOpen.append(outputFilenameCSV)
    IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis end',
                                       'Finished running Language Detection at', True,'Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.')
    print('Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.')
    if createExcelCharts:
        columns_to_be_plotted = [[1, 1],[4,4],[7,7]]
        chart_title='Frequency of Languages Detected by 3 Algorithms'
        hover_label=['LANGDETECT', 'SPACY', 'LANGID']
        inputFilename = outputFilenameCSV
        Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
                                                  outputFileLabel='_bar_chart',
                                                  chart_type_list=["bar"],
                                                  chart_title=chart_title,
                                                  column_xAxis_label_var='Language',
                                                  hover_info_column_list=hover_label,
                                                  count_var=1)
        if Excel_outputFilename!='':
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
Example #18
0
        "Help",
        "Please, using the dropdown menu, select one of the many options available for analyzing your corpus and/or a single document.\n\nTHE TOOLS IN THIS CATEGORY, APPLY TO EITHER MULTIPLE DOCUMENTS (THE 'CORPUS') OR TO A SINGLE DOCUMENT.\n\nIn INPUT the tools expect either multiple documents stored in a directory (the 'corpus') or a single document."
        + GUI_IO_util.msg_Esc)
    GUI_IO_util.place_help_button(
        window, help_button_x_coordinate, basic_y_coordinate + y_step * 9,
        "Help",
        "Please, using the dropdown menu, select one of the many options available for analyzing your corpus/document by sentence index.\n\nTHE TOOLS IN THIS CATEGORY, APPLY TO EITHER MULTIPLE DOCUMENTS (THE 'CORPUS') OR TO A SINGLE DOCUMENT; BUT THEY ALSO PROVIDE SENTENCE-BASED INFORMATION FOR MORE IN-GRAINED ANALYSES.\n\nIn INPUT the tools expect either multiple documents stored in a directory (the 'corpus') or a single document."
        + GUI_IO_util.msg_Esc)


help_buttons(window, GUI_IO_util.get_help_button_x_coordinate(),
             GUI_IO_util.get_basic_y_coordinate(), GUI_IO_util.get_y_step())

# change the value of the readMe_message
readMe_message = "This Python 3 script is the front end for a wide collection of Java and Python Natural Language Processing (NLP) tools.\n\nThe set of tools are divided into GENERAL TOOLS (data and file handling, pre-processing, statistical, visualization) and LINGUISTIC ANALYSIS TOOLS.\n\nLINGUISTIC ANALYSIS TOOLS are divided into tools that expect in input CORPUS DATA (i.e., multiple documents stored in a directory), CORPUS and/or SINGLE DOCUMENT, and SENTENCE.\n\nWhile some linguistic tools are specific for one of these three categories (e.g., topic modeling cannot be performed on a single document), MANY TOOLS OVERLAP. As a result, you may find the same tool under BOTH corpus and corpus/document. SENTENCE TOOLS still require either a corpus or a single document in input; but they also provide in output sentence-level information for more in-grained linguistic analyses.\n\nAll tools are open source freeware software released under the GNU LGPLv2.1 license (http://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html).\n\nYou can cite the NLP Suite as:\n\nR. Franzosi. 2020. NLP Suite: A  set of tools of Natural Language Processing (NLP) & Data Visualization."
readMe_command = lambda: GUI_IO_util.readme_button(
    window, GUI_IO_util.get_help_button_x_coordinate(),
    GUI_IO_util.get_basic_y_coordinate(), "Help", readMe_message)
GUI_util.GUI_bottom(config_input_output_options, y_multiplier_integer,
                    readMe_command, TIPS_lookup, TIPS_options)

if platform == "darwin":
    title_options = ['tkinter MacOS bug']
    message = 'MacOS bug in tkinter (https://www.python.org/download/mac/tcltk/).\n\nPython\'s integrated development environment, IDLE, and the tkinter GUI toolkit it uses, depend on the Tk GUI toolkit which is not part of Python itself. For best results, it is important that the proper release of Tcl/Tk is installed on your machine. For recent Python installers for macOS downloadable from this website, here is a summary of current recommendations followed by more detailed information.'
    reminders_util.checkReminder(config_filename, title_options, message, True)

# check for software installation
IO_libraries_util.get_external_software_dir('NLP_menu', '')

GUI_util.window.mainloop()