def verb_voice_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # print ("\nRun verb voice analysis") # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running VERB VOICE analysis at', True) data_prep = verb_voice_data_preparation(data) voice_list, voice_stats = voice_output(data_prep,data_divided_sents) # output file names verb_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'list') verb_stats_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, IO_CoNLL_util.sort_output_list('Verb Voice', voice_list, documentId_position), verb_file_name) if errorFound == True: return filesToOpen.append(verb_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, voice_stats, verb_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(verb_stats_file_name) if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[voice_stats], inputFilename=verb_stats_file_name, outputDir=outputDir, scriptType='Verb_Voice', chartTitle="Frequency Distribution of Verb Voice", chart_type_list=["pie"], column_xAxis_label="Verb voice values", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index outputFiles = Excel_util.compute_csv_column_frequencies(GUI_util.window, verb_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Verb Voice'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running VERB VOICE analysis at', True) return filesToOpen
def preposition_stats(inputFilename,outputDir,data, data_divided_sents, openOutputFiles,createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PREPOSITION Analysis at', True) #output file names function_words_list_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'list') function_words_stats_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'stats') # filesToOpen.append(function_words_list_file_name) # not necessary to open stats since these stats are included in the pie chart # filesToOpen.append(function_words_stats_file_name) #data = get_data(inputFilename) #data_divided_sents = IO_CoNLL_util.sentence_division(data) if 0: stats_prepositions(data) # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running PRONOUN Analysis at',True) else: if not os.path.isdir(outputDir): mb.showwarning(title='output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return filesToOpen prepositions_list,prepositions_stats= stats_prepositions_output(data,data_divided_sents) errorFound=IO_csv_util.list_to_csv(GUI_util.window,IO_CoNLL_util.sort_output_list('PREPOSITIONS',prepositions_list,documentId_position), function_words_list_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_list_file_name) errorFound=IO_csv_util.list_to_csv(GUI_util.window,prepositions_stats,function_words_stats_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_stats_file_name) if createExcelCharts==True: Excel_outputFilename= Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[prepositions_stats], inputFilename=function_words_stats_file_name, outputDir=outputDir, scriptType='FuncWords_prep', chartTitle="Preposition Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index outputFiles=Excel_util.compute_csv_column_frequencies(GUI_util.window, function_words_list_file_name, '', outputDir, openOutputFiles,createExcelCharts, [[1,4]], ['PREPOSITIONS'],['FORM','Sentence'], ['Document ID','Sentence ID','Document'], 'FW','line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PREPOSITION Analysis at', True) return filesToOpen
def createChart(inputFilename,outputDir,columns_to_be_plotted,hover_label): Excel_outputFileName = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='Leven_spell', chart_type_list=["pie"], chart_title='Frequency of Potential Typos', column_xAxis_label_var='', hover_info_column_list=hover_label, count_var=1) return Excel_outputFileName
def Wordnet_bySentenceID(ConnlTable, wordnetDict, outputFilename, outputDir, noun_verb, openOutputFiles, createExcelCharts): IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running WordNet charts by sentence index at', True) if noun_verb == 'NOUN': checklist = ['NN', 'NNP', 'NNPS', 'NNS'] else: checklist = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # read in the CoreNLP CoNLL table connl = pd.read_csv(ConnlTable) # read in the dictionary file to be used to filter CoNLL values # The file is expected to have 2 columns with headers: Word, WordNet Category dict = pd.read_csv(wordnetDict) # set up the double list conll from the conll data connl = connl[[ 'word', 'lemma', 'postag', 'Sentence ID', 'Document ID', 'Document' ]] # filter the list by noun or verb connl = connl[connl['postag'].isin(checklist)] # eliminate any duplicate value in Word dict = dict.drop_duplicates().rename(columns={ 'Word': 'lemma', 'WordNet Category': 'Category' }) # ? connl = connl.merge(dict, how='left', on='lemma') # the CoNLL table value is not found in the dictionary Word value connl.fillna('Not in INPUT dictionary for ' + noun_verb, inplace=True) # add the WordNet Catgegory to the conll list connl = connl[[ 'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID', 'Document' ]] # put headers on conll list connl.columns = [ 'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID', 'Document' ] Row_list = [] # Iterate over each row for index, rows in connl.iterrows(): # Create list for the current row my_list = [ rows.word, rows.lemma, rows.postag, rows.Category, rows.SentenceID, rows.DocumentID, rows.Document ] # append the list to the final list Row_list.append(my_list) for index, row in enumerate(Row_list): if index == 0 and Row_list[index][4] != 1: for i in range(Row_list[index][4] - 1, 0, -1): Row_list.insert(0, [ '', '', '', '', i, Row_list[index][5], Row_list[index][6] ]) else: if index < len(Row_list) - 1 and Row_list[ index + 1][4] - Row_list[index][4] > 1: for i in range(Row_list[index + 1][4] - 1, Row_list[index][4], -1): Row_list.insert(index + 1, [ '', '', '', '', i, Row_list[index][5], Row_list[index][6] ]) df = pd.DataFrame(Row_list, index=[ 'word', 'lemma', 'postag', 'WordNet Category', 'Sentence ID', 'Document ID', 'Document' ]) df = Excel_util.add_missing_IDs(df) # Row_list.insert(0, ['word','lemma','postag','WordNet Category','SentenceID','DocumentID','Document']) #IO_util.list_to_csv('',Row_list,outputFilename) df.to_csv(outputFilename, index=False) if createExcelCharts: outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, ConnlTable, df, outputDir, openOutputFiles, createExcelCharts, [[4, 5]], ['WordNet Category'], ['word'], ['Document ID', 'Sentence ID', 'Document'], 'WordNet', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running WordNet charts by sentence index at', True)
def ancestor_GoingUP(WordNetDir, inputFile, outputDir, noun_verb, openOutputFiles, createExcelCharts): filesToOpen = [] if IO_libraries_util.inputProgramFileCheck( 'WordNet_Search_UP.jar') == False: return errorFound, error_code, system_output = IO_libraries_util.check_java_installation( 'WordNet upward search') if errorFound: return IO_user_interface_util.timed_alert( GUI_util.window, 4000, 'Analysis start', 'Started running WordNet (Zoom OUT/UP) at', True, '\n\nRunning WordNet with the ' + noun_verb + ' option.') # the java script produces two files:a list and a frequency warning = subprocess.call([ 'java', '-jar', 'WordNet_Search_UP.jar', '-wordNetPath', os.path.join(WordNetDir, "dict"), '-wordList', inputFile, "-pos", noun_verb, '-outputDir', outputDir ]) if warning == 1: IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Invalid Input', 'Wordnet cannot find any word in the input csv file for " + noun_verb + ".\n\nThis error can also occur if any of the files previously generated by WordNet are open. Please, check your files, close them, and try again.' ) # mb.showwarning(title = "Invalid Input", message = "Wordnet cannot find any word in the input csv file for " + noun_verb + ".\n\nThis error can also occur if any of the files previously generated by WordNet are open. Please, check your files, close them, and try again.") return elif warning == 2: IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Invalid Input', 'Some words in your list do not exist in Wordnet for " + noun_verb + ".\n\nPlease, check the list of words in command line.' ) # mb.showwarning(title = "Invalid Input", message = "Some words in your list do not exist in Wordnet for " + noun_verb + ".\n\nPlease, check the list of words in command line.") fileName = os.path.basename(inputFile).split(".")[0] outputFilenameCSV1 = os.path.join( outputDir, "NLP_WordNet_UP_" + fileName + "_output.csv") filesToOpen.append(outputFilenameCSV1) outputFilenameCSV2 = os.path.join( outputDir, "NLP_WordNet_UP_" + fileName + "_frequency.csv") filesToOpen.append(outputFilenameCSV2) if createExcelCharts: columns_to_be_plotted = [[1, 1]] chart_title = 'Frequency of WordNet Aggregate Categories for ' + noun_verb hover_label = ['Word'] inputFilename = outputFilenameCSV1 Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='_bar_chart', chart_type_list=["bar"], chart_title=chart_title, column_xAxis_label_var='WordNet ' + noun_verb + ' category', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running WordNet (Zoom OUT/UP) at', True) return filesToOpen
def run(inputDir, outputDir, openOutputFiles, createExcelCharts, n_grams_var, n_grams_menu_var, n_grams_list, n_grams_viewer_var, CoOcc_Viewer_var, search_words, date_options, temporal_aggregation, date_format, date_separator_var, date_position_var, viewer_list): # print(date_options, temporal_aggregation, date_format, date_separator_var, date_position_var) filesToOpen = [] total_file_number = 0 error_file_number = 0 error_filenames = [] error_flag = False if n_grams_var == False and n_grams_viewer_var == False and CoOcc_Viewer_var == False: mb.showwarning( title='Warning', message= 'There are no options selected.\n\nPlease, select one of the available options and try again.' ) return if date_options: new_date_format = date_format.replace('yyyy', '%Y').replace( 'mm', '%m').replace('dd', '%d') for folder, subs, files in os.walk(inputDir): for filename in files: if not filename.endswith('.txt'): continue filename = filename.replace('.txt', '') total_file_number = total_file_number + 1 try: date_text = '' date_text = filename.split(date_separator_var)[ date_position_var - 1] except: # if a file in the folder has no date it will break the code pass try: datetime.datetime.strptime(date_text, new_date_format) except ValueError: error_file_number = error_file_number + 1 error_filenames.append( IO_csv_util.dressFilenameForCSVHyperlink( os.path.join(folder, filename + '.txt'))) error_flag = True if error_flag: df = pd.DataFrame(error_filenames, columns=[ 'File with date not in position ' + str(date_position_var) ]) error_output = IO_files_util.generate_output_file_name( '', inputDir, outputDir, '.csv', 'Date_position_errors_file') df.to_csv(error_output, index=False) mb.showwarning( title='Warning', message='There are ' + str(error_file_number) + ' files out of ' + str(total_file_number) + ' processed in the selected input directory with errors in either the date format or the date position. \n\nThe selected date format is ' + str(date_format) + ' and the selected date position is ' + str(date_position_var) + '.\n\nClick OK to open a csv file with a list of files with erroneous dates. Check carefully, both date format and date position. Any erroneous file will need to be fixed or removed from the input directory before processing. You may also simply need to select a different date format and/or date position.' ) filesToOpen.append(error_output) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) return # COMPUTE Ngrams ______________________________________________________________________________ if n_grams_var: n_grams_word_var = False n_grams_character_var = False normalize = False n_grams_size = 4 # default number of n_grams excludePunctuation = False bySentenceIndex_word_var = False bySentenceIndex_character_var = False if n_grams_menu_var == "Word": n_grams_word_var = True else: n_grams_character_var = True bySentenceIndex_character_var = False if 'Hapax' in str(n_grams_list): n_grams_size = 1 if 'punctuation' in str(n_grams_list): excludePunctuation = True if 'sentence index' in str(n_grams_list): if n_grams_menu_var == "Word": bySentenceIndex_word_var = True else: bySentenceIndex_character_var = True IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams start', 'Started running ' + n_grams_menu_var + ' n-grams at', True, 'You can follow the script in command line.') if n_grams_word_var or n_grams_character_var or bySentenceIndex_word_var or bySentenceIndex_character_var: inputFilename = '' # for now we only process a whole directory if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return if n_grams_word_var or bySentenceIndex_word_var: statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, n_grams_size, normalize, excludePunctuation, 1, openOutputFiles, createExcelCharts, bySentenceIndex_word_var) if n_grams_character_var or bySentenceIndex_character_var: statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, n_grams_size, normalize, excludePunctuation, 0, openOutputFiles, createExcelCharts, bySentenceIndex_character_var) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams end', 'Finished running ' + n_grams_menu_var + ' n-grams at', True) # VIEWER ____________________________________________________________________________________________ if (n_grams_viewer_var == False and CoOcc_Viewer_var == False): return if (n_grams_viewer_var == True or CoOcc_Viewer_var == True) and (createExcelCharts == False): mb.showwarning( title='Warning', message= 'The checkbox to compute Excel charts is unticked. Since the VIEWER produces Excel charts as output, the script will abort.\n\nPlease, tick the checkbox to produce Excel charts and try again.' ) return txtCounter = len(glob.glob1(inputDir, "*.txt")) if txtCounter == 0: mb.showwarning( title='Warning', message= 'There are no files with txt extension in the selected directory.\n\nPlease, select a different directory and try again.' ) return if txtCounter == 1: mb.showwarning( title='Warning', message= 'There is only one file with txt extension in the selected directory. The script requires at least two files.\n\nPlease, select a different directory and try again.' ) return if (n_grams_viewer_var or CoOcc_Viewer_var): if IO_libraries_util.inputProgramFileCheck( 'NGrams_CoOccurrences_Viewer.jar') == False: return errorFound, error_code, system_output = IO_libraries_util.check_java_installation( 'Ngram/CoOccurrence Viewer') if errorFound: return if ',' in search_words: mb.showwarning( title='Warning', message= 'Values entered in the search bar should not be comma-separated, but blank-separated (e.g., woman man, and not woman, man).\n\nPlease, check your search bar values and try again.' ) return if search_words != '' and n_grams_viewer_var == False and CoOcc_Viewer_var == False: mb.showwarning( title='Warning', message="You have entered the string '" + search_words + "' in the Search widget but you have not selected which Viewer you wish to use, Ngram or Co-Occurrence.\n\nPlease, select an option and try again." ) return if search_words == '' and (n_grams_viewer_var == True or CoOcc_Viewer_var == True): mb.showwarning( title='Warning', message= "You have selected to run a Viewer but you have not entered any search strings in the Search widget.\n\nPlease, enter search values and try again." ) return normalize = False scaleData = False useLemma = False fullInfo = False if 'Normalize' in str(viewer_list): normalize = True if 'Scale' in str(viewer_list): scaleData = True if 'Lemmatize' in str(viewer_list): useLemma = True if 'full information' in str(viewer_list): fullInfo = True cmd = [ 'java', '-jar', 'NGrams_CoOccurrences_Viewer.jar', '-inputFolder', inputDir, '-outputFolder', outputDir ] if (n_grams_viewer_var == 1 or CoOcc_Viewer_var == 1) and len(search_words) == 0: mb.showwarning( title='Warning', message= 'No search words have been entered for either N-Grams or words co-occurrences.\n\nPlease, enter the search words and try again.' ) return if n_grams_viewer_var == 1 and len(search_words) > 0: if date_options == 0: mb.showwarning( title='Warning', message= 'No Date options selected. The N-Grams routine requires date metadata (i.e., date information embedded in the document filenames, e.g., The New York Times_12-18-1899).\n\nPlease, tick the Date options checkbox, enter the appropariate date options and try again.' ) return ngram_list = processSearchWords(search_words) ngram_list = ['-checkNGrams'] + ngram_list cmd.extend(ngram_list) if date_options == 1: cmd.extend([ '-AggregateBy', temporal_aggregation, '-dateFormat', date_format, '-datePos', str(date_position_var), '-itemsDelim', date_separator_var ]) if CoOcc_Viewer_var == 1 and len(search_words) > 0: co_occurrences_list = processSearchWords(search_words) co_occurrences_list = ["-checkCoOccurrences"] + co_occurrences_list cmd.extend(co_occurrences_list) if normalize == 1 and n_grams_viewer_var == 1 and len(search_words) > 0: cmd.append('-normalize') # only available for Ngrams if scaleData == 1: cmd.append('-scaledata') if useLemma == 1: cmd.append('-lemma') if fullInfo == 1: cmd.append('-fullInfo') IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams Word Co-Occurrences start', 'Started running N-Grams Word Co-Occurrences Viewer at', True, 'Please, be patient. Depending upon the number of documents processed this may take a few minutes.\n\nYou can follow the script in command line.' ) reminders_util.checkReminder( config_filename, ['subprocess.call(cmd) error'], 'subprocess.call(cmd) error\n\nIf the VIEWER you are running exits with an error code about a file not found, most likely your selected INPUT & OUTPUT directory options are too long for Windows to handle.\n\nYou may need to move your input and output folders so as to have a shorter path (e.g., desktop).', True) print(cmd) try: subprocess.run(cmd, shell=True) except: mb.showwarning( title='Warning', message= "The Java viewer script exited with errors. Please, check your command line for a possible error 'Java' is not recognized as an internal or external command. If that's the case, please install Java JDK. Please, check the TIPS on Java download and installation and try again." ) return if n_grams_viewer_var == 1 and len(search_words) > 0: # this is the output filename generated by the Java script n_grams_outputFile = os.path.join(outputDir, 'Searched_N-Grams.csv') if IO_files_util.checkFile(n_grams_outputFile, '.csv', True) == False: mb.showwarning( title='Warning', message= "The Java viewer script did not produce an N-grams output file.\n\nPlease, check your command line for possible Java errors and try again." ) return if CoOcc_Viewer_var == 1 and len(search_words) > 0: # this is the output filename generated by the Java script co_occurrences_outputFile = os.path.join(outputDir, 'Searched_CoOccurrences.csv') if IO_files_util.checkFile(co_occurrences_outputFile, '.csv', True) == False: mb.showwarning( title='Warning', message= "The Java viewer script did not produce a Co-occurrences output file.\n\nPlease, check your command line for possible Java errors and try again." ) return # plot co-occurrences if createExcelCharts == True and CoOcc_Viewer_var == 1 and len( search_words) > 0: xlsxFilename = co_occurrences_outputFile filesToOpen.append(co_occurrences_outputFile) chartTitle = 'Co-Occurrences Viewer' if date_options == 0: xAxis = 'Document' else: xAxis = temporal_aggregation hover_label = ['More information'] if xAxis == 'Document': columns_to_be_plotted = [[1, 1]] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'Co-Occ_viewer', chart_type_list=["pie"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label, count_var=1) else: columns_to_be_plotted = [[0, 1]] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'Co-Occ_viewer', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # plot Ngrams if createExcelCharts == True and n_grams_viewer_var == 1 and len( search_words) > 0: xlsxFilename = n_grams_outputFile filesToOpen.append(n_grams_outputFile) xAxis = temporal_aggregation chartTitle = 'N-Grams Viewer' columns_to_be_plotted = [] for i in range(len(ngram_list) - 1): # it will iterate through i = 0, 1, 2, …., n-1 columns_to_be_plotted.append([0, i + 1]) hover_label = [ 'Total Word Count of This Group', 'Total Word Count of This Group', 'Total Word Count of This Group' ] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'n-grams_viewer', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # with both Ngrams and co-occurrences if n_grams_viewer_var == 1 and CoOcc_Viewer_var == 1 and CoOcc_Viewer_var == 1 and len( search_words) > 0: n_grams_co_occurrences_outputFile = os.path.join( outputDir, 'N-Grams_CoOccurrences_Statistics.csv') filesToOpen.append(n_grams_co_occurrences_outputFile) chartTitle = '' IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams Word Co-Occurrences end', 'Finished running N-Grams Word Co-Occurrences Viewer at', True) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def search(searchWord, searchSize, position, inputKWICfile, within, outFile, ranWithCLAs=False): # read in word1,word2,targeted window size column in the csv file if (within == 1): cols = [] cols.append(0) cols.append(1) i = 12 - searchSize while i <= 11 + searchSize: cols.append(i) i+=1 data = pd.read_csv(inputKWICfile, usecols=cols, engine='python') else: data = pd.read_csv(inputKWICfile, usecols=[0, 1, 12-searchSize, 11+searchSize], engine='python') # filter out the rows where searchWord is in target = data['word1'].str.lower() == searchWord.lower() target_rows = data[target] # get the values in dataframe rownum = target_rows.shape[0] colnum = target_rows.shape[1] i = 0 global leftKWIC, rightKWIC leftKWIC = [] rightKWIC = [] result = [] mid = 1+(colnum-2)/2 #every row refers to a new combination of two words while i < rownum: #every row refers to a new combination of two words; the counts should start over at zero countLeft = 0 countRight = 0 #print("i ",i) i is correct if (position == "left"): j = 2 while j <= mid: countLeft += target_rows.iloc[i][j] j+=1 if not (countLeft == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft]) if countLeft>0: leftKWIC.append([target_rows.iloc[i]['word2'],countLeft]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)"]) for items in result: writer.writerow(items) #leftKWIC.append(items[2],items[3]) elif (position == "right"): j = round(mid + 1) while j < colnum: countRight += target_rows.iloc[i][j] j += 1 if not (countRight == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countRight]) if countRight>0: rightKWIC.append([target_rows.iloc[i]['word2'],countRight]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_right(+"+str(searchSize)+" words"]) for items in result: writer.writerow(items) else: j = 2 while j <= mid: countLeft += target_rows.iloc[i][j] j += 1 while j < colnum: countRight += target_rows.iloc[i][j] j += 1 if not (countLeft == 0 and countRight == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft, countRight]) if countLeft>0: leftKWIC.append([target_rows.iloc[i]['word2'],countLeft]) if countRight>0: rightKWIC.append([target_rows.iloc[i]['word2'],countRight]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)", "Position_right(+"+str(searchSize)+" words"]) for items in result: writer.writerow(items) i+=1 #TODO #searchWord should be displayed in quotes in the chart #should exclude stopwords (putting a widget on GUI) filesToOpen.append(KWIC_search_output_filename) """ #display chart for searchWord within sentence KWIC_search_output_filename_sentence=KWIC_search_output_filename.strip()[:-4]+"_sentence_counts.xlsx" sentenceKWIC= IO_util.list_to_csv(window,sentenceKWIC,KWIC_search_output_filename_sentence) #sort will not work with headers; headers inserted after sentenceKWIC=stats_visuals_util.sort_data(sentenceKWIC,1,True) sentenceKWIC.insert(0,["KWIC (sentence tokens)","Counts"]) stats_visuals_util.create_excel_chart(window,"bar","Sentence tokens for " + searchWord,rightKWIC,KWIC_search_output_filename_sentence,20) filesToOpen.append(KWIC_search_output_filename_sentence) """ if position == "left" or position == "both": if len(leftKWIC)>0: KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_left_counts.xlsx" IO_csv_util.list_to_csv(window,leftKWIC,KWIC_search_output_filename_stats) #sort will not work with headers; headers inserted after leftKWIC=stats_visuals_util.sort_data(leftKWIC,1,True) leftKWIC.insert(0,["KWIC (left-hand tokens)","Counts"]) Excel_util.create_excel_chart(window,"bar","Left-hand tokens for " + searchWord,[leftKWIC],KWIC_search_output_filename_stats,20) filesToOpen.append(KWIC_search_output_filename_stats) else: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no left-hand words for the searched keyword: ' + searchWord) if position == "right" or position == "both": if len(rightKWIC)>0: KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_right_counts.xlsx" IO_csv_util.list_to_csv(window,rightKWIC,KWIC_search_output_filename_stats) #sort will not work with headers; headers inserted after rightKWIC=stats_visuals_util.sort_data(rightKWIC,1,True) rightKWIC.insert(0,["KWIC (right-hand tokens)","Counts"]) Excel_util.create_excel_chart(window,"bar","Right-hand tokens for " + searchWord,[rightKWIC],KWIC_search_output_filename_stats,20) filesToOpen.append(KWIC_search_output_filename_stats) else: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no right-hand words for the searched keyword: ' + searchWord) if ranWithCLAs == False: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'Finished running KWIC at', True)
def compute_character_word_ngrams(window, inputFilename, inputDir, outputDir, ngramsNumber=4, normalize=False, excludePunctuation=False, wordgram=None, openOutputFiles=True, createExcelCharts=True, bySentenceID=None): filesToOpen = [] container = [] if inputFilename == '' and inputDir == '': mb.showwarning( title='Input error', message= 'No input file or input directory have been specified.\n\nThe function will exit.\n\nPlease, enter the required input options and try again.' ) return files = IO_files_util.getFileList(inputFilename, inputDir, '.txt') nFile = len(files) if nFile == 0: return if wordgram == None: result = mb.askyesno( "Word/character N-grams", "Would you like to compute\n WORD n-grams (Yes) or\n CHARACTER n-grams (No)?" ) if result == True: wordgram = 1 else: wordgram = 0 if wordgram == 1: fn = "Wd" chartTitle = "Word " else: fn = "Ch" chartTitle = "Character " if bySentenceID == None: result = mb.askyesno( "By sentence index", "Would you like to compute n-grams by sentence index?") if result == True: bySentenceID = 1 else: bySentenceID = 0 i = 0 for file in files: head, tail = os.path.split(file) i = i + 1 print("Processing file " + str(i) + "/" + str(nFile) + ' ' + tail) ngramsList = get_ngramlist(file, ngramsNumber, wordgram, excludePunctuation, bySentenceID, isdir=True) container.append(ngramsList) for index, f in enumerate(container): for n in f: for skip, gram in enumerate(n): if skip == 0: gram.insert(-1, 'Document ID') continue else: gram.insert(-1, index + 1) one_gram = [] for index, f in enumerate(container): if index == 0: one_gram += (f[0]) else: one_gram += (f[0][1:]) generalList = [one_gram] if ngramsNumber > 1: two_gram = [] for index, f in enumerate(container): if index == 0: two_gram += (f[1]) else: two_gram += (f[1][1:]) generalList = [one_gram, two_gram] if ngramsNumber > 2: three_gram = [] for index, f in enumerate(container): if index == 0: three_gram += (f[2]) else: three_gram += (f[2][1:]) generalList = [one_gram, two_gram, three_gram] if ngramsNumber > 3: four_gram = [] for index, f in enumerate(container): if index == 0: four_gram += (f[3]) else: four_gram += (f[3][1:]) generalList = [one_gram, two_gram, three_gram, four_gram] result = True # n-grams # if createExcelCharts==True: # if nFile>10: # result = mb.askyesno("Excel charts","You have " + str(nFile) + " files for which to compute Excel charts.\n\nTHIS WILL TAKE A LONG TIME.\n\nAre you sure you want to do that?",default='no') for index, ngramsList in enumerate(generalList): if nFile > 1: csv_outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'n-grams_' + str(index + 1) + '_' + fn, 'stats', '', '', '', False, True) else: csv_outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'n-grams_' + str(index + 1) + '_' + fn, 'stats') filesToOpen.append(csv_outputFilename) IO_csv_util.list_to_csv(window, ngramsList, csv_outputFilename) # n-grams if createExcelCharts == True and result == True: inputFilename = csv_outputFilename if bySentenceID == True: columns_to_be_plotted = [[2, 2]] hover_label = [str(index + 1) + '-grams'] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='n-grams_' + str(index + 1) + '_' + fn, chart_type_list=["line"], chart_title=chartTitle + str(index + 1) + '-grams', column_xAxis_label_var='Sentence Index', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) else: columns_to_be_plotted = [[0, 2]] # 0,1 hover_label = [str(index + 1) + '-grams'] # change to sentence Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='n-grams_' + str(index + 1) + '_' + fn, chart_type_list=["bar"], chart_title=chartTitle + str(index + 1) + '-grams', column_xAxis_label_var='', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, csv_outputFilename, # chart_type_list=["bar"], chart_title=chartTitle + str(index+1) + '-grams', column_xAxis_label_var='', column_yAxis_label_var='Frequency', outputExtension = '.xlsm', label1='n-grams_'+str(index+1)+'_'+fn,label2='bar',label3='chart',label4='',label5='', useTime=False,disable_suffix=True, count_var=0, column_yAxis_field_list = [], reverse_column_position_for_series_label=False , series_label_list=[str(index+1)+'-grams'], second_y_var=0, second_yAxis_label='', hover_info_column_list=hover_label) # if excel_outputFilename != "": # filesToOpen.append(excel_outputFilename) if len(inputDir) != 0: mb.showwarning( title='Warning', message= 'The output filename generated by N-grams is the name of the directory processed in input, rather than any individual file in the directory.\n\nThe output csv file includes all ' + str(nFile) + ' files in the input directory processed by N-grams.') if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def compute_stats_NLP_main(window, inputFilename, inputDataFrame, outputDir, openOutputFiles, createExcelCharts, columns_to_be_plotted, selected_col, hover_col, group_col, fileNameType='CSV', chartType='line'): filesToOpen = [] container = [] if len(inputDataFrame) != 0: data = inputDataFrame else: with open(inputFilename, encoding='utf-8', errors='ignore') as infile: reader = csv.reader(x.replace('\0', '') for x in infile) headers = next(reader) header_indices = [i for i, item in enumerate(headers) if item] data = pd.read_csv(inputFilename, usecols=header_indices, encoding='utf-8') if len(selected_col) == 0: mb.showwarning( 'Missing field', 'You have not selected the csv field for which to compute frequencies.\n\nPlease, select the field and try again.' ) elif len(selected_col) != 0 and len(group_col) == 0: for col in selected_col: output_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', col) data = data[col].value_counts().to_frame().reset_index() hdr = [col, col + ' Frequency'] Hover_over_header = [] if len(hover_col) != 0: hover_header = ', '.join(hover_col) Hover_over_header = ['Hover_over: ' + hover_header] hdr.append(Hover_over_header) data.columns = hdr temp_str = '%s' + '\n%s' * (len(hover_col) - 1) data['Hover_over: ' + hover_header] = data.apply( lambda x: temp_str % tuple(x[h] for h in hover_col), axis=1) data.drop(hover_col, axis=1, inplace=True) else: data.columns = hdr data.to_csv(output_file_name, index=False) filesToOpen.append(output_file_name) if createExcelCharts: # columns_to_be_plotted = [[1, 2]] # hard code Yi Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel=fileNameType, chart_type_list=chartType, chart_title='', column_xAxis_label_var=col, hover_info_column_list=Hover_over_header) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) elif len(selected_col) != 0 and len(group_col) != 0 and len( hover_col) == 0: for col in selected_col: output_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', col) temp = group_col.copy() temp.append(col) data = data.groupby(temp).size().reset_index(name='Frequency') for index, row in data.iterrows(): if row[col] == '': data.at[index, 'Frequency'] = 0 data.to_csv(output_file_name, index=False) filesToOpen.append(output_file_name) if createExcelCharts: # columns_to_be_plotted = [[1, 2]] # hard code Yi Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel=fileNameType, chart_type_list=chartType, chart_title='', column_xAxis_label_var=col, hover_info_column_list=Hover_over_header) filesToOpen.append(Excel_outputFilename) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # # columns_to_be_plotted = [[1, 2]] # hard code Yi # Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, # outputFileLabel=fileNameType, # chart_type_list=[chartType], # chart_title='', # column_xAxis_label_var=col, # hover_info_column_list=Hover_over_header) # filesToOpen.append(Excel_outputFilename) else: for col in hover_col: temp = group_col.copy() temp.append(col) c = data.groupby(group_col)[col].apply(list).to_dict() container.append(c) temp = group_col.copy() temp.extend(selected_col) data = data.groupby(temp).size().reset_index(name='Frequency') for index, row in data.iterrows(): if row[selected_col[0]] == '': data.at[index, 'Frequency'] = 0 hover_header = ', '.join(hover_col) Hover_over_header = ['Hover_over: ' + hover_header] for index, hover in enumerate(hover_col): df = pd.Series(container[index]).reset_index() temp = group_col.copy() temp.append(hover) df.columns = temp data = data.merge(df, how='left', left_on=group_col, right_on=group_col) temp_str = '%s' + '\n%s' * (len(hover_col) - 1) data['Hover_over: ' + hover_header] = data.apply( lambda x: temp_str % tuple(x[h] for h in hover_col), axis=1) data.drop(hover_col, axis=1, inplace=True) if createExcelCharts: # columns_to_be_plotted = [[1, 2]] # hard code Yi Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel=fileNameType, chart_type_list=chartType, chart_title='', column_xAxis_label_var=col, hover_info_column_list=Hover_over_header) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # need change, put run_all # if createExcelCharts: # filesToOpen=Excel_util.prepare_csv_data_for_chart(window, # inputFilename, data, outputDir, # selected_col, # Hover_over_header, group_col, fileNameType, # chartType,openOutputFiles, createExcelCharts) if openOutputFiles == 1: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) filesToOpen = [] # empty list not to display twice return filesToOpen #2 files
def dictionary_items_bySentenceID(window,inputFilename,inputDir, outputDir,createExcelCharts,openOutputFiles=True,input_dictionary_file='',chartTitle=''): filesToOpen=[] DictionaryList=[] file_list = IO_files_util.getFileList(inputFilename, inputDir, '.txt') nFile=len(file_list) if nFile==0: return # when running the function w/o a GUI, as currently is mostly the case, # we would not be able to pass a dictionary file to the function if input_dictionary_file=='': initialFolder = os.path.dirname(os.path.abspath(__file__)) input_dictionary_file = tk.filedialog.askopenfilename(title = "Select dictionary csv file", initialdir = initialFolder, filetypes = [("csv files", "*.csv")]) if len(input_dictionary_file)==0: return if IO_csv_util.get_csvfile_numberofColumns(input_dictionary_file) == 2: dic = pd.read_csv(input_dictionary_file) dic_value = dic.iloc[:,0].tolist() dic_sec_value = dic.iloc[:,1].tolist() dic =[(dic_value[i],dic_sec_value[i])for i in range(len(dic_value))] if chartTitle=='': chartTitle="Dictionary value" documentID = 0 container = [] for file in file_list: documentID+=1 head, tail = os.path.split(file) print("Processing file ", str(documentID),"\\",str(nFile),tail) text = (open(file, "r", encoding="utf-8",errors='ignore').read()) #Process each word in txt Sentence_ID = 0 sentences = tokenize.sent_tokenize(text) # word frequency sentenceID DocumentID FileName for each_sentence in sentences: In = [] Sentence_ID += 1 token=nltk.word_tokenize(each_sentence) for word in token: for dict_word in dic: if word == dict_word[0].rstrip(): In.append([word,dict_word[1],Sentence_ID,each_sentence,documentID,file]) break else: continue container.extend(In) ctr = collections.Counter(Extract(container)) for word in container: word.insert(2,ctr.get(word[0])) for word in container: if word[0] not in Extract(DictionaryList): DictionaryList.append(word) DictionaryList.insert(0, ['Dict_value','Dict_second_value', 'Frequency', 'Sentence ID','Sentence','Document ID','Document']) else: dic = pd.read_csv(input_dictionary_file) dic_value = dic.iloc[:, 0].tolist() if chartTitle == '': chartTitle = "Dictionary value" documentID = 0 container = [] for file in file_list: documentID += 1 head, tail = os.path.split(file) print("Processing file ", str(documentID), "\\", str(nFile), tail) text = (open(file, "r", encoding="utf-8", errors='ignore').read()) # Process each word in txt Sentence_ID = 0 sentences = tokenize.sent_tokenize(text) # word frequency sentenceID DocumentID FileName for each_sentence in sentences: In = [] Sentence_ID += 1 token = nltk.word_tokenize(each_sentence) for word in token: for dict_word in dic_value: if word == dict_word.rstrip(): In.append([word, Sentence_ID, each_sentence, documentID, file]) break else: continue container.extend(In) ctr = collections.Counter(Extract(container)) for word in container: word.insert(1, ctr.get(word[0])) for word in container: if word[0] not in Extract(DictionaryList): DictionaryList.append(word) DictionaryList.insert(0, ['Dict_value', 'Frequency', 'Sentence ID', 'Sentence', 'Document ID', 'Document']) outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.csv', str(Sentence_ID) + '-Dict_value', 'stats', '', '', '', False, True) filesToOpen.append(outputFilename) IO_csv_util.list_to_csv(window,DictionaryList,outputFilename) outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.xlsx', str(Sentence_ID) + '-Dict_value', 'chart', '', '', '', False, True) filesToOpen.append(outputFilename) Excel_util.create_excel_chart(GUI_util.window,[DictionaryList],outputFilename,chartTitle,["bar"]) if openOutputFiles==True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts): IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis start', 'Started running Language Detection at', True) folderID = 0 fileID = 0 filesToOpen=[] outputFilenameCSV=IO_files_util.generate_output_file_name(inputFilename, inputDir, outputDir, '.csv', 'lang_detect') filesToOpen.append(outputFilenameCSV) files=IO_files_util.getFileList(inputFilename, inputDir, '.txt') if len(files) == 0: return if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return fieldnames = ['LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language', 'Probability', 'LANGID', 'Language', 'Probability', 'Document ID', 'Document'] config_filename = 'file-spell-checker-config.txt' reminders_util.checkReminder(config_filename, ['Language detection'], 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.', True) IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running language detection algorithms at', True, 'You can follow the algorithms in command line.') with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() docErrors_empty=0 docErrors_unknown=0 filenameSV='' for filename in files: fileID = fileID + 1 head, tail = os.path.split(filename) print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail) text = open(filename, 'r', encoding='utf-8', errors='ignore').read() if len(text)==0: print(" The file is empty. It will be discarded from processing.") docErrors_empty=docErrors_empty+1 continue # text = opened_file.read() # head, tail = os.path.split(filename) # head is path, tail is filename try: value = detect_langs(text) except: filenameSV=filename # do not count the same document twice in this and the other algorithms that follow docErrors_unknown=docErrors_unknown+1 print(" Unknown file read error.") continue value=str(value[0]).split(':') language=value[0] probability=value[1] # https://pypi.org/project/langdetect/ # langdetect supports 55 languages out of the box (ISO 639-1 codes) # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw # ISO codes https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes print(' LANGDETECT', language, probability) # print(' LANGDETECT',value[0],value[1]) # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756] currentLine = ['LANGDETECT', language, probability] nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) try: doc = nlp(text) except: if filename!=filenameSV: # do not count the same document twice in this and the other algorithm that follows docErrors_unknown = docErrors_unknown + 1 filenameSV=filename print(" Unknown file read error.") continue value = doc._.language language=value['language'] probability=value['score'] # print(' SPACY', language, probability) # {'language': 'en', 'score': 0.9999978351575265} currentLine.extend(['SPACY', language, probability]) lang_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) try: value=lang_identifier.classify(text) except: if filename!=filenameSV: docErrors_unknown = docErrors_unknown + 1 filenameSV=filename print(" Unknown file read error.") continue language=value[0] probability=value[1] # LANGID ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given) # https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes for ISO codes # https://pypi.org/project/langid/1.1.5/ # af, am, an, ar, as, az, be, bg, bn, br, # bs, ca, cs, cy, da, de, dz, el, en, eo, # es, et, eu, fa, fi, fo, fr, ga, gl, gu, # he, hi, hr, ht, hu, hy, id, is, it, ja, # jv, ka, kk, km, kn, ko, ku, ky, la, lb, # lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, # nb, ne, nl, nn, no, oc, or, pa, pl, ps, # pt, qu, ro, ru, rw, se, si, sk, sl, sq, # sr, sv, sw, ta, te, th, tl, tr, ug, uk, # ur, vi, vo, wa, xh, zh, zu print(' LANGID', language, probability) # ('en', 0.999999999999998) print() currentLine.extend(['LANGID', language, probability]) currentLine.extend([fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)]) writer = csv.writer(csvfile) writer.writerows([currentLine]) filenameSV=filename csvfile.close() msg='' if docErrors_empty==0 and docErrors_unknown==0: msg=str(fileID) + ' documents successfully processed for language detection.' else: if docErrors_empty>0: msg=str(fileID) + ' documents processed for language detection.\n ' + str(docErrors_empty) + ' document(s) found empty.' if docErrors_unknown>0: if msg!='': msg=msg + '\n ' + str(docErrors_unknown) + ' document(s) read with unknown errors.' else: msg = str(fileID) + ' documents processed for language detection.\n ' + \ str(docErrors_unknown) + ' document(s) read with unknown errors.' mb.showwarning(title='File read errors', message=msg+ '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.') filesToOpen.append(outputFilenameCSV) IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis end', 'Finished running Language Detection at', True,'Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.') print('Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.') if createExcelCharts: columns_to_be_plotted = [[1, 1],[4,4],[7,7]] chart_title='Frequency of Languages Detected by 3 Algorithms' hover_label=['LANGDETECT', 'SPACY', 'LANGID'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='_bar_chart', chart_type_list=["bar"], chart_title=chart_title, column_xAxis_label_var='Language', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename!='': filesToOpen.append(Excel_outputFilename) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
# Written by Yuhang Feng November 2019
def main(CoreNLPDir, input_main_dir_path, input_secondary_dir_path, outputDir, openOutputFiles, createExcelCharts, checkNER=False): articles_path = input_main_dir_path compilations_path = input_secondary_dir_path # summaries folder filesToOpen = [] IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running MISSING CHARACTER at', True, 'You can follow MISSING CHARACTER in command line.') if len(articles_path) == 0: mb.showerror( title='Articles directory not found', message= 'The summary checker script requires an input files directory.\n\nNo directory entered. Please, select the input files directory and try again.' ) sys.exit() if len(compilations_path) == 0: tk.messagebox.showerror( title='Summary directory not found', message= 'The summary checker script requires a secondary input directory for the summary files.\n\nNo secondary directory entered. Please, select the secondary input directory and try again.' ) sys.exit() if len(outputDir) == 0: mb.showerror( title='Output directory not found', message= 'The summary checker script requires an output directory.\n\nNo output directory entered. Please, select the output directory and try again.' ) sys.exit() if compilations_path[-1] == os.sep: compilations_path = compilations_path[:-1] if outputDir[-1] == os.sep: outputDir = outputDir[:-1] ############################# ##This is just for evaluation purposes freq_act_miss = 0 count_act_miss = 0 act_miss_list = [] id_act_miss = [] freq_loc_miss = 0 count_loc_miss = 0 loc_miss_list = [] id_loc_miss = [] freq_org_miss = 0 count_org_miss = 0 org_miss_list = [] id_org_miss = [] freq_per_miss = 0 count_per_miss = 0 per_miss_list = [] id_per_miss = [] freq_date_miss = 0 count_date_miss = 0 date_miss_list = [] id_date_miss = [] ##End of evaluation ############################# #write the output csv. if checkNER == 1: outputFilename = IO_files_util.generate_output_file_name( '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'NER', '', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( '', compilations_path, outputDir, '.csv', 'SSR', 'MA', '', '', '', False, True) fName = GUI_IO_util.libPath + os.sep + 'wordLists' + os.sep + 'social-actor-list.csv' if not os.path.isfile(fName): print( "The file " + fName + " could not be found. The routine expects a csv dictionary file 'social-actor-list.csv' in a directory 'lib\wordLists' expected to be a subdirectory of the main NLP directory.\n\nPlease, check your lib\wordLists directory and try again." ) mb.showerror( title='File not found', message='The file ' + fName + " could not be found.\n\nThe routine expects a csv dictionary file 'social-actor-list.csv' in a directory 'lib\wordLists' expected to be a subdirectory of the main NLP directory.\n\nPlease, check your lib\wordLists directory and try again." ) sys.exit() actors = load_soc_actors(fName) f = open(outputFilename, 'w', encoding='utf-8', errors='ignore') sys.stdout = f dirs = glob(articles_path + os.sep + '*' + os.sep) nlp = StanfordCoreNLP(CoreNLPDir) num_id = 0 num_dir = 0 for compilation in glob(compilations_path + os.sep + '*'): num_id += 1 for dir in dirs: sys.stdout = terminal_out print("Processing folder " + str(num_dir + 1) + "/" + str(len(dirs)) + "; Folder name: " + dir.split(os.path.sep)[-2]) num_dir += 1 sys.stdout = f try: count_act_miss, act_miss_list, id_act_miss, count_loc_miss, loc_miss_list, id_loc_miss, count_org_miss, org_miss_list, id_org_miss, count_per_miss, per_miss_list, id_per_miss, count_date_miss, date_miss_list, id_date_miss, if_act, if_loc, if_org, if_per, if_date = check( dir, actors, nlp, compilations_path, checkNER, count_act_miss, act_miss_list, id_act_miss, count_loc_miss, loc_miss_list, id_loc_miss, count_org_miss, org_miss_list, id_org_miss, count_per_miss, per_miss_list, id_per_miss, count_date_miss, date_miss_list, id_date_miss) except: print(' Unspecified error in processing the file') continue ############################# #for evaluation if if_act == True: freq_act_miss += 1 if if_per == True: freq_per_miss += 1 if if_loc == True: freq_loc_miss += 1 if if_org == True: freq_org_miss += 1 if if_date == True: freq_date_miss += 1 ############################# nlp.close() ## ##This is to print out evaluation table if checkNER == 1: outputFilename = IO_files_util.generate_output_file_name( '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'NER', 'freq', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'freq', '', '', False, True) f_e = open(outputFilename, 'w', encoding='utf-8', errors='ignore') sys.stdout = f_e if (len(act_miss_list) <= 320 and len(loc_miss_list) <= 320 and len(per_miss_list) <= 320 and len(org_miss_list) <= 320 and len(date_miss_list) <= 320): print( "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error" ) print("Social Actor,", str(freq_act_miss), ",", freq_act_miss / num_id * 100, ",", count_act_miss, ",", "; ".join(a for a in id_act_miss), ",", "; ".join(c for c in act_miss_list)) print("Organization,", str(freq_org_miss), ",", freq_org_miss / num_id * 100, ",", count_org_miss, ",", "; ".join(b for b in id_org_miss), ",", "; ".join(d for d in org_miss_list)) print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100, ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",", "; ".join(d for d in per_miss_list)) print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100, ",", count_date_miss, ",", "; ".join(b for b in id_date_miss), ",", "; ".join(d for d in date_miss_list)) print("Location,", str(freq_loc_miss), ",", freq_loc_miss / num_id * 100, ",", count_loc_miss, ",", "; ".join(b for b in id_loc_miss), ",", "; ".join(d for d in loc_miss_list)) elif (len(act_miss_list) <= 640 and len(loc_miss_list) <= 640 and len(per_miss_list) <= 640 and len(org_miss_list) <= 640 and len(date_miss_list) <= 640): print( "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error (Cut List),List of Documents for Type of Error (Continue List)" ) print("Social Actor,", str(freq_act_miss), ",", freq_act_miss / num_id * 100, ",", count_act_miss, ",", "; ".join(a for a in id_act_miss), ",", "; ".join(c for c in act_miss_list[:320]), ",", "; ".join(c for c in act_miss_list[320:])) print("Organization,", str(freq_org_miss), ",", freq_org_miss / num_id * 100, ",", count_org_miss, ",", "; ".join(b for b in id_org_miss), ",", "; ".join(d for d in org_miss_list[:320]), ",", "; ".join(d for d in org_miss_list[320:])) print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100, ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",", "; ".join(d for d in per_miss_list[:320]), ",", "; ".join(d for d in per_miss_list[320:])) print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100, ",", count_date_miss, ",", "; ".join(b for b in id_date_miss), ",", "; ".join(d for d in date_miss_list[:320]), ",", "; ".join(d for d in date_miss_list[320:])) print("Location,", str(freq_loc_miss), ",", freq_loc_miss / num_id * 100, ",", count_loc_miss, ",", "; ".join(b for b in id_loc_miss), ",", "; ".join(d for d in loc_miss_list[:320]), ",", "; ".join(d for d in loc_miss_list[320:])) else: print( "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error (Cut List),List of Documents for Type of Error (Continue List),List of Documents for Type of Error (Continue List)" ) print("Social Actor,", str(freq_act_miss), ",", freq_act_miss / num_id * 100, ",", count_act_miss, ",", "; ".join(a for a in id_act_miss), ",", "; ".join(c for c in act_miss_list[:320]), ",", "; ".join(c for c in act_miss_list[320:640]), ",", "; ".join(c for c in act_miss_list[640:])) print("Organization,", str(freq_org_miss), ",", freq_org_miss / num_id * 100, ",", count_org_miss, ",", "; ".join(b for b in id_org_miss), ",", "; ".join(d for d in org_miss_list[:320]), ",", "; ".join(d for d in org_miss_list[320:640]), ",", "; ".join(c for c in org_miss_list[640:])) print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100, ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",", "; ".join(d for d in per_miss_list[:320]), ",", "; ".join(d for d in per_miss_list[320:640]), ",", "; ".join(c for c in per_miss_list[640:])) print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100, ",", count_date_miss, ",", "; ".join(b for b in id_date_miss), ",", "; ".join(d for d in date_miss_list[:320]), ",", "; ".join(d for d in date_miss_list[320:640]), ",", "; ".join(c for c in date_miss_list[640:])) print("Location,", str(freq_loc_miss), ",", freq_loc_miss / num_id * 100, ",", count_loc_miss, ",", "; ".join(b for b in id_loc_miss), ",", "; ".join(d for d in loc_miss_list[:320]), ",", "; ".join(d for d in loc_miss_list[320:640]), ",", "; ".join(c for c in loc_miss_list[640:])) f_e.close() # type "sys.stdout = terminal_out" before print sys.stdout = terminal_out if createExcelCharts: if checkNER == 1: fileType = 'SSR_summary_NER' else: fileType = 'SSR_summary' columns_to_be_plotted = [[0, 1], [0, 2], [0, 3]] hover_label = [ 'List of Summary Filenames for Type of Error', 'List of Summary Filenames for Type of Error', 'List of Summary Filenames for Type of Error' ] inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel=fileType, chart_type_list=["bar"], chart_title='Missing Character (File Summaries in Error)', column_xAxis_label_var='Type of Error', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) filesToOpen = [] # avoid opening twice in the calling function IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running MISSING CHARACTER at', True) return filesToOpen
def nltk_unusual_words(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts=True, silent=False): filesToOpen = [] unusual = [] container = [] documentID = 0 files = IO_files_util.getFileList(inputFilename, inputDir, '.txt') nFile = len(files) if nFile == 0: return outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'NLTK_unus', 'stats') filesToOpen.append(outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'NLTK unusual words/spelling checker start', 'Started running NLTK unusual words/spelling checker at', True, 'You can follow NLTK words/spelling checker in command line.') # already shown in NLP.py # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running NLTK unusual words at',True,'You can follow NLTK unusual words in command line.') for file in files: documentID = documentID + 1 head, tail = os.path.split(file) print("Processing file " + str(documentID) + "/" + str(nFile) + ' ' + tail) text = (open(file, "r", encoding="utf-8", errors="ignore").read()) #lemmatizer = WordNetLemmatizer() # text_vocab = set(lemmatizer.lemmatize(w.lower()) for w in text.split(" ") if w.isalpha()) text_vocab = set( lemmatizing(w.lower()) for w in text.split(" ") if w.isalpha()) english_vocab = set([w.lower() for w in nltk.corpus.words.words()]) print("english_vocab", english_vocab) print("text_vocab", text_vocab) unusual = text_vocab - english_vocab #convert the set to a list unusual = list(unusual) #sort the list unusual.sort() # unusual = [[documentID, file, word] for word in unusual] unusual = [[ documentID, IO_csv_util.dressFilenameForCSVHyperlink(file), word ] for word in unusual] container.extend(unusual) container.insert(0, ['Document ID', 'Document', 'Misspelled/unusual word']) if len(container) > 0: if IO_csv_util.list_to_csv(window, container, outputFilename): return else: IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Spelling checker (via nltk)', 'No misspelled/unusual words found in\n' + file, True) if nFile == 1: return if not silent: IO_user_interface_util.single_file_output_save(inputDir, 'NLTK') # NLTK unusual words if createExcelCharts: if nFile > 10: result = mb.askyesno( "Excel charts", "You have " + str(nFile) + " files for which to compute Excel charts.\n\nTHIS WILL TAKE A LONG TIME.\n\nAre you sure you want to do that?" ) if result == False: pass columns_to_be_plotted = [[2, 2]] hover_label = [''] inputFilename = outputFilename Excel_outputFileName = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='NLTK_spell', chart_type_list=["bar"], chart_title='Misspelled/Unusual Words Frequency', column_xAxis_label_var='', hover_info_column_list=hover_label, count_var=1) if Excel_outputFileName != "": filesToOpen.append(Excel_outputFileName) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) filesToOpen = [] # do not open twice, hee and calling function # already shown in NLP.py # IO_util.timed_alert(GUI_util.window,3000,'Analysis end','Finished running NLTK unusual words at',True) for u in unusual: print(u[-1]) print(len(unusual)) return filesToOpen
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running CLAUSE ANALYSES at', True) #output file names #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag clausal_analysis_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'list') filesToOpen.append(clausal_analysis_file_name) #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data if 0: stats_clauses(data) else: if not os.path.isdir(outputDir): mb.showwarning( title='Output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return clausal_list = stats_clauses_output(data, data_divided_sents) IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list, documentId_position), clausal_analysis_file_name) column_stats = statistics_csv_util.compute_stats_CoreNLP_tag( clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG") clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats, clausal_analysis_stats_file_name) if errorFound == True: return if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[column_stats], inputFilename=clausal_analysis_stats_file_name, outputDir=outputDir, scriptType='CoNLL_Clause', chartTitle="Frequency Distribution of Clause Type", chart_type_list=["pie"], column_xAxis_label="Clause Tags", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index Excel_outputFilename = Excel_util.compute_csv_column_frequencies( GUI_util.window, clausal_analysis_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA', 'line') if len(Excel_outputFilename) > 0: filesToOpen.extend(Excel_outputFilename) # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name) # # overwrite original file having added any missing document ID and sentence ID # output_df.to_csv(clausal_analysis_file_name,index=False) # columns_to_be_plotted = [[1, 8]] hover_label = ['CLAUSAL TAG-DESCRIPTION'] inputFilename = clausal_analysis_file_name Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='CoNLL_Clause', chart_type_list=["line"], chart_title='Frequency of Clause Tags', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != '': filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running CLAUSE ANALYSES at', True) return filesToOpen
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts): folderID = 0 fileID = 0 filesToOpen = [] outputFilenameCSV = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'lang_detect') filesToOpen.append(outputFilenameCSV) files = IO_files_util.getFileList(inputFilename, inputDir, '.txt') if len(files) == 0: return if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return fieldnames = [ 'LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language', 'Probability', 'LANGID', 'Language', 'Probability', 'Document ID', 'Document' ] config_filename = 'file-spell-checker-config.txt' reminders_util.checkReminder( config_filename, ['Language detection'], 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.', True) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running language detection algorithms at', True, 'You can follow the algorithms in command line.') with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() docErrors_empty = 0 docErrors_unknown = 0 filenameSV = '' for filename in files: fileID = fileID + 1 head, tail = os.path.split(filename) print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail) text = open(filename, 'r', encoding='utf-8', errors='ignore').read() if len(text) == 0: print( " The file is empty. It will be discarded from processing." ) docErrors_empty = docErrors_empty + 1 continue # text = opened_file.read() # head, tail = os.path.split(filename) # head is path, tail is filename try: value = detect_langs(text) except: filenameSV = filename # do not count the same document twice in this and the other algorithms that follow docErrors_unknown = docErrors_unknown + 1 print(" Unknown file read error.") continue value = str(value[0]).split(':') language = value[0] probability = value[1] print(' LANGDETECT', language, probability) # print(' LANGDETECT',value[0],value[1]) # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756] currentLine = ['LANGDETECT', language, probability] nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) try: doc = nlp(text) except: if filename != filenameSV: # do not count the same document twice in this and the other algorithm that follows docErrors_unknown = docErrors_unknown + 1 filenameSV = filename print(" Unknown file read error.") continue value = doc._.language language = value['language'] probability = value['score'] print( ' SPACY', language, probability) # {'language': 'en', 'score': 0.9999978351575265} currentLine.extend(['SPACY', language, probability]) lang_identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) try: value = lang_identifier.classify(text) except: if filename != filenameSV: docErrors_unknown = docErrors_unknown + 1 filenameSV = filename print(" Unknown file read error.") continue language = value[0] probability = value[1] print(' LANGID', language, probability) # ('en', 0.999999999999998) print() currentLine.extend(['LANGID', language, probability]) currentLine.extend( [fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)]) writer = csv.writer(csvfile) writer.writerows([currentLine]) filenameSV = filename csvfile.close() msg = '' if docErrors_empty == 0 and docErrors_unknown == 0: msg = str( fileID ) + ' documents successfully processed for language detection.' else: if docErrors_empty > 0: msg = str( fileID ) + ' documents processed for language detection.\n ' + str( docErrors_empty) + ' document(s) found empty.' if docErrors_unknown > 0: if msg != '': msg = msg + '\n ' + str( docErrors_unknown ) + ' document(s) read with unknown errors.' else: msg = str(fileID) + ' documents processed for language detection.\n ' + \ str(docErrors_unknown) + ' document(s) read with unknown errors.' mb.showwarning( title='File read errors', message=msg + '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.' ) filesToOpen.append(outputFilenameCSV) if createExcelCharts: columns_to_be_plotted = [[1, 1], [4, 4], [7, 7]] chart_title = 'Frequency of Languages Detected by 3 Algorithms' hover_label = ['LANGDETECT', 'SPACY', 'LANGID'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='_bar_chart', chart_type_list=["bar"], chart_title=chart_title, column_xAxis_label_var='Language', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != '': filesToOpen.append(Excel_outputFilename) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, CoNLL_table_analysis_var, complexity_readability_analysis_var, vocabulary_analysis_var, ngrams_analysis_var, CoNLL_table_analysis_menu_var, complexity_readability_analysis_menu_var, vocabulary_analysis_menu_var, ngrams_analysis_menu_var, gender_guesser_var): filesToOpen = [] # Store all files that are to be opened once finished if (CoNLL_table_analysis_var == False and complexity_readability_analysis_var == False and vocabulary_analysis_var == False and ngrams_analysis_var == False and gender_guesser_var == False): mb.showwarning( 'Warning', 'No options have been selected.\n\nPlease, select an option and try again.' ) return if CoNLL_table_analysis_var == True: withHeader = True recordID_position = 8 documentId_position = 10 data, header = IO_csv_util.get_csv_data(inputFilename, withHeader) if len(data) == 0: return data_divided_sents = IO_CoNLL_util.sentence_division(data) if data_divided_sents == None: return if len(data_divided_sents) == 0: return if 'Clauses' in CoNLL_table_analysis_menu_var: tempfilesToOpen = CoNLL_clause_analysis_util.clause_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) # only open the chart files filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) elif 'Nouns' in CoNLL_table_analysis_menu_var: tempfilesToOpen = CoNLL_noun_analysis_util.noun_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) # only open the chart files filesToOpen.append(tempfilesToOpen[0]) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[2]) filesToOpen.append(tempfilesToOpen[4]) filesToOpen.append(tempfilesToOpen[6]) filesToOpen.append(tempfilesToOpen[8]) elif 'Verbs' in CoNLL_table_analysis_menu_var: tempfilesToOpen = CoNLL_verb_analysis_util.verb_voice_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) # only open the chart files filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_verb_analysis_util.verb_modality_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_verb_analysis_util.verb_tense_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) elif 'Function' in CoNLL_table_analysis_menu_var: # only open the chart files import CoNLL_function_words_analysis_util tempfilesToOpen = CoNLL_function_words_analysis_util.article_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_function_words_analysis_util.auxiliary_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_function_words_analysis_util.conjunction_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_function_words_analysis_util.preposition_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) tempfilesToOpen = CoNLL_function_words_analysis_util.pronoun_stats( inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts) filesToOpen.append(tempfilesToOpen[1]) filesToOpen.append(tempfilesToOpen[3]) elif 'POSTAG' in CoNLL_table_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return elif 'DEPREL' in CoNLL_table_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return elif 'NER' in CoNLL_table_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return else: mb.showwarning( 'Warning', 'No option has been selected for CoNLL table analysis.\n\nPlease, select an option and try again.' ) return if complexity_readability_analysis_var == True: if 'Sentence' in complexity_readability_analysis_menu_var: if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return filesToOpen = sentence_analysis_util.sentence_complexity( GUI_util.window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) if filesToOpen == None: return elif 'Text' in complexity_readability_analysis_menu_var: if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return sentence_analysis_util.sentence_text_readability( GUI_util.window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) elif 'tree' in complexity_readability_analysis_menu_var: if IO_libraries_util.inputProgramFileCheck( 'DependenSee.Jar') == False: return errorFound, error_code, system_output = IO_libraries_util.check_java_installation( 'Sentence structure visualization') if errorFound: return if inputFilename == '' and inputFilename.strip()[-4:] != '.txt': mb.showwarning( title='Input file error', message= 'The Sentence tree viewer script requires a single txt file in input.\n\nPlease, select a txt file and try again.' ) return IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running Sentence visualization: Dependency tree viewer (png graphs) at', True, '\n\nYou can follow Sentence Complexity in command line.') subprocess.call( ['java', '-jar', 'DependenSee.Jar', inputFilename, outputDir]) mb.showwarning( title='Analysis end', message= 'Finished running the Dependency tree viewer (png graphs).\n\nMake sure to open the png files in output, one graph for each sentence.' ) else: mb.showwarning( 'Warning', 'No option has been selected for Complex/readability analysis.\n\nPlease, select an option and try again.' ) return if vocabulary_analysis_var == True: if vocabulary_analysis_menu_var == '': mb.showwarning( 'Warning', 'No option has been selected for Vocabulary analysis.\n\nPlease, select an option and try again.' ) return if 'Repetition' in vocabulary_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return if '*' == vocabulary_analysis_menu_var: filesToOpen = file_spell_checker_util.language_detection( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) if '*' == vocabulary_analysis_menu_var: filesToOpen = statistics_txt_util.process_words( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) # if len(tempOutputfile)>0: # filesToOpen.extend(tempOutputfile) elif 'detection' in vocabulary_analysis_menu_var: filesToOpen = file_spell_checker_util.language_detection( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) elif 'capital' in vocabulary_analysis_menu_var: filesToOpen = statistics_txt_util.process_words( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'capital') elif 'Short' in vocabulary_analysis_menu_var: filesToOpen = statistics_txt_util.process_words( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'short') elif 'Vowel' in vocabulary_analysis_menu_var: filesToOpen = statistics_txt_util.process_words( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'vowel') elif 'Punctuation' in vocabulary_analysis_menu_var: filesToOpen = statistics_txt_util.process_words( window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'punctuation') if '*' == vocabulary_analysis_menu_var or 'Yule' in vocabulary_analysis_menu_var: statistics_txt_util.yule(window, inputFilename, inputDir, outputDir) if '*' == vocabulary_analysis_menu_var or '*' == vocabulary_analysis_menu_var or 'Unusual' in vocabulary_analysis_menu_var: tempFiles = file_spell_checker_util.nltk_unusual_words( window, inputFilename, inputDir, outputDir, False, createExcelCharts) if len(tempFiles) > 0: filesToOpen.extend(tempFiles) if '*' == vocabulary_analysis_menu_var or 'Abstract' in vocabulary_analysis_menu_var: # ABSTRACT/CONCRETENESS _______________________________________________________ mode = "both" # mean, median, both (calculates both mean and median) if lib_util.checklibFile( GUI_IO_util.concreteness_libPath + os.sep + 'Concreteness_ratings_Brysbaert_et_al_BRM.csv', 'concreteness_analysis_util.py') == False: return if IO_libraries_util.inputProgramFileCheck( 'concreteness_analysis_util.py') == False: return IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running CONCRETENESS Analysis at', True) if len(inputFilename) > 0: outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'SC', 'Concreteness', '', '', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'Concreteness', '', '', '', False, True) concreteness_analysis_util.main(inputFilename, inputDir, outputDir, outputFilename, mode) filesToOpen.append(outputFilename) if createExcelCharts == True: inputFilename = outputFilename if mode == "both": columns_to_be_plotted = [[2, 4], [2, 5]] hover_label = ['Sentence', 'Sentence'] else: columns_to_be_plotted = [[2, 4]] hover_label = ['Sentence'] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='Concret', chart_type_list=["line"], chart_title='Concreteness Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, # outputFilename, chart_type_list=["line"], # chart_title="Concreteness Scores by Sentence Index", # column_xAxis_label_var='Sentence index', # column_yAxis_label_var='Frequency of concreteness scores', # outputExtension='.xlsm', label1='SC', label2='Concreteness', # label3='line', label4='chart', label5='', useTime=False, # disable_suffix=True, # count_var=0, column_yAxis_field_list=[], # reverse_column_position_for_series_label=False, # series_label_list=[''], second_y_var=0, # second_yAxis_label='', hover_var=1, # hover_info_column_list=hover_label) # if outputFilename != "": # filesToOpen.append(outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running CONCRETENESS Analysis at', True) if ngrams_analysis_var == True: if 'Character' in ngrams_analysis_menu_var or 'Word' in ngrams_analysis_menu_var: if 'Character' in ngrams_analysis_menu_var: ngramType = 0 else: ngramType = 1 IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams analysis start', 'Started running Word/Characters N-Grams at', True, 'You can follow the script in command line.') # (inputFilename = '' # for now we only process a whole directory if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return ngramsNumber = 4 normalize = False excludePunctuation = False statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, ngramsNumber, normalize, excludePunctuation, ngramType, openOutputFiles, createExcelCharts, bySentenceIndex_var) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams analysis end', 'Finished running Word/Characters N-Grams at', True) elif 'Hapax' in ngrams_analysis_menu_var: ngramsNumber = 1 ngramType = 1 normalize = False excludePunctuation = False statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, ngramsNumber, normalize, excludePunctuation, ngramType, openOutputFiles, createExcelCharts, bySentenceIndex_var) elif 'POSTAG' in ngrams_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return elif 'DEPREL' in ngrams_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return elif 'NER' in ngrams_analysis_menu_var: mb.showwarning( 'Warning', 'The selected option is not available yet.\n\nSorry!') return else: mb.showwarning( 'Warning', 'No option has been selected for N-grams analysis.\n\nPlease, select an option and try again.' ) return if gender_guesser_var == True: IO_files_util.runScript_fromMenu_option('Gender guesser', 0, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts) return if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def compute_corpus_statistics(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, excludeStopWords=True, lemmatizeWords=True): filesToOpen = [] outputFilenameCSV = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'corpus', 'stats') filesToOpen.append(outputFilenameCSV) inputDocs = IO_files_util.getFileList(inputFilename, inputDir, fileType='.txt') # read_line(inputFilename, inputDir, outputDir) # return Ndocs = str(len(inputDocs)) fieldnames = [ 'Number of documents in corpus', 'Document ID', 'Document', 'Number of Sentences in Document', 'Number of Words in Document', 'Number of Syllables in Document', 'Word1', 'Frequency1', 'Word2', 'Frequency2', 'Word3', 'Frequency3', 'Word4', 'Frequency4', 'Word5', 'Frequency5', 'Word6', 'Frequency6', 'Word7', 'Frequency7', 'Word8', 'Frequency8', 'Word9', 'Frequency9', 'Word10', 'Frequency10', 'Word11', 'Frequency11', 'Word12', 'Frequency12', 'Word13', 'Frequency13', 'Word14', 'Frequency14', 'Word15', 'Frequency15', 'Word16', 'Frequency16', 'Word17', 'Frequency17', 'Word18', 'Frequency18', 'Word19', 'Frequency19', 'Word20', 'Frequency20' ] if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running corpus statistics at', True) with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() #print("Number of corpus text documents: ",Ndocs) #currentLine.append([Ndocs]) index = 0 for doc in inputDocs: head, tail = os.path.split(doc) index = index + 1 # currentLine.append([index]) print("Processing file " + str(index) + "/" + str(Ndocs) + " " + tail) #currentLine.append([doc]) fullText = (open(doc, "r", encoding="utf-8", errors="ignore").read()) Nsentences = str(textstat.sentence_count(fullText)) #print('TOTAL number of sentences: ',Nsentences) Nwords = str(textstat.lexicon_count(fullText, removepunct=True)) #print('TOTAL number of words: ',Nwords) Nsyllables = textstat.syllable_count(fullText, lang='en_US') #print('TOTAL number of Syllables: ',Nsyllables) # words = fullText.split() words = nltk.word_tokenize(fullText) if excludeStopWords: words = excludeStopWords_list(words) if lemmatizeWords: lemmatizer = WordNetLemmatizer() text_vocab = set( lemmatizer.lemmatize(w.lower()) for w in fullText.split(" ") if w.isalpha()) words = set( lemmatizing(w.lower()) for w in words if w.isalpha()) # fullText.split(" ") if w.isalpha()) word_counts = Counter(words) # 20 most frequent words #print("\n\nTOP 20 most frequent words ----------------------------") # for item in word_counts.most_common(20): # print(item) # currentLine=[[Ndocs,index,doc,Nsentences,Nwords,Nsyllables]] currentLine = [[ Ndocs, index, IO_csv_util.dressFilenameForCSVHyperlink(doc), Nsentences, Nwords, Nsyllables ]] for item in word_counts.most_common(20): currentLine[0].append(item[0]) # word currentLine[0].append(item[1]) # frequency writer = csv.writer(csvfile) writer.writerows(currentLine) csvfile.close() # compute statistics about doc length grouped by Document list = ['Document ID'] tempOutputfile = statistics_csv_util.compute_field_statistics_groupBy( window, outputFilenameCSV, outputDir, list, openOutputFiles, createExcelCharts, 4) # ,4) # 'number of words in doc' if tempOutputfile != None: filesToOpen.extend(tempOutputfile) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis end', 'Finished running corpus statistics at', True) if createExcelCharts == True: columns_to_be_plotted = [[1, 3], [1, 4]] hover_label = ['Document', 'Document'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='corpus_stats', chart_type_list=["bar"], # chart_title='Corpus statistics\nCorpus directory: '+inputDir, chart_title= 'Corpus Statistics: Frequency of Sentences & Words by Document', column_xAxis_label_var='Document', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # TODO # we should create 10 classes of values by distance to the median of # each value in the Number of Words in Document Col. E # -0-10 11-20 21-30,… 91-100 # and plot them as column charts. if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) return filesToOpen
def main(window, inputDir, inputTargetDir, outputDir, openOutputFiles, createExcelCharts, relativity_threshold): filesToOpen = [] # check that the CoreNLPdir as been setup CoreNLPdir = IO_libraries_util.get_external_software_dir( 'file_classifier_NER_util', 'Stanford CoreNLP') if CoreNLPdir == None: return filesToOpen IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running the File Classifier by NER values at', True, 'You can follow the Classifier in command line.\n\nThe script will first build a dictionary of NER values for the documents in each subfolder, then process each unclassified document. Please, be patient.' ) if inputDir[-1] != '/': inputDir = inputDir + '/' outputFilename = IO_files_util.generate_output_file_name( '', inputTargetDir, outputDir, '.csv', 'SSR', 'NER_class', '', '', '', False, True) filesToOpen.append(outputFilename) f = open(outputFilename, 'w', encoding='utf-8', errors='ignore') terminal_output = sys.stdout sys.stdout = f print( "Source document,Target directory,Highest index,Relativity index (>" + str(relativity_threshold) + "),Outcome") actors = load_soc_actors() dirs = glob(inputTargetDir + '/*/') if dirs == []: mb.showwarning( "Warning", "No target subdirectories.\n\nNo target subdirectories found in\n\n" + inputTargetDir + "\n\nPlease, check your target directory in the INPUT secondary directory in the IO widgets." ) filesToOpen = [] sys.stdout = terminal_output return filesToOpen nlp = StanfordCoreNLP(CoreNLPDir) compare = {} num_folder = 0 sys.stdout = terminal_output for dir in dirs: print("Processing folder " + str(num_folder + 1) + "/" + str(len(dirs)) + "; Folder name: " + dir.split(os.path.sep)[-2]) compare = get_NER_POSTAG(dir, actors, nlp, compare) num_folder += 1 print("Finished all " + str(num_folder) + " folders. Start to process documents now.") sys.stdout = f #compare stores: key- folder id; value: a set of words num_doc, num_unclass, num_class, num_multiclass = find( inputDir, actors, nlp, compare, relativity_threshold, f, terminal_output) sys.stdout = terminal_output mb.showinfo(title="Final results", message=str(num_doc) + " SOURCE document processed\n" + \ str(num_class) + " SOURCE documents classified in TARGET subdirectories\n" + \ str(num_multiclass) + " SOURCE documents classified in MULTIPLE TARGET subdirectories\n" + \ str(num_unclass) + " SOURCE documents unclassified") print("Number of unclassified documents processed in input: " + str(num_doc)) print("Number of classified documents in output: " + str(num_class)) print( "Number of classified documents (with multiple targets) in output: " + str(num_multiclass)) print("Number of unclassified documents in output: " + str(num_unclass)) nlp.close() f.close() if createExcelCharts == True: columns_to_be_plotted = [[3, 3]] hover_label = '' inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='SSR_NER_home', chart_type_list=["pie"], chart_title='Frequency Distribution of Find a Home Outcome', column_xAxis_label_var='', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) filesToOpen = [ ] # to avoid opening twice here and in calling fuunction return filesToOpen
def process_words(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, processType='', excludeStopWords=True, word_length=3): filesToOpen = [] index = 0 multiple_punctuation = 0 exclamation_punctuation = 0 question_punctuation = 0 punctuation_docs = [] inputDocs = IO_files_util.getFileList(inputFilename, inputDir, fileType='.txt') Ndocs = str(len(inputDocs)) word_list = [] for doc in inputDocs: head, tail = os.path.split(doc) index = index + 1 print("Processing file " + str(index) + "/" + str(Ndocs) + " " + tail) fullText = (open(doc, "r", encoding="utf-8", errors="ignore").read()) # words = fullText.translate(string.punctuation).lower().split() fullText = fullText.replace('\n', ' ') words = fullText.translate(string.punctuation).split() if excludeStopWords: words = excludeStopWords_list(words) if processType != '': hideMessage = False else: hideMessage = True if Ndocs == 1: hideMessage = False else: hideMessage = True if processType == '' or "short" in processType.lower(): header = 'Short words (<4 chars)' fileLabel = 'short_words' # exclude numbers from list word_list = [ word for word in words if word and len(word) <= int(word_length) and word.isalpha() ] filesToOpen = print_results(window, words, word_list, header, inputFilename, outputDir, excludeStopWords, fileLabel, hideMessage, filesToOpen) # filesToOpen.append(outputFilename) if processType == '' or "capital" in processType.lower(): header = 'Initial-capital words' fileLabel = 'init_cap_words' word_list = [word for word in words if word and word[0].isupper()] filesToOpen = print_results(window, words, word_list, header, inputFilename, outputDir, excludeStopWords, fileLabel, hideMessage, filesToOpen) # if outputFilename!='': # filesToOpen.append(outputFilename) if processType == '' or "vowel" in processType.lower(): header = 'Vowel words' fileLabel = 'vowel_words' word_list = [ word for word in words if word and word[0] in "aeiou" and word.isalpha() ] filesToOpen = print_results(window, words, word_list, header, inputFilename, outputDir, excludeStopWords, fileLabel, hideMessage, filesToOpen) # if outputFilename!='': # filesToOpen.append(outputFilename) if processType == '' or "punctuation" in processType.lower(): header = [ 'Word', 'Punctuation symbols of pathos (?!)', 'Document ID', 'Document' ] fileLabel = 'punctuation' for word in words: punctuation = '' character_index = 0 for i in word: if '!' in i or '?' in i: punctuation = word[character_index:len(word)] continue character_index = character_index + 1 if punctuation != '': if doc not in punctuation_docs: punctuation_docs.append(doc) word_list.extend([[ word, punctuation, index, IO_csv_util.dressFilenameForCSVHyperlink(doc) ]]) if '!' in punctuation and '?' in punctuation: multiple_punctuation = multiple_punctuation + 1 elif '!' in punctuation: exclamation_punctuation = exclamation_punctuation + 1 elif '?' in punctuation: question_punctuation = question_punctuation + 1 mb.showinfo(title='Results', message="Combinations of ! and ? punctuation symbols were used " + str(multiple_punctuation) + \ " times.\n\n! punctuation symbols were used " + str(exclamation_punctuation) + \ " times.\n\n? punctuation symbols were used " + str(question_punctuation) + \ " times.\n\n\nPunctuation symbols of pathos (!?) were used in " + str(len(punctuation_docs)) + " separate documents out of " + str(Ndocs) + " documents.\n\nCHECK COMMAND LINE FOR A COPY OF THESE RESULTS.") print("\nCombinations of ! and ? punctuation symbols were used " + str(multiple_punctuation) + \ " times.\n\n! punctuation symbols were used " + str(exclamation_punctuation) + \ " times.\n\n? punctuation symbols were used " + str(question_punctuation) + \ " times.\n\nPunctuation symbols of pathos (!?) were used in " + str(len(punctuation_docs)) + " separate documents out of " + str(Ndocs) + " documents.") outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', fileLabel) word_list.insert(0, header) IO_error = IO_csv_util.list_to_csv(window, word_list, outputFilename) if createExcelCharts == True: columns_to_be_plotted = [[1, 1]] hover_label = [] inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='punct_stats', chart_type_list=["bar"], # chart_title='Corpus statistics\nCorpus directory: '+inputDir, chart_title='Frequency of Punctuation Symbols of Pathos (?!)', column_xAxis_label_var='Punctuation symbols of pathos (?!)', hover_info_column_list=hover_label, count_var=True) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # should also provide a bar chart of the frequency of distinct documents by punctuation symbol columns_to_be_plotted = [[2, 2]] hover_label = [] inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='punct_doc_stats', chart_type_list=["bar"], # chart_title='Corpus statistics\nCorpus directory: '+inputDir, chart_title='Frequency of ' + str(Ndocs) + ' Documents with Punctuation Symbols of Pathos (?!)', column_xAxis_label_var='Punctuation symbols of pathos (?!)', hover_info_column_list=hover_label, count_var=True) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) if not IO_error: filesToOpen.append(outputFilename) return filesToOpen
def noun_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): # print("\nRun noun analysis") filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running NOUN ANALYSES at', True) # TODO: fix postag_list, postag_counter, deprel_list, deprel_counter, ner_list, ner_counter = compute_stats( data) noun_postag, noun_deprel, noun_ner, \ noun_postag_stats, noun_deprel_stats, noun_ner_stats = noun_POSTAG_DEPREL_compute_frequencies(data, data_divided_sents) # output file names noun_postag_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_list') noun_deprel_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_list') noun_ner_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_list') noun_postag_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_stats') noun_deprel_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_stats') noun_ner_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_stats') # save csv files ------------------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun POS Tags', noun_postag, documentId_position), noun_postag_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun DEPREL Tags', noun_deprel, documentId_position), noun_deprel_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun NER Tags', noun_ner, documentId_position), noun_ner_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_file_name) # save csv frequency files ---------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_postag_stats, noun_postag_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_deprel_stats, noun_deprel_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_ner_stats, noun_ner_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_stats_file_name) if createExcelCharts == True: # pie charts ----------------------------------------------------------------------------------------------- Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_postag_stats], inputFilename=noun_postag_stats_file_name, outputDir=outputDir, scriptType='Nouns_POS', chartTitle="Noun POS Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_deprel_stats], inputFilename=noun_deprel_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Noun DEPREL Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_ner_stats], inputFilename=noun_ner_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Nouns (NER Tags)", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index ----------------------------------------------------------------------------------------------- outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_postag_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun POS Tags'], ['FORM', 'Sentence', 'Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_deprel_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun DEPREL Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_ner_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun NER Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running NOUN ANALYSES at', True) return filesToOpen
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, doNotListIndividualFiles): global first_section, noun_cnt, nominalized_cnt first_section = re.compile("^(.+?)\.") noun_cnt = Counter() nominalized_cnt = Counter() filesToOpen = [] # Store all files that are to be opened once finished if __name__ == '__main__': nltk.data.path.append('./nltk_data') inputDocs = [] if os.path.isdir(inputDir): for f in os.listdir(inputDir): if f[:2] != '~$' and f[-4:] == '.txt': inputDocs.append(os.path.join(inputDir, f)) if len(inputDocs) == 0: print( "There are no txt files in the input path. The program will exit." ) mb.showwarning( title='No txt files found', message= 'There are no txt files in the selected input directory.\n\nPlease, select a different input directory and try again.' ) return else: inputDocs = [inputFilename] IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running Nominalization at', True) #add all into a sum result_dir = [] result_dir.append(["Word", "Is nominalized", "Document"]) docID = 0 result2 = [] result_dir2 = [] counter_nominalized_list = [] counter_nominalized_list.append(['Nominalized verb', 'Frequency']) counter_noun_list = [] counter_noun_list.append(['Noun', 'Frequency']) for doc in inputDocs: docID = docID + 1 print("Processing document", doc, "\n") #open the doc and create the list of result (words, T/F) fin = open(doc, 'r', encoding='utf-8', errors='ignore') # result1 contains the sentence and nominalized values fora a specific document result, result1 = nominalized_verb_detection( docID, doc, fin.read()) # result2 contains the sentence and nominalized values for all documents result2.extend(result1) fin.close() # list all verbs as TRUE/FALSE if nominalized for word, boolean in result: result_dir.append([ word, boolean, IO_csv_util.dressFilenameForCSVHyperlink(doc) ]) result_dir2.extend(result_dir) if len(inputDir) > 0: fname = os.path.basename(os.path.normpath(inputDir)) + "_dir" else: fname = doc # used for both individual files and directories output_filename_bySentenceIndex = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'sent', '', '', '', False, True) if len(inputDir) == 0 or doNotListIndividualFiles == False: counter_nominalized_list = [] counter_noun_list = [] # refresh the headers counter_nominalized_list.insert( 0, ['Nominalized verb', 'Frequency']) counter_noun_list.insert(0, ['Noun', 'Frequency']) result1.insert(0, [ 'Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs', 'Number of nominalizations in sentence', 'Percentage of nominalizations in sentence' ]) # compute frequency of most common nominalized verbs for word, freq in nominalized_cnt.most_common(): counter_nominalized_list.append([word, freq]) # compute frequency of most common nouns for word, freq in noun_cnt.most_common(): counter_noun_list.append([word, freq]) head, fname = os.path.split(doc) fname = fname[:-4] output_filename_noun_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '', False, True) filesToOpen.append(output_filename_noun_frequencies) output_filename_nominalized_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '', '', False, True) filesToOpen.append(output_filename_nominalized_frequencies) # export nominalized verbs list_to_csv(output_filename_nominalized_frequencies, counter_nominalized_list) # export nouns list_to_csv(output_filename_noun_frequencies, counter_noun_list) output_filename_TRUE_FALSE = IO_files_util.generate_output_file_name( fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '', '', '', False, True) filesToOpen.append(output_filename_TRUE_FALSE) list_to_csv(output_filename_TRUE_FALSE, result) filesToOpen.append(output_filename_bySentenceIndex) list_to_csv(output_filename_bySentenceIndex, result1) if createExcelCharts == True: # line chart columns_to_be_plotted = [[2, 6]] chartTitle = 'Nominalized verbs (by Sentence Index)' xAxis = 'Sentence index' yAxis = 'Number of nominalizations in sentence' hover_label = '' Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, output_filename_bySentenceIndex, outputDir, '', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label, column_yAxis_label_var=yAxis) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nominalized verbs Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_nominalized_list], fname, outputDir, 'NOM_Verb', "Nominalized verbs", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nouns Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_noun_list], fname, outputDir, 'NOM_noun', "Nouns", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) if len(inputDir) > 0 and doNotListIndividualFiles == True: output_filename_TRUE_FALSE_dir = IO_files_util.generate_output_file_name( fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '', '', '', False, True) filesToOpen.append(output_filename_TRUE_FALSE_dir) output_filename_dir_noun_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '', False, True) filesToOpen.append(output_filename_dir_noun_frequencies) output_filename_dir_nominalized_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '', '', False, True) filesToOpen.append(output_filename_dir_nominalized_frequencies) result2.insert(0, [ 'Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs', 'Number of nominalizations in sentence', 'Percentage of nominalizations in sentence' ]) list_to_csv(output_filename_bySentenceIndex, result2) # list all verbs as TRUE/FALSE if nominalized list_to_csv(output_filename_TRUE_FALSE_dir, result_dir2) counter_noun_list = [] counter_noun_list.append(['Noun', 'Frequency']) for word, freq in noun_cnt.most_common(): counter_noun_list.append([word, freq]) list_to_csv(output_filename_dir_noun_frequencies, counter_noun_list) counter_nominalized_list = [] counter_nominalized_list.append(['Nominalized verb', 'Frequency']) for word, freq in nominalized_cnt.most_common(): counter_nominalized_list.append([word, freq]) list_to_csv(output_filename_dir_nominalized_frequencies, counter_nominalized_list) if createExcelCharts == True: # pie chart of nominalized verbs Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_nominalized_list], output_filename_dir_nominalized_frequencies, outputDir, 'NOM_verb' "Nominalized verbs", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nouns Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_noun_list], output_filename_dir_noun_frequencies, outputDir, 'NOM_noun', "Nouns", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running Nominalization at', True) if openOutputFiles == 1: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def run(inputDir, outputDir, openOutputFiles, createExcelCharts, OptimizeInterval, numTopics): # to setup environment variable programmatically # https://stackoverflow.com/questions/4906977/how-to-access-environment-variable-values # to get an environment variable # malletEnvDir=os.getenv('', 'MALLET_HOME') # os.environ lists all environment variables # # remove env variable; two alternatives # os.environ.pop("MALLET_HOME") # del os.environ['MALLET_HOME'] # check that the MalletDir as been setup MalletDir = IO_libraries_util.get_external_software_dir( 'topic_modeling_mallet', 'Mallet') if MalletDir == None: return MalletPath = '' try: # if MALLET_HOME has been set up os.getenv returns the Mallet installation path MalletPath = os.getenv('MALLET_HOME', 'MALLET_HOME') if MalletPath == 'MALLET_HOME': # the env variable has not been setup MalletPath = '' mb.showwarning( title='MALLET-HOME environment variable', message= 'The value MALLET-HOME needed by Mallet to run was not found in the environment variables.\n\nThe MALLET_HOME value was added programmatically to your environment variables.\n\nTHIS IS A TEMPORARY FIX VALID FOR RUNNING THE MALLET AS LONG AS THIS GUI REMAINS OPEN. For a more permanent solution, please read the TIPS on Mallet installation and setting Mallet environment variables.' ) # add environment variable os.environ["MALLET_HOME"] = MalletDir else: MalletDir = MalletDir.replace("\\", "/") MalletPath = MalletPath.replace("\\", "/") if str(MalletPath).lower() != str(MalletDir).lower(): # add updated environment variable os.environ["MALLET_HOME"] = MalletDir mb.showwarning( title='Mallet environment variable path update', message= 'The value MALLET-HOME in the environment variables was changed from\n\n ' + MalletPath + '\n\nto\n\n ' + MalletDir) except: mb.showwarning( title='MALLET-HOME environment variable', message= 'The value MALLET-HOME needed by Mallet to run was not found in the environment variables.\n\nThe MALLET_HOME value was added programmatically to your environment variables.\n\nTHIS IS A TEMPORARY FIX VALID FOR RUNNING THE MALLET AS LONG AS THIS GUI REMAINS OPEN. For a more permanent solution, please read the TIPS on Mallet installation and setting Mallet environment variables.' ) MalletDir = MalletDir.replace("\\", "/") MalletPath = MalletPath.replace("\\", "/") if str(MalletPath).lower() != str(MalletDir).lower(): # add environment variable os.environ["MALLET_HOME"] = MalletDir filesToOpen = [] MalletDir = MalletDir + os.sep + 'bin' if ' ' in inputDir: mb.showerror( title='Input file error', message= 'The selected INPUT directory contains a blank (space) in the path. The Mallet code cannot handle input/output paths that contain a space and will break.\n\nPlease, place your input files in a directory with a path containing no spaces and try again.' ) return if ' ' in outputDir: mb.showerror( title='Output file error', message= 'The selected OUTPUT directory contains a blank (space) in the path. The Mallet code cannot handle input/output paths that contain a space and will break.\n\nPlease, select an output directory with a path containing no spaces and try again.' ) return if not os.path.isdir(inputDir): mb.showerror( title='Input directory error', message= 'The selected input directory does NOT exist.\n\nPlease, select a different directory and try again.' ) return if not os.path.isdir(outputDir): mb.showerror( title='Output directory error', message= 'The selected output directory does NOT exist.\n\nPlease, select a different directory and try again.' ) return numFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt') if numFiles == 0: mb.showerror( title='Number of files error', message= 'The selected input directory does NOT contain any file of txt type.\n\nPlease, select a different directory and try again.' ) return elif numFiles == 1: mb.showerror( title='Number of files error', message='The selected input directory contains only ' + str(numFiles) + ' file of txt type.\n\nTopic modeling requires a large number of files to produce valid results. That is true even if the available file contains several different documents morged together.' ) return elif numFiles < 10: mb.showwarning( title='Number of files', message='The selected input directory contains only ' + str(numFiles) + ' files of txt type.\n\nTopic modeling requires a large number of files to produce valid results.' ) """ All OUTPUT file names can be changed and Mallet will still run successfully OUTPUT file names extensions for step two can be TXT or CSV """ # output.mallet TXTFiles_MalletFormatted_FileName = os.path.join( outputDir, "MalletFormatted_TXTFiles.mallet") # output.csv or output.txt Composition_FileName = os.path.join(outputDir, "NLP-Mallet_Output_Composition") # keys.tsv or keys.txt Keys_FileName = os.path.join(outputDir, "NLP-Mallet_Output_Keys.tsv") #output.gz Compressed_FileName = os.path.join(outputDir, "NLP-Mallet_Output_Compressed.gz") # filesToOpen.append(Composition_FileName+'.csv') # filesToOpen.append(Keys_FileName+'.csv') # """ The Key table has as many lines as desired topics and three columns TOPIC #, WEIGHT OF TOPIC that measures the weight of the topic across all the documents, KEY WORDS IN TOPIC that lists a set of typical words belonging to the topic. The Composition table has as many lines as documents analyzed (one document per line) and several columns: column 1 (Document ID), column 2 (Document with path), and as many successive pairs of columns as the number of topics, with column pairs as follow: TOPIC is a number corresponding to the number in column 1 in the Keys file; PROPORTION measures the % of words in the document attributed to that topic (pairs sorted in descending PROPORTION order). """ # mb.showwarning(title="Mallet output files",message="The Python Mallet wrapper runs Mallet with default options. If you want to provide custom options, please run Mallet from the command prompt.\n\nThe NLP Mallet produces four files in output (refer to the Mallet TIPS file for what each file contains):\n\n" + # TXTFiles_MalletFormatted_FileName + "\n" + # Composition_FileName + "\n" + # Keys_FileName + "\n" + # Compressed_FileName) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running Mallet Topic modeling at ', True, "Depending upon corpus size, computations may take a while... Please, be patient...\n\nYou can follow Mallet in command line." ) #FIRST STEP # The output file MalletFormatted_TXTFiles.mallet contains all corpus TXT files properly formatted for Mallet if platform == "win32": subprocess.call([ MalletDir + os.sep + 'mallet', 'import-dir', '--input', inputDir, '--output', TXTFiles_MalletFormatted_FileName, '--keep-sequence', '--remove-stopwords' ], shell=True) # linux # OS X elif platform == "linux" or platform == "linux2" or platform == "darwin": subprocess.call([ MalletDir + os.sep + 'mallet', 'import-dir', '--input', inputDir, '--output', TXTFiles_MalletFormatted_FileName, '--keep-sequence', '--remove-stopwords' ]) #SECOND STEP #The output file Composition_FileName is a tsv file indicating the breakdown, # by percentage, of each topic within each original imported text file #The output file Keys_FileName is a text file showing what the top key words are for each topic #the .gz file contains in .gz compressed form every word in your corpus, with each topic associated with each #see www.gzip.org on how to unzip this #Interval Optimization leads to better results according to http://programminghistorian.org/lessons/topic-modeling-and-mallet # the real format of the file created by mallet is .tsv or .txt if platform == "win32": if OptimizeInterval == True: subprocess.call([ MalletDir + os.sep + 'mallet', 'train-topics', '--input', TXTFiles_MalletFormatted_FileName, '--num-topics', str(numTopics), '--optimize-interval', str(numTopics), '--output-state', Compressed_FileName, '--output-topic-keys', Keys_FileName, '--output-doc-topics', Composition_FileName ], shell=True) else: subprocess.call([ MalletDir + os.sep + 'mallet', 'train-topics', '--input', TXTFiles_MalletFormatted_FileName, '--num-topics', str(numTopics), '--output-state', Compressed_FileName, '--output-topic-keys', Keys_FileName, '--output-doc-topics', Composition_FileName ], shell=True) elif platform == "linux" or platform == "linux2" or platform == "darwin": if OptimizeInterval == True: subprocess.call([ MalletDir + os.sep + 'mallet', 'train-topics', '--input', TXTFiles_MalletFormatted_FileName, '--num-topics', str(numTopics), '--optimize-interval', str(numTopics), '--output-state', Compressed_FileName, '--output-topic-keys', Keys_FileName, '--output-doc-topics', Composition_FileName ]) else: subprocess.call([ MalletDir + os.sep + 'mallet', 'train-topics', '--input', TXTFiles_MalletFormatted_FileName, '--num-topics', str(numTopics), '--output-state', Compressed_FileName, '--output-topic-keys', Keys_FileName, '--output-doc-topics', Composition_FileName ]) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis end', 'Finished running Mallet Topic modeling at ', True) # https://stackoverflow.com/questions/29759305/how-do-i-convert-a-tsv-to-csv # convert to csv Mallet tsv output files # read Mallet tab-delimited files; both Keys_FileName and Composition_FileName must be converted if (not os.path.isfile(Keys_FileName)) and ( not os.path.isfile(Composition_FileName)): mb.showwarning( title='Mallet FATAL error', message= 'Mallet has not produced the expected Keys and Composition files. It looks like Mallet did NOT run.\n\nPlease, make sure that you have edited properly the environment variables by reading the TIPS file for Mallet installation and setting Mallet environment variables.' ) filesToOpen = [] return Keys_FileName = file_type_converter_util.tsv_converter( GUI_util.window, Keys_FileName, outputDir) Composition_FileName = file_type_converter_util.tsv_converter( GUI_util.window, Composition_FileName, outputDir) filesToOpen.append(Keys_FileName) filesToOpen.append(Composition_FileName) if createExcelCharts: columns_to_be_plotted = [[0, 1]] hover_label = [2] chartTitle = 'Mallet Topics' xAxis = 'Topic #' yAxis = 'Topic weight' fileName = Keys_FileName Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, fileName, outputDir, 'Mallet_TM', chart_type_list=["bar"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var=yAxis) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def run(CoreNLPdir, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, mean_var, median_var, SA_algorithm_var, memory_var, sentence_index_var, shape_of_stories_var): #if GUI_util.check_missingIO()==True: # return usefile = False usedir = False flag = "" #used by CoreNLP filesToOpen = [] # Store all files that are to be opened once finished if shape_of_stories_var: if IO_libraries_util.inputProgramFileCheck( 'shape_of_stories_main.py') == False: return call("python shape_of_stories_main.py", shell=True) if SA_algorithm_var == '': mb.showwarning( 'Warning', "No option has been selected.\n\nPlease, select a Sentiment analysis option and try again." ) return if len(inputFilename) > 3: usefile = True usedir = False if len(inputDir) > 3: usefile = False usedir = True mode = "both" if mean_var == False and median_var == False: mode = "mean" elif mean_var == True and median_var == False: mode = "mean" elif mean_var == False and median_var == True: mode = "median" elif mean_var == True and median_var == True: mode = "both" SentiWordNet_var = 0 CoreNLP_var = 0 hedonometer_var = 0 vader_var = 0 anew_var = 0 if SA_algorithm_var == '*': SentiWordNet_var = 1 CoreNLP_var = 1 hedonometer_var = 1 vader_var = 1 anew_var = 1 elif SA_algorithm_var == 'Stanford CoreNLP': CoreNLP_var = 1 elif SA_algorithm_var == 'SentiWordNet': SentiWordNet_var = 1 elif SA_algorithm_var == 'ANEW': anew_var = 1 elif SA_algorithm_var == 'hedonometer': hedonometer_var = 1 elif SA_algorithm_var == 'VADER': vader_var = 1 #CORENLP _______________________________________________________ if CoreNLP_var == 1: #check internet connection import IO_internet_util if not IO_internet_util.check_internet_availability_warning( 'Stanford CoreNLP Sentiment Analysis'): return # flag="true" do NOT produce individual output files when processing a directory; only merged file produced # flag="false" or flag="" ONLY produce individual output files when processing a directory; NO merged file produced flag = "false" # the true option does not seem to work if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Stanford CoreNLP Sentiment Analysis', # 'Started running Stanford CoreNLP Sentiment Analysis at', True, # 'You can follow CoreNLP in command line.') #@ need an additional variable CoreNLP dir and memory_var @ # set memory_var if not there if memory_var == 0: memory_var = 4 outputFilename = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'sentiment', False, memory_var) outputFilename = outputFilename[ 0] # annotators return a list and not a string if len(outputFilename) > 0: filesToOpen.append(outputFilename) #@ not longer need to call java subprocess @ # subprocess.call(['java', '-jar', 'Stanford_CoreNLP_sentiment_analysis.jar', inputDir, inputFilename, outputDir, flag]) if not usedir: if createExcelCharts == True: # CoreNLP only computes mean values columns_to_be_plotted = [[2, 4]] hover_label = ['Sentence'] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='CoreNLP_sent', chart_type_list=["line"], chart_title= 'Stanford CoreNLP - Sentiment Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) columns_to_be_plotted = [[5, 5]] hover_label = [] # inputFilename = inputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='CoreNLP_SA', chart_type_list=["bar"], chart_title='Stanford CoreNLP - Sentiment Scores', column_xAxis_label_var='Sentiment score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # outputFilenameXlsm1 = Excel_util.run_all(columns_to_be_plotted,inputFilename,outputDir, outputQuotefilePath, chart_type_list = ["bar"], chart_title= # "Stanford CoreNLP (Sentiment Value)", column_xAxis_label_var = 'Sentiment value', # column_yAxis_label_var = 'Frequency of sentiment value',outputExtension = '.xlsm', # label1='SC',label2='CoreNLP_Sentiment',label3='bar',label4='chart',label5='', # useTime=False,disable_suffix=True, count_var=1, column_yAxis_field_list = [], reverse_column_position_for_series_label=False , series_label_list=[''], second_y_var=0, second_yAxis_label='', hover_info_column_list=hover_label) # else: # #open only the merged file # lastPart=os.path.basename(os.path.normpath(inputDir)) # outputFilename = IO_files_util.generate_output_file_name(lastPart, outputDir, '.csv', 'SC', 'Sentiment CoreNLP', '', '', '', False, True) # filesToOpen.append(outputFilename) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running CoreNLP Sentiment Analysis at', True) #HEDONOMETER _______________________________________________________ if hedonometer_var == 1: if lib_util.checklibFile( GUI_IO_util.sentiment_libPath + os.sep + 'hedonometer.json', 'sentiment_analysis_hedonometer_util.py') == False: return if IO_libraries_util.inputProgramFileCheck( 'sentiment_analysis_hedonometer_util.py') == False: return IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running HEDONOMETER Sentiment Analysis at', True) if len(inputFilename) > 0: fileNamesToPass = [] # LINE ADDED outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'SC', 'Hedonometer', '', '', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'Hedonometer', '', '', '', False, True) sentiment_analysis_hedonometer_util.main(inputFilename, inputDir, outputDir, outputFilename, mode) #tkinter filedialog.askdirectory ALWAYS returns forward slashes / if you use os.sep you end up mixing the slashes # subprocess.call(['python', 'sentiment_analysis_hedonometer_util.py', '--file', inputFilename, "--out", outputDir+os.sep # , "--mode", mode]) filesToOpen.append(outputFilename) if createExcelCharts == True: if mode == "both": columns_to_be_plotted = [[2, 4], [2, 6]] hover_label = ['Sentence', 'Sentence'] else: columns_to_be_plotted = [[2, 4]] hover_label = ['Sentence'] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='Hedo_sent', chart_type_list=["line"], chart_title='Hedonometer - Sentiment Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) columns_to_be_plotted = [[5, 5]] hover_label = [] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='Hedo_sent', chart_type_list=["bar"], chart_title='Hedonometer - Sentiment Scores', column_xAxis_label_var='Sentiment score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running HEDONOMETER Sentiment Analysis at', True) #SentiWordNet _______________________________________________________ if SentiWordNet_var == 1: if IO_libraries_util.inputProgramFileCheck( 'sentiment_analysis_SentiWordNet_util.py') == False: return IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running SentiWordNet Sentiment Analysis at', True) if len(inputFilename) > 0: outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'SC', 'SentiWordNet', '', '', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'SentiWordNet', '', '', '', False, True) sentiment_analysis_SentiWordNet_util.main(inputFilename, inputDir, outputDir, outputFilename, mode) filesToOpen.append(outputFilename) if createExcelCharts == True: # sentiWordNet compute a single sentiment score columns_to_be_plotted = [[2, 4]] hover_label = ['Sentence'] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='SentiWordNet_sent', chart_type_list=["line"], chart_title='SentiWordNet - Sentiment Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) columns_to_be_plotted = [[5, 5]] hover_label = [] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='SentiWordNet_sent', chart_type_list=["bar"], chart_title='SentiWordNet - Sentiment Scores', column_xAxis_label_var='Sentiment score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running SentiWordNet Sentiment Analysis at', True) #VADER _______________________________________________________ if vader_var == 1: if lib_util.checklibFile( GUI_IO_util.sentiment_libPath + os.sep + 'vader_lexicon.txt', 'sentiment_analysis_VADER_util.py') == False: return if IO_libraries_util.inputProgramFileCheck( 'sentiment_analysis_VADER_util.py') == False: return IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running VADER Sentiment Analysis at', True) if len(inputFilename) > 0: outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'SC', 'VADER', '', '', '', False, True) else: outputFilename = IO_files_util.generate_output_file_name( inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'VADER', '', '', '', False, True) sentiment_analysis_VADER_util.main(inputFilename, inputDir, outputDir, outputFilename, mode) filesToOpen.append(outputFilename) if createExcelCharts == True: # VADER does not compute separate mean and median values columns_to_be_plotted = [[2, 4]] hover_label = ['Sentence'] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='VADER_sent', chart_type_list=["line"], chart_title='VADER - Sentiment Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) columns_to_be_plotted = [[5, 5]] hover_label = [] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='VADER_sent', chart_type_list=["bar"], chart_title='VADER - Sentiment Scores', column_xAxis_label_var='Sentiment score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running VADER Sentiment Analysis at', True) #ANEW _______________________________________________________ if anew_var == 1: if lib_util.checklibFile( GUI_IO_util.sentiment_libPath + os.sep + 'EnglishShortenedANEW.csv', 'sentiment_analysis_ANEW') == False: return if IO_libraries_util.inputProgramFileCheck( 'sentiment_analysis_ANEW_util.py') == False: return IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running ANEW Sentiment Analysis at', True) outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'SC', 'ANEW', '', '', '', False, True) sentiment_analysis_ANEW_util.main(inputFilename, inputDir, outputDir, outputFilename, mode) if createExcelCharts == True: # # sentiment by sentence index if mode == "both": columns_to_be_plotted = [[2, 4], [2, 6], [2, 8], [2, 10], [2, 12], [2, 14]] hover_label = [ 'Sentence', 'Sentence', 'Sentence', 'Sentence', 'Sentence', 'Sentence' ] else: columns_to_be_plotted = [[2, 4], [2, 6], [2, 8]] hover_label = ['Sentence', 'Sentence', 'Sentence'] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='ANEW_sent', chart_type_list=["line"], chart_title='ANEW - Sentiment Scores by Sentence Index', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=0, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # sentiment bar chart if mode == "both": columns_to_be_plotted = [[5, 5], [7, 7]] else: columns_to_be_plotted = [[5, 5]] hover_label = [] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='ANEW_sent', chart_type_list=["bar"], chart_title='ANEW - Sentiment Scores', column_xAxis_label_var='Sentiment score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # arousal if mode == "both": columns_to_be_plotted = [[9, 9], [11, 11]] else: columns_to_be_plotted = [[7, 7]] hover_label = [] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='ANEW_arous', chart_type_list=["bar"], chart_title='ANEW - Arousal Scores', column_xAxis_label_var='Arousal score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # dominance if mode == "both": columns_to_be_plotted = [[13, 13], [15, 15]] else: columns_to_be_plotted = [[9, 9]] hover_label = [] # inputFilename = outputFilename Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, outputFilename, outputDir, outputFileLabel='ANEW_dom', chart_type_list=["bar"], chart_title='ANEW - Dominance Scores', column_xAxis_label_var='Dominance score', hover_info_column_list=hover_label, count_var=1, column_yAxis_label_var='Scores') if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running ANEW Sentiment Analysis at', True) if openOutputFiles == True: # IO_user_interface_util.timed_alert(GUI_util.window, 5000, 'Warning', 'All csv output files have been saved to ' + outputDir) IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def plagiarist(input_main_dir_path, output_dir_path, open_csv_output_checkbox, createExcelCharts, similarityIndex_Plagiarist_var, fileName_embeds_date, DateFormat, DatePosition, DateCharacterSeparator): if similarityIndex_Plagiarist_var < .8: mb.showwarning(title='Similarity Index warning', message="The level of similarity was set at " + str( similarityIndex_Plagiarist_var) + ".\n\nCAVEAT! The default threshold for similarity is normally set at 80%.\n\nBe aware that lowering the default level may result in too many documents wrongly classified as similar; conversely, raising the level may exclude too many documents.") if IO_libraries_util.inputProgramFileCheck('Lucene.jar') == False: return if len(DateCharacterSeparator) == 0: tk.messagebox.showinfo("Plagiarist", "DateCharacterSeparator") return lib_stopwords = lib_util.check_lib_stopwords() if len(lib_stopwords) != 0: IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PLAGIARIST at', True) errorFound, error_code, system_output = IO_libraries_util.check_java_installation('Lucene') if errorFound: return subprocess.call(['java', '-jar', 'Lucene.jar', '-inputDir', input_main_dir_path + os.sep, '-outputDir', output_dir_path + os.sep , '-stopword', lib_stopwords, '-embedsDate', str(fileName_embeds_date), '-dateFormat', DateFormat , '-datePos', str(DatePosition), '-itemsDelim', DateCharacterSeparator, '-similarityIndex', str(similarityIndex_Plagiarist_var)]) filesToOpen.append(output_dir_path + os.sep + "document_duplicates.txt") outputFilenameCSV_1 = output_dir_path + os.sep + "Lucene_classes_freq.csv" filesToOpen.append(outputFilenameCSV_1) if fileName_embeds_date: outputFilenameCSV_2 = output_dir_path + os.sep + "Lucene_classes_time_freq.csv" filesToOpen.append(outputFilenameCSV_2) outputFilenameCSV_3 = output_dir_path + os.sep + "Lucene_document_instance_classes_freq.csv" filesToOpen.append(outputFilenameCSV_3) outputFilenameCSV_4 = output_dir_path + os.sep + "Lucene_Document_classes_freq.csv" group_newspaper(outputFilenameCSV_3, outputFilenameCSV_4) filesToOpen.append(outputFilenameCSV_4) if createExcelCharts: # Lucene_classes_freq.csv; outputFilenameCSV_1 outputDir=output_dir_path inputFilename = outputFilenameCSV_1 columns_to_be_plotted = [[0, 1]] hover_label = ['List of Documents in Category'] Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='SSR_plagiar', chart_type_list=["bar"], chart_title='Frequency of Plagiarism by Classes of % Duplication', column_xAxis_label_var='Classes of percentage duplication', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # Plot Lucene_classes_time_freq.csv line plot (temporal plot); outputFilenameCSV_2 if fileName_embeds_date: # columns_to_be_plotted = [[0,1], [0,2], [0,3], [0,4], [0,5], [0,6],[0,7], [0,8], [0,9],[0,10]] # hover_label=['','','','','','','','','',''] inputFilename = outputFilenameCSV_2 columns_to_be_plotted = [[0, 1], [0, 2], [0, 3]] hover_label = ['', '', ''] Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='SSR_plagiar', chart_type_list=["line"], chart_title='Frequency of Plagiarism by Year', column_xAxis_label_var='Year', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # No plot for Lucene_document_classes_freq.csv # because it could potentially have thousands of documents # inputFilename = outputFilenameCSV_3 # Lucene_Document_classes_freq.csv; outputFilenameCSV_4 columns_to_be_plotted = [[0, 1],[0, 2],[0, 3]] hover_label = [''] inputFilename = outputFilenameCSV_4 Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='SSR_plagiar', chart_type_list=["bar"], chart_title='Frequency of Plagiarism by Document Name & Classes', column_xAxis_label_var='', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PLAGIARIST at', True)