def preposition_stats(inputFilename,outputDir,data, data_divided_sents, openOutputFiles,createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PREPOSITION Analysis at', True) #output file names function_words_list_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'list') function_words_stats_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'stats') # filesToOpen.append(function_words_list_file_name) # not necessary to open stats since these stats are included in the pie chart # filesToOpen.append(function_words_stats_file_name) #data = get_data(inputFilename) #data_divided_sents = IO_CoNLL_util.sentence_division(data) if 0: stats_prepositions(data) # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running PRONOUN Analysis at',True) else: if not os.path.isdir(outputDir): mb.showwarning(title='output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return filesToOpen prepositions_list,prepositions_stats= stats_prepositions_output(data,data_divided_sents) errorFound=IO_csv_util.list_to_csv(GUI_util.window,IO_CoNLL_util.sort_output_list('PREPOSITIONS',prepositions_list,documentId_position), function_words_list_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_list_file_name) errorFound=IO_csv_util.list_to_csv(GUI_util.window,prepositions_stats,function_words_stats_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_stats_file_name) if createExcelCharts==True: Excel_outputFilename= Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[prepositions_stats], inputFilename=function_words_stats_file_name, outputDir=outputDir, scriptType='FuncWords_prep', chartTitle="Preposition Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index outputFiles=Excel_util.compute_csv_column_frequencies(GUI_util.window, function_words_list_file_name, '', outputDir, openOutputFiles,createExcelCharts, [[1,4]], ['PREPOSITIONS'],['FORM','Sentence'], ['Document ID','Sentence ID','Document'], 'FW','line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PREPOSITION Analysis at', True) return filesToOpen
def verb_voice_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # print ("\nRun verb voice analysis") # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running VERB VOICE analysis at', True) data_prep = verb_voice_data_preparation(data) voice_list, voice_stats = voice_output(data_prep,data_divided_sents) # output file names verb_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'list') verb_stats_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, IO_CoNLL_util.sort_output_list('Verb Voice', voice_list, documentId_position), verb_file_name) if errorFound == True: return filesToOpen.append(verb_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, voice_stats, verb_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(verb_stats_file_name) if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[voice_stats], inputFilename=verb_stats_file_name, outputDir=outputDir, scriptType='Verb_Voice', chartTitle="Frequency Distribution of Verb Voice", chart_type_list=["pie"], column_xAxis_label="Verb voice values", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index outputFiles = Excel_util.compute_csv_column_frequencies(GUI_util.window, verb_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Verb Voice'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running VERB VOICE analysis at', True) return filesToOpen
def search(searchWord, searchSize, position, inputKWICfile, within, outFile, ranWithCLAs=False): # read in word1,word2,targeted window size column in the csv file if (within == 1): cols = [] cols.append(0) cols.append(1) i = 12 - searchSize while i <= 11 + searchSize: cols.append(i) i+=1 data = pd.read_csv(inputKWICfile, usecols=cols, engine='python') else: data = pd.read_csv(inputKWICfile, usecols=[0, 1, 12-searchSize, 11+searchSize], engine='python') # filter out the rows where searchWord is in target = data['word1'].str.lower() == searchWord.lower() target_rows = data[target] # get the values in dataframe rownum = target_rows.shape[0] colnum = target_rows.shape[1] i = 0 global leftKWIC, rightKWIC leftKWIC = [] rightKWIC = [] result = [] mid = 1+(colnum-2)/2 #every row refers to a new combination of two words while i < rownum: #every row refers to a new combination of two words; the counts should start over at zero countLeft = 0 countRight = 0 #print("i ",i) i is correct if (position == "left"): j = 2 while j <= mid: countLeft += target_rows.iloc[i][j] j+=1 if not (countLeft == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft]) if countLeft>0: leftKWIC.append([target_rows.iloc[i]['word2'],countLeft]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)"]) for items in result: writer.writerow(items) #leftKWIC.append(items[2],items[3]) elif (position == "right"): j = round(mid + 1) while j < colnum: countRight += target_rows.iloc[i][j] j += 1 if not (countRight == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countRight]) if countRight>0: rightKWIC.append([target_rows.iloc[i]['word2'],countRight]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_right(+"+str(searchSize)+" words"]) for items in result: writer.writerow(items) else: j = 2 while j <= mid: countLeft += target_rows.iloc[i][j] j += 1 while j < colnum: countRight += target_rows.iloc[i][j] j += 1 if not (countLeft == 0 and countRight == 0): result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft, countRight]) if countLeft>0: leftKWIC.append([target_rows.iloc[i]['word2'],countLeft]) if countRight>0: rightKWIC.append([target_rows.iloc[i]['word2'],countRight]) # write into csv file with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)", "Position_right(+"+str(searchSize)+" words"]) for items in result: writer.writerow(items) i+=1 #TODO #searchWord should be displayed in quotes in the chart #should exclude stopwords (putting a widget on GUI) filesToOpen.append(KWIC_search_output_filename) """ #display chart for searchWord within sentence KWIC_search_output_filename_sentence=KWIC_search_output_filename.strip()[:-4]+"_sentence_counts.xlsx" sentenceKWIC= IO_util.list_to_csv(window,sentenceKWIC,KWIC_search_output_filename_sentence) #sort will not work with headers; headers inserted after sentenceKWIC=stats_visuals_util.sort_data(sentenceKWIC,1,True) sentenceKWIC.insert(0,["KWIC (sentence tokens)","Counts"]) stats_visuals_util.create_excel_chart(window,"bar","Sentence tokens for " + searchWord,rightKWIC,KWIC_search_output_filename_sentence,20) filesToOpen.append(KWIC_search_output_filename_sentence) """ if position == "left" or position == "both": if len(leftKWIC)>0: KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_left_counts.xlsx" IO_csv_util.list_to_csv(window,leftKWIC,KWIC_search_output_filename_stats) #sort will not work with headers; headers inserted after leftKWIC=stats_visuals_util.sort_data(leftKWIC,1,True) leftKWIC.insert(0,["KWIC (left-hand tokens)","Counts"]) Excel_util.create_excel_chart(window,"bar","Left-hand tokens for " + searchWord,[leftKWIC],KWIC_search_output_filename_stats,20) filesToOpen.append(KWIC_search_output_filename_stats) else: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no left-hand words for the searched keyword: ' + searchWord) if position == "right" or position == "both": if len(rightKWIC)>0: KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_right_counts.xlsx" IO_csv_util.list_to_csv(window,rightKWIC,KWIC_search_output_filename_stats) #sort will not work with headers; headers inserted after rightKWIC=stats_visuals_util.sort_data(rightKWIC,1,True) rightKWIC.insert(0,["KWIC (right-hand tokens)","Counts"]) Excel_util.create_excel_chart(window,"bar","Right-hand tokens for " + searchWord,[rightKWIC],KWIC_search_output_filename_stats,20) filesToOpen.append(KWIC_search_output_filename_stats) else: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no right-hand words for the searched keyword: ' + searchWord) if ranWithCLAs == False: IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'Finished running KWIC at', True)
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, doNotListIndividualFiles): global first_section, noun_cnt, nominalized_cnt first_section = re.compile("^(.+?)\.") noun_cnt = Counter() nominalized_cnt = Counter() filesToOpen = [] # Store all files that are to be opened once finished if __name__ == '__main__': nltk.data.path.append('./nltk_data') inputDocs = [] if os.path.isdir(inputDir): for f in os.listdir(inputDir): if f[:2] != '~$' and f[-4:] == '.txt': inputDocs.append(os.path.join(inputDir, f)) if len(inputDocs) == 0: print( "There are no txt files in the input path. The program will exit." ) mb.showwarning( title='No txt files found', message= 'There are no txt files in the selected input directory.\n\nPlease, select a different input directory and try again.' ) return else: inputDocs = [inputFilename] IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running Nominalization at', True) #add all into a sum result_dir = [] result_dir.append(["Word", "Is nominalized", "Document"]) docID = 0 result2 = [] result_dir2 = [] counter_nominalized_list = [] counter_nominalized_list.append(['Nominalized verb', 'Frequency']) counter_noun_list = [] counter_noun_list.append(['Noun', 'Frequency']) for doc in inputDocs: docID = docID + 1 print("Processing document", doc, "\n") #open the doc and create the list of result (words, T/F) fin = open(doc, 'r', encoding='utf-8', errors='ignore') # result1 contains the sentence and nominalized values fora a specific document result, result1 = nominalized_verb_detection( docID, doc, fin.read()) # result2 contains the sentence and nominalized values for all documents result2.extend(result1) fin.close() # list all verbs as TRUE/FALSE if nominalized for word, boolean in result: result_dir.append([ word, boolean, IO_csv_util.dressFilenameForCSVHyperlink(doc) ]) result_dir2.extend(result_dir) if len(inputDir) > 0: fname = os.path.basename(os.path.normpath(inputDir)) + "_dir" else: fname = doc # used for both individual files and directories output_filename_bySentenceIndex = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'sent', '', '', '', False, True) if len(inputDir) == 0 or doNotListIndividualFiles == False: counter_nominalized_list = [] counter_noun_list = [] # refresh the headers counter_nominalized_list.insert( 0, ['Nominalized verb', 'Frequency']) counter_noun_list.insert(0, ['Noun', 'Frequency']) result1.insert(0, [ 'Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs', 'Number of nominalizations in sentence', 'Percentage of nominalizations in sentence' ]) # compute frequency of most common nominalized verbs for word, freq in nominalized_cnt.most_common(): counter_nominalized_list.append([word, freq]) # compute frequency of most common nouns for word, freq in noun_cnt.most_common(): counter_noun_list.append([word, freq]) head, fname = os.path.split(doc) fname = fname[:-4] output_filename_noun_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '', False, True) filesToOpen.append(output_filename_noun_frequencies) output_filename_nominalized_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '', '', False, True) filesToOpen.append(output_filename_nominalized_frequencies) # export nominalized verbs list_to_csv(output_filename_nominalized_frequencies, counter_nominalized_list) # export nouns list_to_csv(output_filename_noun_frequencies, counter_noun_list) output_filename_TRUE_FALSE = IO_files_util.generate_output_file_name( fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '', '', '', False, True) filesToOpen.append(output_filename_TRUE_FALSE) list_to_csv(output_filename_TRUE_FALSE, result) filesToOpen.append(output_filename_bySentenceIndex) list_to_csv(output_filename_bySentenceIndex, result1) if createExcelCharts == True: # line chart columns_to_be_plotted = [[2, 6]] chartTitle = 'Nominalized verbs (by Sentence Index)' xAxis = 'Sentence index' yAxis = 'Number of nominalizations in sentence' hover_label = '' Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, output_filename_bySentenceIndex, outputDir, '', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label, column_yAxis_label_var=yAxis) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nominalized verbs Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_nominalized_list], fname, outputDir, 'NOM_Verb', "Nominalized verbs", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nouns Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_noun_list], fname, outputDir, 'NOM_noun', "Nouns", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) if len(inputDir) > 0 and doNotListIndividualFiles == True: output_filename_TRUE_FALSE_dir = IO_files_util.generate_output_file_name( fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '', '', '', False, True) filesToOpen.append(output_filename_TRUE_FALSE_dir) output_filename_dir_noun_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '', False, True) filesToOpen.append(output_filename_dir_noun_frequencies) output_filename_dir_nominalized_frequencies = IO_files_util.generate_output_file_name( fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '', '', False, True) filesToOpen.append(output_filename_dir_nominalized_frequencies) result2.insert(0, [ 'Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs', 'Number of nominalizations in sentence', 'Percentage of nominalizations in sentence' ]) list_to_csv(output_filename_bySentenceIndex, result2) # list all verbs as TRUE/FALSE if nominalized list_to_csv(output_filename_TRUE_FALSE_dir, result_dir2) counter_noun_list = [] counter_noun_list.append(['Noun', 'Frequency']) for word, freq in noun_cnt.most_common(): counter_noun_list.append([word, freq]) list_to_csv(output_filename_dir_noun_frequencies, counter_noun_list) counter_nominalized_list = [] counter_nominalized_list.append(['Nominalized verb', 'Frequency']) for word, freq in nominalized_cnt.most_common(): counter_nominalized_list.append([word, freq]) list_to_csv(output_filename_dir_nominalized_frequencies, counter_nominalized_list) if createExcelCharts == True: # pie chart of nominalized verbs Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_nominalized_list], output_filename_dir_nominalized_frequencies, outputDir, 'NOM_verb' "Nominalized verbs", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) # pie chart of nouns Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, [counter_noun_list], output_filename_dir_noun_frequencies, outputDir, 'NOM_noun', "Nouns", ["pie"]) if len(Excel_outputFilename) > 0: filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running Nominalization at', True) if openOutputFiles == 1: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def noun_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): # print("\nRun noun analysis") filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running NOUN ANALYSES at', True) # TODO: fix postag_list, postag_counter, deprel_list, deprel_counter, ner_list, ner_counter = compute_stats( data) noun_postag, noun_deprel, noun_ner, \ noun_postag_stats, noun_deprel_stats, noun_ner_stats = noun_POSTAG_DEPREL_compute_frequencies(data, data_divided_sents) # output file names noun_postag_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_list') noun_deprel_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_list') noun_ner_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_list') noun_postag_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_stats') noun_deprel_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_stats') noun_ner_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_stats') # save csv files ------------------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun POS Tags', noun_postag, documentId_position), noun_postag_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun DEPREL Tags', noun_deprel, documentId_position), noun_deprel_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun NER Tags', noun_ner, documentId_position), noun_ner_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_file_name) # save csv frequency files ---------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_postag_stats, noun_postag_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_deprel_stats, noun_deprel_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_ner_stats, noun_ner_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_stats_file_name) if createExcelCharts == True: # pie charts ----------------------------------------------------------------------------------------------- Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_postag_stats], inputFilename=noun_postag_stats_file_name, outputDir=outputDir, scriptType='Nouns_POS', chartTitle="Noun POS Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_deprel_stats], inputFilename=noun_deprel_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Noun DEPREL Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_ner_stats], inputFilename=noun_ner_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Nouns (NER Tags)", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index ----------------------------------------------------------------------------------------------- outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_postag_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun POS Tags'], ['FORM', 'Sentence', 'Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_deprel_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun DEPREL Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_ner_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun NER Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running NOUN ANALYSES at', True) return filesToOpen
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running CLAUSE ANALYSES at', True) #output file names #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag clausal_analysis_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'list') filesToOpen.append(clausal_analysis_file_name) #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data if 0: stats_clauses(data) else: if not os.path.isdir(outputDir): mb.showwarning( title='Output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return clausal_list = stats_clauses_output(data, data_divided_sents) IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list, documentId_position), clausal_analysis_file_name) column_stats = statistics_csv_util.compute_stats_CoreNLP_tag( clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG") clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats, clausal_analysis_stats_file_name) if errorFound == True: return if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[column_stats], inputFilename=clausal_analysis_stats_file_name, outputDir=outputDir, scriptType='CoNLL_Clause', chartTitle="Frequency Distribution of Clause Type", chart_type_list=["pie"], column_xAxis_label="Clause Tags", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index Excel_outputFilename = Excel_util.compute_csv_column_frequencies( GUI_util.window, clausal_analysis_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA', 'line') if len(Excel_outputFilename) > 0: filesToOpen.extend(Excel_outputFilename) # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name) # # overwrite original file having added any missing document ID and sentence ID # output_df.to_csv(clausal_analysis_file_name,index=False) # columns_to_be_plotted = [[1, 8]] hover_label = ['CLAUSAL TAG-DESCRIPTION'] inputFilename = clausal_analysis_file_name Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='CoNLL_Clause', chart_type_list=["line"], chart_title='Frequency of Clause Tags', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != '': filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running CLAUSE ANALYSES at', True) return filesToOpen
def dictionary_items_bySentenceID(window,inputFilename,inputDir, outputDir,createExcelCharts,openOutputFiles=True,input_dictionary_file='',chartTitle=''): filesToOpen=[] DictionaryList=[] file_list = IO_files_util.getFileList(inputFilename, inputDir, '.txt') nFile=len(file_list) if nFile==0: return # when running the function w/o a GUI, as currently is mostly the case, # we would not be able to pass a dictionary file to the function if input_dictionary_file=='': initialFolder = os.path.dirname(os.path.abspath(__file__)) input_dictionary_file = tk.filedialog.askopenfilename(title = "Select dictionary csv file", initialdir = initialFolder, filetypes = [("csv files", "*.csv")]) if len(input_dictionary_file)==0: return if IO_csv_util.get_csvfile_numberofColumns(input_dictionary_file) == 2: dic = pd.read_csv(input_dictionary_file) dic_value = dic.iloc[:,0].tolist() dic_sec_value = dic.iloc[:,1].tolist() dic =[(dic_value[i],dic_sec_value[i])for i in range(len(dic_value))] if chartTitle=='': chartTitle="Dictionary value" documentID = 0 container = [] for file in file_list: documentID+=1 head, tail = os.path.split(file) print("Processing file ", str(documentID),"\\",str(nFile),tail) text = (open(file, "r", encoding="utf-8",errors='ignore').read()) #Process each word in txt Sentence_ID = 0 sentences = tokenize.sent_tokenize(text) # word frequency sentenceID DocumentID FileName for each_sentence in sentences: In = [] Sentence_ID += 1 token=nltk.word_tokenize(each_sentence) for word in token: for dict_word in dic: if word == dict_word[0].rstrip(): In.append([word,dict_word[1],Sentence_ID,each_sentence,documentID,file]) break else: continue container.extend(In) ctr = collections.Counter(Extract(container)) for word in container: word.insert(2,ctr.get(word[0])) for word in container: if word[0] not in Extract(DictionaryList): DictionaryList.append(word) DictionaryList.insert(0, ['Dict_value','Dict_second_value', 'Frequency', 'Sentence ID','Sentence','Document ID','Document']) else: dic = pd.read_csv(input_dictionary_file) dic_value = dic.iloc[:, 0].tolist() if chartTitle == '': chartTitle = "Dictionary value" documentID = 0 container = [] for file in file_list: documentID += 1 head, tail = os.path.split(file) print("Processing file ", str(documentID), "\\", str(nFile), tail) text = (open(file, "r", encoding="utf-8", errors='ignore').read()) # Process each word in txt Sentence_ID = 0 sentences = tokenize.sent_tokenize(text) # word frequency sentenceID DocumentID FileName for each_sentence in sentences: In = [] Sentence_ID += 1 token = nltk.word_tokenize(each_sentence) for word in token: for dict_word in dic_value: if word == dict_word.rstrip(): In.append([word, Sentence_ID, each_sentence, documentID, file]) break else: continue container.extend(In) ctr = collections.Counter(Extract(container)) for word in container: word.insert(1, ctr.get(word[0])) for word in container: if word[0] not in Extract(DictionaryList): DictionaryList.append(word) DictionaryList.insert(0, ['Dict_value', 'Frequency', 'Sentence ID', 'Sentence', 'Document ID', 'Document']) outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.csv', str(Sentence_ID) + '-Dict_value', 'stats', '', '', '', False, True) filesToOpen.append(outputFilename) IO_csv_util.list_to_csv(window,DictionaryList,outputFilename) outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.xlsx', str(Sentence_ID) + '-Dict_value', 'chart', '', '', '', False, True) filesToOpen.append(outputFilename) Excel_util.create_excel_chart(GUI_util.window,[DictionaryList],outputFilename,chartTitle,["bar"]) if openOutputFiles==True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
# Written by Yuhang Feng November 2019