def preposition_stats(inputFilename,outputDir,data, data_divided_sents, openOutputFiles,createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PREPOSITION Analysis at', True) #output file names function_words_list_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'list') function_words_stats_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'stats') # filesToOpen.append(function_words_list_file_name) # not necessary to open stats since these stats are included in the pie chart # filesToOpen.append(function_words_stats_file_name) #data = get_data(inputFilename) #data_divided_sents = IO_CoNLL_util.sentence_division(data) if 0: stats_prepositions(data) # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running PRONOUN Analysis at',True) else: if not os.path.isdir(outputDir): mb.showwarning(title='output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return filesToOpen prepositions_list,prepositions_stats= stats_prepositions_output(data,data_divided_sents) errorFound=IO_csv_util.list_to_csv(GUI_util.window,IO_CoNLL_util.sort_output_list('PREPOSITIONS',prepositions_list,documentId_position), function_words_list_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_list_file_name) errorFound=IO_csv_util.list_to_csv(GUI_util.window,prepositions_stats,function_words_stats_file_name) if errorFound==True: return filesToOpen filesToOpen.append(function_words_stats_file_name) if createExcelCharts==True: Excel_outputFilename= Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[prepositions_stats], inputFilename=function_words_stats_file_name, outputDir=outputDir, scriptType='FuncWords_prep', chartTitle="Preposition Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index outputFiles=Excel_util.compute_csv_column_frequencies(GUI_util.window, function_words_list_file_name, '', outputDir, openOutputFiles,createExcelCharts, [[1,4]], ['PREPOSITIONS'],['FORM','Sentence'], ['Document ID','Sentence ID','Document'], 'FW','line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PREPOSITION Analysis at', True) return filesToOpen
def verb_voice_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished # print ("\nRun verb voice analysis") # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running VERB VOICE analysis at', True) data_prep = verb_voice_data_preparation(data) voice_list, voice_stats = voice_output(data_prep,data_divided_sents) # output file names verb_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'list') verb_stats_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, IO_CoNLL_util.sort_output_list('Verb Voice', voice_list, documentId_position), verb_file_name) if errorFound == True: return filesToOpen.append(verb_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, voice_stats, verb_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(verb_stats_file_name) if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart(GUI_util.window, data_to_be_plotted=[voice_stats], inputFilename=verb_stats_file_name, outputDir=outputDir, scriptType='Verb_Voice', chartTitle="Frequency Distribution of Verb Voice", chart_type_list=["pie"], column_xAxis_label="Verb voice values", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index outputFiles = Excel_util.compute_csv_column_frequencies(GUI_util.window, verb_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Verb Voice'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running VERB VOICE analysis at', True) return filesToOpen
def Wordnet_bySentenceID(ConnlTable, wordnetDict, outputFilename, outputDir, noun_verb, openOutputFiles, createExcelCharts): IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis start', 'Started running WordNet charts by sentence index at', True) if noun_verb == 'NOUN': checklist = ['NN', 'NNP', 'NNPS', 'NNS'] else: checklist = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # read in the CoreNLP CoNLL table connl = pd.read_csv(ConnlTable) # read in the dictionary file to be used to filter CoNLL values # The file is expected to have 2 columns with headers: Word, WordNet Category dict = pd.read_csv(wordnetDict) # set up the double list conll from the conll data connl = connl[[ 'word', 'lemma', 'postag', 'Sentence ID', 'Document ID', 'Document' ]] # filter the list by noun or verb connl = connl[connl['postag'].isin(checklist)] # eliminate any duplicate value in Word dict = dict.drop_duplicates().rename(columns={ 'Word': 'lemma', 'WordNet Category': 'Category' }) # ? connl = connl.merge(dict, how='left', on='lemma') # the CoNLL table value is not found in the dictionary Word value connl.fillna('Not in INPUT dictionary for ' + noun_verb, inplace=True) # add the WordNet Catgegory to the conll list connl = connl[[ 'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID', 'Document' ]] # put headers on conll list connl.columns = [ 'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID', 'Document' ] Row_list = [] # Iterate over each row for index, rows in connl.iterrows(): # Create list for the current row my_list = [ rows.word, rows.lemma, rows.postag, rows.Category, rows.SentenceID, rows.DocumentID, rows.Document ] # append the list to the final list Row_list.append(my_list) for index, row in enumerate(Row_list): if index == 0 and Row_list[index][4] != 1: for i in range(Row_list[index][4] - 1, 0, -1): Row_list.insert(0, [ '', '', '', '', i, Row_list[index][5], Row_list[index][6] ]) else: if index < len(Row_list) - 1 and Row_list[ index + 1][4] - Row_list[index][4] > 1: for i in range(Row_list[index + 1][4] - 1, Row_list[index][4], -1): Row_list.insert(index + 1, [ '', '', '', '', i, Row_list[index][5], Row_list[index][6] ]) df = pd.DataFrame(Row_list, index=[ 'word', 'lemma', 'postag', 'WordNet Category', 'Sentence ID', 'Document ID', 'Document' ]) df = Excel_util.add_missing_IDs(df) # Row_list.insert(0, ['word','lemma','postag','WordNet Category','SentenceID','DocumentID','Document']) #IO_util.list_to_csv('',Row_list,outputFilename) df.to_csv(outputFilename, index=False) if createExcelCharts: outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, ConnlTable, df, outputDir, openOutputFiles, createExcelCharts, [[4, 5]], ['WordNet Category'], ['word'], ['Document ID', 'Sentence ID', 'Document'], 'WordNet', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'Analysis end', 'Finished running WordNet charts by sentence index at', True)
def noun_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): # print("\nRun noun analysis") filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running NOUN ANALYSES at', True) # TODO: fix postag_list, postag_counter, deprel_list, deprel_counter, ner_list, ner_counter = compute_stats( data) noun_postag, noun_deprel, noun_ner, \ noun_postag_stats, noun_deprel_stats, noun_ner_stats = noun_POSTAG_DEPREL_compute_frequencies(data, data_divided_sents) # output file names noun_postag_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_list') noun_deprel_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_list') noun_ner_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_list') noun_postag_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_stats') noun_deprel_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_stats') noun_ner_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_stats') # save csv files ------------------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun POS Tags', noun_postag, documentId_position), noun_postag_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun DEPREL Tags', noun_deprel, documentId_position), noun_deprel_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_file_name) errorFound = IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('Noun NER Tags', noun_ner, documentId_position), noun_ner_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_file_name) # save csv frequency files ---------------------------------------------------------------------------------------- errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_postag_stats, noun_postag_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_postag_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_deprel_stats, noun_deprel_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_deprel_stats_file_name) errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_ner_stats, noun_ner_stats_file_name) if errorFound == True: return filesToOpen filesToOpen.append(noun_ner_stats_file_name) if createExcelCharts == True: # pie charts ----------------------------------------------------------------------------------------------- Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_postag_stats], inputFilename=noun_postag_stats_file_name, outputDir=outputDir, scriptType='Nouns_POS', chartTitle="Noun POS Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_deprel_stats], inputFilename=noun_deprel_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Noun DEPREL Analysis", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[noun_ner_stats], inputFilename=noun_ner_stats_file_name, outputDir=outputDir, scriptType='Nouns_DEPREL', chartTitle="Nouns (NER Tags)", chart_type_list=["pie"]) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plots by sentence index ----------------------------------------------------------------------------------------------- outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_postag_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun POS Tags'], ['FORM', 'Sentence', 'Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_deprel_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun DEPREL Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) outputFiles = Excel_util.compute_csv_column_frequencies( GUI_util.window, noun_ner_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[1, 4]], ['Noun NER Tags'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'], 'NVA', 'line') if len(outputFiles) > 0: filesToOpen.extend(outputFiles) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running NOUN ANALYSES at', True) return filesToOpen
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts): filesToOpen = [] # Store all files that are to be opened once finished IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running CLAUSE ANALYSES at', True) #output file names #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag clausal_analysis_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'list') filesToOpen.append(clausal_analysis_file_name) #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data if 0: stats_clauses(data) else: if not os.path.isdir(outputDir): mb.showwarning( title='Output file path error', message='Please check OUTPUT DIRECTORY PATH and try again') return clausal_list = stats_clauses_output(data, data_divided_sents) IO_csv_util.list_to_csv( GUI_util.window, IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list, documentId_position), clausal_analysis_file_name) column_stats = statistics_csv_util.compute_stats_CoreNLP_tag( clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG") clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags', 'stats') errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats, clausal_analysis_stats_file_name) if errorFound == True: return if createExcelCharts == True: Excel_outputFilename = Excel_util.create_excel_chart( GUI_util.window, data_to_be_plotted=[column_stats], inputFilename=clausal_analysis_stats_file_name, outputDir=outputDir, scriptType='CoNLL_Clause', chartTitle="Frequency Distribution of Clause Type", chart_type_list=["pie"], column_xAxis_label="Clause Tags", column_yAxis_label="Frequency") if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # line plot by sentence index Excel_outputFilename = Excel_util.compute_csv_column_frequencies( GUI_util.window, clausal_analysis_file_name, '', outputDir, openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'], ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA', 'line') if len(Excel_outputFilename) > 0: filesToOpen.extend(Excel_outputFilename) # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name) # # overwrite original file having added any missing document ID and sentence ID # output_df.to_csv(clausal_analysis_file_name,index=False) # columns_to_be_plotted = [[1, 8]] hover_label = ['CLAUSAL TAG-DESCRIPTION'] inputFilename = clausal_analysis_file_name Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='CoNLL_Clause', chart_type_list=["line"], chart_title='Frequency of Clause Tags', column_xAxis_label_var='Sentence index', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != '': filesToOpen.append(Excel_outputFilename) IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running CLAUSE ANALYSES at', True) return filesToOpen