def preposition_stats(inputFilename,outputDir,data, data_divided_sents, openOutputFiles,createExcelCharts):
    filesToOpen = []  # Store all files that are to be opened once finished
    # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PREPOSITION Analysis at', True)

    #output file names
    function_words_list_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'list')
    function_words_stats_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'stats')
    # filesToOpen.append(function_words_list_file_name)
    # not necessary to open stats since these stats are included in the pie chart
    # filesToOpen.append(function_words_stats_file_name)
    
    #data  = get_data(inputFilename)
    #data_divided_sents = IO_CoNLL_util.sentence_division(data)
    
    
    if 0:
        stats_prepositions(data)
        # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running PRONOUN Analysis at',True)
    else:
        if not os.path.isdir(outputDir):
            mb.showwarning(title='output file path error', message='Please check OUTPUT DIRECTORY PATH and try again')
            return filesToOpen
       
        prepositions_list,prepositions_stats= stats_prepositions_output(data,data_divided_sents)
        errorFound=IO_csv_util.list_to_csv(GUI_util.window,IO_CoNLL_util.sort_output_list('PREPOSITIONS',prepositions_list,documentId_position), function_words_list_file_name)
        if errorFound==True:
            return filesToOpen
        filesToOpen.append(function_words_list_file_name)

        errorFound=IO_csv_util.list_to_csv(GUI_util.window,prepositions_stats,function_words_stats_file_name)
        if errorFound==True:
            return filesToOpen
        filesToOpen.append(function_words_stats_file_name)

        if createExcelCharts==True:
            Excel_outputFilename= Excel_util.create_excel_chart(GUI_util.window,
                                          data_to_be_plotted=[prepositions_stats],
                                          inputFilename=function_words_stats_file_name,
                                          outputDir=outputDir,
                                          scriptType='FuncWords_prep',
                                          chartTitle="Preposition Analysis",
                                          chart_type_list=["pie"])

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # line plot by sentence index
            outputFiles=Excel_util.compute_csv_column_frequencies(GUI_util.window,
                                                                         function_words_list_file_name,
                                                                         '',
                                                                         outputDir,
                                                                         openOutputFiles,createExcelCharts,
                                                                         [[1,4]],
                                                                         ['PREPOSITIONS'],['FORM','Sentence'], ['Document ID','Sentence ID','Document'],
                                                                         'FW','line')
            if len(outputFiles) > 0:
                filesToOpen.extend(outputFiles)

    # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PREPOSITION Analysis at', True)
    return filesToOpen
Exemple #2
0
def verb_voice_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts):
	filesToOpen = []  # Store all files that are to be opened once finished

	# print ("\nRun verb voice analysis")

	# IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running VERB VOICE analysis at', True)

	data_prep = verb_voice_data_preparation(data)

	voice_list, voice_stats = voice_output(data_prep,data_divided_sents)

	# output file names
	verb_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'list')
	verb_stats_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'stats')

	errorFound = IO_csv_util.list_to_csv(GUI_util.window,
									 IO_CoNLL_util.sort_output_list('Verb Voice', voice_list, documentId_position),
									 verb_file_name)
	if errorFound == True:
		return
	filesToOpen.append(verb_file_name)

	errorFound = IO_csv_util.list_to_csv(GUI_util.window, voice_stats, verb_stats_file_name)
	if errorFound == True:
		return filesToOpen
	filesToOpen.append(verb_stats_file_name)

	if createExcelCharts == True:
		Excel_outputFilename = Excel_util.create_excel_chart(GUI_util.window,
															 data_to_be_plotted=[voice_stats],
															 inputFilename=verb_stats_file_name,
															 outputDir=outputDir,
															 scriptType='Verb_Voice',
															 chartTitle="Frequency Distribution of Verb Voice",
															 chart_type_list=["pie"],
															 column_xAxis_label="Verb voice values",
															 column_yAxis_label="Frequency")

		if Excel_outputFilename != "":
			filesToOpen.append(Excel_outputFilename)

		# line plots by sentence index
		outputFiles = Excel_util.compute_csv_column_frequencies(GUI_util.window,
																	   verb_file_name,
																	   '',
																	   outputDir,
																	   openOutputFiles,
																	   createExcelCharts,
																	   [[1, 4]],
																	   ['Verb Voice'],
																		   ['FORM', 'Sentence'],
																		   ['Document ID', 'Sentence ID',
																			'Document'],
																	   'NVA', 'line')
		if len(outputFiles) > 0:
			filesToOpen.extend(outputFiles)

	# IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running VERB VOICE analysis at', True)

	return filesToOpen
Exemple #3
0
def Wordnet_bySentenceID(ConnlTable, wordnetDict, outputFilename, outputDir,
                         noun_verb, openOutputFiles, createExcelCharts):
    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'Analysis start',
        'Started running WordNet charts by sentence index at', True)
    if noun_verb == 'NOUN':
        checklist = ['NN', 'NNP', 'NNPS', 'NNS']
    else:
        checklist = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    # read in the CoreNLP CoNLL table
    connl = pd.read_csv(ConnlTable)
    # read in the dictionary file to be used to filter CoNLL values
    # The file is expected to have 2 columns with headers: Word, WordNet Category
    dict = pd.read_csv(wordnetDict)
    # set up the double list conll from the conll data
    connl = connl[[
        'word', 'lemma', 'postag', 'Sentence ID', 'Document ID', 'Document'
    ]]
    # filter the list by noun or verb
    connl = connl[connl['postag'].isin(checklist)]
    # eliminate any duplicate value in Word
    dict = dict.drop_duplicates().rename(columns={
        'Word': 'lemma',
        'WordNet Category': 'Category'
    })
    # ?
    connl = connl.merge(dict, how='left', on='lemma')
    # the CoNLL table value is not found in the dictionary Word value
    connl.fillna('Not in INPUT dictionary for ' + noun_verb, inplace=True)
    # add the WordNet Catgegory to the conll list
    connl = connl[[
        'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID',
        'Document'
    ]]
    # put headers on conll list
    connl.columns = [
        'word', 'lemma', 'postag', 'Category', 'Sentence ID', 'Document ID',
        'Document'
    ]

    Row_list = []
    # Iterate over each row
    for index, rows in connl.iterrows():
        # Create list for the current row
        my_list = [
            rows.word, rows.lemma, rows.postag, rows.Category, rows.SentenceID,
            rows.DocumentID, rows.Document
        ]
        # append the list to the final list
        Row_list.append(my_list)
    for index, row in enumerate(Row_list):
        if index == 0 and Row_list[index][4] != 1:
            for i in range(Row_list[index][4] - 1, 0, -1):
                Row_list.insert(0, [
                    '', '', '', '', i, Row_list[index][5], Row_list[index][6]
                ])
        else:
            if index < len(Row_list) - 1 and Row_list[
                    index + 1][4] - Row_list[index][4] > 1:
                for i in range(Row_list[index + 1][4] - 1, Row_list[index][4],
                               -1):
                    Row_list.insert(index + 1, [
                        '', '', '', '', i, Row_list[index][5],
                        Row_list[index][6]
                    ])
    df = pd.DataFrame(Row_list,
                      index=[
                          'word', 'lemma', 'postag', 'WordNet Category',
                          'Sentence ID', 'Document ID', 'Document'
                      ])
    df = Excel_util.add_missing_IDs(df)
    # Row_list.insert(0, ['word','lemma','postag','WordNet Category','SentenceID','DocumentID','Document'])
    #IO_util.list_to_csv('',Row_list,outputFilename)
    df.to_csv(outputFilename, index=False)

    if createExcelCharts:
        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, ConnlTable, df, outputDir, openOutputFiles,
            createExcelCharts, [[4, 5]], ['WordNet Category'], ['word'],
            ['Document ID', 'Sentence ID', 'Document'], 'WordNet', 'line')
        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'Analysis end',
        'Finished running WordNet charts by sentence index at', True)
def noun_stats(inputFilename, outputDir, data, data_divided_sents,
               openOutputFiles, createExcelCharts):
    # print("\nRun noun analysis")

    filesToOpen = []  # Store all files that are to be opened once finished

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start',
                                       'Started running NOUN ANALYSES at',
                                       True)  # TODO: fix

    postag_list, postag_counter, deprel_list, deprel_counter, ner_list, ner_counter = compute_stats(
        data)

    noun_postag, noun_deprel, noun_ner, \
    noun_postag_stats, noun_deprel_stats, noun_ner_stats = noun_POSTAG_DEPREL_compute_frequencies(data,
                                                                                                  data_divided_sents)
    # output file names
    noun_postag_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_list')
    noun_deprel_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_list')
    noun_ner_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_list')
    noun_postag_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_stats')
    noun_deprel_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_stats')
    noun_ner_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_stats')

    # save csv files -------------------------------------------------------------------------------------------------

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun POS Tags', noun_postag,
                                       documentId_position),
        noun_postag_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_postag_file_name)

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun DEPREL Tags', noun_deprel,
                                       documentId_position),
        noun_deprel_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_deprel_file_name)

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun NER Tags', noun_ner,
                                       documentId_position),
        noun_ner_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_ner_file_name)

    # save csv frequency files ----------------------------------------------------------------------------------------

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_postag_stats,
                                         noun_postag_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_postag_stats_file_name)

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_deprel_stats,
                                         noun_deprel_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_deprel_stats_file_name)

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_ner_stats,
                                         noun_ner_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_ner_stats_file_name)

    if createExcelCharts == True:

        # pie charts -----------------------------------------------------------------------------------------------

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_postag_stats],
            inputFilename=noun_postag_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_POS',
            chartTitle="Noun POS Analysis",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_deprel_stats],
            inputFilename=noun_deprel_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_DEPREL',
            chartTitle="Noun DEPREL Analysis",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_ner_stats],
            inputFilename=noun_ner_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_DEPREL',
            chartTitle="Nouns (NER Tags)",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        # line plots by sentence index -----------------------------------------------------------------------------------------------

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_postag_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun POS Tags'],
            ['FORM', 'Sentence', 'Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')
        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_deprel_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun DEPREL Tags'],
            ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')

        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_ner_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun NER Tags'],
            ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')
        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running NOUN ANALYSES at',
                                       True)

    return filesToOpen
Exemple #5
0
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents,
                 openOutputFiles, createExcelCharts):

    filesToOpen = []  # Store all files that are to be opened once finished

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start',
                                       'Started running CLAUSE ANALYSES at',
                                       True)

    #output file names
    #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag
    clausal_analysis_file_name = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
        'list')
    filesToOpen.append(clausal_analysis_file_name)
    #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data

    if 0:
        stats_clauses(data)
    else:
        if not os.path.isdir(outputDir):
            mb.showwarning(
                title='Output file path error',
                message='Please check OUTPUT DIRECTORY PATH and try again')
            return
        clausal_list = stats_clauses_output(data, data_divided_sents)

        IO_csv_util.list_to_csv(
            GUI_util.window,
            IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list,
                                           documentId_position),
            clausal_analysis_file_name)
        column_stats = statistics_csv_util.compute_stats_CoreNLP_tag(
            clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG")

        clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name(
            inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
            'stats')
        errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats,
                                             clausal_analysis_stats_file_name)
        if errorFound == True:
            return

        if createExcelCharts == True:
            Excel_outputFilename = Excel_util.create_excel_chart(
                GUI_util.window,
                data_to_be_plotted=[column_stats],
                inputFilename=clausal_analysis_stats_file_name,
                outputDir=outputDir,
                scriptType='CoNLL_Clause',
                chartTitle="Frequency Distribution of Clause Type",
                chart_type_list=["pie"],
                column_xAxis_label="Clause Tags",
                column_yAxis_label="Frequency")
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # line plot by sentence index
            Excel_outputFilename = Excel_util.compute_csv_column_frequencies(
                GUI_util.window, clausal_analysis_file_name, '', outputDir,
                openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'],
                ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA',
                'line')
            if len(Excel_outputFilename) > 0:
                filesToOpen.extend(Excel_outputFilename)

            # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name)
            # # overwrite original file having added any missing document ID and sentence ID
            # output_df.to_csv(clausal_analysis_file_name,index=False)
            #
            columns_to_be_plotted = [[1, 8]]
            hover_label = ['CLAUSAL TAG-DESCRIPTION']
            inputFilename = clausal_analysis_file_name
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel='CoNLL_Clause',
                chart_type_list=["line"],
                chart_title='Frequency of Clause Tags',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=1)
            if Excel_outputFilename != '':
                filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running CLAUSE ANALYSES at',
                                       True)
    return filesToOpen