def preposition_stats(inputFilename,outputDir,data, data_divided_sents, openOutputFiles,createExcelCharts):
    filesToOpen = []  # Store all files that are to be opened once finished
    # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PREPOSITION Analysis at', True)

    #output file names
    function_words_list_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'list')
    function_words_stats_file_name=IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'FW', 'Prepositions', 'stats')
    # filesToOpen.append(function_words_list_file_name)
    # not necessary to open stats since these stats are included in the pie chart
    # filesToOpen.append(function_words_stats_file_name)
    
    #data  = get_data(inputFilename)
    #data_divided_sents = IO_CoNLL_util.sentence_division(data)
    
    
    if 0:
        stats_prepositions(data)
        # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running PRONOUN Analysis at',True)
    else:
        if not os.path.isdir(outputDir):
            mb.showwarning(title='output file path error', message='Please check OUTPUT DIRECTORY PATH and try again')
            return filesToOpen
       
        prepositions_list,prepositions_stats= stats_prepositions_output(data,data_divided_sents)
        errorFound=IO_csv_util.list_to_csv(GUI_util.window,IO_CoNLL_util.sort_output_list('PREPOSITIONS',prepositions_list,documentId_position), function_words_list_file_name)
        if errorFound==True:
            return filesToOpen
        filesToOpen.append(function_words_list_file_name)

        errorFound=IO_csv_util.list_to_csv(GUI_util.window,prepositions_stats,function_words_stats_file_name)
        if errorFound==True:
            return filesToOpen
        filesToOpen.append(function_words_stats_file_name)

        if createExcelCharts==True:
            Excel_outputFilename= Excel_util.create_excel_chart(GUI_util.window,
                                          data_to_be_plotted=[prepositions_stats],
                                          inputFilename=function_words_stats_file_name,
                                          outputDir=outputDir,
                                          scriptType='FuncWords_prep',
                                          chartTitle="Preposition Analysis",
                                          chart_type_list=["pie"])

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # line plot by sentence index
            outputFiles=Excel_util.compute_csv_column_frequencies(GUI_util.window,
                                                                         function_words_list_file_name,
                                                                         '',
                                                                         outputDir,
                                                                         openOutputFiles,createExcelCharts,
                                                                         [[1,4]],
                                                                         ['PREPOSITIONS'],['FORM','Sentence'], ['Document ID','Sentence ID','Document'],
                                                                         'FW','line')
            if len(outputFiles) > 0:
                filesToOpen.extend(outputFiles)

    # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PREPOSITION Analysis at', True)
    return filesToOpen
Esempio n. 2
0
def verb_voice_stats(inputFilename, outputDir, data, data_divided_sents, openOutputFiles, createExcelCharts):
	filesToOpen = []  # Store all files that are to be opened once finished

	# print ("\nRun verb voice analysis")

	# IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running VERB VOICE analysis at', True)

	data_prep = verb_voice_data_preparation(data)

	voice_list, voice_stats = voice_output(data_prep,data_divided_sents)

	# output file names
	verb_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'list')
	verb_stats_file_name = IO_files_util.generate_output_file_name(inputFilename, '', outputDir, '.csv', 'NVA', 'Verb Voice', 'stats')

	errorFound = IO_csv_util.list_to_csv(GUI_util.window,
									 IO_CoNLL_util.sort_output_list('Verb Voice', voice_list, documentId_position),
									 verb_file_name)
	if errorFound == True:
		return
	filesToOpen.append(verb_file_name)

	errorFound = IO_csv_util.list_to_csv(GUI_util.window, voice_stats, verb_stats_file_name)
	if errorFound == True:
		return filesToOpen
	filesToOpen.append(verb_stats_file_name)

	if createExcelCharts == True:
		Excel_outputFilename = Excel_util.create_excel_chart(GUI_util.window,
															 data_to_be_plotted=[voice_stats],
															 inputFilename=verb_stats_file_name,
															 outputDir=outputDir,
															 scriptType='Verb_Voice',
															 chartTitle="Frequency Distribution of Verb Voice",
															 chart_type_list=["pie"],
															 column_xAxis_label="Verb voice values",
															 column_yAxis_label="Frequency")

		if Excel_outputFilename != "":
			filesToOpen.append(Excel_outputFilename)

		# line plots by sentence index
		outputFiles = Excel_util.compute_csv_column_frequencies(GUI_util.window,
																	   verb_file_name,
																	   '',
																	   outputDir,
																	   openOutputFiles,
																	   createExcelCharts,
																	   [[1, 4]],
																	   ['Verb Voice'],
																		   ['FORM', 'Sentence'],
																		   ['Document ID', 'Sentence ID',
																			'Document'],
																	   'NVA', 'line')
		if len(outputFiles) > 0:
			filesToOpen.extend(outputFiles)

	# IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running VERB VOICE analysis at', True)

	return filesToOpen
Esempio n. 3
0
def search(searchWord, searchSize, position, inputKWICfile, within, outFile, ranWithCLAs=False):
    # read in word1,word2,targeted window size column in the csv file
    if (within == 1):
        cols = []
        cols.append(0)
        cols.append(1)
        i = 12 - searchSize
        while i <= 11 + searchSize:
            cols.append(i)
            i+=1
        data = pd.read_csv(inputKWICfile, usecols=cols, engine='python')
    else:
        data = pd.read_csv(inputKWICfile, usecols=[0, 1, 12-searchSize, 11+searchSize], engine='python')
    # filter out the rows where searchWord is in
    target = data['word1'].str.lower() == searchWord.lower()
    target_rows = data[target]
    # get the values in dataframe
    rownum = target_rows.shape[0]
    colnum = target_rows.shape[1]
    i = 0
    global leftKWIC, rightKWIC 
    leftKWIC = []
    rightKWIC = []
    result = []
    mid = 1+(colnum-2)/2
    #every row refers to a new combination of two words
    while i < rownum:
        #every row refers to a new combination of two words; the counts should start over at zero
        countLeft = 0
        countRight = 0
        #print("i ",i) i is correct
        if (position == "left"):
            j = 2
            while j <= mid:
                countLeft += target_rows.iloc[i][j]
                j+=1
            if not (countLeft == 0):
                result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft])
                if countLeft>0:
                    leftKWIC.append([target_rows.iloc[i]['word2'],countLeft])
                    # write into csv file
                    with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)"])
                        for items in result:
                            writer.writerow(items)
                            #leftKWIC.append(items[2],items[3])
        elif (position == "right"):
            j = round(mid + 1)
            while j < colnum:
                countRight += target_rows.iloc[i][j]
                j += 1
            if not (countRight == 0):
                result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countRight])
                if countRight>0:
                    rightKWIC.append([target_rows.iloc[i]['word2'],countRight])
                    # write into csv file
                    with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(["Searched Key Word", "Word in Context", "Position_right(+"+str(searchSize)+" words"])
                        for items in result:
                            writer.writerow(items)
        else:
            j = 2
            while j <= mid:
                countLeft += target_rows.iloc[i][j]
                j += 1
            while j < colnum:
                countRight += target_rows.iloc[i][j]
                j += 1
            if not (countLeft == 0 and countRight == 0):
                result.append([target_rows.iloc[i]['word1'], target_rows.iloc[i]['word2'], countLeft, countRight])
            if countLeft>0:
                leftKWIC.append([target_rows.iloc[i]['word2'],countLeft])
            if countRight>0:
                rightKWIC.append([target_rows.iloc[i]['word2'],countRight])
                # write into csv file
                with open(outFile, 'w', encoding="utf-8", errors = "ignore", newline='') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(["Searched Key Word", "Word in Context", "Position_left(-"+str(searchSize)+" words)", "Position_right(+"+str(searchSize)+" words"])
                    for items in result:
                        writer.writerow(items)
        i+=1
    #TODO
    #searchWord should be displayed in quotes in the chart
    #should exclude stopwords (putting a widget on GUI)
    filesToOpen.append(KWIC_search_output_filename)

    """
    #display chart for searchWord within sentence
    KWIC_search_output_filename_sentence=KWIC_search_output_filename.strip()[:-4]+"_sentence_counts.xlsx"
    sentenceKWIC=
    IO_util.list_to_csv(window,sentenceKWIC,KWIC_search_output_filename_sentence)
    #sort will not work with headers; headers inserted after
    sentenceKWIC=stats_visuals_util.sort_data(sentenceKWIC,1,True)
    sentenceKWIC.insert(0,["KWIC (sentence tokens)","Counts"])
    stats_visuals_util.create_excel_chart(window,"bar","Sentence tokens for " + searchWord,rightKWIC,KWIC_search_output_filename_sentence,20)
    filesToOpen.append(KWIC_search_output_filename_sentence)
    """
    
    if position == "left" or position == "both":
        if len(leftKWIC)>0:
            KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_left_counts.xlsx"
            IO_csv_util.list_to_csv(window,leftKWIC,KWIC_search_output_filename_stats)
            #sort will not work with headers; headers inserted after
            leftKWIC=stats_visuals_util.sort_data(leftKWIC,1,True)
            leftKWIC.insert(0,["KWIC (left-hand tokens)","Counts"])
            Excel_util.create_excel_chart(window,"bar","Left-hand tokens for " + searchWord,[leftKWIC],KWIC_search_output_filename_stats,20)
            filesToOpen.append(KWIC_search_output_filename_stats)
        else:
            IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no left-hand words for the searched keyword: ' + searchWord)
    if position == "right" or position == "both":    
        if len(rightKWIC)>0:
            KWIC_search_output_filename_stats=KWIC_search_output_filename.strip()[:-4]+"_right_counts.xlsx"
            IO_csv_util.list_to_csv(window,rightKWIC,KWIC_search_output_filename_stats)
            #sort will not work with headers; headers inserted after
            rightKWIC=stats_visuals_util.sort_data(rightKWIC,1,True)
            rightKWIC.insert(0,["KWIC (right-hand tokens)","Counts"])
            Excel_util.create_excel_chart(window,"bar","Right-hand tokens for " + searchWord,[rightKWIC],KWIC_search_output_filename_stats,20)
            filesToOpen.append(KWIC_search_output_filename_stats)
        else:
            IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'There are no right-hand words for the searched keyword: ' + searchWord)

    if ranWithCLAs == False:
        IO_user_interface_util.timed_alert(window, 3000, 'Searching KWIC Table', 'Finished running KWIC at', True)
Esempio n. 4
0
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts,
        doNotListIndividualFiles):
    global first_section, noun_cnt, nominalized_cnt

    first_section = re.compile("^(.+?)\.")
    noun_cnt = Counter()
    nominalized_cnt = Counter()
    filesToOpen = []  # Store all files that are to be opened once finished

    if __name__ == '__main__':
        nltk.data.path.append('./nltk_data')

        inputDocs = []
        if os.path.isdir(inputDir):
            for f in os.listdir(inputDir):
                if f[:2] != '~$' and f[-4:] == '.txt':
                    inputDocs.append(os.path.join(inputDir, f))
            if len(inputDocs) == 0:
                print(
                    "There are no txt files in the input path. The program will exit."
                )
                mb.showwarning(
                    title='No txt files found',
                    message=
                    'There are no txt files in the selected input directory.\n\nPlease, select a different input directory and try again.'
                )
                return
        else:
            inputDocs = [inputFilename]

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running Nominalization at', True)

        #add all into a sum
        result_dir = []
        result_dir.append(["Word", "Is nominalized", "Document"])
        docID = 0
        result2 = []
        result_dir2 = []
        counter_nominalized_list = []
        counter_nominalized_list.append(['Nominalized verb', 'Frequency'])
        counter_noun_list = []
        counter_noun_list.append(['Noun', 'Frequency'])

        for doc in inputDocs:

            docID = docID + 1
            print("Processing document", doc, "\n")
            #open the doc and create the list of result (words, T/F)
            fin = open(doc, 'r', encoding='utf-8', errors='ignore')
            # result1 contains the sentence and nominalized values fora a specific document
            result, result1 = nominalized_verb_detection(
                docID, doc, fin.read())
            # result2 contains the sentence and nominalized values for all documents
            result2.extend(result1)
            fin.close()

            # list all verbs as TRUE/FALSE if nominalized
            for word, boolean in result:
                result_dir.append([
                    word, boolean,
                    IO_csv_util.dressFilenameForCSVHyperlink(doc)
                ])

            result_dir2.extend(result_dir)

            if len(inputDir) > 0:
                fname = os.path.basename(os.path.normpath(inputDir)) + "_dir"
            else:
                fname = doc
            # used for both individual files and directories
            output_filename_bySentenceIndex = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'sent', '', '', '', False,
                True)

            if len(inputDir) == 0 or doNotListIndividualFiles == False:
                counter_nominalized_list = []
                counter_noun_list = []
                # refresh the headers
                counter_nominalized_list.insert(
                    0, ['Nominalized verb', 'Frequency'])
                counter_noun_list.insert(0, ['Noun', 'Frequency'])

                result1.insert(0, [
                    'Document ID', 'Document', 'Sentence ID', 'Sentence',
                    'Number of words in sentence', 'Nominalized verbs',
                    'Number of nominalizations in sentence',
                    'Percentage of nominalizations in sentence'
                ])

                # compute frequency of most common nominalized verbs
                for word, freq in nominalized_cnt.most_common():
                    counter_nominalized_list.append([word, freq])

                # compute frequency of most common nouns
                for word, freq in noun_cnt.most_common():
                    counter_noun_list.append([word, freq])

                head, fname = os.path.split(doc)
                fname = fname[:-4]

                output_filename_noun_frequencies = IO_files_util.generate_output_file_name(
                    fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '',
                    '', False, True)
                filesToOpen.append(output_filename_noun_frequencies)
                output_filename_nominalized_frequencies = IO_files_util.generate_output_file_name(
                    fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '',
                    '', '', False, True)
                filesToOpen.append(output_filename_nominalized_frequencies)

                # export nominalized verbs
                list_to_csv(output_filename_nominalized_frequencies,
                            counter_nominalized_list)

                # export nouns
                list_to_csv(output_filename_noun_frequencies,
                            counter_noun_list)

                output_filename_TRUE_FALSE = IO_files_util.generate_output_file_name(
                    fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '',
                    '', '', '', False, True)

                filesToOpen.append(output_filename_TRUE_FALSE)
                list_to_csv(output_filename_TRUE_FALSE, result)

                filesToOpen.append(output_filename_bySentenceIndex)
                list_to_csv(output_filename_bySentenceIndex, result1)

                if createExcelCharts == True:
                    # line chart
                    columns_to_be_plotted = [[2, 6]]
                    chartTitle = 'Nominalized verbs (by Sentence Index)'
                    xAxis = 'Sentence index'
                    yAxis = 'Number of nominalizations in sentence'
                    hover_label = ''
                    Excel_outputFilename = Excel_util.run_all(
                        columns_to_be_plotted,
                        output_filename_bySentenceIndex,
                        outputDir,
                        '',
                        chart_type_list=["line"],
                        chart_title=chartTitle,
                        column_xAxis_label_var=xAxis,
                        hover_info_column_list=hover_label,
                        column_yAxis_label_var=yAxis)
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

                    # pie chart of nominalized verbs
                    Excel_outputFilename = Excel_util.create_excel_chart(
                        GUI_util.window, [counter_nominalized_list], fname,
                        outputDir, 'NOM_Verb', "Nominalized verbs", ["pie"])
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

                    # pie chart of nouns
                    Excel_outputFilename = Excel_util.create_excel_chart(
                        GUI_util.window, [counter_noun_list], fname, outputDir,
                        'NOM_noun', "Nouns", ["pie"])
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

        if len(inputDir) > 0 and doNotListIndividualFiles == True:
            output_filename_TRUE_FALSE_dir = IO_files_util.generate_output_file_name(
                fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '',
                '', '', False, True)
            filesToOpen.append(output_filename_TRUE_FALSE_dir)
            output_filename_dir_noun_frequencies = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '',
                False, True)
            filesToOpen.append(output_filename_dir_noun_frequencies)
            output_filename_dir_nominalized_frequencies = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '',
                '', False, True)
            filesToOpen.append(output_filename_dir_nominalized_frequencies)

            result2.insert(0, [
                'Document ID', 'Document', 'Sentence ID', 'Sentence',
                'Number of words in sentence', 'Nominalized verbs',
                'Number of nominalizations in sentence',
                'Percentage of nominalizations in sentence'
            ])
            list_to_csv(output_filename_bySentenceIndex, result2)

            # list all verbs as TRUE/FALSE if nominalized
            list_to_csv(output_filename_TRUE_FALSE_dir, result_dir2)

            counter_noun_list = []
            counter_noun_list.append(['Noun', 'Frequency'])
            for word, freq in noun_cnt.most_common():
                counter_noun_list.append([word, freq])
            list_to_csv(output_filename_dir_noun_frequencies,
                        counter_noun_list)

            counter_nominalized_list = []
            counter_nominalized_list.append(['Nominalized verb', 'Frequency'])
            for word, freq in nominalized_cnt.most_common():
                counter_nominalized_list.append([word, freq])
            list_to_csv(output_filename_dir_nominalized_frequencies,
                        counter_nominalized_list)

            if createExcelCharts == True:
                # pie chart of nominalized verbs
                Excel_outputFilename = Excel_util.create_excel_chart(
                    GUI_util.window, [counter_nominalized_list],
                    output_filename_dir_nominalized_frequencies, outputDir,
                    'NOM_verb'
                    "Nominalized verbs", ["pie"])
                if len(Excel_outputFilename) > 0:
                    filesToOpen.append(Excel_outputFilename)

                # pie chart of nouns
                Excel_outputFilename = Excel_util.create_excel_chart(
                    GUI_util.window, [counter_noun_list],
                    output_filename_dir_noun_frequencies, outputDir,
                    'NOM_noun', "Nouns", ["pie"])
                if len(Excel_outputFilename) > 0:
                    filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running Nominalization at',
                                       True)

    if openOutputFiles == 1:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def noun_stats(inputFilename, outputDir, data, data_divided_sents,
               openOutputFiles, createExcelCharts):
    # print("\nRun noun analysis")

    filesToOpen = []  # Store all files that are to be opened once finished

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start',
                                       'Started running NOUN ANALYSES at',
                                       True)  # TODO: fix

    postag_list, postag_counter, deprel_list, deprel_counter, ner_list, ner_counter = compute_stats(
        data)

    noun_postag, noun_deprel, noun_ner, \
    noun_postag_stats, noun_deprel_stats, noun_ner_stats = noun_POSTAG_DEPREL_compute_frequencies(data,
                                                                                                  data_divided_sents)
    # output file names
    noun_postag_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_list')
    noun_deprel_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_list')
    noun_ner_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_list')
    noun_postag_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'POSTAG_stats')
    noun_deprel_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'DEPREL_stats')
    noun_ner_stats_file_name = IO_files_util.generate_output_file_name(
        inputFilename, '', outputDir, '.csv', 'NVA', 'Noun', 'NER_stats')

    # save csv files -------------------------------------------------------------------------------------------------

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun POS Tags', noun_postag,
                                       documentId_position),
        noun_postag_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_postag_file_name)

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun DEPREL Tags', noun_deprel,
                                       documentId_position),
        noun_deprel_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_deprel_file_name)

    errorFound = IO_csv_util.list_to_csv(
        GUI_util.window,
        IO_CoNLL_util.sort_output_list('Noun NER Tags', noun_ner,
                                       documentId_position),
        noun_ner_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_ner_file_name)

    # save csv frequency files ----------------------------------------------------------------------------------------

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_postag_stats,
                                         noun_postag_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_postag_stats_file_name)

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_deprel_stats,
                                         noun_deprel_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_deprel_stats_file_name)

    errorFound = IO_csv_util.list_to_csv(GUI_util.window, noun_ner_stats,
                                         noun_ner_stats_file_name)
    if errorFound == True:
        return filesToOpen
    filesToOpen.append(noun_ner_stats_file_name)

    if createExcelCharts == True:

        # pie charts -----------------------------------------------------------------------------------------------

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_postag_stats],
            inputFilename=noun_postag_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_POS',
            chartTitle="Noun POS Analysis",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_deprel_stats],
            inputFilename=noun_deprel_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_DEPREL',
            chartTitle="Noun DEPREL Analysis",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        Excel_outputFilename = Excel_util.create_excel_chart(
            GUI_util.window,
            data_to_be_plotted=[noun_ner_stats],
            inputFilename=noun_ner_stats_file_name,
            outputDir=outputDir,
            scriptType='Nouns_DEPREL',
            chartTitle="Nouns (NER Tags)",
            chart_type_list=["pie"])

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        # line plots by sentence index -----------------------------------------------------------------------------------------------

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_postag_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun POS Tags'],
            ['FORM', 'Sentence', 'Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')
        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_deprel_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun DEPREL Tags'],
            ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')

        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

        outputFiles = Excel_util.compute_csv_column_frequencies(
            GUI_util.window, noun_ner_file_name, '', outputDir,
            openOutputFiles, createExcelCharts, [[1, 4]], ['Noun NER Tags'],
            ['FORM', 'Sentence'], ['Document ID', 'Sentence ID', 'Document'],
            'NVA', 'line')
        if len(outputFiles) > 0:
            filesToOpen.extend(outputFiles)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running NOUN ANALYSES at',
                                       True)

    return filesToOpen
Esempio n. 6
0
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents,
                 openOutputFiles, createExcelCharts):

    filesToOpen = []  # Store all files that are to be opened once finished

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start',
                                       'Started running CLAUSE ANALYSES at',
                                       True)

    #output file names
    #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag
    clausal_analysis_file_name = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
        'list')
    filesToOpen.append(clausal_analysis_file_name)
    #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data

    if 0:
        stats_clauses(data)
    else:
        if not os.path.isdir(outputDir):
            mb.showwarning(
                title='Output file path error',
                message='Please check OUTPUT DIRECTORY PATH and try again')
            return
        clausal_list = stats_clauses_output(data, data_divided_sents)

        IO_csv_util.list_to_csv(
            GUI_util.window,
            IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list,
                                           documentId_position),
            clausal_analysis_file_name)
        column_stats = statistics_csv_util.compute_stats_CoreNLP_tag(
            clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG")

        clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name(
            inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
            'stats')
        errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats,
                                             clausal_analysis_stats_file_name)
        if errorFound == True:
            return

        if createExcelCharts == True:
            Excel_outputFilename = Excel_util.create_excel_chart(
                GUI_util.window,
                data_to_be_plotted=[column_stats],
                inputFilename=clausal_analysis_stats_file_name,
                outputDir=outputDir,
                scriptType='CoNLL_Clause',
                chartTitle="Frequency Distribution of Clause Type",
                chart_type_list=["pie"],
                column_xAxis_label="Clause Tags",
                column_yAxis_label="Frequency")
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # line plot by sentence index
            Excel_outputFilename = Excel_util.compute_csv_column_frequencies(
                GUI_util.window, clausal_analysis_file_name, '', outputDir,
                openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'],
                ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA',
                'line')
            if len(Excel_outputFilename) > 0:
                filesToOpen.extend(Excel_outputFilename)

            # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name)
            # # overwrite original file having added any missing document ID and sentence ID
            # output_df.to_csv(clausal_analysis_file_name,index=False)
            #
            columns_to_be_plotted = [[1, 8]]
            hover_label = ['CLAUSAL TAG-DESCRIPTION']
            inputFilename = clausal_analysis_file_name
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel='CoNLL_Clause',
                chart_type_list=["line"],
                chart_title='Frequency of Clause Tags',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=1)
            if Excel_outputFilename != '':
                filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running CLAUSE ANALYSES at',
                                       True)
    return filesToOpen
Esempio n. 7
0
def dictionary_items_bySentenceID(window,inputFilename,inputDir, outputDir,createExcelCharts,openOutputFiles=True,input_dictionary_file='',chartTitle=''):
    filesToOpen=[]
    DictionaryList=[]
    file_list = IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    nFile=len(file_list)
    if nFile==0:
        return
    # when running the function w/o a GUI, as currently is mostly the case,
    #   we would not be able to pass a dictionary file to the function
    if input_dictionary_file=='':
        initialFolder = os.path.dirname(os.path.abspath(__file__))
        input_dictionary_file = tk.filedialog.askopenfilename(title = "Select dictionary csv file", initialdir = initialFolder, filetypes = [("csv files", "*.csv")])
        if len(input_dictionary_file)==0:
            return

    if IO_csv_util.get_csvfile_numberofColumns(input_dictionary_file) == 2:
        dic = pd.read_csv(input_dictionary_file)
        dic_value = dic.iloc[:,0].tolist()
        dic_sec_value = dic.iloc[:,1].tolist()
        dic =[(dic_value[i],dic_sec_value[i])for i in range(len(dic_value))]
        if chartTitle=='':
            chartTitle="Dictionary value"
        documentID = 0
        container = []
        for file in file_list:
            documentID+=1
            head, tail = os.path.split(file)
            print("Processing file ", str(documentID),"\\",str(nFile),tail)
            text = (open(file, "r", encoding="utf-8",errors='ignore').read())
            #Process each word in txt
            Sentence_ID = 0
            sentences = tokenize.sent_tokenize(text)
            # word  frequency sentenceID DocumentID FileName
            for each_sentence in sentences:
                In = []
                Sentence_ID += 1
                token=nltk.word_tokenize(each_sentence)
                for word in token:
                    for dict_word in dic:
                        if word == dict_word[0].rstrip():
                            In.append([word,dict_word[1],Sentence_ID,each_sentence,documentID,file])
                            break
                        else:
                            continue
                container.extend(In)

            ctr = collections.Counter(Extract(container))
            for word in container:
                word.insert(2,ctr.get(word[0]))
            for word in container:
                if word[0] not in Extract(DictionaryList):
                    DictionaryList.append(word)

            DictionaryList.insert(0, ['Dict_value','Dict_second_value', 'Frequency', 'Sentence ID','Sentence','Document ID','Document'])
    else:
        dic = pd.read_csv(input_dictionary_file)
        dic_value = dic.iloc[:, 0].tolist()
        if chartTitle == '':
            chartTitle = "Dictionary value"
        documentID = 0
        container = []
        for file in file_list:
            documentID += 1
            head, tail = os.path.split(file)
            print("Processing file ", str(documentID), "\\", str(nFile), tail)
            text = (open(file, "r", encoding="utf-8", errors='ignore').read())
            # Process each word in txt
            Sentence_ID = 0
            sentences = tokenize.sent_tokenize(text)
            # word  frequency sentenceID DocumentID FileName
            for each_sentence in sentences:
                In = []
                Sentence_ID += 1
                token = nltk.word_tokenize(each_sentence)
                for word in token:
                    for dict_word in dic_value:
                        if word == dict_word.rstrip():
                            In.append([word, Sentence_ID, each_sentence, documentID, file])
                            break
                        else:
                            continue
                container.extend(In)

            ctr = collections.Counter(Extract(container))
            for word in container:
                word.insert(1, ctr.get(word[0]))
            for word in container:
                if word[0] not in Extract(DictionaryList):
                    DictionaryList.append(word)

            DictionaryList.insert(0, ['Dict_value', 'Frequency', 'Sentence ID', 'Sentence',
                                      'Document ID', 'Document'])

        outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.csv', str(Sentence_ID) + '-Dict_value', 'stats', '', '', '', False, True)
        filesToOpen.append(outputFilename)
        IO_csv_util.list_to_csv(window,DictionaryList,outputFilename)
        outputFilename=IO_files_util.generate_output_file_name(file, '', outputDir, '.xlsx', str(Sentence_ID) + '-Dict_value', 'chart', '', '', '', False, True)
        filesToOpen.append(outputFilename)
        Excel_util.create_excel_chart(GUI_util.window,[DictionaryList],outputFilename,chartTitle,["bar"])

    if openOutputFiles==True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
        
Esempio n. 8
0
# Written by Yuhang Feng November 2019