def createChart(inputFilename,outputDir,columns_to_be_plotted,hover_label):
    Excel_outputFileName = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
                                              outputFileLabel='Leven_spell',
                                              chart_type_list=["pie"],
                                              chart_title='Frequency of Potential Typos',
                                              column_xAxis_label_var='',
                                              hover_info_column_list=hover_label,
                                              count_var=1)
    return Excel_outputFileName
Esempio n. 2
0
def process_words(window,
                  inputFilename,
                  inputDir,
                  outputDir,
                  openOutputFiles,
                  createExcelCharts,
                  processType='',
                  excludeStopWords=True,
                  word_length=3):
    filesToOpen = []
    index = 0
    multiple_punctuation = 0
    exclamation_punctuation = 0
    question_punctuation = 0
    punctuation_docs = []

    inputDocs = IO_files_util.getFileList(inputFilename,
                                          inputDir,
                                          fileType='.txt')

    Ndocs = str(len(inputDocs))
    word_list = []
    for doc in inputDocs:
        head, tail = os.path.split(doc)
        index = index + 1
        print("Processing file " + str(index) + "/" + str(Ndocs) + " " + tail)
        fullText = (open(doc, "r", encoding="utf-8", errors="ignore").read())
        # words = fullText.translate(string.punctuation).lower().split()
        fullText = fullText.replace('\n', ' ')
        words = fullText.translate(string.punctuation).split()
        if excludeStopWords:
            words = excludeStopWords_list(words)
        if processType != '':
            hideMessage = False
        else:
            hideMessage = True
        if Ndocs == 1:
            hideMessage = False
        else:
            hideMessage = True
        if processType == '' or "short" in processType.lower():
            header = 'Short words (<4 chars)'
            fileLabel = 'short_words'
            # exclude numbers from list
            word_list = [
                word for word in words
                if word and len(word) <= int(word_length) and word.isalpha()
            ]
            filesToOpen = print_results(window, words, word_list, header,
                                        inputFilename, outputDir,
                                        excludeStopWords, fileLabel,
                                        hideMessage, filesToOpen)
            # filesToOpen.append(outputFilename)
        if processType == '' or "capital" in processType.lower():
            header = 'Initial-capital words'
            fileLabel = 'init_cap_words'
            word_list = [word for word in words if word and word[0].isupper()]
            filesToOpen = print_results(window, words, word_list, header,
                                        inputFilename, outputDir,
                                        excludeStopWords, fileLabel,
                                        hideMessage, filesToOpen)
            # if outputFilename!='':
            #     filesToOpen.append(outputFilename)
        if processType == '' or "vowel" in processType.lower():
            header = 'Vowel words'
            fileLabel = 'vowel_words'
            word_list = [
                word for word in words
                if word and word[0] in "aeiou" and word.isalpha()
            ]
            filesToOpen = print_results(window, words, word_list, header,
                                        inputFilename, outputDir,
                                        excludeStopWords, fileLabel,
                                        hideMessage, filesToOpen)
            # if outputFilename!='':
            #     filesToOpen.append(outputFilename)
        if processType == '' or "punctuation" in processType.lower():
            header = [
                'Word', 'Punctuation symbols of pathos (?!)', 'Document ID',
                'Document'
            ]
            fileLabel = 'punctuation'
            for word in words:
                punctuation = ''
                character_index = 0
                for i in word:
                    if '!' in i or '?' in i:
                        punctuation = word[character_index:len(word)]
                        continue
                    character_index = character_index + 1
                if punctuation != '':
                    if doc not in punctuation_docs:
                        punctuation_docs.append(doc)
                    word_list.extend([[
                        word, punctuation, index,
                        IO_csv_util.dressFilenameForCSVHyperlink(doc)
                    ]])
                    if '!' in punctuation and '?' in punctuation:
                        multiple_punctuation = multiple_punctuation + 1
                    elif '!' in punctuation:
                        exclamation_punctuation = exclamation_punctuation + 1
                    elif '?' in punctuation:
                        question_punctuation = question_punctuation + 1

    mb.showinfo(title='Results', message="Combinations of ! and ? punctuation symbols were used " + str(multiple_punctuation) + \
                        " times.\n\n! punctuation symbols were used " + str(exclamation_punctuation) + \
                        " times.\n\n? punctuation symbols were used " + str(question_punctuation) + \
                        " times.\n\n\nPunctuation symbols of pathos (!?) were used in " + str(len(punctuation_docs)) + " separate documents out of " + str(Ndocs) + " documents.\n\nCHECK COMMAND LINE FOR A COPY OF THESE RESULTS.")

    print("\nCombinations of ! and ? punctuation symbols were used " + str(multiple_punctuation) + \
                        " times.\n\n! punctuation symbols were used " + str(exclamation_punctuation) + \
                        " times.\n\n? punctuation symbols were used " + str(question_punctuation) + \
                        " times.\n\nPunctuation symbols of pathos (!?) were used in " + str(len(punctuation_docs)) + " separate documents out of " + str(Ndocs) + " documents.")

    outputFilename = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', fileLabel)
    word_list.insert(0, header)
    IO_error = IO_csv_util.list_to_csv(window, word_list, outputFilename)

    if createExcelCharts == True:
        columns_to_be_plotted = [[1, 1]]
        hover_label = []
        inputFilename = outputFilename
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='punct_stats',
            chart_type_list=["bar"],
            # chart_title='Corpus statistics\nCorpus directory: '+inputDir,
            chart_title='Frequency of Punctuation Symbols of Pathos (?!)',
            column_xAxis_label_var='Punctuation symbols of pathos (?!)',
            hover_info_column_list=hover_label,
            count_var=True)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

        # should also provide a bar chart of the frequency of distinct documents by punctuation symbol
        columns_to_be_plotted = [[2, 2]]
        hover_label = []
        inputFilename = outputFilename
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='punct_doc_stats',
            chart_type_list=["bar"],
            # chart_title='Corpus statistics\nCorpus directory: '+inputDir,
            chart_title='Frequency of ' + str(Ndocs) +
            ' Documents with Punctuation Symbols of Pathos (?!)',
            column_xAxis_label_var='Punctuation symbols of pathos (?!)',
            hover_info_column_list=hover_label,
            count_var=True)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    if not IO_error:
        filesToOpen.append(outputFilename)
    return filesToOpen
Esempio n. 3
0
def compute_character_word_ngrams(window,
                                  inputFilename,
                                  inputDir,
                                  outputDir,
                                  ngramsNumber=4,
                                  normalize=False,
                                  excludePunctuation=False,
                                  wordgram=None,
                                  openOutputFiles=True,
                                  createExcelCharts=True,
                                  bySentenceID=None):
    filesToOpen = []
    container = []

    if inputFilename == '' and inputDir == '':
        mb.showwarning(
            title='Input error',
            message=
            'No input file or input directory have been specified.\n\nThe function will exit.\n\nPlease, enter the required input options and try again.'
        )
        return
    files = IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    nFile = len(files)
    if nFile == 0:
        return

    if wordgram == None:
        result = mb.askyesno(
            "Word/character N-grams",
            "Would you like to compute\n  WORD n-grams (Yes) or\n  CHARACTER n-grams (No)?"
        )
        if result == True:
            wordgram = 1
        else:
            wordgram = 0

    if wordgram == 1:
        fn = "Wd"
        chartTitle = "Word "
    else:
        fn = "Ch"
        chartTitle = "Character "

    if bySentenceID == None:
        result = mb.askyesno(
            "By sentence index",
            "Would you like to compute n-grams by sentence index?")
        if result == True:
            bySentenceID = 1
        else:
            bySentenceID = 0

    i = 0
    for file in files:
        head, tail = os.path.split(file)
        i = i + 1
        print("Processing file " + str(i) + "/" + str(nFile) + ' ' + tail)
        ngramsList = get_ngramlist(file,
                                   ngramsNumber,
                                   wordgram,
                                   excludePunctuation,
                                   bySentenceID,
                                   isdir=True)
        container.append(ngramsList)

    for index, f in enumerate(container):

        for n in f:
            for skip, gram in enumerate(n):
                if skip == 0:
                    gram.insert(-1, 'Document ID')
                    continue
                else:
                    gram.insert(-1, index + 1)
    one_gram = []
    for index, f in enumerate(container):
        if index == 0:
            one_gram += (f[0])
        else:
            one_gram += (f[0][1:])
        generalList = [one_gram]
    if ngramsNumber > 1:
        two_gram = []
        for index, f in enumerate(container):
            if index == 0:
                two_gram += (f[1])
            else:
                two_gram += (f[1][1:])
        generalList = [one_gram, two_gram]
    if ngramsNumber > 2:
        three_gram = []
        for index, f in enumerate(container):
            if index == 0:
                three_gram += (f[2])
            else:
                three_gram += (f[2][1:])
        generalList = [one_gram, two_gram, three_gram]
    if ngramsNumber > 3:
        four_gram = []
        for index, f in enumerate(container):
            if index == 0:
                four_gram += (f[3])
            else:
                four_gram += (f[3][1:])
        generalList = [one_gram, two_gram, three_gram, four_gram]

    result = True
    # n-grams
    # if createExcelCharts==True:
    #     if nFile>10:
    #         result = mb.askyesno("Excel charts","You have " + str(nFile) + " files for which to compute Excel charts.\n\nTHIS WILL TAKE A LONG TIME.\n\nAre you sure you want to do that?",default='no')
    for index, ngramsList in enumerate(generalList):
        if nFile > 1:
            csv_outputFilename = IO_files_util.generate_output_file_name(
                inputFilename, inputDir, outputDir, '.csv',
                'n-grams_' + str(index + 1) + '_' + fn, 'stats', '', '', '',
                False, True)
        else:
            csv_outputFilename = IO_files_util.generate_output_file_name(
                inputFilename, inputDir, outputDir, '.csv',
                'n-grams_' + str(index + 1) + '_' + fn, 'stats')

        filesToOpen.append(csv_outputFilename)
        IO_csv_util.list_to_csv(window, ngramsList, csv_outputFilename)

        # n-grams
        if createExcelCharts == True and result == True:
            inputFilename = csv_outputFilename
            if bySentenceID == True:
                columns_to_be_plotted = [[2, 2]]
                hover_label = [str(index + 1) + '-grams']
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    inputFilename,
                    outputDir,
                    outputFileLabel='n-grams_' + str(index + 1) + '_' + fn,
                    chart_type_list=["line"],
                    chart_title=chartTitle + str(index + 1) + '-grams',
                    column_xAxis_label_var='Sentence Index',
                    hover_info_column_list=hover_label)
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)
            else:
                columns_to_be_plotted = [[0, 2]]  # 0,1
                hover_label = [str(index + 1) + '-grams']  # change to sentence
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    inputFilename,
                    outputDir,
                    outputFileLabel='n-grams_' + str(index + 1) + '_' + fn,
                    chart_type_list=["bar"],
                    chart_title=chartTitle + str(index + 1) + '-grams',
                    column_xAxis_label_var='',
                    hover_info_column_list=hover_label)
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

                # excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, csv_outputFilename,
                # chart_type_list=["bar"], chart_title=chartTitle + str(index+1) + '-grams', column_xAxis_label_var='', column_yAxis_label_var='Frequency', outputExtension = '.xlsm', label1='n-grams_'+str(index+1)+'_'+fn,label2='bar',label3='chart',label4='',label5='', useTime=False,disable_suffix=True,  count_var=0, column_yAxis_field_list = [], reverse_column_position_for_series_label=False , series_label_list=[str(index+1)+'-grams'], second_y_var=0, second_yAxis_label='', hover_info_column_list=hover_label)
                # if excel_outputFilename != "":
                #     filesToOpen.append(excel_outputFilename)

    if len(inputDir) != 0:
        mb.showwarning(
            title='Warning',
            message=
            'The output filename generated by N-grams is the name of the directory processed in input, rather than any individual file in the directory.\n\nThe output csv file includes all '
            + str(nFile) +
            ' files in the input directory processed by N-grams.')

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Esempio n. 4
0
def compute_corpus_statistics(window,
                              inputFilename,
                              inputDir,
                              outputDir,
                              openOutputFiles,
                              createExcelCharts,
                              excludeStopWords=True,
                              lemmatizeWords=True):
    filesToOpen = []
    outputFilenameCSV = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'corpus', 'stats')
    filesToOpen.append(outputFilenameCSV)
    inputDocs = IO_files_util.getFileList(inputFilename,
                                          inputDir,
                                          fileType='.txt')

    # read_line(inputFilename, inputDir, outputDir)
    # return

    Ndocs = str(len(inputDocs))
    fieldnames = [
        'Number of documents in corpus', 'Document ID', 'Document',
        'Number of Sentences in Document', 'Number of Words in Document',
        'Number of Syllables in Document', 'Word1', 'Frequency1', 'Word2',
        'Frequency2', 'Word3', 'Frequency3', 'Word4', 'Frequency4', 'Word5',
        'Frequency5', 'Word6', 'Frequency6', 'Word7', 'Frequency7', 'Word8',
        'Frequency8', 'Word9', 'Frequency9', 'Word10', 'Frequency10', 'Word11',
        'Frequency11', 'Word12', 'Frequency12', 'Word13', 'Frequency13',
        'Word14', 'Frequency14', 'Word15', 'Frequency15', 'Word16',
        'Frequency16', 'Word17', 'Frequency17', 'Word18', 'Frequency18',
        'Word19', 'Frequency19', 'Word20', 'Frequency20'
    ]
    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                                       'Started running corpus statistics at',
                                       True)

    with open(outputFilenameCSV,
              'w',
              encoding='utf-8',
              errors='ignore',
              newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        #print("Number of corpus text documents: ",Ndocs)
        #currentLine.append([Ndocs])
        index = 0
        for doc in inputDocs:
            head, tail = os.path.split(doc)
            index = index + 1
            # currentLine.append([index])
            print("Processing file " + str(index) + "/" + str(Ndocs) + " " +
                  tail)
            #currentLine.append([doc])
            fullText = (open(doc, "r", encoding="utf-8",
                             errors="ignore").read())

            Nsentences = str(textstat.sentence_count(fullText))
            #print('TOTAL number of sentences: ',Nsentences)

            Nwords = str(textstat.lexicon_count(fullText, removepunct=True))
            #print('TOTAL number of words: ',Nwords)

            Nsyllables = textstat.syllable_count(fullText, lang='en_US')
            #print('TOTAL number of Syllables: ',Nsyllables)

            # words = fullText.split()
            words = nltk.word_tokenize(fullText)

            if excludeStopWords:
                words = excludeStopWords_list(words)

            if lemmatizeWords:
                lemmatizer = WordNetLemmatizer()
                text_vocab = set(
                    lemmatizer.lemmatize(w.lower())
                    for w in fullText.split(" ") if w.isalpha())
                words = set(
                    lemmatizing(w.lower()) for w in words
                    if w.isalpha())  # fullText.split(" ") if w.isalpha())

            word_counts = Counter(words)
            # 20 most frequent words
            #print("\n\nTOP 20 most frequent words  ----------------------------")
            # for item in word_counts.most_common(20):
            #     print(item)
            # currentLine=[[Ndocs,index,doc,Nsentences,Nwords,Nsyllables]]
            currentLine = [[
                Ndocs, index,
                IO_csv_util.dressFilenameForCSVHyperlink(doc), Nsentences,
                Nwords, Nsyllables
            ]]
            for item in word_counts.most_common(20):
                currentLine[0].append(item[0])  # word
                currentLine[0].append(item[1])  # frequency
            writer = csv.writer(csvfile)
            writer.writerows(currentLine)
        csvfile.close()

        # compute statistics about doc length grouped by Document
        list = ['Document ID']
        tempOutputfile = statistics_csv_util.compute_field_statistics_groupBy(
            window, outputFilenameCSV, outputDir, list, openOutputFiles,
            createExcelCharts, 4)
        # ,4)  # 'number of words in doc'
        if tempOutputfile != None:
            filesToOpen.extend(tempOutputfile)

        IO_user_interface_util.timed_alert(
            GUI_util.window, 2000, 'Analysis end',
            'Finished running corpus statistics at', True)

        if createExcelCharts == True:
            columns_to_be_plotted = [[1, 3], [1, 4]]
            hover_label = ['Document', 'Document']
            inputFilename = outputFilenameCSV
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel='corpus_stats',
                chart_type_list=["bar"],
                # chart_title='Corpus statistics\nCorpus directory: '+inputDir,
                chart_title=
                'Corpus Statistics: Frequency of Sentences & Words by Document',
                column_xAxis_label_var='Document',
                hover_info_column_list=hover_label)
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        # TODO
        #   we should create 10 classes of values by distance to the median of
        #       each value in the Number of Words in Document Col. E
        #   -0-10 11-20 21-30,… 91-100
        #   and plot them as column charts.

        if openOutputFiles == True:
            IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                          filesToOpen)
    return filesToOpen
Esempio n. 5
0
def compute_stats_NLP_main(window,
                           inputFilename,
                           inputDataFrame,
                           outputDir,
                           openOutputFiles,
                           createExcelCharts,
                           columns_to_be_plotted,
                           selected_col,
                           hover_col,
                           group_col,
                           fileNameType='CSV',
                           chartType='line'):

    filesToOpen = []
    container = []
    if len(inputDataFrame) != 0:
        data = inputDataFrame
    else:
        with open(inputFilename, encoding='utf-8', errors='ignore') as infile:
            reader = csv.reader(x.replace('\0', '') for x in infile)
            headers = next(reader)
        header_indices = [i for i, item in enumerate(headers) if item]
        data = pd.read_csv(inputFilename,
                           usecols=header_indices,
                           encoding='utf-8')

    if len(selected_col) == 0:
        mb.showwarning(
            'Missing field',
            'You have not selected the csv field for which to compute frequencies.\n\nPlease, select the field and try again.'
        )

    elif len(selected_col) != 0 and len(group_col) == 0:
        for col in selected_col:
            output_file_name = IO_files_util.generate_output_file_name(
                inputFilename, '', outputDir, '.csv', col)
            data = data[col].value_counts().to_frame().reset_index()
            hdr = [col, col + ' Frequency']

            Hover_over_header = []
            if len(hover_col) != 0:
                hover_header = ', '.join(hover_col)
                Hover_over_header = ['Hover_over: ' + hover_header]
                hdr.append(Hover_over_header)
                data.columns = hdr
                temp_str = '%s' + '\n%s' * (len(hover_col) - 1)
                data['Hover_over: ' + hover_header] = data.apply(
                    lambda x: temp_str % tuple(x[h] for h in hover_col),
                    axis=1)
                data.drop(hover_col, axis=1, inplace=True)
            else:
                data.columns = hdr
            data.to_csv(output_file_name, index=False)
            filesToOpen.append(output_file_name)

            if createExcelCharts:
                # columns_to_be_plotted = [[1, 2]] # hard code Yi
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    inputFilename,
                    outputDir,
                    outputFileLabel=fileNameType,
                    chart_type_list=chartType,
                    chart_title='',
                    column_xAxis_label_var=col,
                    hover_info_column_list=Hover_over_header)
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

    elif len(selected_col) != 0 and len(group_col) != 0 and len(
            hover_col) == 0:
        for col in selected_col:
            output_file_name = IO_files_util.generate_output_file_name(
                inputFilename, '', outputDir, '.csv', col)
            temp = group_col.copy()
            temp.append(col)
            data = data.groupby(temp).size().reset_index(name='Frequency')
            for index, row in data.iterrows():
                if row[col] == '':
                    data.at[index, 'Frequency'] = 0
            data.to_csv(output_file_name, index=False)
            filesToOpen.append(output_file_name)
            if createExcelCharts:
                # columns_to_be_plotted = [[1, 2]] # hard code Yi
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    inputFilename,
                    outputDir,
                    outputFileLabel=fileNameType,
                    chart_type_list=chartType,
                    chart_title='',
                    column_xAxis_label_var=col,
                    hover_info_column_list=Hover_over_header)
                filesToOpen.append(Excel_outputFilename)
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

                # # columns_to_be_plotted = [[1, 2]] # hard code Yi
                # Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
                #                                       outputFileLabel=fileNameType,
                #                                       chart_type_list=[chartType],
                #                                       chart_title='',
                #                                       column_xAxis_label_var=col,
                #                                       hover_info_column_list=Hover_over_header)
                # filesToOpen.append(Excel_outputFilename)
    else:
        for col in hover_col:
            temp = group_col.copy()
            temp.append(col)
            c = data.groupby(group_col)[col].apply(list).to_dict()

            container.append(c)

        temp = group_col.copy()
        temp.extend(selected_col)
        data = data.groupby(temp).size().reset_index(name='Frequency')
        for index, row in data.iterrows():
            if row[selected_col[0]] == '':
                data.at[index, 'Frequency'] = 0

        hover_header = ', '.join(hover_col)
        Hover_over_header = ['Hover_over: ' + hover_header]

        for index, hover in enumerate(hover_col):
            df = pd.Series(container[index]).reset_index()
            temp = group_col.copy()
            temp.append(hover)
            df.columns = temp
            data = data.merge(df,
                              how='left',
                              left_on=group_col,
                              right_on=group_col)
        temp_str = '%s' + '\n%s' * (len(hover_col) - 1)
        data['Hover_over: ' + hover_header] = data.apply(
            lambda x: temp_str % tuple(x[h] for h in hover_col), axis=1)
        data.drop(hover_col, axis=1, inplace=True)

        if createExcelCharts:
            # columns_to_be_plotted = [[1, 2]] # hard code Yi
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel=fileNameType,
                chart_type_list=chartType,
                chart_title='',
                column_xAxis_label_var=col,
                hover_info_column_list=Hover_over_header)
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        # need change, put run_all
        # if createExcelCharts:
        #     filesToOpen=Excel_util.prepare_csv_data_for_chart(window,
        #                                                         inputFilename, data, outputDir,
        #                                                         selected_col,
        #                                                         Hover_over_header, group_col, fileNameType,
        #                                                         chartType,openOutputFiles, createExcelCharts)
    if openOutputFiles == 1:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
        filesToOpen = []  # empty list not to display twice

    return filesToOpen  #2 files
def language_detection(window, inputFilename, inputDir, outputDir,
                       openOutputFiles, createExcelCharts):

    folderID = 0
    fileID = 0
    filesToOpen = []

    outputFilenameCSV = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'lang_detect')
    filesToOpen.append(outputFilenameCSV)

    files = IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    if len(files) == 0:
        return

    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    fieldnames = [
        'LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language',
        'Probability', 'LANGID', 'Language', 'Probability', 'Document ID',
        'Document'
    ]

    config_filename = 'file-spell-checker-config.txt'
    reminders_util.checkReminder(
        config_filename, ['Language detection'],
        'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.',
        True)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        'Started running language detection algorithms at', True,
        'You can follow the algorithms in command line.')

    with open(outputFilenameCSV,
              'w',
              encoding='utf-8',
              errors='ignore',
              newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        docErrors_empty = 0
        docErrors_unknown = 0
        filenameSV = ''
        for filename in files:
            fileID = fileID + 1
            head, tail = os.path.split(filename)
            print("Processing file " + str(fileID) + "/" + str(len(files)) +
                  ' ' + tail)
            text = open(filename, 'r', encoding='utf-8',
                        errors='ignore').read()
            if len(text) == 0:
                print(
                    "  The file is empty. It will be discarded from processing."
                )
                docErrors_empty = docErrors_empty + 1
                continue
            # text = opened_file.read()
            # head, tail = os.path.split(filename)
            # head is path, tail is filename
            try:
                value = detect_langs(text)
            except:
                filenameSV = filename  # do not count the same document twice in this and the other algorithms that follow
                docErrors_unknown = docErrors_unknown + 1
                print("  Unknown file read error.")
                continue
            value = str(value[0]).split(':')
            language = value[0]
            probability = value[1]
            print('   LANGDETECT', language, probability)
            # print('   LANGDETECT',value[0],value[1])  # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756]
            currentLine = ['LANGDETECT', language, probability]

            nlp = spacy.load('en_core_web_sm')
            nlp.add_pipe(LanguageDetector(),
                         name='language_detector',
                         last=True)
            try:
                doc = nlp(text)
            except:
                if filename != filenameSV:  # do not count the same document twice in this and the other algorithm that follows
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV = filename
                print("  Unknown file read error.")
                continue
            value = doc._.language
            language = value['language']
            probability = value['score']
            print(
                '   SPACY', language,
                probability)  # {'language': 'en', 'score': 0.9999978351575265}
            currentLine.extend(['SPACY', language, probability])

            lang_identifier = LanguageIdentifier.from_modelstring(
                model, norm_probs=True)
            try:
                value = lang_identifier.classify(text)
            except:
                if filename != filenameSV:
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV = filename
                print("  Unknown file read error.")
                continue
            language = value[0]
            probability = value[1]
            print('   LANGID', language,
                  probability)  # ('en', 0.999999999999998)
            print()
            currentLine.extend(['LANGID', language, probability])
            currentLine.extend(
                [fileID,
                 IO_csv_util.dressFilenameForCSVHyperlink(filename)])

            writer = csv.writer(csvfile)
            writer.writerows([currentLine])
            filenameSV = filename
    csvfile.close()
    msg = ''
    if docErrors_empty == 0 and docErrors_unknown == 0:
        msg = str(
            fileID
        ) + ' documents successfully processed for language detection.'
    else:
        if docErrors_empty > 0:
            msg = str(
                fileID
            ) + ' documents processed for language detection.\n  ' + str(
                docErrors_empty) + ' document(s) found empty.'
        if docErrors_unknown > 0:
            if msg != '':
                msg = msg + '\n  ' + str(
                    docErrors_unknown
                ) + ' document(s) read with unknown errors.'
            else:
                msg = str(fileID) + ' documents processed for language detection.\n  ' + \
                      str(docErrors_unknown) + ' document(s) read with unknown errors.'
        mb.showwarning(
            title='File read errors',
            message=msg +
            '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.'
        )
    filesToOpen.append(outputFilenameCSV)
    if createExcelCharts:
        columns_to_be_plotted = [[1, 1], [4, 4], [7, 7]]
        chart_title = 'Frequency of Languages Detected by 3 Algorithms'
        hover_label = ['LANGDETECT', 'SPACY', 'LANGID']
        inputFilename = outputFilenameCSV
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='_bar_chart',
            chart_type_list=["bar"],
            chart_title=chart_title,
            column_xAxis_label_var='Language',
            hover_info_column_list=hover_label,
            count_var=1)
        if Excel_outputFilename != '':
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def nltk_unusual_words(window,
                       inputFilename,
                       inputDir,
                       outputDir,
                       openOutputFiles,
                       createExcelCharts=True,
                       silent=False):
    filesToOpen = []
    unusual = []
    container = []
    documentID = 0
    files = IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    nFile = len(files)
    if nFile == 0:
        return
    outputFilename = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'NLTK_unus', 'stats')
    filesToOpen.append(outputFilename)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'NLTK unusual words/spelling checker start',
        'Started running NLTK unusual words/spelling checker at', True,
        'You can follow NLTK words/spelling checker in command line.')

    # already shown in NLP.py
    # IO_util.timed_alert(GUI_util.window,3000,'Analysis start','Started running NLTK unusual words at',True,'You can follow NLTK unusual words in command line.')
    for file in files:
        documentID = documentID + 1
        head, tail = os.path.split(file)
        print("Processing file " + str(documentID) + "/" + str(nFile) + ' ' +
              tail)
        text = (open(file, "r", encoding="utf-8", errors="ignore").read())
        #lemmatizer = WordNetLemmatizer()
        # text_vocab = set(lemmatizer.lemmatize(w.lower()) for w in text.split(" ") if w.isalpha())
        text_vocab = set(
            lemmatizing(w.lower()) for w in text.split(" ") if w.isalpha())
        english_vocab = set([w.lower() for w in nltk.corpus.words.words()])
        print("english_vocab", english_vocab)
        print("text_vocab", text_vocab)
        unusual = text_vocab - english_vocab
        #convert the set to a list
        unusual = list(unusual)
        #sort the list
        unusual.sort()
        # unusual = [[documentID, file, word] for word in unusual]
        unusual = [[
            documentID,
            IO_csv_util.dressFilenameForCSVHyperlink(file), word
        ] for word in unusual]
        container.extend(unusual)
    container.insert(0, ['Document ID', 'Document', 'Misspelled/unusual word'])
    if len(container) > 0:
        if IO_csv_util.list_to_csv(window, container, outputFilename): return
    else:
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Spelling checker (via nltk)',
            'No misspelled/unusual words found in\n' + file, True)
        if nFile == 1:
            return

    if not silent:
        IO_user_interface_util.single_file_output_save(inputDir, 'NLTK')

    # NLTK unusual words
    if createExcelCharts:
        if nFile > 10:
            result = mb.askyesno(
                "Excel charts", "You have " + str(nFile) +
                " files for which to compute Excel charts.\n\nTHIS WILL TAKE A LONG TIME.\n\nAre you sure you want to do that?"
            )
            if result == False:
                pass
        columns_to_be_plotted = [[2, 2]]
        hover_label = ['']
        inputFilename = outputFilename
        Excel_outputFileName = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='NLTK_spell',
            chart_type_list=["bar"],
            chart_title='Misspelled/Unusual Words Frequency',
            column_xAxis_label_var='',
            hover_info_column_list=hover_label,
            count_var=1)
        if Excel_outputFileName != "":
            filesToOpen.append(Excel_outputFileName)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
        filesToOpen = []  # do not open twice, hee and calling function
    # already shown in NLP.py
    # IO_util.timed_alert(GUI_util.window,3000,'Analysis end','Finished running NLTK unusual words at',True)
    for u in unusual:
        print(u[-1])

    print(len(unusual))
    return filesToOpen
Esempio n. 8
0
def run(inputDir, outputDir, openOutputFiles, createExcelCharts, n_grams_var,
        n_grams_menu_var, n_grams_list, n_grams_viewer_var, CoOcc_Viewer_var,
        search_words, date_options, temporal_aggregation, date_format,
        date_separator_var, date_position_var, viewer_list):
    # print(date_options, temporal_aggregation, date_format, date_separator_var, date_position_var)
    filesToOpen = []

    total_file_number = 0
    error_file_number = 0
    error_filenames = []
    error_flag = False

    if n_grams_var == False and n_grams_viewer_var == False and CoOcc_Viewer_var == False:
        mb.showwarning(
            title='Warning',
            message=
            'There are no options selected.\n\nPlease, select one of the available options and try again.'
        )
        return
    if date_options:
        new_date_format = date_format.replace('yyyy', '%Y').replace(
            'mm', '%m').replace('dd', '%d')
        for folder, subs, files in os.walk(inputDir):
            for filename in files:
                if not filename.endswith('.txt'):
                    continue
                filename = filename.replace('.txt', '')
                total_file_number = total_file_number + 1
                try:
                    date_text = ''
                    date_text = filename.split(date_separator_var)[
                        date_position_var - 1]
                except:  # if a file in the folder has no date it will break the code
                    pass
                try:
                    datetime.datetime.strptime(date_text, new_date_format)
                except ValueError:
                    error_file_number = error_file_number + 1
                    error_filenames.append(
                        IO_csv_util.dressFilenameForCSVHyperlink(
                            os.path.join(folder, filename + '.txt')))
                    error_flag = True

    if error_flag:
        df = pd.DataFrame(error_filenames,
                          columns=[
                              'File with date not in position ' +
                              str(date_position_var)
                          ])
        error_output = IO_files_util.generate_output_file_name(
            '', inputDir, outputDir, '.csv', 'Date_position_errors_file')
        df.to_csv(error_output, index=False)
        mb.showwarning(
            title='Warning',
            message='There are ' + str(error_file_number) + ' files out of ' +
            str(total_file_number) +
            ' processed in the selected input directory with errors in either the date format or the date position. \n\nThe selected date format is '
            + str(date_format) + ' and the selected date position is ' +
            str(date_position_var) +
            '.\n\nClick OK to open a csv file with a list of files with erroneous dates. Check carefully, both date format and date position. Any erroneous file will need to be fixed or removed from the input directory before processing. You may also simply need to select a different date format and/or date position.'
        )
        filesToOpen.append(error_output)
        if openOutputFiles == True:
            IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                          filesToOpen)
        return

# COMPUTE Ngrams ______________________________________________________________________________

    if n_grams_var:
        n_grams_word_var = False
        n_grams_character_var = False
        normalize = False
        n_grams_size = 4  # default number of n_grams
        excludePunctuation = False
        bySentenceIndex_word_var = False
        bySentenceIndex_character_var = False
        if n_grams_menu_var == "Word":
            n_grams_word_var = True
        else:
            n_grams_character_var = True
        bySentenceIndex_character_var = False
        if 'Hapax' in str(n_grams_list):
            n_grams_size = 1
        if 'punctuation' in str(n_grams_list):
            excludePunctuation = True
        if 'sentence index' in str(n_grams_list):
            if n_grams_menu_var == "Word":
                bySentenceIndex_word_var = True
            else:
                bySentenceIndex_character_var = True

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'N-Grams start',
            'Started running ' + n_grams_menu_var + ' n-grams at', True,
            'You can follow the script in command line.')

        if n_grams_word_var or n_grams_character_var or bySentenceIndex_word_var or bySentenceIndex_character_var:
            inputFilename = ''  # for now we only process a whole directory
            if IO_libraries_util.inputProgramFileCheck(
                    'statistics_txt_util.py') == False:
                return

        if n_grams_word_var or bySentenceIndex_word_var:
            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                n_grams_size, normalize, excludePunctuation, 1,
                openOutputFiles, createExcelCharts, bySentenceIndex_word_var)
        if n_grams_character_var or bySentenceIndex_character_var:
            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                n_grams_size, normalize, excludePunctuation, 0,
                openOutputFiles, createExcelCharts,
                bySentenceIndex_character_var)
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'N-Grams end',
            'Finished running ' + n_grams_menu_var + ' n-grams at', True)

# VIEWER ____________________________________________________________________________________________

    if (n_grams_viewer_var == False and CoOcc_Viewer_var == False):
        return

    if (n_grams_viewer_var == True
            or CoOcc_Viewer_var == True) and (createExcelCharts == False):
        mb.showwarning(
            title='Warning',
            message=
            'The checkbox to compute Excel charts is unticked. Since the VIEWER produces Excel charts as output, the script will abort.\n\nPlease, tick the checkbox to produce Excel charts and try again.'
        )
        return

    txtCounter = len(glob.glob1(inputDir, "*.txt"))
    if txtCounter == 0:
        mb.showwarning(
            title='Warning',
            message=
            'There are no files with txt extension in the selected directory.\n\nPlease, select a different directory and try again.'
        )
        return

    if txtCounter == 1:
        mb.showwarning(
            title='Warning',
            message=
            'There is only one file with txt extension in the selected directory. The script requires at least two files.\n\nPlease, select a different directory and try again.'
        )
        return

    if (n_grams_viewer_var or CoOcc_Viewer_var):
        if IO_libraries_util.inputProgramFileCheck(
                'NGrams_CoOccurrences_Viewer.jar') == False:
            return
        errorFound, error_code, system_output = IO_libraries_util.check_java_installation(
            'Ngram/CoOccurrence Viewer')
        if errorFound:
            return

    if ',' in search_words:
        mb.showwarning(
            title='Warning',
            message=
            'Values entered in the search bar should not be comma-separated, but blank-separated (e.g., woman man, and not woman, man).\n\nPlease, check your search bar values and try again.'
        )
        return

    if search_words != '' and n_grams_viewer_var == False and CoOcc_Viewer_var == False:
        mb.showwarning(
            title='Warning',
            message="You have entered the string '" + search_words +
            "' in the Search widget but you have not selected which Viewer you wish to use, Ngram or Co-Occurrence.\n\nPlease, select an option and try again."
        )
        return

    if search_words == '' and (n_grams_viewer_var == True
                               or CoOcc_Viewer_var == True):
        mb.showwarning(
            title='Warning',
            message=
            "You have selected to run a Viewer but you have not entered any search strings in the Search widget.\n\nPlease, enter search values  and try again."
        )
        return

    normalize = False
    scaleData = False
    useLemma = False
    fullInfo = False
    if 'Normalize' in str(viewer_list):
        normalize = True
    if 'Scale' in str(viewer_list):
        scaleData = True
    if 'Lemmatize' in str(viewer_list):
        useLemma = True
    if 'full information' in str(viewer_list):
        fullInfo = True

    cmd = [
        'java', '-jar', 'NGrams_CoOccurrences_Viewer.jar', '-inputFolder',
        inputDir, '-outputFolder', outputDir
    ]

    if (n_grams_viewer_var == 1
            or CoOcc_Viewer_var == 1) and len(search_words) == 0:
        mb.showwarning(
            title='Warning',
            message=
            'No search words have been entered for either N-Grams or words co-occurrences.\n\nPlease, enter the search words and try again.'
        )
        return

    if n_grams_viewer_var == 1 and len(search_words) > 0:
        if date_options == 0:
            mb.showwarning(
                title='Warning',
                message=
                'No Date options selected. The N-Grams routine requires date metadata (i.e., date information embedded in the document filenames, e.g., The New York Times_12-18-1899).\n\nPlease, tick the Date options checkbox, enter the appropariate date options and try again.'
            )
            return
        ngram_list = processSearchWords(search_words)
        ngram_list = ['-checkNGrams'] + ngram_list
        cmd.extend(ngram_list)

    if date_options == 1:
        cmd.extend([
            '-AggregateBy', temporal_aggregation, '-dateFormat', date_format,
            '-datePos',
            str(date_position_var), '-itemsDelim', date_separator_var
        ])

    if CoOcc_Viewer_var == 1 and len(search_words) > 0:
        co_occurrences_list = processSearchWords(search_words)
        co_occurrences_list = ["-checkCoOccurrences"] + co_occurrences_list
        cmd.extend(co_occurrences_list)

    if normalize == 1 and n_grams_viewer_var == 1 and len(search_words) > 0:
        cmd.append('-normalize')  # only available for Ngrams
    if scaleData == 1: cmd.append('-scaledata')
    if useLemma == 1: cmd.append('-lemma')
    if fullInfo == 1: cmd.append('-fullInfo')

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'N-Grams Word Co-Occurrences start',
        'Started running N-Grams Word Co-Occurrences Viewer at', True,
        'Please, be patient. Depending upon the number of documents processed this may take a few minutes.\n\nYou can follow the script in command line.'
    )

    reminders_util.checkReminder(
        config_filename, ['subprocess.call(cmd) error'],
        'subprocess.call(cmd) error\n\nIf the VIEWER you are running exits with an error code about a file not found, most likely your selected INPUT & OUTPUT directory options are too long for Windows to handle.\n\nYou may need to move your input and output folders so as to have a shorter path (e.g., desktop).',
        True)
    print(cmd)
    try:
        subprocess.run(cmd, shell=True)
    except:
        mb.showwarning(
            title='Warning',
            message=
            "The Java viewer script exited with errors. Please, check your command line for a possible error 'Java' is not recognized as an internal or external command. If that's the case, please install Java JDK. Please, check the TIPS on Java download and installation and try again."
        )
        return

    if n_grams_viewer_var == 1 and len(search_words) > 0:
        # this is the output filename generated by the Java script
        n_grams_outputFile = os.path.join(outputDir, 'Searched_N-Grams.csv')
        if IO_files_util.checkFile(n_grams_outputFile, '.csv', True) == False:
            mb.showwarning(
                title='Warning',
                message=
                "The Java viewer script did not produce an N-grams output file.\n\nPlease, check your command line for possible Java errors and try again."
            )
            return

    if CoOcc_Viewer_var == 1 and len(search_words) > 0:
        # this is the output filename generated by the Java script
        co_occurrences_outputFile = os.path.join(outputDir,
                                                 'Searched_CoOccurrences.csv')
        if IO_files_util.checkFile(co_occurrences_outputFile, '.csv',
                                   True) == False:
            mb.showwarning(
                title='Warning',
                message=
                "The Java viewer script did not produce a Co-occurrences output file.\n\nPlease, check your command line for possible Java errors and try again."
            )
            return

    # plot co-occurrences
    if createExcelCharts == True and CoOcc_Viewer_var == 1 and len(
            search_words) > 0:
        xlsxFilename = co_occurrences_outputFile
        filesToOpen.append(co_occurrences_outputFile)
        chartTitle = 'Co-Occurrences Viewer'
        if date_options == 0:
            xAxis = 'Document'
        else:
            xAxis = temporal_aggregation
        hover_label = ['More information']
        if xAxis == 'Document':
            columns_to_be_plotted = [[1, 1]]
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                xlsxFilename,
                outputDir,
                'Co-Occ_viewer',
                chart_type_list=["pie"],
                chart_title=chartTitle,
                column_xAxis_label_var=xAxis,
                hover_info_column_list=hover_label,
                count_var=1)
        else:
            columns_to_be_plotted = [[0, 1]]
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                xlsxFilename,
                outputDir,
                'Co-Occ_viewer',
                chart_type_list=["line"],
                chart_title=chartTitle,
                column_xAxis_label_var=xAxis,
                hover_info_column_list=hover_label)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    # plot Ngrams
    if createExcelCharts == True and n_grams_viewer_var == 1 and len(
            search_words) > 0:
        xlsxFilename = n_grams_outputFile
        filesToOpen.append(n_grams_outputFile)
        xAxis = temporal_aggregation
        chartTitle = 'N-Grams Viewer'
        columns_to_be_plotted = []
        for i in range(len(ngram_list) -
                       1):  # it will iterate through i = 0, 1, 2, …., n-1
            columns_to_be_plotted.append([0, i + 1])
        hover_label = [
            'Total Word Count of This Group', 'Total Word Count of This Group',
            'Total Word Count of This Group'
        ]
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            xlsxFilename,
            outputDir,
            'n-grams_viewer',
            chart_type_list=["line"],
            chart_title=chartTitle,
            column_xAxis_label_var=xAxis,
            hover_info_column_list=hover_label)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    # with both Ngrams and co-occurrences
    if n_grams_viewer_var == 1 and CoOcc_Viewer_var == 1 and CoOcc_Viewer_var == 1 and len(
            search_words) > 0:
        n_grams_co_occurrences_outputFile = os.path.join(
            outputDir, 'N-Grams_CoOccurrences_Statistics.csv')
        filesToOpen.append(n_grams_co_occurrences_outputFile)
        chartTitle = ''

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'N-Grams Word Co-Occurrences end',
        'Finished running N-Grams Word Co-Occurrences Viewer at', True)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Esempio n. 9
0
def run(inputDir, outputDir, openOutputFiles, createExcelCharts,
        OptimizeInterval, numTopics):

    # to setup environment variable programmatically
    #   https://stackoverflow.com/questions/4906977/how-to-access-environment-variable-values
    # to get an environment variable
    #   malletEnvDir=os.getenv('', 'MALLET_HOME')
    # os.environ lists all environment variables
    # # remove env variable; two alternatives
    # os.environ.pop("MALLET_HOME")
    # del os.environ['MALLET_HOME']

    # check that the MalletDir as been setup
    MalletDir = IO_libraries_util.get_external_software_dir(
        'topic_modeling_mallet', 'Mallet')
    if MalletDir == None:
        return

    MalletPath = ''
    try:
        # if MALLET_HOME has been set up os.getenv returns the Mallet installation path
        MalletPath = os.getenv('MALLET_HOME', 'MALLET_HOME')
        if MalletPath == 'MALLET_HOME':
            # the env variable has not been setup
            MalletPath = ''
            mb.showwarning(
                title='MALLET-HOME environment variable',
                message=
                'The value MALLET-HOME needed by Mallet to run was not found in the environment variables.\n\nThe MALLET_HOME value was added programmatically to your environment variables.\n\nTHIS IS A TEMPORARY FIX VALID FOR RUNNING THE MALLET AS LONG AS THIS GUI REMAINS OPEN. For a more permanent solution, please read the TIPS on Mallet installation and setting Mallet environment variables.'
            )
            # add environment variable
            os.environ["MALLET_HOME"] = MalletDir
        else:
            MalletDir = MalletDir.replace("\\", "/")
            MalletPath = MalletPath.replace("\\", "/")
            if str(MalletPath).lower() != str(MalletDir).lower():
                # add updated environment variable
                os.environ["MALLET_HOME"] = MalletDir
                mb.showwarning(
                    title='Mallet environment variable path update',
                    message=
                    'The value MALLET-HOME in the environment variables was changed from\n\n  '
                    + MalletPath + '\n\nto\n\n  ' + MalletDir)
    except:
        mb.showwarning(
            title='MALLET-HOME environment variable',
            message=
            'The value MALLET-HOME needed by Mallet to run was not found in the environment variables.\n\nThe MALLET_HOME value was added programmatically to your environment variables.\n\nTHIS IS A TEMPORARY FIX VALID FOR RUNNING THE MALLET AS LONG AS THIS GUI REMAINS OPEN. For a more permanent solution, please read the TIPS on Mallet installation and setting Mallet environment variables.'
        )
        MalletDir = MalletDir.replace("\\", "/")
        MalletPath = MalletPath.replace("\\", "/")
        if str(MalletPath).lower() != str(MalletDir).lower():
            # add environment variable
            os.environ["MALLET_HOME"] = MalletDir

    filesToOpen = []

    MalletDir = MalletDir + os.sep + 'bin'

    if ' ' in inputDir:
        mb.showerror(
            title='Input file error',
            message=
            'The selected INPUT directory contains a blank (space) in the path. The Mallet code cannot handle input/output paths that contain a space and will break.\n\nPlease, place your input files in a directory with a path containing no spaces and try again.'
        )
        return
    if ' ' in outputDir:
        mb.showerror(
            title='Output file error',
            message=
            'The selected OUTPUT directory contains a blank (space) in the path. The Mallet code cannot handle input/output paths that contain a space and will break.\n\nPlease, select an output directory with a path containing no spaces and try again.'
        )
        return

    if not os.path.isdir(inputDir):
        mb.showerror(
            title='Input directory error',
            message=
            'The selected input directory does NOT exist.\n\nPlease, select a different directory and try again.'
        )
        return

    if not os.path.isdir(outputDir):
        mb.showerror(
            title='Output directory error',
            message=
            'The selected output directory does NOT exist.\n\nPlease, select a different directory and try again.'
        )
        return

    numFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt')
    if numFiles == 0:
        mb.showerror(
            title='Number of files error',
            message=
            'The selected input directory does NOT contain any file of txt type.\n\nPlease, select a different directory and try again.'
        )
        return
    elif numFiles == 1:
        mb.showerror(
            title='Number of files error',
            message='The selected input directory contains only ' +
            str(numFiles) +
            ' file of txt type.\n\nTopic modeling requires a large number of files to produce valid results. That is true even if the available file contains several different documents morged together.'
        )
        return
    elif numFiles < 10:
        mb.showwarning(
            title='Number of files',
            message='The selected input directory contains only ' +
            str(numFiles) +
            ' files of txt type.\n\nTopic modeling requires a large number of files to produce valid results.'
        )
    """
    All OUTPUT file names can be changed and Mallet will still run successfully
    OUTPUT file names extensions for step two can be TXT or CSV
    """
    # output.mallet
    TXTFiles_MalletFormatted_FileName = os.path.join(
        outputDir, "MalletFormatted_TXTFiles.mallet")
    # output.csv or output.txt
    Composition_FileName = os.path.join(outputDir,
                                        "NLP-Mallet_Output_Composition")
    # keys.tsv or keys.txt
    Keys_FileName = os.path.join(outputDir, "NLP-Mallet_Output_Keys.tsv")
    #output.gz
    Compressed_FileName = os.path.join(outputDir,
                                       "NLP-Mallet_Output_Compressed.gz")

    # filesToOpen.append(Composition_FileName+'.csv')
    # filesToOpen.append(Keys_FileName+'.csv')
    #
    """
    The Key table has as many lines as desired topics and three columns 
        TOPIC #, 
        WEIGHT OF TOPIC that measures the weight of the topic across all the documents,
        KEY WORDS IN TOPIC that lists a set of typical words belonging to the topic.
        
    The Composition table has as many lines as documents analyzed (one document per line) and several columns:
        column 1 (Document ID), 
        column 2 (Document with path), 
        and as many successive pairs of columns as the number of topics, with column pairs as follow: 
            TOPIC is a number corresponding to the number in column 1 in the Keys file; 
            PROPORTION measures the % of words in the document attributed to that topic (pairs sorted in descending PROPORTION order).
    """

    # mb.showwarning(title="Mallet output files",message="The Python Mallet wrapper runs Mallet with default options. If you want to provide custom options, please run Mallet from the command prompt.\n\nThe NLP Mallet produces four files in output (refer to the Mallet TIPS file for what each file contains):\n\n" +
    #     TXTFiles_MalletFormatted_FileName + "\n" +
    #     Composition_FileName + "\n" +
    #     Keys_FileName + "\n" +
    #     Compressed_FileName)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        'Started running Mallet Topic modeling at ', True,
        "Depending upon corpus size, computations may take a while... Please, be patient...\n\nYou can follow Mallet in command line."
    )

    #FIRST STEP

    # The output file MalletFormatted_TXTFiles.mallet contains all corpus TXT files properly formatted for Mallet
    if platform == "win32":
        subprocess.call([
            MalletDir + os.sep + 'mallet', 'import-dir', '--input', inputDir,
            '--output', TXTFiles_MalletFormatted_FileName, '--keep-sequence',
            '--remove-stopwords'
        ],
                        shell=True)
    # linux # OS X
    elif platform == "linux" or platform == "linux2" or platform == "darwin":
        subprocess.call([
            MalletDir + os.sep + 'mallet', 'import-dir', '--input', inputDir,
            '--output', TXTFiles_MalletFormatted_FileName, '--keep-sequence',
            '--remove-stopwords'
        ])

    #SECOND STEP
    #The output file Composition_FileName is a tsv file indicating the breakdown,
    #    by percentage, of each topic within each original imported text file
    #The output file Keys_FileName is a text file showing what the top key words are for each topic
    #the .gz file contains in .gz compressed form every word in your corpus, with each topic associated with each
    #see www.gzip.org on how to unzip this
    #Interval Optimization leads to better results according to http://programminghistorian.org/lessons/topic-modeling-and-mallet

    #     the real format of the file created by mallet is .tsv or .txt

    if platform == "win32":
        if OptimizeInterval == True:
            subprocess.call([
                MalletDir + os.sep + 'mallet', 'train-topics', '--input',
                TXTFiles_MalletFormatted_FileName, '--num-topics',
                str(numTopics), '--optimize-interval',
                str(numTopics), '--output-state', Compressed_FileName,
                '--output-topic-keys', Keys_FileName, '--output-doc-topics',
                Composition_FileName
            ],
                            shell=True)
        else:
            subprocess.call([
                MalletDir + os.sep + 'mallet', 'train-topics', '--input',
                TXTFiles_MalletFormatted_FileName, '--num-topics',
                str(numTopics), '--output-state', Compressed_FileName,
                '--output-topic-keys', Keys_FileName, '--output-doc-topics',
                Composition_FileName
            ],
                            shell=True)
    elif platform == "linux" or platform == "linux2" or platform == "darwin":
        if OptimizeInterval == True:
            subprocess.call([
                MalletDir + os.sep + 'mallet', 'train-topics', '--input',
                TXTFiles_MalletFormatted_FileName, '--num-topics',
                str(numTopics), '--optimize-interval',
                str(numTopics), '--output-state', Compressed_FileName,
                '--output-topic-keys', Keys_FileName, '--output-doc-topics',
                Composition_FileName
            ])
        else:
            subprocess.call([
                MalletDir + os.sep + 'mallet', 'train-topics', '--input',
                TXTFiles_MalletFormatted_FileName, '--num-topics',
                str(numTopics), '--output-state', Compressed_FileName,
                '--output-topic-keys', Keys_FileName, '--output-doc-topics',
                Composition_FileName
            ])

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis end',
        'Finished running Mallet Topic modeling at ', True)

    # https://stackoverflow.com/questions/29759305/how-do-i-convert-a-tsv-to-csv

    # convert to csv Mallet tsv output files
    # read Mallet tab-delimited files; both Keys_FileName and Composition_FileName must be converted

    if (not os.path.isfile(Keys_FileName)) and (
            not os.path.isfile(Composition_FileName)):
        mb.showwarning(
            title='Mallet FATAL error',
            message=
            'Mallet has not produced the expected Keys and Composition files. It looks like Mallet did NOT run.\n\nPlease, make sure that you have edited properly the environment variables by reading the TIPS file for Mallet installation and setting Mallet environment variables.'
        )
        filesToOpen = []
        return
    Keys_FileName = file_type_converter_util.tsv_converter(
        GUI_util.window, Keys_FileName, outputDir)
    Composition_FileName = file_type_converter_util.tsv_converter(
        GUI_util.window, Composition_FileName, outputDir)
    filesToOpen.append(Keys_FileName)
    filesToOpen.append(Composition_FileName)

    if createExcelCharts:
        columns_to_be_plotted = [[0, 1]]
        hover_label = [2]
        chartTitle = 'Mallet Topics'
        xAxis = 'Topic #'
        yAxis = 'Topic weight'
        fileName = Keys_FileName
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            fileName,
            outputDir,
            'Mallet_TM',
            chart_type_list=["bar"],
            chart_title=chartTitle,
            column_xAxis_label_var=xAxis,
            hover_info_column_list=hover_label,
            count_var=0,
            column_yAxis_label_var=yAxis)

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def main(window, inputDir, inputTargetDir, outputDir, openOutputFiles,
         createExcelCharts, relativity_threshold):

    filesToOpen = []
    # check that the CoreNLPdir as been setup
    CoreNLPdir = IO_libraries_util.get_external_software_dir(
        'file_classifier_NER_util', 'Stanford CoreNLP')
    if CoreNLPdir == None:
        return filesToOpen

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        'Started running the File Classifier by NER values at', True,
        'You can follow the Classifier in command line.\n\nThe script will first build a dictionary of NER values for the documents in each subfolder, then process each unclassified document.  Please, be patient.'
    )

    if inputDir[-1] != '/':
        inputDir = inputDir + '/'
    outputFilename = IO_files_util.generate_output_file_name(
        '', inputTargetDir, outputDir, '.csv', 'SSR', 'NER_class', '', '', '',
        False, True)
    filesToOpen.append(outputFilename)
    f = open(outputFilename, 'w', encoding='utf-8', errors='ignore')
    terminal_output = sys.stdout
    sys.stdout = f
    print(
        "Source document,Target directory,Highest index,Relativity index (>" +
        str(relativity_threshold) + "),Outcome")
    actors = load_soc_actors()
    dirs = glob(inputTargetDir + '/*/')
    if dirs == []:
        mb.showwarning(
            "Warning",
            "No target subdirectories.\n\nNo target subdirectories found in\n\n"
            + inputTargetDir +
            "\n\nPlease, check your target directory in the INPUT secondary directory in the IO widgets."
        )
        filesToOpen = []
        sys.stdout = terminal_output
        return filesToOpen

    nlp = StanfordCoreNLP(CoreNLPDir)
    compare = {}
    num_folder = 0
    sys.stdout = terminal_output
    for dir in dirs:
        print("Processing folder " + str(num_folder + 1) + "/" +
              str(len(dirs)) + "; Folder name: " + dir.split(os.path.sep)[-2])
        compare = get_NER_POSTAG(dir, actors, nlp, compare)
        num_folder += 1
    print("Finished all " + str(num_folder) +
          " folders. Start to process documents now.")
    sys.stdout = f
    #compare stores: key- folder id; value: a set of words
    num_doc, num_unclass, num_class, num_multiclass = find(
        inputDir, actors, nlp, compare, relativity_threshold, f,
        terminal_output)
    sys.stdout = terminal_output
    mb.showinfo(title="Final results",
                message=str(num_doc) + " SOURCE document processed\n" + \
                        str(num_class) + " SOURCE documents classified in TARGET subdirectories\n" + \
                        str(num_multiclass) + " SOURCE documents classified in MULTIPLE TARGET subdirectories\n" + \
                        str(num_unclass) + " SOURCE documents unclassified")

    print("Number of unclassified documents processed in input: " +
          str(num_doc))
    print("Number of classified documents in output: " + str(num_class))
    print(
        "Number of classified documents (with multiple targets) in output: " +
        str(num_multiclass))
    print("Number of unclassified documents in output: " + str(num_unclass))

    nlp.close()
    f.close()

    if createExcelCharts == True:
        columns_to_be_plotted = [[3, 3]]
        hover_label = ''
        inputFilename = outputFilename
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='SSR_NER_home',
            chart_type_list=["pie"],
            chart_title='Frequency Distribution of Find a Home Outcome',
            column_xAxis_label_var='',
            hover_info_column_list=hover_label,
            count_var=1)
    if Excel_outputFilename != "":
        filesToOpen.append(Excel_outputFilename)

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
        filesToOpen = [
        ]  # to avoid opening twice here and in calling fuunction

    return filesToOpen
Esempio n. 11
0
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts,
        CoNLL_table_analysis_var, complexity_readability_analysis_var,
        vocabulary_analysis_var, ngrams_analysis_var,
        CoNLL_table_analysis_menu_var,
        complexity_readability_analysis_menu_var, vocabulary_analysis_menu_var,
        ngrams_analysis_menu_var, gender_guesser_var):

    filesToOpen = []  # Store all files that are to be opened once finished

    if (CoNLL_table_analysis_var == False
            and complexity_readability_analysis_var == False
            and vocabulary_analysis_var == False
            and ngrams_analysis_var == False and gender_guesser_var == False):
        mb.showwarning(
            'Warning',
            'No options have been selected.\n\nPlease, select an option and try again.'
        )
        return

    if CoNLL_table_analysis_var == True:
        withHeader = True
        recordID_position = 8
        documentId_position = 10
        data, header = IO_csv_util.get_csv_data(inputFilename, withHeader)
        if len(data) == 0:
            return
        data_divided_sents = IO_CoNLL_util.sentence_division(data)
        if data_divided_sents == None:
            return
        if len(data_divided_sents) == 0:
            return

        if 'Clauses' in CoNLL_table_analysis_menu_var:
            tempfilesToOpen = CoNLL_clause_analysis_util.clause_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            # only open the chart files
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])

        elif 'Nouns' in CoNLL_table_analysis_menu_var:
            tempfilesToOpen = CoNLL_noun_analysis_util.noun_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            # only open the chart files
            filesToOpen.append(tempfilesToOpen[0])
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[2])
            filesToOpen.append(tempfilesToOpen[4])
            filesToOpen.append(tempfilesToOpen[6])
            filesToOpen.append(tempfilesToOpen[8])

        elif 'Verbs' in CoNLL_table_analysis_menu_var:
            tempfilesToOpen = CoNLL_verb_analysis_util.verb_voice_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            # only open the chart files
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_verb_analysis_util.verb_modality_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_verb_analysis_util.verb_tense_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])

        elif 'Function' in CoNLL_table_analysis_menu_var:
            # only open the chart files
            import CoNLL_function_words_analysis_util
            tempfilesToOpen = CoNLL_function_words_analysis_util.article_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_function_words_analysis_util.auxiliary_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_function_words_analysis_util.conjunction_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_function_words_analysis_util.preposition_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])
            tempfilesToOpen = CoNLL_function_words_analysis_util.pronoun_stats(
                inputFilename, outputDir, data, data_divided_sents,
                openOutputFiles, createExcelCharts)
            filesToOpen.append(tempfilesToOpen[1])
            filesToOpen.append(tempfilesToOpen[3])

        elif 'POSTAG' in CoNLL_table_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        elif 'DEPREL' in CoNLL_table_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        elif 'NER' in CoNLL_table_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        else:
            mb.showwarning(
                'Warning',
                'No option has been selected for CoNLL table analysis.\n\nPlease, select an option and try again.'
            )
            return

    if complexity_readability_analysis_var == True:
        if 'Sentence' in complexity_readability_analysis_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'statistics_txt_util.py') == False:
                return
            filesToOpen = sentence_analysis_util.sentence_complexity(
                GUI_util.window, inputFilename, inputDir, outputDir,
                openOutputFiles, createExcelCharts)
            if filesToOpen == None:
                return

        elif 'Text' in complexity_readability_analysis_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'statistics_txt_util.py') == False:
                return
            sentence_analysis_util.sentence_text_readability(
                GUI_util.window, inputFilename, inputDir, outputDir,
                openOutputFiles, createExcelCharts)
        elif 'tree' in complexity_readability_analysis_menu_var:
            if IO_libraries_util.inputProgramFileCheck(
                    'DependenSee.Jar') == False:
                return
            errorFound, error_code, system_output = IO_libraries_util.check_java_installation(
                'Sentence structure visualization')
            if errorFound:
                return
            if inputFilename == '' and inputFilename.strip()[-4:] != '.txt':
                mb.showwarning(
                    title='Input file error',
                    message=
                    'The Sentence tree viewer script requires a single txt file in input.\n\nPlease, select a txt file and try again.'
                )
                return
            IO_user_interface_util.timed_alert(
                GUI_util.window, 2000, 'Analysis start',
                'Started running Sentence visualization: Dependency tree viewer (png graphs) at',
                True,
                '\n\nYou can follow Sentence Complexity in command line.')
            subprocess.call(
                ['java', '-jar', 'DependenSee.Jar', inputFilename, outputDir])
            mb.showwarning(
                title='Analysis end',
                message=
                'Finished running the Dependency tree viewer (png graphs).\n\nMake sure to open the png files in output, one graph for each sentence.'
            )

        else:
            mb.showwarning(
                'Warning',
                'No option has been selected for Complex/readability analysis.\n\nPlease, select an option and try again.'
            )
            return

    if vocabulary_analysis_var == True:
        if vocabulary_analysis_menu_var == '':
            mb.showwarning(
                'Warning',
                'No option has been selected for Vocabulary analysis.\n\nPlease, select an option and try again.'
            )
            return
        if 'Repetition' in vocabulary_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        if '*' == vocabulary_analysis_menu_var:
            filesToOpen = file_spell_checker_util.language_detection(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts)
        if '*' == vocabulary_analysis_menu_var:
            filesToOpen = statistics_txt_util.process_words(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts)
            # if len(tempOutputfile)>0:
            #     filesToOpen.extend(tempOutputfile)
        elif 'detection' in vocabulary_analysis_menu_var:
            filesToOpen = file_spell_checker_util.language_detection(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts)
        elif 'capital' in vocabulary_analysis_menu_var:
            filesToOpen = statistics_txt_util.process_words(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, 'capital')
        elif 'Short' in vocabulary_analysis_menu_var:
            filesToOpen = statistics_txt_util.process_words(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, 'short')
        elif 'Vowel' in vocabulary_analysis_menu_var:
            filesToOpen = statistics_txt_util.process_words(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, 'vowel')
        elif 'Punctuation' in vocabulary_analysis_menu_var:
            filesToOpen = statistics_txt_util.process_words(
                window, inputFilename, inputDir, outputDir, openOutputFiles,
                createExcelCharts, 'punctuation')
        if '*' == vocabulary_analysis_menu_var or 'Yule' in vocabulary_analysis_menu_var:
            statistics_txt_util.yule(window, inputFilename, inputDir,
                                     outputDir)
        if '*' == vocabulary_analysis_menu_var or '*' == vocabulary_analysis_menu_var or 'Unusual' in vocabulary_analysis_menu_var:
            tempFiles = file_spell_checker_util.nltk_unusual_words(
                window, inputFilename, inputDir, outputDir, False,
                createExcelCharts)
            if len(tempFiles) > 0:
                filesToOpen.extend(tempFiles)
        if '*' == vocabulary_analysis_menu_var or 'Abstract' in vocabulary_analysis_menu_var:
            # ABSTRACT/CONCRETENESS _______________________________________________________
            mode = "both"  # mean, median, both (calculates both mean and median)
            if lib_util.checklibFile(
                    GUI_IO_util.concreteness_libPath + os.sep +
                    'Concreteness_ratings_Brysbaert_et_al_BRM.csv',
                    'concreteness_analysis_util.py') == False:
                return
            if IO_libraries_util.inputProgramFileCheck(
                    'concreteness_analysis_util.py') == False:
                return
            IO_user_interface_util.timed_alert(
                GUI_util.window, 3000, 'Analysis start',
                'Started running CONCRETENESS Analysis at', True)
            if len(inputFilename) > 0:
                outputFilename = IO_files_util.generate_output_file_name(
                    inputFilename, inputDir, outputDir, '.csv', 'SC',
                    'Concreteness', '', '', '', False, True)
            else:
                outputFilename = IO_files_util.generate_output_file_name(
                    inputDir, inputDir, outputDir, '.csv', 'SC_dir',
                    'Concreteness', '', '', '', False, True)

            concreteness_analysis_util.main(inputFilename, inputDir, outputDir,
                                            outputFilename, mode)

            filesToOpen.append(outputFilename)
            if createExcelCharts == True:
                inputFilename = outputFilename
                if mode == "both":
                    columns_to_be_plotted = [[2, 4], [2, 5]]
                    hover_label = ['Sentence', 'Sentence']
                else:
                    columns_to_be_plotted = [[2, 4]]
                    hover_label = ['Sentence']
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    inputFilename,
                    outputDir,
                    outputFileLabel='Concret',
                    chart_type_list=["line"],
                    chart_title='Concreteness Scores by Sentence Index',
                    column_xAxis_label_var='Sentence index',
                    hover_info_column_list=hover_label,
                    count_var=0,
                    column_yAxis_label_var='Scores')
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

                # outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
                #                                     outputFilename, chart_type_list=["line"],
                #                                     chart_title="Concreteness Scores by Sentence Index",
                #                                     column_xAxis_label_var='Sentence index',
                #                                     column_yAxis_label_var='Frequency of concreteness scores',
                #                                     outputExtension='.xlsm', label1='SC', label2='Concreteness',
                #                                     label3='line', label4='chart', label5='', useTime=False,
                #                                     disable_suffix=True,
                #                                     count_var=0, column_yAxis_field_list=[],
                #                                     reverse_column_position_for_series_label=False,
                #                                     series_label_list=[''], second_y_var=0,
                #                                     second_yAxis_label='', hover_var=1,
                #                                     hover_info_column_list=hover_label)
                # if outputFilename != "":
                #     filesToOpen.append(outputFilename)

            IO_user_interface_util.timed_alert(
                GUI_util.window, 3000, 'Analysis end',
                'Finished running CONCRETENESS Analysis at', True)

    if ngrams_analysis_var == True:
        if 'Character' in ngrams_analysis_menu_var or 'Word' in ngrams_analysis_menu_var:
            if 'Character' in ngrams_analysis_menu_var:
                ngramType = 0
            else:
                ngramType = 1
            IO_user_interface_util.timed_alert(
                GUI_util.window, 3000, 'N-Grams analysis start',
                'Started running Word/Characters N-Grams at', True,
                'You can follow the script in command line.')
            # (inputFilename = ''  # for now we only process a whole directory
            if IO_libraries_util.inputProgramFileCheck(
                    'statistics_txt_util.py') == False:
                return
            ngramsNumber = 4
            normalize = False
            excludePunctuation = False

            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                ngramsNumber, normalize, excludePunctuation, ngramType,
                openOutputFiles, createExcelCharts, bySentenceIndex_var)
            IO_user_interface_util.timed_alert(
                GUI_util.window, 3000, 'N-Grams analysis end',
                'Finished running Word/Characters N-Grams at', True)
        elif 'Hapax' in ngrams_analysis_menu_var:
            ngramsNumber = 1
            ngramType = 1
            normalize = False
            excludePunctuation = False

            statistics_txt_util.compute_character_word_ngrams(
                GUI_util.window, inputFilename, inputDir, outputDir,
                ngramsNumber, normalize, excludePunctuation, ngramType,
                openOutputFiles, createExcelCharts, bySentenceIndex_var)
        elif 'POSTAG' in ngrams_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        elif 'DEPREL' in ngrams_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        elif 'NER' in ngrams_analysis_menu_var:
            mb.showwarning(
                'Warning',
                'The selected option is not available yet.\n\nSorry!')
            return
        else:
            mb.showwarning(
                'Warning',
                'No option has been selected for N-grams analysis.\n\nPlease, select an option and try again.'
            )
            return

    if gender_guesser_var == True:
        IO_files_util.runScript_fromMenu_option('Gender guesser', 0,
                                                inputFilename, inputDir,
                                                outputDir, openOutputFiles,
                                                createExcelCharts)
        return

    if openOutputFiles == True:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Esempio n. 12
0
def clause_stats(inputFilename, inputDir, outputDir, data, data_divided_sents,
                 openOutputFiles, createExcelCharts):

    filesToOpen = []  # Store all files that are to be opened once finished

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start',
                                       'Started running CLAUSE ANALYSES at',
                                       True)

    #output file names
    #clausal_analysis_file_name contains all the CoNLL table records that have a clausal tag
    clausal_analysis_file_name = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
        'list')
    filesToOpen.append(clausal_analysis_file_name)
    #clausal_analysis_stats_file_name will contain a data sheet with the frequency distribution of all available clausal tags and a chart sheet with the pie chart visualization of the data

    if 0:
        stats_clauses(data)
    else:
        if not os.path.isdir(outputDir):
            mb.showwarning(
                title='Output file path error',
                message='Please check OUTPUT DIRECTORY PATH and try again')
            return
        clausal_list = stats_clauses_output(data, data_divided_sents)

        IO_csv_util.list_to_csv(
            GUI_util.window,
            IO_CoNLL_util.sort_output_list('CLAUSE TAGS', clausal_list,
                                           documentId_position),
            clausal_analysis_file_name)
        column_stats = statistics_csv_util.compute_stats_CoreNLP_tag(
            clausal_list, 7, "Clause Tags, Frequency", "CLAUSALTAG")

        clausal_analysis_stats_file_name = IO_files_util.generate_output_file_name(
            inputFilename, inputDir, outputDir, '.csv', 'CA', 'Clause tags',
            'stats')
        errorFound = IO_csv_util.list_to_csv(GUI_util.window, column_stats,
                                             clausal_analysis_stats_file_name)
        if errorFound == True:
            return

        if createExcelCharts == True:
            Excel_outputFilename = Excel_util.create_excel_chart(
                GUI_util.window,
                data_to_be_plotted=[column_stats],
                inputFilename=clausal_analysis_stats_file_name,
                outputDir=outputDir,
                scriptType='CoNLL_Clause',
                chartTitle="Frequency Distribution of Clause Type",
                chart_type_list=["pie"],
                column_xAxis_label="Clause Tags",
                column_yAxis_label="Frequency")
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # line plot by sentence index
            Excel_outputFilename = Excel_util.compute_csv_column_frequencies(
                GUI_util.window, clausal_analysis_file_name, '', outputDir,
                openOutputFiles, createExcelCharts, [[8, 8]], ['CLAUSE TAGS'],
                ['FORM', 'Sentence'], ['Document ID', 'Sentence ID'], 'CA',
                'line')
            if len(Excel_outputFilename) > 0:
                filesToOpen.extend(Excel_outputFilename)

            # output_df= Excel_util.add_missing_IDs(clausal_analysis_file_name)
            # # overwrite original file having added any missing document ID and sentence ID
            # output_df.to_csv(clausal_analysis_file_name,index=False)
            #
            columns_to_be_plotted = [[1, 8]]
            hover_label = ['CLAUSAL TAG-DESCRIPTION']
            inputFilename = clausal_analysis_file_name
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel='CoNLL_Clause',
                chart_type_list=["line"],
                chart_title='Frequency of Clause Tags',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=1)
            if Excel_outputFilename != '':
                filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running CLAUSE ANALYSES at',
                                       True)
    return filesToOpen
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts):

    IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis start',
                                       'Started running Language Detection at', True)

    folderID = 0
    fileID = 0
    filesToOpen=[]

    outputFilenameCSV=IO_files_util.generate_output_file_name(inputFilename, inputDir, outputDir, '.csv', 'lang_detect')
    filesToOpen.append(outputFilenameCSV)

    files=IO_files_util.getFileList(inputFilename, inputDir, '.txt')
    if len(files) == 0:
        return

    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    fieldnames = ['LANGDETECT',
                  'Language',
                  'Probability',
                  'SPACY',
                  'Language',
                  'Probability',
                  'LANGID',
                  'Language',
                  'Probability',
                  'Document ID',
                  'Document']

    config_filename = 'file-spell-checker-config.txt'
    reminders_util.checkReminder(config_filename,
                                 ['Language detection'],
                                 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.',
                                 True)

    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                                       'Started running language detection algorithms at', True,
                                       'You can follow the algorithms in command line.')

    with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        docErrors_empty=0
        docErrors_unknown=0
        filenameSV=''
        for filename in files:
            fileID = fileID + 1
            head, tail = os.path.split(filename)
            print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail)
            text = open(filename, 'r', encoding='utf-8', errors='ignore').read()
            if len(text)==0:
                print("  The file is empty. It will be discarded from processing.")
                docErrors_empty=docErrors_empty+1
                continue
            # text = opened_file.read()
            # head, tail = os.path.split(filename)
            # head is path, tail is filename
            try:
                value = detect_langs(text)
            except:
                filenameSV=filename # do not count the same document twice in this and the other algorithms that follow
                docErrors_unknown=docErrors_unknown+1
                print("  Unknown file read error.")
                continue
            value=str(value[0]).split(':')
            language=value[0]
            probability=value[1]
            # https://pypi.org/project/langdetect/
            # langdetect supports 55 languages out of the box (ISO 639-1 codes)
            # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
            # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
            # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
            # ISO codes https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
            print('   LANGDETECT', language, probability)
            # print('   LANGDETECT',value[0],value[1])  # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756]
            currentLine = ['LANGDETECT', language, probability]

            nlp = spacy.load('en_core_web_sm')
            nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
            try:
                doc = nlp(text)
            except:
                if filename!=filenameSV: # do not count the same document twice in this and the other algorithm that follows
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV=filename
                print("  Unknown file read error.")
                continue
            value = doc._.language
            language=value['language']
            probability=value['score']
            #
            print('   SPACY', language, probability)  # {'language': 'en', 'score': 0.9999978351575265}
            currentLine.extend(['SPACY', language, probability])

            lang_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
            try:
                value=lang_identifier.classify(text)
            except:
                if filename!=filenameSV:
                    docErrors_unknown = docErrors_unknown + 1
                    filenameSV=filename
                print("  Unknown file read error.")
                continue
            language=value[0]
            probability=value[1]
            # LANGID ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given)
            # https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes for ISO codes
            # https://pypi.org/project/langid/1.1.5/
            # af, am, an, ar, as, az, be, bg, bn, br,
            # bs, ca, cs, cy, da, de, dz, el, en, eo,
            # es, et, eu, fa, fi, fo, fr, ga, gl, gu,
            # he, hi, hr, ht, hu, hy, id, is, it, ja,
            # jv, ka, kk, km, kn, ko, ku, ky, la, lb,
            # lo, lt, lv, mg, mk, ml, mn, mr, ms, mt,
            # nb, ne, nl, nn, no, oc, or, pa, pl, ps,
            # pt, qu, ro, ru, rw, se, si, sk, sl, sq,
            # sr, sv, sw, ta, te, th, tl, tr, ug, uk,
            # ur, vi, vo, wa, xh, zh, zu
            print('   LANGID', language, probability)  # ('en', 0.999999999999998)
            print()
            currentLine.extend(['LANGID',  language, probability])
            currentLine.extend([fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)])

            writer = csv.writer(csvfile)
            writer.writerows([currentLine])
            filenameSV=filename
    csvfile.close()
    msg=''
    if docErrors_empty==0 and docErrors_unknown==0:
        msg=str(fileID) + ' documents successfully processed for language detection.'
    else:
        if docErrors_empty>0:
            msg=str(fileID) + ' documents processed for language detection.\n  ' + str(docErrors_empty) + ' document(s) found empty.'
        if docErrors_unknown>0:
            if msg!='':
                msg=msg + '\n  ' + str(docErrors_unknown) + ' document(s) read with unknown errors.'
            else:
                msg = str(fileID) + ' documents processed for language detection.\n  ' + \
                      str(docErrors_unknown) + ' document(s) read with unknown errors.'
        mb.showwarning(title='File read errors',
                message=msg+ '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.')
    filesToOpen.append(outputFilenameCSV)
    IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis end',
                                       'Finished running Language Detection at', True,'Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.')
    print('Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.')
    if createExcelCharts:
        columns_to_be_plotted = [[1, 1],[4,4],[7,7]]
        chart_title='Frequency of Languages Detected by 3 Algorithms'
        hover_label=['LANGDETECT', 'SPACY', 'LANGID']
        inputFilename = outputFilenameCSV
        Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
                                                  outputFileLabel='_bar_chart',
                                                  chart_type_list=["bar"],
                                                  chart_title=chart_title,
                                                  column_xAxis_label_var='Language',
                                                  hover_info_column_list=hover_label,
                                                  count_var=1)
        if Excel_outputFilename!='':
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
Esempio n. 14
0
def main(CoreNLPDir,
         input_main_dir_path,
         input_secondary_dir_path,
         outputDir,
         openOutputFiles,
         createExcelCharts,
         checkNER=False):
    articles_path = input_main_dir_path
    compilations_path = input_secondary_dir_path  # summaries folder
    filesToOpen = []

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'Analysis start',
        'Started running MISSING CHARACTER at', True,
        'You can follow MISSING CHARACTER in command line.')

    if len(articles_path) == 0:
        mb.showerror(
            title='Articles directory not found',
            message=
            'The summary checker script requires an input files directory.\n\nNo directory entered. Please, select the input files directory and try again.'
        )
        sys.exit()

    if len(compilations_path) == 0:
        tk.messagebox.showerror(
            title='Summary directory not found',
            message=
            'The summary checker script requires a secondary input directory for the summary files.\n\nNo secondary directory entered. Please, select the secondary input directory and try again.'
        )
        sys.exit()

    if len(outputDir) == 0:
        mb.showerror(
            title='Output directory not found',
            message=
            'The summary checker script requires an output directory.\n\nNo output directory entered. Please, select the output directory and try again.'
        )
        sys.exit()

    if compilations_path[-1] == os.sep:
        compilations_path = compilations_path[:-1]
    if outputDir[-1] == os.sep:
        outputDir = outputDir[:-1]

    #############################
    ##This is just for evaluation purposes
    freq_act_miss = 0
    count_act_miss = 0
    act_miss_list = []
    id_act_miss = []
    freq_loc_miss = 0
    count_loc_miss = 0
    loc_miss_list = []
    id_loc_miss = []
    freq_org_miss = 0
    count_org_miss = 0
    org_miss_list = []
    id_org_miss = []
    freq_per_miss = 0
    count_per_miss = 0
    per_miss_list = []
    id_per_miss = []
    freq_date_miss = 0
    count_date_miss = 0
    date_miss_list = []
    id_date_miss = []
    ##End of evaluation
    #############################
    #write the output csv.

    if checkNER == 1:
        outputFilename = IO_files_util.generate_output_file_name(
            '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'NER', '',
            '', False, True)
    else:
        outputFilename = IO_files_util.generate_output_file_name(
            '', compilations_path, outputDir, '.csv', 'SSR', 'MA', '', '', '',
            False, True)
    fName = GUI_IO_util.libPath + os.sep + 'wordLists' + os.sep + 'social-actor-list.csv'
    if not os.path.isfile(fName):
        print(
            "The file " + fName +
            " could not be found. The routine expects a csv dictionary file 'social-actor-list.csv' in a directory 'lib\wordLists' expected to be a subdirectory of the main NLP directory.\n\nPlease, check your lib\wordLists directory and try again."
        )
        mb.showerror(
            title='File not found',
            message='The file ' + fName +
            " could not be found.\n\nThe routine expects a csv dictionary file 'social-actor-list.csv' in a directory 'lib\wordLists' expected to be a subdirectory of the main NLP directory.\n\nPlease, check your lib\wordLists directory and try again."
        )
        sys.exit()
    actors = load_soc_actors(fName)
    f = open(outputFilename, 'w', encoding='utf-8', errors='ignore')
    sys.stdout = f
    dirs = glob(articles_path + os.sep + '*' + os.sep)
    nlp = StanfordCoreNLP(CoreNLPDir)
    num_id = 0
    num_dir = 0
    for compilation in glob(compilations_path + os.sep + '*'):
        num_id += 1
    for dir in dirs:
        sys.stdout = terminal_out
        print("Processing folder " + str(num_dir + 1) + "/" + str(len(dirs)) +
              "; Folder name: " + dir.split(os.path.sep)[-2])
        num_dir += 1
        sys.stdout = f
        try:
            count_act_miss, act_miss_list, id_act_miss, count_loc_miss, loc_miss_list, id_loc_miss, count_org_miss, org_miss_list, id_org_miss, count_per_miss, per_miss_list, id_per_miss, count_date_miss, date_miss_list, id_date_miss, if_act, if_loc, if_org, if_per, if_date = check(
                dir, actors, nlp, compilations_path, checkNER, count_act_miss,
                act_miss_list, id_act_miss, count_loc_miss, loc_miss_list,
                id_loc_miss, count_org_miss, org_miss_list, id_org_miss,
                count_per_miss, per_miss_list, id_per_miss, count_date_miss,
                date_miss_list, id_date_miss)
        except:
            print('         Unspecified error in processing the file')
            continue
        #############################
        #for evaluation
        if if_act == True:
            freq_act_miss += 1
        if if_per == True:
            freq_per_miss += 1
        if if_loc == True:
            freq_loc_miss += 1
        if if_org == True:
            freq_org_miss += 1
        if if_date == True:
            freq_date_miss += 1
        #############################
    nlp.close()
    ##
    ##This is to print out evaluation table
    if checkNER == 1:
        outputFilename = IO_files_util.generate_output_file_name(
            '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'NER',
            'freq', '', False, True)
    else:
        outputFilename = IO_files_util.generate_output_file_name(
            '', compilations_path, outputDir, '.csv', 'SSR', 'MA', 'freq', '',
            '', False, True)
    f_e = open(outputFilename, 'w', encoding='utf-8', errors='ignore')
    sys.stdout = f_e
    if (len(act_miss_list) <= 320 and len(loc_miss_list) <= 320
            and len(per_miss_list) <= 320 and len(org_miss_list) <= 320
            and len(date_miss_list) <= 320):
        print(
            "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error"
        )
        print("Social Actor,", str(freq_act_miss), ",",
              freq_act_miss / num_id * 100, ",", count_act_miss, ",",
              "; ".join(a for a in id_act_miss), ",",
              "; ".join(c for c in act_miss_list))
        print("Organization,", str(freq_org_miss), ",",
              freq_org_miss / num_id * 100, ",", count_org_miss, ",",
              "; ".join(b for b in id_org_miss), ",",
              "; ".join(d for d in org_miss_list))
        print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100,
              ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",",
              "; ".join(d for d in per_miss_list))
        print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100,
              ",", count_date_miss, ",", "; ".join(b for b in id_date_miss),
              ",", "; ".join(d for d in date_miss_list))
        print("Location,", str(freq_loc_miss), ",",
              freq_loc_miss / num_id * 100, ",", count_loc_miss, ",",
              "; ".join(b for b in id_loc_miss), ",",
              "; ".join(d for d in loc_miss_list))
    elif (len(act_miss_list) <= 640 and len(loc_miss_list) <= 640
          and len(per_miss_list) <= 640 and len(org_miss_list) <= 640
          and len(date_miss_list) <= 640):
        print(
            "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error (Cut List),List of Documents for Type of Error (Continue List)"
        )
        print("Social Actor,", str(freq_act_miss), ",",
              freq_act_miss / num_id * 100, ",", count_act_miss, ",",
              "; ".join(a for a in id_act_miss), ",",
              "; ".join(c for c in act_miss_list[:320]), ",",
              "; ".join(c for c in act_miss_list[320:]))
        print("Organization,", str(freq_org_miss), ",",
              freq_org_miss / num_id * 100, ",", count_org_miss, ",",
              "; ".join(b for b in id_org_miss), ",",
              "; ".join(d for d in org_miss_list[:320]), ",",
              "; ".join(d for d in org_miss_list[320:]))
        print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100,
              ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",",
              "; ".join(d for d in per_miss_list[:320]), ",",
              "; ".join(d for d in per_miss_list[320:]))
        print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100,
              ",", count_date_miss, ",", "; ".join(b for b in id_date_miss),
              ",", "; ".join(d for d in date_miss_list[:320]), ",",
              "; ".join(d for d in date_miss_list[320:]))
        print("Location,", str(freq_loc_miss), ",",
              freq_loc_miss / num_id * 100, ",", count_loc_miss, ",",
              "; ".join(b for b in id_loc_miss), ",",
              "; ".join(d for d in loc_miss_list[:320]), ",",
              "; ".join(d for d in loc_miss_list[320:]))
    else:
        print(
            "Type of Error,Frequency of Summaries in Error,Percentage of Summaries in Error,Frequency of Error,List of Summary Filenames for Type of Error,List of Documents for Type of Error (Cut List),List of Documents for Type of Error (Continue List),List of Documents for Type of Error (Continue List)"
        )
        print("Social Actor,", str(freq_act_miss), ",",
              freq_act_miss / num_id * 100, ",", count_act_miss, ",",
              "; ".join(a for a in id_act_miss), ",",
              "; ".join(c for c in act_miss_list[:320]), ",",
              "; ".join(c for c in act_miss_list[320:640]), ",",
              "; ".join(c for c in act_miss_list[640:]))
        print("Organization,", str(freq_org_miss), ",",
              freq_org_miss / num_id * 100, ",", count_org_miss, ",",
              "; ".join(b for b in id_org_miss), ",",
              "; ".join(d for d in org_miss_list[:320]), ",",
              "; ".join(d for d in org_miss_list[320:640]), ",",
              "; ".join(c for c in org_miss_list[640:]))
        print("Person,", str(freq_per_miss), ",", freq_per_miss / num_id * 100,
              ",", count_per_miss, ",", "; ".join(b for b in id_per_miss), ",",
              "; ".join(d for d in per_miss_list[:320]), ",",
              "; ".join(d for d in per_miss_list[320:640]), ",",
              "; ".join(c for c in per_miss_list[640:]))
        print("Date,", str(freq_date_miss), ",", freq_date_miss / num_id * 100,
              ",", count_date_miss, ",", "; ".join(b for b in id_date_miss),
              ",", "; ".join(d for d in date_miss_list[:320]), ",",
              "; ".join(d for d in date_miss_list[320:640]), ",",
              "; ".join(c for c in date_miss_list[640:]))
        print("Location,", str(freq_loc_miss), ",",
              freq_loc_miss / num_id * 100, ",", count_loc_miss, ",",
              "; ".join(b for b in id_loc_miss), ",",
              "; ".join(d for d in loc_miss_list[:320]), ",",
              "; ".join(d for d in loc_miss_list[320:640]), ",",
              "; ".join(c for c in loc_miss_list[640:]))
    f_e.close()
    # type "sys.stdout = terminal_out" before print
    sys.stdout = terminal_out
    if createExcelCharts:
        if checkNER == 1:
            fileType = 'SSR_summary_NER'
        else:
            fileType = 'SSR_summary'

        columns_to_be_plotted = [[0, 1], [0, 2], [0, 3]]
        hover_label = [
            'List of Summary Filenames for Type of Error',
            'List of Summary Filenames for Type of Error',
            'List of Summary Filenames for Type of Error'
        ]
        inputFilename = outputFilename
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel=fileType,
            chart_type_list=["bar"],
            chart_title='Missing Character (File Summaries in Error)',
            column_xAxis_label_var='Type of Error',
            hover_info_column_list=hover_label)
        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    if openOutputFiles:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
        filesToOpen = []  # avoid opening twice in the calling function

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'Analysis end',
        'Finished running MISSING CHARACTER at', True)

    return filesToOpen
Esempio n. 15
0
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts,
        doNotListIndividualFiles):
    global first_section, noun_cnt, nominalized_cnt

    first_section = re.compile("^(.+?)\.")
    noun_cnt = Counter()
    nominalized_cnt = Counter()
    filesToOpen = []  # Store all files that are to be opened once finished

    if __name__ == '__main__':
        nltk.data.path.append('./nltk_data')

        inputDocs = []
        if os.path.isdir(inputDir):
            for f in os.listdir(inputDir):
                if f[:2] != '~$' and f[-4:] == '.txt':
                    inputDocs.append(os.path.join(inputDir, f))
            if len(inputDocs) == 0:
                print(
                    "There are no txt files in the input path. The program will exit."
                )
                mb.showwarning(
                    title='No txt files found',
                    message=
                    'There are no txt files in the selected input directory.\n\nPlease, select a different input directory and try again.'
                )
                return
        else:
            inputDocs = [inputFilename]

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running Nominalization at', True)

        #add all into a sum
        result_dir = []
        result_dir.append(["Word", "Is nominalized", "Document"])
        docID = 0
        result2 = []
        result_dir2 = []
        counter_nominalized_list = []
        counter_nominalized_list.append(['Nominalized verb', 'Frequency'])
        counter_noun_list = []
        counter_noun_list.append(['Noun', 'Frequency'])

        for doc in inputDocs:

            docID = docID + 1
            print("Processing document", doc, "\n")
            #open the doc and create the list of result (words, T/F)
            fin = open(doc, 'r', encoding='utf-8', errors='ignore')
            # result1 contains the sentence and nominalized values fora a specific document
            result, result1 = nominalized_verb_detection(
                docID, doc, fin.read())
            # result2 contains the sentence and nominalized values for all documents
            result2.extend(result1)
            fin.close()

            # list all verbs as TRUE/FALSE if nominalized
            for word, boolean in result:
                result_dir.append([
                    word, boolean,
                    IO_csv_util.dressFilenameForCSVHyperlink(doc)
                ])

            result_dir2.extend(result_dir)

            if len(inputDir) > 0:
                fname = os.path.basename(os.path.normpath(inputDir)) + "_dir"
            else:
                fname = doc
            # used for both individual files and directories
            output_filename_bySentenceIndex = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'sent', '', '', '', False,
                True)

            if len(inputDir) == 0 or doNotListIndividualFiles == False:
                counter_nominalized_list = []
                counter_noun_list = []
                # refresh the headers
                counter_nominalized_list.insert(
                    0, ['Nominalized verb', 'Frequency'])
                counter_noun_list.insert(0, ['Noun', 'Frequency'])

                result1.insert(0, [
                    'Document ID', 'Document', 'Sentence ID', 'Sentence',
                    'Number of words in sentence', 'Nominalized verbs',
                    'Number of nominalizations in sentence',
                    'Percentage of nominalizations in sentence'
                ])

                # compute frequency of most common nominalized verbs
                for word, freq in nominalized_cnt.most_common():
                    counter_nominalized_list.append([word, freq])

                # compute frequency of most common nouns
                for word, freq in noun_cnt.most_common():
                    counter_noun_list.append([word, freq])

                head, fname = os.path.split(doc)
                fname = fname[:-4]

                output_filename_noun_frequencies = IO_files_util.generate_output_file_name(
                    fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '',
                    '', False, True)
                filesToOpen.append(output_filename_noun_frequencies)
                output_filename_nominalized_frequencies = IO_files_util.generate_output_file_name(
                    fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '',
                    '', '', False, True)
                filesToOpen.append(output_filename_nominalized_frequencies)

                # export nominalized verbs
                list_to_csv(output_filename_nominalized_frequencies,
                            counter_nominalized_list)

                # export nouns
                list_to_csv(output_filename_noun_frequencies,
                            counter_noun_list)

                output_filename_TRUE_FALSE = IO_files_util.generate_output_file_name(
                    fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '',
                    '', '', '', False, True)

                filesToOpen.append(output_filename_TRUE_FALSE)
                list_to_csv(output_filename_TRUE_FALSE, result)

                filesToOpen.append(output_filename_bySentenceIndex)
                list_to_csv(output_filename_bySentenceIndex, result1)

                if createExcelCharts == True:
                    # line chart
                    columns_to_be_plotted = [[2, 6]]
                    chartTitle = 'Nominalized verbs (by Sentence Index)'
                    xAxis = 'Sentence index'
                    yAxis = 'Number of nominalizations in sentence'
                    hover_label = ''
                    Excel_outputFilename = Excel_util.run_all(
                        columns_to_be_plotted,
                        output_filename_bySentenceIndex,
                        outputDir,
                        '',
                        chart_type_list=["line"],
                        chart_title=chartTitle,
                        column_xAxis_label_var=xAxis,
                        hover_info_column_list=hover_label,
                        column_yAxis_label_var=yAxis)
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

                    # pie chart of nominalized verbs
                    Excel_outputFilename = Excel_util.create_excel_chart(
                        GUI_util.window, [counter_nominalized_list], fname,
                        outputDir, 'NOM_Verb', "Nominalized verbs", ["pie"])
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

                    # pie chart of nouns
                    Excel_outputFilename = Excel_util.create_excel_chart(
                        GUI_util.window, [counter_noun_list], fname, outputDir,
                        'NOM_noun', "Nouns", ["pie"])
                    if len(Excel_outputFilename) > 0:
                        filesToOpen.append(Excel_outputFilename)

        if len(inputDir) > 0 and doNotListIndividualFiles == True:
            output_filename_TRUE_FALSE_dir = IO_files_util.generate_output_file_name(
                fname + '_TRUE_FALSE', '', outputDir, '.csv', 'NOM', '', '',
                '', '', False, True)
            filesToOpen.append(output_filename_TRUE_FALSE_dir)
            output_filename_dir_noun_frequencies = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'noun_freq', '', '', '',
                False, True)
            filesToOpen.append(output_filename_dir_noun_frequencies)
            output_filename_dir_nominalized_frequencies = IO_files_util.generate_output_file_name(
                fname, '', outputDir, '.csv', 'NOM', 'nominal_freq', '', '',
                '', False, True)
            filesToOpen.append(output_filename_dir_nominalized_frequencies)

            result2.insert(0, [
                'Document ID', 'Document', 'Sentence ID', 'Sentence',
                'Number of words in sentence', 'Nominalized verbs',
                'Number of nominalizations in sentence',
                'Percentage of nominalizations in sentence'
            ])
            list_to_csv(output_filename_bySentenceIndex, result2)

            # list all verbs as TRUE/FALSE if nominalized
            list_to_csv(output_filename_TRUE_FALSE_dir, result_dir2)

            counter_noun_list = []
            counter_noun_list.append(['Noun', 'Frequency'])
            for word, freq in noun_cnt.most_common():
                counter_noun_list.append([word, freq])
            list_to_csv(output_filename_dir_noun_frequencies,
                        counter_noun_list)

            counter_nominalized_list = []
            counter_nominalized_list.append(['Nominalized verb', 'Frequency'])
            for word, freq in nominalized_cnt.most_common():
                counter_nominalized_list.append([word, freq])
            list_to_csv(output_filename_dir_nominalized_frequencies,
                        counter_nominalized_list)

            if createExcelCharts == True:
                # pie chart of nominalized verbs
                Excel_outputFilename = Excel_util.create_excel_chart(
                    GUI_util.window, [counter_nominalized_list],
                    output_filename_dir_nominalized_frequencies, outputDir,
                    'NOM_verb'
                    "Nominalized verbs", ["pie"])
                if len(Excel_outputFilename) > 0:
                    filesToOpen.append(Excel_outputFilename)

                # pie chart of nouns
                Excel_outputFilename = Excel_util.create_excel_chart(
                    GUI_util.window, [counter_noun_list],
                    output_filename_dir_noun_frequencies, outputDir,
                    'NOM_noun', "Nouns", ["pie"])
                if len(Excel_outputFilename) > 0:
                    filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end',
                                       'Finished running Nominalization at',
                                       True)

    if openOutputFiles == 1:
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
Esempio n. 16
0
def ancestor_GoingUP(WordNetDir, inputFile, outputDir, noun_verb,
                     openOutputFiles, createExcelCharts):

    filesToOpen = []
    if IO_libraries_util.inputProgramFileCheck(
            'WordNet_Search_UP.jar') == False:
        return
    errorFound, error_code, system_output = IO_libraries_util.check_java_installation(
        'WordNet upward search')
    if errorFound:
        return
    IO_user_interface_util.timed_alert(
        GUI_util.window, 4000, 'Analysis start',
        'Started running WordNet (Zoom OUT/UP) at', True,
        '\n\nRunning WordNet with the ' + noun_verb + ' option.')
    # the java script produces two files:a list and a frequency
    warning = subprocess.call([
        'java', '-jar', 'WordNet_Search_UP.jar', '-wordNetPath',
        os.path.join(WordNetDir, "dict"), '-wordList', inputFile, "-pos",
        noun_verb, '-outputDir', outputDir
    ])
    if warning == 1:
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Invalid Input',
            'Wordnet cannot find any word in the input csv file for " + noun_verb + ".\n\nThis error can also occur if any of the files previously generated by WordNet are open. Please, check your files, close them, and try again.'
        )

        # mb.showwarning(title = "Invalid Input", message = "Wordnet cannot find any word in the input csv file for " + noun_verb + ".\n\nThis error can also occur if any of the files previously generated by WordNet are open. Please, check your files, close them, and try again.")
        return
    elif warning == 2:
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Invalid Input',
            'Some words in your list do not exist in Wordnet for " + noun_verb + ".\n\nPlease, check the list of words in command line.'
        )
        # mb.showwarning(title = "Invalid Input", message = "Some words in your list do not exist in Wordnet for " + noun_verb + ".\n\nPlease, check the list of words in command line.")
    fileName = os.path.basename(inputFile).split(".")[0]
    outputFilenameCSV1 = os.path.join(
        outputDir, "NLP_WordNet_UP_" + fileName + "_output.csv")
    filesToOpen.append(outputFilenameCSV1)
    outputFilenameCSV2 = os.path.join(
        outputDir, "NLP_WordNet_UP_" + fileName + "_frequency.csv")
    filesToOpen.append(outputFilenameCSV2)

    if createExcelCharts:
        columns_to_be_plotted = [[1, 1]]
        chart_title = 'Frequency of WordNet Aggregate Categories for ' + noun_verb
        hover_label = ['Word']
        inputFilename = outputFilenameCSV1
        Excel_outputFilename = Excel_util.run_all(
            columns_to_be_plotted,
            inputFilename,
            outputDir,
            outputFileLabel='_bar_chart',
            chart_type_list=["bar"],
            chart_title=chart_title,
            column_xAxis_label_var='WordNet ' + noun_verb + ' category',
            hover_info_column_list=hover_label,
            count_var=1)

        if Excel_outputFilename != "":
            filesToOpen.append(Excel_outputFilename)

    IO_user_interface_util.timed_alert(
        GUI_util.window, 3000, 'Analysis end',
        'Finished running WordNet (Zoom OUT/UP) at', True)
    return filesToOpen
Esempio n. 17
0
def run(CoreNLPdir, inputFilename, inputDir, outputDir, openOutputFiles,
        createExcelCharts, mean_var, median_var, SA_algorithm_var, memory_var,
        sentence_index_var, shape_of_stories_var):

    #if GUI_util.check_missingIO()==True:
    #    return
    usefile = False
    usedir = False
    flag = ""  #used by CoreNLP
    filesToOpen = []  # Store all files that are to be opened once finished

    if shape_of_stories_var:
        if IO_libraries_util.inputProgramFileCheck(
                'shape_of_stories_main.py') == False:
            return
        call("python shape_of_stories_main.py", shell=True)

    if SA_algorithm_var == '':
        mb.showwarning(
            'Warning',
            "No option has been selected.\n\nPlease, select a Sentiment analysis option and try again."
        )
        return

    if len(inputFilename) > 3:
        usefile = True
        usedir = False

    if len(inputDir) > 3:
        usefile = False
        usedir = True

    mode = "both"
    if mean_var == False and median_var == False:
        mode = "mean"
    elif mean_var == True and median_var == False:
        mode = "mean"
    elif mean_var == False and median_var == True:
        mode = "median"
    elif mean_var == True and median_var == True:
        mode = "both"

    SentiWordNet_var = 0
    CoreNLP_var = 0
    hedonometer_var = 0
    vader_var = 0
    anew_var = 0

    if SA_algorithm_var == '*':
        SentiWordNet_var = 1
        CoreNLP_var = 1
        hedonometer_var = 1
        vader_var = 1
        anew_var = 1
    elif SA_algorithm_var == 'Stanford CoreNLP':
        CoreNLP_var = 1
    elif SA_algorithm_var == 'SentiWordNet':
        SentiWordNet_var = 1
    elif SA_algorithm_var == 'ANEW':
        anew_var = 1
    elif SA_algorithm_var == 'hedonometer':
        hedonometer_var = 1
    elif SA_algorithm_var == 'VADER':
        vader_var = 1

    #CORENLP  _______________________________________________________
    if CoreNLP_var == 1:
        #check internet connection
        import IO_internet_util
        if not IO_internet_util.check_internet_availability_warning(
                'Stanford CoreNLP Sentiment Analysis'):
            return
        #     flag="true" do NOT produce individual output files when processing a directory; only merged file produced
        #     flag="false" or flag="" ONLY produce individual output files when processing a directory; NO  merged file produced

        flag = "false"  # the true option does not seem to work

        if IO_libraries_util.inputProgramFileCheck(
                'Stanford_CoreNLP_annotator_util.py') == False:
            return
        # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Stanford CoreNLP Sentiment Analysis',
        #                                    'Started running Stanford CoreNLP Sentiment Analysis at', True,
        #                                    'You can follow CoreNLP in command line.')

        #@ need an additional variable CoreNLP dir and memory_var @
        # set memory_var if not there
        if memory_var == 0:
            memory_var = 4
        outputFilename = Stanford_CoreNLP_annotator_util.CoreNLP_annotate(
            inputFilename, inputDir, outputDir, openOutputFiles,
            createExcelCharts, 'sentiment', False, memory_var)
        outputFilename = outputFilename[
            0]  # annotators return a list and not a string
        if len(outputFilename) > 0:
            filesToOpen.append(outputFilename)
        #@ not longer need to call java subprocess @
        # subprocess.call(['java', '-jar', 'Stanford_CoreNLP_sentiment_analysis.jar', inputDir, inputFilename, outputDir, flag])
        if not usedir:
            if createExcelCharts == True:
                # CoreNLP only computes mean values
                columns_to_be_plotted = [[2, 4]]
                hover_label = ['Sentence']
                # inputFilename = outputFilename
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    outputFilename,
                    outputDir,
                    outputFileLabel='CoreNLP_sent',
                    chart_type_list=["line"],
                    chart_title=
                    'Stanford CoreNLP - Sentiment Scores by Sentence Index',
                    column_xAxis_label_var='Sentence index',
                    hover_info_column_list=hover_label,
                    count_var=0,
                    column_yAxis_label_var='Scores')
                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

                columns_to_be_plotted = [[5, 5]]
                hover_label = []
                # inputFilename = inputFilename
                Excel_outputFilename = Excel_util.run_all(
                    columns_to_be_plotted,
                    outputFilename,
                    outputDir,
                    outputFileLabel='CoreNLP_SA',
                    chart_type_list=["bar"],
                    chart_title='Stanford CoreNLP - Sentiment Scores',
                    column_xAxis_label_var='Sentiment score',
                    hover_info_column_list=hover_label,
                    count_var=1,
                    column_yAxis_label_var='Scores')

                if Excel_outputFilename != "":
                    filesToOpen.append(Excel_outputFilename)

                # outputFilenameXlsm1 = Excel_util.run_all(columns_to_be_plotted,inputFilename,outputDir, outputQuotefilePath, chart_type_list = ["bar"], chart_title=
                # "Stanford CoreNLP (Sentiment Value)", column_xAxis_label_var = 'Sentiment value',
                # column_yAxis_label_var = 'Frequency of sentiment value',outputExtension = '.xlsm',
                # label1='SC',label2='CoreNLP_Sentiment',label3='bar',label4='chart',label5='',
                # useTime=False,disable_suffix=True,  count_var=1, column_yAxis_field_list = [], reverse_column_position_for_series_label=False , series_label_list=[''], second_y_var=0, second_yAxis_label='', hover_info_column_list=hover_label)

        # else:
        #     #open only the merged file
        #     lastPart=os.path.basename(os.path.normpath(inputDir))
        #     outputFilename = IO_files_util.generate_output_file_name(lastPart, outputDir, '.csv', 'SC', 'Sentiment CoreNLP', '', '', '', False, True)
        #     filesToOpen.append(outputFilename)

        # IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running CoreNLP Sentiment Analysis at', True)

    #HEDONOMETER _______________________________________________________
    if hedonometer_var == 1:
        if lib_util.checklibFile(
                GUI_IO_util.sentiment_libPath + os.sep + 'hedonometer.json',
                'sentiment_analysis_hedonometer_util.py') == False:
            return
        if IO_libraries_util.inputProgramFileCheck(
                'sentiment_analysis_hedonometer_util.py') == False:
            return
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running HEDONOMETER Sentiment Analysis at', True)
        if len(inputFilename) > 0:
            fileNamesToPass = []  # LINE ADDED
            outputFilename = IO_files_util.generate_output_file_name(
                inputFilename, inputDir, outputDir, '.csv', 'SC',
                'Hedonometer', '', '', '', False, True)
        else:
            outputFilename = IO_files_util.generate_output_file_name(
                inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'Hedonometer',
                '', '', '', False, True)

        sentiment_analysis_hedonometer_util.main(inputFilename, inputDir,
                                                 outputDir, outputFilename,
                                                 mode)

        #tkinter filedialog.askdirectory ALWAYS returns forward slashes / if you use os.sep you end up mixing the slashes
        # subprocess.call(['python', 'sentiment_analysis_hedonometer_util.py', '--file', inputFilename, "--out", outputDir+os.sep
        #                  , "--mode", mode])
        filesToOpen.append(outputFilename)

        if createExcelCharts == True:
            if mode == "both":
                columns_to_be_plotted = [[2, 4], [2, 6]]
                hover_label = ['Sentence', 'Sentence']
            else:
                columns_to_be_plotted = [[2, 4]]
                hover_label = ['Sentence']
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='Hedo_sent',
                chart_type_list=["line"],
                chart_title='Hedonometer - Sentiment Scores by Sentence Index',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=0,
                column_yAxis_label_var='Scores')
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            columns_to_be_plotted = [[5, 5]]
            hover_label = []
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='Hedo_sent',
                chart_type_list=["bar"],
                chart_title='Hedonometer - Sentiment Scores',
                column_xAxis_label_var='Sentiment score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis end',
            'Finished running HEDONOMETER Sentiment Analysis at', True)

    #SentiWordNet _______________________________________________________
    if SentiWordNet_var == 1:
        if IO_libraries_util.inputProgramFileCheck(
                'sentiment_analysis_SentiWordNet_util.py') == False:
            return
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running SentiWordNet Sentiment Analysis at', True)

        if len(inputFilename) > 0:
            outputFilename = IO_files_util.generate_output_file_name(
                inputFilename, inputDir, outputDir, '.csv', 'SC',
                'SentiWordNet', '', '', '', False, True)
        else:
            outputFilename = IO_files_util.generate_output_file_name(
                inputDir, inputDir, outputDir, '.csv', 'SC_dir',
                'SentiWordNet', '', '', '', False, True)

        sentiment_analysis_SentiWordNet_util.main(inputFilename, inputDir,
                                                  outputDir, outputFilename,
                                                  mode)

        filesToOpen.append(outputFilename)
        if createExcelCharts == True:
            # sentiWordNet compute a single sentiment score
            columns_to_be_plotted = [[2, 4]]
            hover_label = ['Sentence']

            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='SentiWordNet_sent',
                chart_type_list=["line"],
                chart_title='SentiWordNet - Sentiment Scores by Sentence Index',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=0,
                column_yAxis_label_var='Scores')
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            columns_to_be_plotted = [[5, 5]]
            hover_label = []
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='SentiWordNet_sent',
                chart_type_list=["bar"],
                chart_title='SentiWordNet - Sentiment Scores',
                column_xAxis_label_var='Sentiment score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis end',
            'Finished running SentiWordNet Sentiment Analysis at', True)

    #VADER _______________________________________________________
    if vader_var == 1:
        if lib_util.checklibFile(
                GUI_IO_util.sentiment_libPath + os.sep + 'vader_lexicon.txt',
                'sentiment_analysis_VADER_util.py') == False:
            return
        if IO_libraries_util.inputProgramFileCheck(
                'sentiment_analysis_VADER_util.py') == False:
            return
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running VADER Sentiment Analysis at', True)
        if len(inputFilename) > 0:
            outputFilename = IO_files_util.generate_output_file_name(
                inputFilename, inputDir, outputDir, '.csv', 'SC', 'VADER', '',
                '', '', False, True)
        else:
            outputFilename = IO_files_util.generate_output_file_name(
                inputDir, inputDir, outputDir, '.csv', 'SC_dir', 'VADER', '',
                '', '', False, True)

        sentiment_analysis_VADER_util.main(inputFilename, inputDir, outputDir,
                                           outputFilename, mode)

        filesToOpen.append(outputFilename)
        if createExcelCharts == True:
            # VADER does not compute separate mean and median values
            columns_to_be_plotted = [[2, 4]]
            hover_label = ['Sentence']
            # inputFilename = outputFilename

            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='VADER_sent',
                chart_type_list=["line"],
                chart_title='VADER - Sentiment Scores by Sentence Index',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=0,
                column_yAxis_label_var='Scores')
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            columns_to_be_plotted = [[5, 5]]
            hover_label = []
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='VADER_sent',
                chart_type_list=["bar"],
                chart_title='VADER - Sentiment Scores',
                column_xAxis_label_var='Sentiment score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis end',
            'Finished running VADER Sentiment Analysis at', True)

    #ANEW _______________________________________________________
    if anew_var == 1:
        if lib_util.checklibFile(
                GUI_IO_util.sentiment_libPath + os.sep +
                'EnglishShortenedANEW.csv',
                'sentiment_analysis_ANEW') == False:
            return
        if IO_libraries_util.inputProgramFileCheck(
                'sentiment_analysis_ANEW_util.py') == False:
            return
        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis start',
            'Started running ANEW Sentiment Analysis at', True)
        outputFilename = IO_files_util.generate_output_file_name(
            inputFilename, inputDir, outputDir, '.csv', 'SC', 'ANEW', '', '',
            '', False, True)

        sentiment_analysis_ANEW_util.main(inputFilename, inputDir, outputDir,
                                          outputFilename, mode)
        if createExcelCharts == True:
            # # sentiment by sentence index
            if mode == "both":
                columns_to_be_plotted = [[2, 4], [2, 6], [2, 8], [2, 10],
                                         [2, 12], [2, 14]]
                hover_label = [
                    'Sentence', 'Sentence', 'Sentence', 'Sentence', 'Sentence',
                    'Sentence'
                ]
            else:
                columns_to_be_plotted = [[2, 4], [2, 6], [2, 8]]
                hover_label = ['Sentence', 'Sentence', 'Sentence']

            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='ANEW_sent',
                chart_type_list=["line"],
                chart_title='ANEW - Sentiment Scores by Sentence Index',
                column_xAxis_label_var='Sentence index',
                hover_info_column_list=hover_label,
                count_var=0,
                column_yAxis_label_var='Scores')
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # sentiment bar chart
            if mode == "both":
                columns_to_be_plotted = [[5, 5], [7, 7]]
            else:
                columns_to_be_plotted = [[5, 5]]
            hover_label = []
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='ANEW_sent',
                chart_type_list=["bar"],
                chart_title='ANEW - Sentiment Scores',
                column_xAxis_label_var='Sentiment score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # arousal
            if mode == "both":
                columns_to_be_plotted = [[9, 9], [11, 11]]
            else:
                columns_to_be_plotted = [[7, 7]]
            hover_label = []
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='ANEW_arous',
                chart_type_list=["bar"],
                chart_title='ANEW - Arousal Scores',
                column_xAxis_label_var='Arousal score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

            # dominance
            if mode == "both":
                columns_to_be_plotted = [[13, 13], [15, 15]]
            else:
                columns_to_be_plotted = [[9, 9]]
            hover_label = []
            # inputFilename = outputFilename
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                outputFilename,
                outputDir,
                outputFileLabel='ANEW_dom',
                chart_type_list=["bar"],
                chart_title='ANEW - Dominance Scores',
                column_xAxis_label_var='Dominance score',
                hover_info_column_list=hover_label,
                count_var=1,
                column_yAxis_label_var='Scores')

            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        IO_user_interface_util.timed_alert(
            GUI_util.window, 3000, 'Analysis end',
            'Finished running ANEW Sentiment Analysis at', True)

    if openOutputFiles == True:
        # IO_user_interface_util.timed_alert(GUI_util.window, 5000, 'Warning', 'All csv output files have been saved to ' + outputDir)
        IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                      filesToOpen)
def plagiarist(input_main_dir_path, output_dir_path, open_csv_output_checkbox, createExcelCharts,
			   similarityIndex_Plagiarist_var, fileName_embeds_date, DateFormat, DatePosition, DateCharacterSeparator):
	if similarityIndex_Plagiarist_var < .8:
		mb.showwarning(title='Similarity Index warning', message="The level of similarity was set at " + str(
			similarityIndex_Plagiarist_var) + ".\n\nCAVEAT! The default threshold for similarity is normally set at 80%.\n\nBe aware that lowering the default level may result in too many documents wrongly classified as similar; conversely, raising the level may exclude too many documents.")

	if IO_libraries_util.inputProgramFileCheck('Lucene.jar') == False:
		return
	if len(DateCharacterSeparator) == 0:
		tk.messagebox.showinfo("Plagiarist", "DateCharacterSeparator")
		return
	lib_stopwords = lib_util.check_lib_stopwords()

	if len(lib_stopwords) != 0:
		IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis start', 'Started running PLAGIARIST at',
										   True)
		errorFound, error_code, system_output = IO_libraries_util.check_java_installation('Lucene')
		if errorFound:
			return
		subprocess.call(['java', '-jar', 'Lucene.jar', '-inputDir', input_main_dir_path + os.sep, '-outputDir',
						 output_dir_path + os.sep
							, '-stopword', lib_stopwords, '-embedsDate', str(fileName_embeds_date), '-dateFormat',
						 DateFormat
							, '-datePos', str(DatePosition), '-itemsDelim', DateCharacterSeparator, '-similarityIndex',
						 str(similarityIndex_Plagiarist_var)])
		filesToOpen.append(output_dir_path + os.sep + "document_duplicates.txt")

		outputFilenameCSV_1 = output_dir_path + os.sep + "Lucene_classes_freq.csv"
		filesToOpen.append(outputFilenameCSV_1)

		if fileName_embeds_date:
			outputFilenameCSV_2 = output_dir_path + os.sep + "Lucene_classes_time_freq.csv"
			filesToOpen.append(outputFilenameCSV_2)

		outputFilenameCSV_3 = output_dir_path + os.sep + "Lucene_document_instance_classes_freq.csv"
		filesToOpen.append(outputFilenameCSV_3)

		outputFilenameCSV_4 = output_dir_path + os.sep + "Lucene_Document_classes_freq.csv"
		group_newspaper(outputFilenameCSV_3, outputFilenameCSV_4)
		filesToOpen.append(outputFilenameCSV_4)

	if createExcelCharts:
		# Lucene_classes_freq.csv; outputFilenameCSV_1
		outputDir=output_dir_path
		inputFilename = outputFilenameCSV_1
		columns_to_be_plotted = [[0, 1]]
		hover_label = ['List of Documents in Category']
		Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
												  outputFileLabel='SSR_plagiar',
												  chart_type_list=["bar"],
												  chart_title='Frequency of Plagiarism by Classes of % Duplication',
												  column_xAxis_label_var='Classes of percentage duplication',
												  hover_info_column_list=hover_label)
		if Excel_outputFilename != "":
			filesToOpen.append(Excel_outputFilename)

		# Plot Lucene_classes_time_freq.csv line plot (temporal plot); outputFilenameCSV_2
		if fileName_embeds_date:
			# columns_to_be_plotted = [[0,1], [0,2], [0,3], [0,4], [0,5], [0,6],[0,7], [0,8], [0,9],[0,10]]
			# hover_label=['','','','','','','','','','']
			inputFilename = outputFilenameCSV_2
			columns_to_be_plotted = [[0, 1], [0, 2], [0, 3]]
			hover_label = ['', '', '']
			Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
													  outputFileLabel='SSR_plagiar',
													  chart_type_list=["line"],
													  chart_title='Frequency of Plagiarism by Year',
													  column_xAxis_label_var='Year',
													  hover_info_column_list=hover_label)
			if Excel_outputFilename != "":
				filesToOpen.append(Excel_outputFilename)

		# No plot for Lucene_document_classes_freq.csv
		#   because it could potentially have thousands of documents
		# 	inputFilename = outputFilenameCSV_3


		# Lucene_Document_classes_freq.csv; outputFilenameCSV_4
		columns_to_be_plotted = [[0, 1],[0, 2],[0, 3]]
		hover_label = ['']
		inputFilename = outputFilenameCSV_4
		Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir,
												  outputFileLabel='SSR_plagiar',
												  chart_type_list=["bar"],
												  chart_title='Frequency of Plagiarism by Document Name & Classes',
												  column_xAxis_label_var='',
												  hover_info_column_list=hover_label)
		if Excel_outputFilename != "":
			filesToOpen.append(Excel_outputFilename)

	IO_user_interface_util.timed_alert(GUI_util.window, 3000, 'Analysis end', 'Finished running PLAGIARIST at', True)