예제 #1
0
    def setup_non_academic(self):
        """
        Setup for LBK. Json count files are produced.
        :return:
        """
        # Total count non-academic
        total_dict = {}
        total_count = 0
        print('Reading files in: non-academic corpus')
        for subdir, dirs, files in os.walk(self.corpus_directory + self.non_academic_corpus):
            for f in files:
                if f.endswith('.okl'):
                    for sentence in read_lbk.read_cg3(codecs.open(os.path.join(subdir, f), 'r', 'ISO-8859-1')):
                        for word in sentence:
                            if not isinstance(word, str):
                                if '$' not in word[1]:
                                    current_word = word[1].replace('"', '')
                                    self.add_count_to_dict(total_dict, current_word)
                                    total_count += 1

        # Storing non-academic dictionaries
        self.remove_threshold(total_dict)
        self.store_dict(total_dict, 'counts/dictionary_non_academic.txt')
        with open('counts/word_count_non_academic', 'w') as f:
            f.write(str(total_count))
예제 #2
0
import cg3.read_lbk as lbk
import os
import codecs
import random

word_list = []

for subdir, dirs, files in os.walk('/Users/arashsaidi/Work/Corpus/lbk_22.04.14/TV'):
    for f in files:
        if f.endswith('.okl'):
            for sentence in lbk.read_cg3(codecs.open(os.path.join(subdir, f), 'r', 'ISO-8859-1')):
                for word in sentence:
                    if not isinstance(word, str):
                        if '$' not in word[1]:
                            current_word = word[1].replace('"', '')
                            if random.randint(0, 1000) > 980:
                                if current_word not in word_list:
                                    word_list.append(current_word)
                if len(word_list) > 750:
                    break
    if len(word_list) > 750:
                    break

with open('random_lbk.txt', 'w') as f:
    for w in word_list[0:750]:
        f.write(w + '\n')

# 46.12788739668003 67.80671117032298
예제 #3
0
def run_coverage(n, academic_list, save_file):
    # TO RUN:
    # Remember to change nr_of_words and file names

    # To run this script, make changes for which corpus to run as comparison
    # Number of words to include in academic list
    nr_of_words = n
    print("Running Coverage on KIAP...")

    duo_words = []
    words_checked = 0
    # Words list to check for coverage
    with open("/Users/arashsaidi/PycharmProjects/GardnerDavies2/lists/" + academic_list) as duo:
        for word in duo.readlines():
            if words_checked < nr_of_words:
                word = word.split(" ")[0].replace("\n", "")
                duo_words.append(word)
                words_checked += 1

    found_count = 0.0
    word_counts = dict()
    total_word_count = 0.0
    coverage = 0
    # For running with lbk
    # '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/' + lbk
    # For running with kiap

    if words_checked > 0:
        for dir_name, dir_names, file_names in os.walk("/Users/arashsaidi/Work/Corpus/kiap-obt/"):
            for f in file_names:
                # ADD LINE BELOW TO JUST CHECK ACADEMIC PART OF LBK SAKPROSA
                # and dir_name in academic_dir_name
                if f.endswith(".obt"):
                    cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), "r", "utf8"))
                    for word in cg3_data:
                        # Check if list
                        if not isinstance(word, str):
                            if isinstance(word[1], str):
                                current_word = word[1].replace('"', "")
                                if "$" not in current_word:
                                    total_word_count += 1.0
                                    if current_word in duo_words:
                                        found_count += 1
                                        if current_word in word_counts:
                                            word_counts[current_word] += 1.0
                                        else:
                                            word_counts[current_word] = 1.0

        print(total_word_count)
        print("Coverage: " + str(found_count / total_word_count))

        for word, c in word_counts.items():
            word_counts[word] = c / total_word_count

        sorted_x = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True)

        directory = "/Users/arashsaidi/PycharmProjects/GardnerDavies2/coverage/"
        with open(directory + str(nr_of_words) + "_words_checked_KIAP" + save_file + ".txt", "w") as f:
            f.write("Total words in comparison: " + str(total_word_count) + "\n")
            for word in sorted_x:
                coverage += word[1]
            f.write("Coverage: " + str(coverage * 100) + "\n\n")
            for word in sorted_x:
                f.write(word[0] + " " + str(word[1]) + "\n")

    print("Words should be checked: " + str(nr_of_words))
    print("Words checked: " + str(words_checked))

    return coverage
예제 #4
0
directory_lbk = '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Skjonnlitt/'
directory_kiap = '/Users/arashsaidi/Work/Corpus/kiap-obt/'


def remove_threshold(dictionary):
    for k in list(dictionary):
        if dictionary[k] < 5:
            del dictionary[k]

counts = {}
words = 0
# lbk
for dir_name, d, file_names in os.walk(directory_lbk):
    for f in file_names:
        if f.endswith('.okl'):
                cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), 'r', 'ISO-8859-1'))
                for sentence in cg3_data:
                    for word in sentence:
                        # Check if list
                        if not isinstance(word, str):
                            if '$' not in word[1]:
                                words += 1
                                current_word = word[1].replace('"', '')
                                if current_word in counts:
                                    counts[current_word] += 1
                                else:
                                    counts[current_word] = 1

remove_threshold(counts)
json.dump(counts, open('lbk.txt', 'w'))
with open('count_lbk.txt', 'w') as f:
예제 #5
0
def run_coverage(n, academic_list, save_file):
    # TO RUN:
    # Remember to change nr_of_words and file names

    # For running the academic part of lbk
    academic_dir_name = ['/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA02',
                                                   '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA04',
                                                   '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA05',
                                                   '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA22']

    # To run this script, make changes for which corpus to run as comparison
    print('Running Coverage on lbk...')
    lbk = 'Skjonnlitt/'
    # Number of words to include in academic list
    nr_of_words = n
    duo_words = []
    words_checked = 0
    coverage = 0

    # Words list to check for coverage
    with open('/Users/arashsaidi/PycharmProjects/GardnerDavies2/lists/' + academic_list) as duo:
        for word in duo.readlines():
            if words_checked < nr_of_words:
                word = word.split(' ')[0].replace('\n', '')
                duo_words.append(word)
                words_checked += 1

    found_count = 0.
    word_counts = dict()
    total_word_count = 0.
    for dir_name, dir_names, file_names in os.walk('/Users/arashsaidi/Work/Corpus/lbk_22.04.14/' + lbk):
        for f in file_names:
            # ADD LINE BELOW TO JUST CHECK ACADEMIC PART OF LBK SAKPROSA
            # and dir_name in academic_dir_name
            if f.endswith('.okl'):
                cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), 'r', 'ISO-8859-1'))
                for sentence in cg3_data:
                    for word in sentence:
                        # Check if list
                        if not isinstance(word, str):
                            if '$' not in word[1]:
                                total_word_count += 1.
                                current_word = word[1].replace('"', '')
                                if current_word in duo_words:
                                    found_count += 1
                                    if current_word in word_counts:
                                        word_counts[current_word] += 1.
                                    else:
                                        word_counts[current_word] = 1.

    print(total_word_count)

    for word, count in word_counts.items():
        word_counts[word] = count / total_word_count

    sorted_x = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True)

    directory = '/Users/arashsaidi/PycharmProjects/GardnerDavies2/coverage/'
    with open(directory + str(nr_of_words) + '_words_checked_lbk_' + save_file + '.txt', 'w') as f:
        f.write('Total words in comparison: ' + str(total_word_count) + '\n')
        for word in sorted_x:
            coverage += word[1]
        f.write('Coverage: ' + str(coverage * 100) + '\n\n')
        for word in sorted_x:
            f.write(word[0] + ' ' + str(word[1]) + '\n')

    return coverage