def setup_non_academic(self): """ Setup for LBK. Json count files are produced. :return: """ # Total count non-academic total_dict = {} total_count = 0 print('Reading files in: non-academic corpus') for subdir, dirs, files in os.walk(self.corpus_directory + self.non_academic_corpus): for f in files: if f.endswith('.okl'): for sentence in read_lbk.read_cg3(codecs.open(os.path.join(subdir, f), 'r', 'ISO-8859-1')): for word in sentence: if not isinstance(word, str): if '$' not in word[1]: current_word = word[1].replace('"', '') self.add_count_to_dict(total_dict, current_word) total_count += 1 # Storing non-academic dictionaries self.remove_threshold(total_dict) self.store_dict(total_dict, 'counts/dictionary_non_academic.txt') with open('counts/word_count_non_academic', 'w') as f: f.write(str(total_count))
import cg3.read_lbk as lbk import os import codecs import random word_list = [] for subdir, dirs, files in os.walk('/Users/arashsaidi/Work/Corpus/lbk_22.04.14/TV'): for f in files: if f.endswith('.okl'): for sentence in lbk.read_cg3(codecs.open(os.path.join(subdir, f), 'r', 'ISO-8859-1')): for word in sentence: if not isinstance(word, str): if '$' not in word[1]: current_word = word[1].replace('"', '') if random.randint(0, 1000) > 980: if current_word not in word_list: word_list.append(current_word) if len(word_list) > 750: break if len(word_list) > 750: break with open('random_lbk.txt', 'w') as f: for w in word_list[0:750]: f.write(w + '\n') # 46.12788739668003 67.80671117032298
def run_coverage(n, academic_list, save_file): # TO RUN: # Remember to change nr_of_words and file names # To run this script, make changes for which corpus to run as comparison # Number of words to include in academic list nr_of_words = n print("Running Coverage on KIAP...") duo_words = [] words_checked = 0 # Words list to check for coverage with open("/Users/arashsaidi/PycharmProjects/GardnerDavies2/lists/" + academic_list) as duo: for word in duo.readlines(): if words_checked < nr_of_words: word = word.split(" ")[0].replace("\n", "") duo_words.append(word) words_checked += 1 found_count = 0.0 word_counts = dict() total_word_count = 0.0 coverage = 0 # For running with lbk # '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/' + lbk # For running with kiap if words_checked > 0: for dir_name, dir_names, file_names in os.walk("/Users/arashsaidi/Work/Corpus/kiap-obt/"): for f in file_names: # ADD LINE BELOW TO JUST CHECK ACADEMIC PART OF LBK SAKPROSA # and dir_name in academic_dir_name if f.endswith(".obt"): cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), "r", "utf8")) for word in cg3_data: # Check if list if not isinstance(word, str): if isinstance(word[1], str): current_word = word[1].replace('"', "") if "$" not in current_word: total_word_count += 1.0 if current_word in duo_words: found_count += 1 if current_word in word_counts: word_counts[current_word] += 1.0 else: word_counts[current_word] = 1.0 print(total_word_count) print("Coverage: " + str(found_count / total_word_count)) for word, c in word_counts.items(): word_counts[word] = c / total_word_count sorted_x = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True) directory = "/Users/arashsaidi/PycharmProjects/GardnerDavies2/coverage/" with open(directory + str(nr_of_words) + "_words_checked_KIAP" + save_file + ".txt", "w") as f: f.write("Total words in comparison: " + str(total_word_count) + "\n") for word in sorted_x: coverage += word[1] f.write("Coverage: " + str(coverage * 100) + "\n\n") for word in sorted_x: f.write(word[0] + " " + str(word[1]) + "\n") print("Words should be checked: " + str(nr_of_words)) print("Words checked: " + str(words_checked)) return coverage
directory_lbk = '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Skjonnlitt/' directory_kiap = '/Users/arashsaidi/Work/Corpus/kiap-obt/' def remove_threshold(dictionary): for k in list(dictionary): if dictionary[k] < 5: del dictionary[k] counts = {} words = 0 # lbk for dir_name, d, file_names in os.walk(directory_lbk): for f in file_names: if f.endswith('.okl'): cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), 'r', 'ISO-8859-1')) for sentence in cg3_data: for word in sentence: # Check if list if not isinstance(word, str): if '$' not in word[1]: words += 1 current_word = word[1].replace('"', '') if current_word in counts: counts[current_word] += 1 else: counts[current_word] = 1 remove_threshold(counts) json.dump(counts, open('lbk.txt', 'w')) with open('count_lbk.txt', 'w') as f:
def run_coverage(n, academic_list, save_file): # TO RUN: # Remember to change nr_of_words and file names # For running the academic part of lbk academic_dir_name = ['/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA02', '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA04', '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA05', '/Users/arashsaidi/Work/Corpus/lbk_22.04.14/Sakprosa/SA22'] # To run this script, make changes for which corpus to run as comparison print('Running Coverage on lbk...') lbk = 'Skjonnlitt/' # Number of words to include in academic list nr_of_words = n duo_words = [] words_checked = 0 coverage = 0 # Words list to check for coverage with open('/Users/arashsaidi/PycharmProjects/GardnerDavies2/lists/' + academic_list) as duo: for word in duo.readlines(): if words_checked < nr_of_words: word = word.split(' ')[0].replace('\n', '') duo_words.append(word) words_checked += 1 found_count = 0. word_counts = dict() total_word_count = 0. for dir_name, dir_names, file_names in os.walk('/Users/arashsaidi/Work/Corpus/lbk_22.04.14/' + lbk): for f in file_names: # ADD LINE BELOW TO JUST CHECK ACADEMIC PART OF LBK SAKPROSA # and dir_name in academic_dir_name if f.endswith('.okl'): cg3_data = read_lbk.read_cg3(codecs.open(os.path.join(dir_name, f), 'r', 'ISO-8859-1')) for sentence in cg3_data: for word in sentence: # Check if list if not isinstance(word, str): if '$' not in word[1]: total_word_count += 1. current_word = word[1].replace('"', '') if current_word in duo_words: found_count += 1 if current_word in word_counts: word_counts[current_word] += 1. else: word_counts[current_word] = 1. print(total_word_count) for word, count in word_counts.items(): word_counts[word] = count / total_word_count sorted_x = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True) directory = '/Users/arashsaidi/PycharmProjects/GardnerDavies2/coverage/' with open(directory + str(nr_of_words) + '_words_checked_lbk_' + save_file + '.txt', 'w') as f: f.write('Total words in comparison: ' + str(total_word_count) + '\n') for word in sorted_x: coverage += word[1] f.write('Coverage: ' + str(coverage * 100) + '\n\n') for word in sorted_x: f.write(word[0] + ' ' + str(word[1]) + '\n') return coverage