def provide_corpus_functionwords(numOfFunctionwords): global top_corpus_functionwords top_corpus_functionwords={} synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt") if not util.exists(synchronized_functionwords_file_path): # can't find the file in memory log('Cannot find synchronized_functionwords file') # redundant corpus_functionwords = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Counting function words in ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: words = line.split() for word in words: if word in function_words_map.keys(): if word not in corpus_functionwords.keys(): corpus_functionwords[word] = 1 else: corpus_functionwords[word] += 1 top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords, key=corpus_functionwords.get) util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords) top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path) return top_corpus_functionwords
def provide_top_spelling_errors(): spelling_file_path = Path("vectors_handling/vectors/spelling_errors/top_spelling_errors.txt") if not util.exists(spelling_file_path): # can't find the file in memory log('Cannot find top bipos file') # redundant generate_top_spelling_errors(spelling_file_path) top_spelling_errors = util.load_file(spelling_file_path) return top_spelling_errors
def provide_top_unigram(): unigram_file_path = Path("vectors_handling/vectors/unigrams/top_unigrams.txt") if not util.exists(unigram_file_path): # can't find the file in memory log('Cannot find top unigrams file') # redundant generate_top_unigrams(unigram_file_path) top_unigrams = util.load_file(unigram_file_path) return top_unigrams
def provide_top_tripos(): tripos_file_path = Path("vectors_handling/vectors/pos/top_tripos.txt") if not util.exists(tripos_file_path): # can't find the file in memory log('Cannot find top tripos file') # redundant generate_top_tripos(tripos_file_path) top_tripos = util.load_file(tripos_file_path) return top_tripos