def get_corpus(file_path: str): logger = Logger() output_file = FileOpener().get_new_file("wiki.en.raw.txt") wiki: WikiCorpus = WikiCorpus(file_path, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output_file.write(" ".join(text) + "\n") logger.every_n_wiki_status(100) output_file.close() logger.every_n_wiki_status(1)
def lemmatize_text(file_path: str, timer: Timer): logger = Logger() output_file = FileOpener().get_new_file("wiki.en.lemmatized.txt", "a") with open(file_path, "r") as file: for line in file: lemmatized_list = [ word.lemma_ for word in SpacyModel.instance.get_en_spacy_line(line) ] lemmazized_line = " ".join(lemmatized_list) output_file.write(lemmazized_line) logger.every_n_wiki_status(10, timer.get_duration()) logger.every_n_wiki_status(1)
def remove_stopwords(file_path: str): logger = Logger() stop_words = StopWordModel.instance.get_stop_words_nltk() output_file = FileOpener().get_new_file("wiki.en.filtered.txt", "a") with open(file_path, "r") as file: for line in file: split_line = line.split(" ") filtered_list = [ word for word in split_line if word not in stop_words ] filtered_line = " ".join(filtered_list) output_file.write(filtered_line) logger.every_n_wiki_status(100) logger.every_n_wiki_status(1)