def get_corpus(file_path: str):
    logger = Logger()
    output_file = FileOpener().get_new_file("wiki.en.raw.txt")
    wiki: WikiCorpus = WikiCorpus(file_path, lemmatize=False, dictionary={})

    for text in wiki.get_texts():
        output_file.write(" ".join(text) + "\n")
        logger.every_n_wiki_status(100)

    output_file.close()
    logger.every_n_wiki_status(1)
Exemple #2
0
def lemmatize_text(file_path: str, timer: Timer):
    logger = Logger()
    output_file = FileOpener().get_new_file("wiki.en.lemmatized.txt", "a")

    with open(file_path, "r") as file:
        for line in file:
            lemmatized_list = [
                word.lemma_
                for word in SpacyModel.instance.get_en_spacy_line(line)
            ]
            lemmazized_line = " ".join(lemmatized_list)
            output_file.write(lemmazized_line)
            logger.every_n_wiki_status(10, timer.get_duration())
    logger.every_n_wiki_status(1)
Exemple #3
0
def remove_stopwords(file_path: str):
    logger = Logger()
    stop_words = StopWordModel.instance.get_stop_words_nltk()
    output_file = FileOpener().get_new_file("wiki.en.filtered.txt", "a")

    with open(file_path, "r") as file:
        for line in file:
            split_line = line.split(" ")
            filtered_list = [
                word for word in split_line if word not in stop_words
            ]
            filtered_line = " ".join(filtered_list)
            output_file.write(filtered_line)
            logger.every_n_wiki_status(100)
    logger.every_n_wiki_status(1)