Exemple #1
0
def lemmatize_text(file_path: str, timer: Timer):
    logger = Logger()
    output_file = FileOpener().get_new_file("wiki.en.lemmatized.txt", "a")

    with open(file_path, "r") as file:
        for line in file:
            lemmatized_list = [
                word.lemma_
                for word in SpacyModel.instance.get_en_spacy_line(line)
            ]
            lemmazized_line = " ".join(lemmatized_list)
            output_file.write(lemmazized_line)
            logger.every_n_wiki_status(10, timer.get_duration())
    logger.every_n_wiki_status(1)
Exemple #2
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f'python {script_name} <wiki.en.raw.txt>')
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to remove stopwords")
        timer = Timer()
        remove_stopwords(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Exemple #3
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f"python {script_name} <wiki.en.filtered.txt>")
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to lemmatize text")
        timer = Timer()
        lemmatize_text(file_path, timer)
        Logger().finish_script(timer.get_duration(), script_name)
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(
            f'python {script_name} <en.wiki-latest-pages-articles.xml.bz2>')
        return

    file_path: str = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info(f'Starting to create wiki corpus from "{file_path}"')
        timer = Timer()
        get_corpus(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Exemple #5
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])
    timer = Timer()

    if len(sys.argv) == 3:
        model_path = PathExtractor().get_absolute_path(sys.argv[2])
        if not PathValidator().is_valid_files([model_path]):
            return
        Word2VecModel.instance.set_model(model_path)

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        Logger().usage(
            f"python {script_name} <file_or_directory_path> [<word2vec.model>]"
        )
        return

    project_path = PathExtractor().get_absolute_path(sys.argv[1])

    if PathValidator().is_valid_paths([project_path]):
        parse(project_path)
        Logger().finish_script(timer.get_duration(), script_name)
Exemple #6
0
class FileModel():
    relative_path: str = None
    identifier_list_model: IdentifierListModel = None
    identifier_dictionary_model: IdentifierDictionaryModel = None
    word_dictionary_model: WordDictionaryModel = None

    path: str = None
    supported_extensions: [str] = None
    file_name: str = None
    extension: str = None
    timer: Timer = None
    content: str = None

    def __init__(self, path: str, supported_extensions: [int]):
        self.timer = Timer()
        self.path = path
        self.relative_path = PathExtractor().get_relative_path(path)
        self.supported_extensions = supported_extensions
        self.file_name = PathExtractor().get_file_name(path)
        self.extension = PathExtractor().get_file_extension(self.file_name)

    def to_print(self):
        return {
            "relative_path": self.relative_path,
            "identifier_list": self.identifier_list_model.to_print(),
            "identifier_dictionary":
            self.identifier_dictionary_model.to_print(),
            "word_dictionary": self.word_dictionary_model.to_print()
        }

    def to_csv(self):
        content = [
            identifier.to_csv(self.relative_path, name) for (name, identifier)
            in self.identifier_dictionary_model.get_dictionary().items()
        ]
        return "".join(content)

    def is_valid(self):
        if self.extension in self.supported_extensions:
            self.content = FileOpener().get_file_content(self.path)
            return True if self.content else False

    def parse(self):
        Logger().start_analyzing(self.relative_path)
        self.identifier_list_model = LanguageParser().parse_file(
            self.extension, self.content)
        self.identifier_dictionary_model = IdentifierDictionaryModel(
            self.identifier_list_model)
        self.word_dictionary_model = WordDictionaryModel(
            self.identifier_dictionary_model)
        if Word2VecModel.instance.exists():
            self.calculate_semantic_metrics()
        self.identifier_dictionary_model.set_word_metrics(
            self.word_dictionary_model.get_dictionary())
        Logger().finish_analyzing(self.timer.get_duration(),
                                  self.relative_path)

    def calculate_semantic_metrics(self):
        self.set_word2vec_class_name()
        self.set_word2vec_file_context_name()
        self.word_dictionary_model.calculate_semantic_metrics()

    def set_word2vec_class_name(self):
        class_identifiers: [
            str
        ] = self.identifier_list_model.get_filtered_identfier_names(
            IdentifierType.Class)
        class_identifier_words: [
            str
        ] = self.identifier_dictionary_model.get_filtered_words(
            class_identifiers)
        Word2VecModel.instance.set_class_name(class_identifier_words)

    def set_word2vec_file_context_name(self):
        file_context_words: [
            str
        ] = self.word_dictionary_model.get_dictionary_keys()
        Word2VecModel.instance.set_file_context_name(file_context_words)