def lemmatize_text(file_path: str, timer: Timer): logger = Logger() output_file = FileOpener().get_new_file("wiki.en.lemmatized.txt", "a") with open(file_path, "r") as file: for line in file: lemmatized_list = [ word.lemma_ for word in SpacyModel.instance.get_en_spacy_line(line) ] lemmazized_line = " ".join(lemmatized_list) output_file.write(lemmazized_line) logger.every_n_wiki_status(10, timer.get_duration()) logger.every_n_wiki_status(1)
def main(): script_name: str = PathExtractor().get_file_name(sys.argv[0]) if len(sys.argv) != 2: Logger().usage(f'python {script_name} <wiki.en.raw.txt>') return file_path = sys.argv[1] if PathValidator().is_valid_files([file_path]): Logger().info(f'Input file: "{file_path}"') Logger().info("Starting to remove stopwords") timer = Timer() remove_stopwords(file_path) Logger().finish_script(timer.get_duration(), script_name)
def main(): script_name: str = PathExtractor().get_file_name(sys.argv[0]) if len(sys.argv) != 2: Logger().usage(f"python {script_name} <wiki.en.filtered.txt>") return file_path = sys.argv[1] if PathValidator().is_valid_files([file_path]): Logger().info(f'Input file: "{file_path}"') Logger().info("Starting to lemmatize text") timer = Timer() lemmatize_text(file_path, timer) Logger().finish_script(timer.get_duration(), script_name)
def main(): script_name: str = PathExtractor().get_file_name(sys.argv[0]) if len(sys.argv) != 2: Logger().usage( f'python {script_name} <en.wiki-latest-pages-articles.xml.bz2>') return file_path: str = sys.argv[1] if PathValidator().is_valid_files([file_path]): Logger().info(f'Input file: "{file_path}"') Logger().info(f'Starting to create wiki corpus from "{file_path}"') timer = Timer() get_corpus(file_path) Logger().finish_script(timer.get_duration(), script_name)
def main(): script_name: str = PathExtractor().get_file_name(sys.argv[0]) timer = Timer() if len(sys.argv) == 3: model_path = PathExtractor().get_absolute_path(sys.argv[2]) if not PathValidator().is_valid_files([model_path]): return Word2VecModel.instance.set_model(model_path) if len(sys.argv) < 2 or len(sys.argv) > 3: Logger().usage( f"python {script_name} <file_or_directory_path> [<word2vec.model>]" ) return project_path = PathExtractor().get_absolute_path(sys.argv[1]) if PathValidator().is_valid_paths([project_path]): parse(project_path) Logger().finish_script(timer.get_duration(), script_name)
class FileModel(): relative_path: str = None identifier_list_model: IdentifierListModel = None identifier_dictionary_model: IdentifierDictionaryModel = None word_dictionary_model: WordDictionaryModel = None path: str = None supported_extensions: [str] = None file_name: str = None extension: str = None timer: Timer = None content: str = None def __init__(self, path: str, supported_extensions: [int]): self.timer = Timer() self.path = path self.relative_path = PathExtractor().get_relative_path(path) self.supported_extensions = supported_extensions self.file_name = PathExtractor().get_file_name(path) self.extension = PathExtractor().get_file_extension(self.file_name) def to_print(self): return { "relative_path": self.relative_path, "identifier_list": self.identifier_list_model.to_print(), "identifier_dictionary": self.identifier_dictionary_model.to_print(), "word_dictionary": self.word_dictionary_model.to_print() } def to_csv(self): content = [ identifier.to_csv(self.relative_path, name) for (name, identifier) in self.identifier_dictionary_model.get_dictionary().items() ] return "".join(content) def is_valid(self): if self.extension in self.supported_extensions: self.content = FileOpener().get_file_content(self.path) return True if self.content else False def parse(self): Logger().start_analyzing(self.relative_path) self.identifier_list_model = LanguageParser().parse_file( self.extension, self.content) self.identifier_dictionary_model = IdentifierDictionaryModel( self.identifier_list_model) self.word_dictionary_model = WordDictionaryModel( self.identifier_dictionary_model) if Word2VecModel.instance.exists(): self.calculate_semantic_metrics() self.identifier_dictionary_model.set_word_metrics( self.word_dictionary_model.get_dictionary()) Logger().finish_analyzing(self.timer.get_duration(), self.relative_path) def calculate_semantic_metrics(self): self.set_word2vec_class_name() self.set_word2vec_file_context_name() self.word_dictionary_model.calculate_semantic_metrics() def set_word2vec_class_name(self): class_identifiers: [ str ] = self.identifier_list_model.get_filtered_identfier_names( IdentifierType.Class) class_identifier_words: [ str ] = self.identifier_dictionary_model.get_filtered_words( class_identifiers) Word2VecModel.instance.set_class_name(class_identifier_words) def set_word2vec_file_context_name(self): file_context_words: [ str ] = self.word_dictionary_model.get_dictionary_keys() Word2VecModel.instance.set_file_context_name(file_context_words)