def build_language_model(self, word_occurrences_model): """ From a given word occurrences model, build a probability language model. """ start_time = time.time() print( "Message=\"Starting to train a language model from a word occurrences model\"" ) for word, word_occurrence_node in word_occurrences_model.items(): word_count = word_occurrence_node.word_count self.total_words_count += word_count probability_node = self.build_word_occurrence_probability_node( word, word_count, word_occurrence_node.children_nodes) self.probability_nodes[word] = probability_node self.calculate_nodes_probability(self.probability_nodes, self.total_words_count) self.sorted_probability_nodes = sort_probability_list( self.probability_nodes.values()) print( "ElapsedTime={}, TotalWordsCount={}, TotalDistinctWordsCount={}, Message=\"Finished training language model from word occurrences model\"" .format(get_elapsed_time(start_time), self.total_words_count, len(self.probability_nodes)))
def extract_all_sentences_from_files(source_directory): """ Extract all sentences from all files located in a source directory. """ start_time = time.time() sentences = list() files = os.listdir(source_directory) print("FilesCount={}, Message=\"Starting to extract sentences from files\"".format(len(files))) for file in files: if not file.endswith(".txt"): print("FileName={}, Message=\"Ignoring file with invalid structure. File extension needs to be .txt for sentences to be loaded\"".format(file)) continue file_path = os.path.join(source_directory, file) sentences_file = extract_sentences(file_path) sentences.extend(sentences_file) print("ElapsedTime={}, FilesCount={}, TotalSentencesCount={}, " "Message=\"Finished extracting sentences from files\"".format(get_elapsed_time(start_time), len(files), len(sentences))) return sentences
def save_language_model(language_model_data, path): """ Saves the language model in a pickle file to be later used by the language model service. """ start_time = time.time() print("Message=\"Starting to save language model\"") with open(path, "wb") as pickle_file: pickle.dump(language_model_data, pickle_file) print("ElapsedTime={}, Message=\"Finished saving language model.\"".format( get_elapsed_time(start_time)))
def trim_word_occurrences_model(word_occurrences_model): """ TODO: """ start_time = time.time() print("BaseLevelNodes={}, Message=\"Starting to trim base word occurrence model\"".format(len(word_occurrences_model))) total_nodes, trimmed_nodes = trim_nodes(word_occurrences_model, 10) print("ElapsedTime={}, OriginalNodeCount={}, TrimmedNodes={}, UpdatedBaseLevelNodes={}, Message=\"Finished trimming word occurrence model\"" .format(get_elapsed_time(start_time), total_nodes, trimmed_nodes, len(word_occurrences_model))) return word_occurrences_model
def create_word_occurrence_model(n_gram_level, list_of_sentences_as_words): """ Create an n-gram word occurrences model represented as a tree. We will only consider words that are valid by using the dictionary of words. """ start_time = time.time() print("SentencesCount={}, Message=\"Starting to create base word occurrence model from sentences\"".format(len(list_of_sentences_as_words))) word_occurrences_model = dict() mk_dictionary = get_dictionary_words(settings["dictionary_file_path"]) for words in list_of_sentences_as_words: sentence_word_count = len(words) for index, current_word in enumerate(words, start=0): if current_word not in mk_dictionary: continue if current_word not in word_occurrences_model: word_occurrences_model[current_word] = Node(current_word) current_word_combination = word_occurrences_model[current_word] current_word_combination.word_count += 1 for n_gram_step in range(1, n_gram_level): if index + n_gram_step >= sentence_word_count: break next_word = words[index+n_gram_step] if next_word not in mk_dictionary: break current_word_combination.add_occurrence(next_word) current_word_combination = current_word_combination.get_or_add_node(next_word) current_word_combination.word_count += 1 print("ElapsedTime={}, SentencesCount={}, Message=\"Finished creating base word occurrence model from sentences\"" .format(get_elapsed_time(start_time), len(list_of_sentences_as_words))) return word_occurrences_model
def build_word_occurrence_model(list_of_sentences_as_words): """ For a given list of sentences represented as a list of words, build a word occurrences model. The depth of the model should depend from the setting 'n_gram_level'. The building of the model has 2 phases: 1. Create the model using all the sentences. 2. Trim the model in order to reduce the size of the model and to cut down noisy data. The word occurrences model will be represented as a dictionary where keys are the first level words are values are Node objects. """ n_gram_level = settings["n_gram_level"] start_time = time.time() print("SentencesCount={}, Message=\"Starting to build word occurrence model\"".format(len(list_of_sentences_as_words))) word_occurrences_model = create_word_occurrence_model(n_gram_level, list_of_sentences_as_words) word_occurrences_model = trim_word_occurrences_model(word_occurrences_model) print("ElapsedTime={}s, SentencesCount={}, Message=\"Finished building word occurrence model\"" .format(get_elapsed_time(start_time), len(list_of_sentences_as_words))) return word_occurrences_model
def convert_sentences_to_list_of_words(sentences, names_set, keep_names = False): """ For a given list of sentences, convert all sentences to a list of words. Optional is if we want to replace all names and digits with special joker characters so that they don't have a different meaning. We return a list of list of words where each word is in lower case. """ start_time = time.time() print("SentencesCount={}, Message=\"Starting to convert sentences to lists of words\"".format(len(sentences))) sentences_to_words = list() for sentence in sentences: words = token_to_words(sentence) if keep_names: sentences_to_words.append(words) else: sentences_to_words.append([check_word_type(word, names_set) for word in words]) print("ElapsedTime={}, SentencesCount={}, Message=\"Finished converting sentences to list of words\"" .format(get_elapsed_time(start_time), len(sentences))) return sentences_to_words
sys.exit(0) start_time = time.time() print("ModelName=\"{}\", Message=\"Starting language model training\"". format(model_name)) sentences = extract_all_sentences_from_files( settings["text_corpus_directory"]) # TODO: Delete print(sentences[:20]) names_set = read_names(settings["male_names_file_path"], settings["female_names_file_path"]) list_of_sentences_as_words = convert_sentences_to_list_of_words( sentences, names_set) word_occurrence_model = build_word_occurrence_model( list_of_sentences_as_words) language_model = LanguageModel() language_model.build_language_model(word_occurrence_model) model_data = language_model.prepare_model_data_for_saving() save_language_model(model_data, model_name) print( "TotalElapsedTime={}, ModelName=\"{}\", Message=\"Finished language model training\"" .format(get_elapsed_time(start_time), model_name))