def store_plots(models: List[TopicsModel], coherence_values: List[float], tsne=True): """ Given a list of models and a list of coherence values, stores the plots of the wordclouds \ and the tsne html interactive plot in the dir_path of the model with max coherence value. :param tsne: If true, calculates tsne and stores plot. """ pretty_print('Storing plots') # Get the best model using the coherence value index_max_coherence_value = coherence_values.index(max(coherence_values)) best_model = models[index_max_coherence_value] # Store the wordclouds plots of only the bests models of the list plot_word_clouds_of_topics(best_model.get_topics(), save=True, dir_save_path=best_model.dir_path, show_plot=False, dpi=100) if tsne: # Store the tSNE plot of only the best model of the list tsne_clustering_chart(best_model, save_path=best_model.dir_path, plot_name='tsne.html', show_plot=False)
def print_some_files(self, n=3, print_file_num=True): """ Prints some text files from the corpus. \ This function can be used to see how the preprocessing affects the dataset documents. """ category_and_name_list = [('comp.sys.ibm.pc.hardware', '60133'), ('sci.space', '59848'), ('rec.sport.hockey', '52609')] if n > len(category_and_name_list): n = len(category_and_name_list) for i in range(n): if print_file_num: pretty_print('File {0}'.format(i + 1)) doc_index_inside_category = self.get_document_index( *category_and_name_list[i]) print( textwrap.fill(self.files_dict[category_and_name_list[i][0]] [doc_index_inside_category].content, width=80))
def execute(conf_ini_file_path: str): """ Demo of the library functionality. :param conf_ini_file_path: Path to the demo-conf.ini configuration file. \ This file contains some configuration to execute the demo, for example, absolute paths. \ If the demo is executed with docker, the path to the demo-docker-conf.ini must be passed instead. """ # region 0. Obtain variables from configuration file # Path to the 20_newsgroups dataset folder. dataset_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'DATASETS', 'TWENTY_NEWS_GROUPS_DIR_PATH') # Path to the directory where the 'trigrams_dataset' object folder is stored. dataset_obj_parent_dir_path = \ get_param_value_from_conf_ini_file(conf_ini_file_path, 'DATASETS', 'TRIGRAMS_DATASET_OBJECT_PARENT_DIR_PATH') # Name of the best lda mallet model best_lda_mallet_model_name = \ get_param_value_from_conf_ini_file(conf_ini_file_path, 'MODELS', 'BEST_LDA_MALLET_MODEL_NAME') # Path to the directory where the best mallet model folder (called best_lda_mallet_model_name) is stored in. mallet_model_parent_dir_path = \ get_param_value_from_conf_ini_file(conf_ini_file_path, 'MODELS', 'BEST_LDA_MALLET_MODEL_PARENT_DIR_PATH') # Path to the mallet source code. mallet_source_code_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'MALLET', 'SOURCE_CODE_PATH') # Path where the glove directory is located. glove_embeddings_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'EMBEDDINGS', 'GLOVE_PATH') # Path to the directory where the wordcloud images will be saved. wordcloud_images_dir_save_path = \ get_param_value_from_conf_ini_file(conf_ini_file_path, 'WORDCLOUD_IMAGES', 'DIRECTORY_PATH') # endregion # region 1. Load dataset and preprocessing pretty_print('1. Load dataset and preprocessing') user_input = input('Load previously preprocessed dataset from [d]isk (quick) or ' 'load dataset and preprocess it in the [m]oment (slow)? (D/m): ') if user_input.lower() != 'm': # D option # Load a preprocessed 20newsgroups dataset object (with trigrams) preprocessed_dataset = TwentyNewsGroupsDataset.load( 'trigrams_dataset', # name of the dataset object parent_dir_path=dataset_obj_parent_dir_path, # path to dataset obj parent dir dataset_path=dataset_path # path to the dataset files ) pretty_print("One of the files of the preprocessed dataset") preprocessed_dataset.print_some_files(n=1, print_file_num=False) else: # m option # Load the 20newsgroups dataset, applying the dataset specific preprocessing # (remove header, remove footer and remove quotes of the documents, as specified # in the __init__() default parameters). dataset = TwentyNewsGroupsDataset() # Prints some files pretty_print("One of the files of the dataset after the dataset specific preprocessing") dataset.print_some_files(n=1, print_file_num=False) # Applies general preprocessing (generating trigrams): # Normalize, lowercase, remove stopwords, remove emails, ... # All this preprocessing and more is applied, as specified in the default parameters # of the preprocess_dataset() function. preprocessed_dataset = preprocess_dataset(dataset, ngrams='tri') pretty_print("One of the files of the dataset after the preprocessing") preprocessed_dataset.print_some_files(n=1, print_file_num=False) # endregion # region 2. Generate LdaGensimModel or load LdaMalletModel pretty_print('2. Generate or load a TopicsModel') user_input = input( 'Load previously generated Lda[M]alletModel (quick op. and better model) or ' 'generate a Lda[G]ensimModel in the moment (slow op. and worst model)? (M/g): ' ) if user_input.lower() != 'g': # M option # Load a LdaMalletModel stored on disk (the best model found for this dataset) # The load() method also loads the dataset used to generate the model, # the preprocessing options, and the docs_topics_df DataFrame # (contains the dominant topic of each document in the dataset). model = LdaMalletModel.load(best_lda_mallet_model_name, model_parent_dir_path=mallet_model_parent_dir_path, dataset_path=dataset_path, mallet_path=mallet_source_code_path) else: # g option # Generate a LdaGensimModel using the previously preprocessed dataset model = LdaGensimModel(preprocessed_dataset, num_topics=17) # endregion # region 3. Show topics pretty_print('3. Show the topics of the chosen model') user_input = input('In which format ([t]ext, [i]mages, [b]oth)? (t/i/B):') text_format = images_format = False if user_input.lower() != 't' and user_input.lower() != 'i': # B option text_format = images_format = True elif user_input.lower() == 't': text_format = True elif user_input.lower() == 'i': images_format = True if text_format: pretty_print('Text format') model.print_topics(pretty_format=True) if images_format: pretty_print('Images') print('Images are being saved in the <project-root-path>/demo-images folder') # Create a plot with the most important keywords in each topic. # Plots are stored in the <project-root-path>/demo-images folder. plot_word_clouds_of_topics( model.get_topics(num_keywords=15), dpi=150, show_plot=False, save=True, dir_save_path=wordcloud_images_dir_save_path ) # endregion # region 4. Get the most representative documents of one topic pretty_print('4. Show the k most representative documents of the topic 16') k = input('k value (default is 2):') try: k = int(k) except ValueError: k = 2 # Obtain a DataFrame with the k most representative documents of the topic 16 two_most_repr_docs_topic16_df = model.get_k_most_repr_docs_of_topic_as_df(topic=16, k=k) for i in range(k): pretty_print('Document {0}'.format(i + 1)) # The 'Topic prob' column contains the topic-document probability print('Probability: {0}'.format(two_most_repr_docs_topic16_df['Topic prob'][i])) pretty_print('Original document content') # The 'Original doc text' column contains the original text of the documents # (the text of the documents before the general preprocessing) print(two_most_repr_docs_topic16_df['Original doc text'][i]) # endregion # region 5. Given a text, predict the topics probability pretty_print('5. Given a text, predict the topics probability') user_input = input('Use a religion [h]ardcoded text or ' 'write your [o]wn text? (H/o): ') if user_input.lower() != 'o': # H option text = """The baptism of Jesus is described in the gospels of Matthew, Mark and Luke. John's gospel does not directly describe Jesus' baptism. Most modern theologians view the baptism of Jesus by John the Baptist as a historical event to which a high degree of certainty can be assigned.[1][2][3][4][5] Along with the crucifixion of Jesus, most biblical scholars view it as one of the two historically certain facts about him, and often use it as the starting point for the study of the historical Jesus.[6] The baptism is one of the five major milestones in the gospel narrative of the life of Jesus, the others being the Transfiguration, Crucifixion, Resurrection, and Ascension.[7][8] Most Christian denominations view the baptism of Jesus as an important event and a basis for the Christian rite of baptism (see also Acts 19:1–7). In Eastern Christianity, Jesus' baptism is commemorated on 6 January (the Julian calendar date of which corresponds to 19 January on the Gregorian calendar), the feast of Epiphany.[9] In the Roman Catholic Church, the Anglican Communion, the Lutheran Churches and some other Western denominations, it is recalled on a day within the following week, the feast of the baptism of the Lord. In Roman Catholicism, the baptism of Jesus is one of the Luminous Mysteries sometimes added to the Rosary. It is a Trinitarian feast in the Eastern Orthodox Churches.""" else: # o option print('Write your text (when finish, press Enter two times):') lines = [] while True: line = input() if line: lines.append(line) else: break text = '\n'.join(lines) pretty_print('Text') print(text) pretty_print('Text-topics probability') # Predict the probability of the text being related with each topic. # Instead of storing the returned DataFrame, a table is printed to the standard output model.predict_topic_prob_on_text(text) # endregion # region 6. Given a text, get k most related documents pretty_print('6. Given a text, get k most related documents') k = input('k value (default is 2):') try: k = int(k) except ValueError: k = 2 pretty_print('Text') print(text) # Obtain a DataFrame with the k documents more related to the given text related_docs_df = model.get_related_docs_as_df(text, num_docs=k) for i in range(k): pretty_print('Document {0}'.format(i + 1)) # The 'Doc prob' column contains the document-text probability print('Probability: {0}'.format(related_docs_df['Doc prob'][i])) pretty_print('Original document content') # The 'Original doc text' column contains the original text of the documents # (the text of the documents before the general preprocessing) print(related_docs_df['Original doc text'][i]) # endregion # region 7. Summarize a given text pretty_print('7. Summarize a given text (get k best sentences)') k = input('k value (default is 2):') try: k = int(k) except ValueError: k = 2 pretty_print('Text') print(text) # Create a TextRank model (using Glove word embeddings) pretty_print('Loading the Glove word embeddings') tr = TextRank(embedding_model='glove', embeddings_path=glove_embeddings_path) # Use the created model to obtain the k sentences that better summarize the given text pretty_print('Generating the summary with the Text Rank algorithm') pretty_print('Summary') summary = tr.get_k_best_sentences_of_text(text, k) for i, sent in enumerate(summary): if i > 0: print() print('Sentence {0}: {1}'.format(i + 1, sent))
def preprocess_dataset(dataset: StructuredDataset, trash_docs=True, normalize=True, lowercase=True, stopwords=True, contractions=True, vulgar_words=True, emails=True, punctuation=True, ngrams='uni', min_bigrams_count=50, bigrams_threshold=75, min_trigrams_count=100, trigrams_threshold=175, lemmatize=True, stem=False, trash_words=True, apostrophes=True, chars=True, empty_docs=True) \ -> StructuredDataset: """ Creates a copy of the given dataset and returns the dataset copy with the specified preprocessing applied. \ The preprocessing options applied (including the ngrams_model_func if it's the case) are stored in the preprocessing_options attribute of the returned dataset. The original dataset is not modified. :param min_bigrams_count: If ngrams is 'bi' or 'tri', this is the minimum number of occurrences \ of a bigram to be transformed as a bigram. :param bigrams_threshold: If ngrams is 'bi' or 'tri', this is the threshold for creating a bigram. :param min_trigrams_count: If ngrams is 'tri', this is the minimum number of occurrences \ of a trigram to be transformed as a trigram. :param trigrams_threshold: If ngrams is 'tri', this is the threshold for creating a trigram. :param dataset: Dataset to copy and apply preprocessing. :param trash_docs: Remove specified docs. By default is True. :param normalize: Normalize words. By default is True. :param lowercase: Transform to lowercase. By default is True. :param stopwords: Remove stopwords. By default is True. :param contractions: Expand contractions. By default is True. :param vulgar_words: Substitute vulgar words. By default is True. :param emails: Remove emails. By default is True. :param punctuation: Remove punctuation. By default is True. :param ngrams: If 'uni' uses unigrams. If 'bi' create bigrams and returns bigram function. \ If 'tri' creates trigrams and returns trigram function. By default is 'uni'. :param lemmatize: Lemmatize words. By default is True. :param stem: Stemm words. By default is False. :param trash_words: Remove documents with any of the 'trash words'. By default is True. :param apostrophes: Remove apostrophes. :param chars: Remove single chars. By default is True. :param empty_docs: Remove empty docs. By default is True. :return: The dataset with the preprocessing applied. Note that lemmatize and stem shouldn't be both True, because only one of them will be applied. """ # Print the options selected pretty_print('Preprocessing the dataset') # locals() returns all the local variables in the current function. # At the top of the function the only local variables are the parameters to the function. params = locals() del params[ 'dataset'] # remove the dataset param from the params list, because it's not an option print('Options selected:') for opt, value in params.items(): print('\t{0}: {1}'.format(opt, value)) # Create a copy of the dataset to avoid modifying the given dataset dataset_copy = deepcopy(dataset) ngrams_model_func = None if trash_docs: remove_trash_docs_specified_in_file(dataset_copy) if normalize: # TODO: Problem: Here we can have 'USA,' and the 'USA' in the .txt file doesn't match that. # TODO: Problem: It only can transform words, so it can't transform 'United States' to 'USA', i.e. dataset_copy.apply_function_to_files(normalize_words) if lowercase: dataset_copy.apply_function_to_files(to_lowercase) if stopwords: dataset_copy.apply_function_to_files(remove_stopwords) if contractions: dataset_copy.apply_function_to_files(expand_contractions) if vulgar_words: dataset_copy.apply_function_to_files(substitute_vulgar_words) if emails: dataset_copy.apply_function_to_files(remove_emails) if punctuation: dataset_copy.apply_function_to_files(substitute_punctuation) if stopwords: dataset_copy.apply_function_to_files(remove_stopwords) if ngrams == 'bi': ngrams_model_func = make_bigrams_and_get_bigrams_model_func( dataset_copy, min_bigrams_count, bigrams_threshold) elif ngrams == 'tri': ngrams_model_func = make_trigrams_and_get_trigrams_model_func( dataset_copy, min_bigrams_count, bigrams_threshold, min_trigrams_count, trigrams_threshold) if lemmatize: dataset_copy.apply_function_to_files(lemmatize_words) elif stem: dataset_copy.apply_function_to_files(stem_words) if trash_words: remove_docs_that_contain_any_of_the_words_in_file(dataset_copy) if apostrophes: dataset_copy.apply_function_to_files(remove_apostrophes) if chars: dataset_copy.apply_function_to_files(remove_single_chars) if empty_docs: remove_empty_docs(dataset_copy) # Store the preprocessing options in the dataset copy object dataset_copy.preprocessing_options = DatasetPreprocessingOptions( normalize=normalize, lowercase=lowercase, stopwords=stopwords, contractions=contractions, vulgar_words=vulgar_words, emails=emails, punctuation=punctuation, ngrams=ngrams, ngrams_model_func=ngrams_model_func, lemmatize=lemmatize, stem=stem, apostrophes=apostrophes, chars=chars) return dataset_copy
def generate_and_store_models(path, dataset, plot_first_name): # region LDA pretty_print(plot_first_name + ' LDA') lda_path = join_paths(path, 'lda') lda_models_list = LdaModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lda_models, lda_coherence_values = \ lda_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LDA models', save_plot=True, save_plot_path=join_paths(lda_path, 'coherence_values.png')) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lda_models_list.save(base_name='model', path=lda_path) store_plots(lda_models, lda_coherence_values) # endregion # region LSA pretty_print(plot_first_name + ' LSA') lsa_path = join_paths(path, 'lsa') lsa_models_list = LsaModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lsa_models, lsa_coherence_values = \ lsa_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LSA models', save_plot=True, save_plot_path=join_paths(lsa_path, 'coherence_values.png')) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lsa_models_list.save(base_name='model', path=lsa_path) # endregion # region LDA Mallet pretty_print(plot_first_name + ' LDA Mallet') lda_mallet_path = join_paths(path, 'lda-mallet') lda_mallet_models_list = LdaMalletModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lda_mallet_models, lda_mallet_coherence_values = \ lda_mallet_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LDA Mallet models', save_plot=True, save_plot_path=join_paths(lda_mallet_path, 'coherence_values.png'), models_base_name='model', model_path=lda_mallet_path) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lda_mallet_models_list.save() # tSNE is too slow to calculate, because predictions in LdaMallet are too slow store_plots(lda_mallet_models, lda_mallet_coherence_values, tsne=False)
3.3 Models with LDA Mallet between 10 and 20 topics """ # %% # Load dataset dataset = TwentyNewsGroupsDataset() # Topics info for the models MIN_TOPICS = 10 MAX_TOPICS = 20 BASE_PATH = get_abspath_from_project_source_root( 'saved-elements/topics/comparison') # %% # Unigrams pretty_print('Unigrams') unigrams_dataset = preprocess_dataset(dataset, ngrams='uni') unigrams_path = join_paths(BASE_PATH, 'unigrams') generate_and_store_models(unigrams_path, unigrams_dataset, 'Unigrams') # Bigrams pretty_print('Bigrams') bigrams_dataset = preprocess_dataset(dataset, ngrams='bi') bigrams_path = join_paths(BASE_PATH, 'bigrams') generate_and_store_models(bigrams_path, bigrams_dataset, 'Bigrams') # Trigrams pretty_print('Trigrams') trigrams_dataset = preprocess_dataset(dataset, ngrams='tri')
from topics_and_summary.utils import pretty_print if __name__ == '__main__': """ This Python module generates some statistics about the number of words in each document, and plot a boxplot of the results. """ dataset = TwentyNewsGroupsDataset() dataset = preprocess_dataset(dataset, ngrams='tri') df = dataset.as_dataframe() # Create a new column with a list of the words in each document df['num_words'] = df['document'].apply(lambda x: len(x.split())) # Obtain statistics on the number of words in each document pretty_print('Stats on the number of words in each document') print(df['num_words'].describe()) # Print percentiles print() print('80th percentile: ', df['num_words'].quantile(0.80)) print('85th percentile: ', df['num_words'].quantile(0.85)) print('90th percentile: ', df['num_words'].quantile(0.90)) print('95th percentile: ', df['num_words'].quantile(0.95)) # Plot a boxplot of the num words in each document sns.set(style="whitegrid") sns.boxplot(x=df['num_words']) plt.show()
The series was originally published in English by two major publishers, Bloomsbury in the United Kingdom and Scholastic Press in the United States. A play, Harry Potter and the Cursed Child, based on a story co-written by Rowling, premiered in London on 30 July 2016 at the Palace Theatre, and its script was published by Little, Brown. The original seven books were adapted into an eight-part namesake film series by Warner Bros. Pictures, which is the third highest-grossing film series of all time as of February 2018. In 2016, the total value of the Harry Potter franchise was estimated at $25 billion,[4] making Harry Potter one of the highest-grossing media franchises of all time. A series of many genres, including fantasy, drama, coming of age, and the British school story (which includes elements of mystery, thriller, adventure, horror, and romance), the world of Harry Potter explores numerous themes and includes many cultural meanings and references.[5] According to Rowling, the main theme is death.[6] Other major themes in the series include prejudice, corruption, and madness.[7] The success of the books and films has allowed the Harry Potter franchise to expand with numerous derivative works, a travelling exhibition that premiered in Chicago in 2009, a studio tour in London that opened in 2012, a digital platform on which J.K. Rowling updates the series with new information and insight, and a pentalogy of spin-off films premiering in November 2016 with Fantastic Beasts and Where to Find Them, among many other developments. Most recently, themed attractions, collectively known as The Wizarding World of Harry Potter, have been built at several Universal Parks & Resorts amusement parks around the world. """ pretty_print('TextRank Word2Vec 300 results') print_sentences(text_rank_word2vec_300.get_k_best_sentences_of_text(text)) pretty_print('TextRank Glove 100 results') print_sentences(text_rank_glove_100.get_k_best_sentences_of_text(text)) pretty_print('TextRank Glove 300 results') print_sentences(text_rank_glove_300.get_k_best_sentences_of_text(text))
from topics_and_summary.preprocessing.dataset.structured import preprocess_dataset from topics_and_summary.utils import pretty_print, RANDOM_STATE from topics_and_summary.visualizations import plot_word_clouds_of_topics, tsne_clustering_chart if __name__ == '__main__': """ This Python module shows some of the functionalities of the library. """ # %% # Load dataset and apply preprocessing dataset = TwentyNewsGroupsDataset() dataset = preprocess_dataset(dataset, ngrams='tri') # Create the Lda model pretty_print('Creating the Lda model') model = LdaGensimModel(dataset, num_topics=20, random_state=RANDOM_STATE) # Visualize with tsne tsne_clustering_chart(model) # %% # Print topics and coherence score pretty_print('\nTopics') NUM_WORDS_EACH_TOPIC_TO_BE_PRINTED = 15 model.print_topics(NUM_WORDS_EACH_TOPIC_TO_BE_PRINTED) coherence_score = model.compute_coherence_value() pretty_print('Coherence Score') print(coherence_score)