def store_plots(models: List[TopicsModel],
                coherence_values: List[float],
                tsne=True):
    """
    Given a list of models and a list of coherence values, stores the plots of the wordclouds \
    and the tsne html interactive plot in the dir_path of the model with max coherence value.

    :param tsne: If true, calculates tsne and stores plot.
    """
    pretty_print('Storing plots')

    # Get the best model using the coherence value
    index_max_coherence_value = coherence_values.index(max(coherence_values))
    best_model = models[index_max_coherence_value]

    # Store the wordclouds plots of only the bests models of the list
    plot_word_clouds_of_topics(best_model.get_topics(),
                               save=True,
                               dir_save_path=best_model.dir_path,
                               show_plot=False,
                               dpi=100)

    if tsne:
        # Store the tSNE plot of only the best model of the list
        tsne_clustering_chart(best_model,
                              save_path=best_model.dir_path,
                              plot_name='tsne.html',
                              show_plot=False)
Exemple #2
0
    def print_some_files(self, n=3, print_file_num=True):
        """
        Prints some text files from the corpus. \
        This function can be used to see how the preprocessing affects the dataset documents.
        """
        category_and_name_list = [('comp.sys.ibm.pc.hardware', '60133'),
                                  ('sci.space', '59848'),
                                  ('rec.sport.hockey', '52609')]

        if n > len(category_and_name_list):
            n = len(category_and_name_list)

        for i in range(n):
            if print_file_num:
                pretty_print('File {0}'.format(i + 1))

            doc_index_inside_category = self.get_document_index(
                *category_and_name_list[i])
            print(
                textwrap.fill(self.files_dict[category_and_name_list[i][0]]
                              [doc_index_inside_category].content,
                              width=80))
def execute(conf_ini_file_path: str):
    """
    Demo of the library functionality.

    :param conf_ini_file_path: Path to the demo-conf.ini configuration file. \
    This file contains some configuration to execute the demo, for example, absolute paths. \
    If the demo is executed with docker, the path to the demo-docker-conf.ini must be passed instead.
    """

    # region 0. Obtain variables from configuration file
    # Path to the 20_newsgroups dataset folder.
    dataset_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'DATASETS', 'TWENTY_NEWS_GROUPS_DIR_PATH')
    # Path to the directory where the 'trigrams_dataset' object folder is stored.
    dataset_obj_parent_dir_path = \
        get_param_value_from_conf_ini_file(conf_ini_file_path, 'DATASETS', 'TRIGRAMS_DATASET_OBJECT_PARENT_DIR_PATH')

    # Name of the best lda mallet model
    best_lda_mallet_model_name = \
        get_param_value_from_conf_ini_file(conf_ini_file_path, 'MODELS', 'BEST_LDA_MALLET_MODEL_NAME')
    # Path to the directory where the best mallet model folder (called best_lda_mallet_model_name) is stored in.
    mallet_model_parent_dir_path = \
        get_param_value_from_conf_ini_file(conf_ini_file_path, 'MODELS', 'BEST_LDA_MALLET_MODEL_PARENT_DIR_PATH')

    # Path to the mallet source code.
    mallet_source_code_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'MALLET', 'SOURCE_CODE_PATH')

    # Path where the glove directory is located.
    glove_embeddings_path = get_param_value_from_conf_ini_file(conf_ini_file_path, 'EMBEDDINGS', 'GLOVE_PATH')

    # Path to the directory where the wordcloud images will be saved.
    wordcloud_images_dir_save_path = \
        get_param_value_from_conf_ini_file(conf_ini_file_path, 'WORDCLOUD_IMAGES', 'DIRECTORY_PATH')
    # endregion

    # region 1. Load dataset and preprocessing
    pretty_print('1. Load dataset and preprocessing')

    user_input = input('Load previously preprocessed dataset from [d]isk (quick) or '
                       'load dataset and preprocess it in the [m]oment (slow)? (D/m): ')
    if user_input.lower() != 'm':  # D option
        # Load a preprocessed 20newsgroups dataset object (with trigrams)
        preprocessed_dataset = TwentyNewsGroupsDataset.load(
            'trigrams_dataset',  # name of the dataset object
            parent_dir_path=dataset_obj_parent_dir_path,  # path to dataset obj parent dir
            dataset_path=dataset_path  # path to the dataset files
        )
        pretty_print("One of the files of the preprocessed dataset")
        preprocessed_dataset.print_some_files(n=1, print_file_num=False)
    else:  # m option
        # Load the 20newsgroups dataset, applying the dataset specific preprocessing
        # (remove header, remove footer and remove quotes of the documents, as specified
        # in the __init__() default parameters).
        dataset = TwentyNewsGroupsDataset()

        # Prints some files
        pretty_print("One of the files of the dataset after the dataset specific preprocessing")
        dataset.print_some_files(n=1, print_file_num=False)

        # Applies general preprocessing (generating trigrams):
        #   Normalize, lowercase, remove stopwords, remove emails, ...
        #   All this preprocessing and more is applied, as specified in the default parameters
        #   of the preprocess_dataset() function.
        preprocessed_dataset = preprocess_dataset(dataset, ngrams='tri')
        pretty_print("One of the files of the dataset after the preprocessing")
        preprocessed_dataset.print_some_files(n=1, print_file_num=False)
    # endregion

    # region 2. Generate LdaGensimModel or load LdaMalletModel
    pretty_print('2. Generate or load a TopicsModel')

    user_input = input(
        'Load previously generated Lda[M]alletModel (quick op. and better model) or '
        'generate a Lda[G]ensimModel in the moment (slow op. and worst model)? (M/g): '
    )
    if user_input.lower() != 'g':  # M option
        # Load a LdaMalletModel stored on disk (the best model found for this dataset)
        # The load() method also loads the dataset used to generate the model,
        # the preprocessing options, and the docs_topics_df DataFrame
        # (contains the dominant topic of each document in the dataset).
        model = LdaMalletModel.load(best_lda_mallet_model_name,
                                    model_parent_dir_path=mallet_model_parent_dir_path,
                                    dataset_path=dataset_path,
                                    mallet_path=mallet_source_code_path)
    else:  # g option
        # Generate a LdaGensimModel using the previously preprocessed dataset
        model = LdaGensimModel(preprocessed_dataset, num_topics=17)
    # endregion

    # region 3. Show topics
    pretty_print('3. Show the topics of the chosen model')

    user_input = input('In which format ([t]ext, [i]mages, [b]oth)? (t/i/B):')

    text_format = images_format = False
    if user_input.lower() != 't' and user_input.lower() != 'i':  # B option
        text_format = images_format = True
    elif user_input.lower() == 't':
        text_format = True
    elif user_input.lower() == 'i':
        images_format = True

    if text_format:
        pretty_print('Text format')
        model.print_topics(pretty_format=True)
    if images_format:
        pretty_print('Images')
        print('Images are being saved in the <project-root-path>/demo-images folder')
        # Create a plot with the most important keywords in each topic.
        # Plots are stored in the <project-root-path>/demo-images folder.
        plot_word_clouds_of_topics(
            model.get_topics(num_keywords=15), dpi=150, show_plot=False, save=True,
            dir_save_path=wordcloud_images_dir_save_path
        )
    # endregion

    # region 4. Get the most representative documents of one topic
    pretty_print('4. Show the k most representative documents of the topic 16')

    k = input('k value (default is 2):')
    try:
        k = int(k)
    except ValueError:
        k = 2

    # Obtain a DataFrame with the k most representative documents of the topic 16
    two_most_repr_docs_topic16_df = model.get_k_most_repr_docs_of_topic_as_df(topic=16, k=k)

    for i in range(k):
        pretty_print('Document {0}'.format(i + 1))
        # The 'Topic prob' column contains the topic-document probability
        print('Probability: {0}'.format(two_most_repr_docs_topic16_df['Topic prob'][i]))

        pretty_print('Original document content')
        # The 'Original doc text' column contains the original text of the documents
        # (the text of the documents before the general preprocessing)
        print(two_most_repr_docs_topic16_df['Original doc text'][i])
    # endregion

    # region 5. Given a text, predict the topics probability
    pretty_print('5. Given a text, predict the topics probability')

    user_input = input('Use a religion [h]ardcoded text or '
                       'write your [o]wn text? (H/o): ')

    if user_input.lower() != 'o':  # H option
        text = """The baptism of Jesus is described in the gospels of Matthew, Mark and Luke. 
John's gospel does not directly describe Jesus' baptism. Most modern theologians view the 
baptism of Jesus by John the Baptist as a historical event to which a high degree of 
certainty can be assigned.[1][2][3][4][5] Along with the crucifixion of Jesus, most biblical 
scholars view it as one of the two historically certain facts about him, and often use it
as the starting point for the study of the historical Jesus.[6] 
The baptism is one of the five major milestones in the gospel narrative of the life of Jesus, 
the others being the Transfiguration, Crucifixion, Resurrection, and Ascension.[7][8] 
Most Christian denominations view the baptism of Jesus as an important event and a basis for 
the Christian rite of baptism (see also Acts 19:1–7). In Eastern Christianity, Jesus' baptism 
is commemorated on 6 January (the Julian calendar date of which corresponds to 19 January on 
the Gregorian calendar), the feast of Epiphany.[9] In the Roman Catholic Church, the Anglican
Communion, the Lutheran Churches and some other Western denominations, it is recalled on a day 
within the following week, the feast of the baptism of the Lord. In Roman Catholicism, 
the baptism of Jesus is one of the Luminous Mysteries sometimes added to the Rosary.
It is a Trinitarian feast in the Eastern Orthodox Churches."""

    else:  # o option
        print('Write your text (when finish, press Enter two times):')
        lines = []
        while True:
            line = input()
            if line:
                lines.append(line)
            else:
                break
        text = '\n'.join(lines)

    pretty_print('Text')
    print(text)

    pretty_print('Text-topics probability')
    # Predict the probability of the text being related with each topic.
    # Instead of storing the returned DataFrame, a table is printed to the standard output
    model.predict_topic_prob_on_text(text)
    # endregion

    # region 6. Given a text, get k most related documents
    pretty_print('6. Given a text, get k most related documents')

    k = input('k value (default is 2):')
    try:
        k = int(k)
    except ValueError:
        k = 2

    pretty_print('Text')
    print(text)

    # Obtain a DataFrame with the k documents more related to the given text
    related_docs_df = model.get_related_docs_as_df(text, num_docs=k)

    for i in range(k):
        pretty_print('Document {0}'.format(i + 1))
        # The 'Doc prob' column contains the document-text probability
        print('Probability: {0}'.format(related_docs_df['Doc prob'][i]))

        pretty_print('Original document content')
        # The 'Original doc text' column contains the original text of the documents
        # (the text of the documents before the general preprocessing)
        print(related_docs_df['Original doc text'][i])
    # endregion

    # region 7. Summarize a given text
    pretty_print('7. Summarize a given text (get k best sentences)')

    k = input('k value (default is 2):')
    try:
        k = int(k)
    except ValueError:
        k = 2

    pretty_print('Text')
    print(text)

    # Create a TextRank model (using Glove word embeddings)
    pretty_print('Loading the Glove word embeddings')
    tr = TextRank(embedding_model='glove', embeddings_path=glove_embeddings_path)

    # Use the created model to obtain the k sentences that better summarize the given text
    pretty_print('Generating the summary with the Text Rank algorithm')
    pretty_print('Summary')
    summary = tr.get_k_best_sentences_of_text(text, k)

    for i, sent in enumerate(summary):
        if i > 0:
            print()
        print('Sentence {0}: {1}'.format(i + 1, sent))
def preprocess_dataset(dataset: StructuredDataset, trash_docs=True, normalize=True, lowercase=True, stopwords=True,
                       contractions=True, vulgar_words=True, emails=True, punctuation=True, ngrams='uni',
                       min_bigrams_count=50, bigrams_threshold=75, min_trigrams_count=100, trigrams_threshold=175,
                       lemmatize=True, stem=False, trash_words=True, apostrophes=True, chars=True, empty_docs=True) \
        -> StructuredDataset:
    """
    Creates a copy of the given dataset and returns the dataset copy with the specified preprocessing applied. \
    The preprocessing options applied (including the ngrams_model_func if it's the case) are stored in the
    preprocessing_options attribute of the returned dataset. The original dataset is not modified.

    :param min_bigrams_count: If ngrams is 'bi' or 'tri', this is the minimum number of occurrences \
    of a bigram to be transformed as a bigram.
    :param bigrams_threshold: If ngrams is 'bi' or 'tri', this is the threshold for creating a bigram.
    :param min_trigrams_count: If ngrams is 'tri', this is the minimum number of occurrences \
    of a trigram to be transformed as a trigram.
    :param trigrams_threshold: If ngrams is 'tri', this is the threshold for creating a trigram.
    :param dataset: Dataset to copy and apply preprocessing.
    :param trash_docs: Remove specified docs. By default is True.
    :param normalize: Normalize words. By default is True.
    :param lowercase: Transform to lowercase. By default is True.
    :param stopwords: Remove stopwords. By default is True.
    :param contractions: Expand contractions. By default is True.
    :param vulgar_words: Substitute vulgar words. By default is True.
    :param emails: Remove emails. By default is True.
    :param punctuation: Remove punctuation. By default is True.
    :param ngrams: If 'uni' uses unigrams. If 'bi' create bigrams and returns bigram function. \
    If 'tri' creates trigrams and returns trigram function. By default is 'uni'.
    :param lemmatize: Lemmatize words. By default is True.
    :param stem: Stemm words. By default is False.
    :param trash_words: Remove documents with any of the 'trash words'. By default is True.
    :param apostrophes: Remove apostrophes.
    :param chars: Remove single chars. By default is True.
    :param empty_docs: Remove empty docs. By default is True.
    :return: The dataset with the preprocessing applied.

    Note that lemmatize and stem shouldn't be both True, because only one of them will be applied.
    """

    # Print the options selected
    pretty_print('Preprocessing the dataset')
    # locals() returns all the local variables in the current function.
    # At the top of the function the only local variables are the parameters to the function.
    params = locals()
    del params[
        'dataset']  # remove the dataset param from the params list, because it's not an option
    print('Options selected:')
    for opt, value in params.items():
        print('\t{0}: {1}'.format(opt, value))

    # Create a copy of the dataset to avoid modifying the given dataset
    dataset_copy = deepcopy(dataset)

    ngrams_model_func = None

    if trash_docs:
        remove_trash_docs_specified_in_file(dataset_copy)
    if normalize:
        # TODO: Problem: Here we can have 'USA,' and the 'USA' in the .txt file doesn't match that.
        # TODO: Problem: It only can transform words, so it can't transform 'United States' to 'USA', i.e.
        dataset_copy.apply_function_to_files(normalize_words)
    if lowercase:
        dataset_copy.apply_function_to_files(to_lowercase)
    if stopwords:
        dataset_copy.apply_function_to_files(remove_stopwords)
    if contractions:
        dataset_copy.apply_function_to_files(expand_contractions)
    if vulgar_words:
        dataset_copy.apply_function_to_files(substitute_vulgar_words)
    if emails:
        dataset_copy.apply_function_to_files(remove_emails)
    if punctuation:
        dataset_copy.apply_function_to_files(substitute_punctuation)
    if stopwords:
        dataset_copy.apply_function_to_files(remove_stopwords)
    if ngrams == 'bi':
        ngrams_model_func = make_bigrams_and_get_bigrams_model_func(
            dataset_copy, min_bigrams_count, bigrams_threshold)
    elif ngrams == 'tri':
        ngrams_model_func = make_trigrams_and_get_trigrams_model_func(
            dataset_copy, min_bigrams_count, bigrams_threshold,
            min_trigrams_count, trigrams_threshold)
    if lemmatize:
        dataset_copy.apply_function_to_files(lemmatize_words)
    elif stem:
        dataset_copy.apply_function_to_files(stem_words)
    if trash_words:
        remove_docs_that_contain_any_of_the_words_in_file(dataset_copy)
    if apostrophes:
        dataset_copy.apply_function_to_files(remove_apostrophes)
    if chars:
        dataset_copy.apply_function_to_files(remove_single_chars)
    if empty_docs:
        remove_empty_docs(dataset_copy)

    # Store the preprocessing options in the dataset copy object
    dataset_copy.preprocessing_options = DatasetPreprocessingOptions(
        normalize=normalize,
        lowercase=lowercase,
        stopwords=stopwords,
        contractions=contractions,
        vulgar_words=vulgar_words,
        emails=emails,
        punctuation=punctuation,
        ngrams=ngrams,
        ngrams_model_func=ngrams_model_func,
        lemmatize=lemmatize,
        stem=stem,
        apostrophes=apostrophes,
        chars=chars)

    return dataset_copy
def generate_and_store_models(path, dataset, plot_first_name):
    # region LDA
    pretty_print(plot_first_name + ' LDA')

    lda_path = join_paths(path, 'lda')
    lda_models_list = LdaModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lda_models, lda_coherence_values = \
        lda_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                   title=plot_first_name + ' LDA models',
                                                                   save_plot=True,
                                                                   save_plot_path=join_paths(lda_path,
                                                                                             'coherence_values.png'))
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lda_models_list.save(base_name='model', path=lda_path)
    store_plots(lda_models, lda_coherence_values)
    # endregion

    # region LSA
    pretty_print(plot_first_name + ' LSA')

    lsa_path = join_paths(path, 'lsa')
    lsa_models_list = LsaModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lsa_models, lsa_coherence_values = \
        lsa_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                   title=plot_first_name + ' LSA models',
                                                                   save_plot=True,
                                                                   save_plot_path=join_paths(lsa_path,
                                                                                             'coherence_values.png'))
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lsa_models_list.save(base_name='model', path=lsa_path)
    # endregion

    # region LDA Mallet
    pretty_print(plot_first_name + ' LDA Mallet')

    lda_mallet_path = join_paths(path, 'lda-mallet')
    lda_mallet_models_list = LdaMalletModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lda_mallet_models, lda_mallet_coherence_values = \
        lda_mallet_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                          title=plot_first_name + ' LDA Mallet models',
                                                                          save_plot=True,
                                                                          save_plot_path=join_paths(lda_mallet_path,
                                                                                                    'coherence_values.png'),
                                                                          models_base_name='model',
                                                                          model_path=lda_mallet_path)
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lda_mallet_models_list.save()
    # tSNE is too slow to calculate, because predictions in LdaMallet are too slow
    store_plots(lda_mallet_models, lda_mallet_coherence_values, tsne=False)
        3.3 Models with LDA Mallet between 10 and 20 topics
    """

    # %%
    # Load dataset
    dataset = TwentyNewsGroupsDataset()

    # Topics info for the models
    MIN_TOPICS = 10
    MAX_TOPICS = 20
    BASE_PATH = get_abspath_from_project_source_root(
        'saved-elements/topics/comparison')

    # %%
    # Unigrams
    pretty_print('Unigrams')
    unigrams_dataset = preprocess_dataset(dataset, ngrams='uni')
    unigrams_path = join_paths(BASE_PATH, 'unigrams')

    generate_and_store_models(unigrams_path, unigrams_dataset, 'Unigrams')

    # Bigrams
    pretty_print('Bigrams')
    bigrams_dataset = preprocess_dataset(dataset, ngrams='bi')
    bigrams_path = join_paths(BASE_PATH, 'bigrams')

    generate_and_store_models(bigrams_path, bigrams_dataset, 'Bigrams')

    # Trigrams
    pretty_print('Trigrams')
    trigrams_dataset = preprocess_dataset(dataset, ngrams='tri')
from topics_and_summary.utils import pretty_print

if __name__ == '__main__':
    """
    This Python module generates some statistics about the number of words in each document,
    and plot a boxplot of the results.
    """

    dataset = TwentyNewsGroupsDataset()
    dataset = preprocess_dataset(dataset, ngrams='tri')
    df = dataset.as_dataframe()

    # Create a new column with a list of the words in each document
    df['num_words'] = df['document'].apply(lambda x: len(x.split()))

    # Obtain statistics on the number of words in each document
    pretty_print('Stats on the number of words in each document')
    print(df['num_words'].describe())

    # Print percentiles
    print()
    print('80th percentile: ', df['num_words'].quantile(0.80))
    print('85th percentile: ', df['num_words'].quantile(0.85))
    print('90th percentile: ', df['num_words'].quantile(0.90))
    print('95th percentile: ', df['num_words'].quantile(0.95))

    # Plot a boxplot of the num words in each document
    sns.set(style="whitegrid")
    sns.boxplot(x=df['num_words'])
    plt.show()
    
    The series was originally published in English by two major publishers, Bloomsbury in the United Kingdom and 
    Scholastic Press in the United States. A play, Harry Potter and the Cursed Child, based on a story co-written by 
    Rowling, premiered in London on 30 July 2016 at the Palace Theatre, and its script was published by Little, Brown. 
    The original seven books were adapted into an eight-part namesake film series by Warner Bros. Pictures, which is 
    the third highest-grossing film series of all time as of February 2018. In 2016, the total value of the Harry 
    Potter franchise was estimated at $25 billion,[4] making Harry Potter one of the highest-grossing media franchises 
    of all time.
    
    A series of many genres, including fantasy, drama, coming of age, and the British school story (which includes 
    elements of mystery, thriller, adventure, horror, and romance), the world of Harry Potter explores numerous themes 
    and includes many cultural meanings and references.[5] According to Rowling, the main theme is death.[6] Other major 
    themes in the series include prejudice, corruption, and madness.[7]
    
    The success of the books and films has allowed the Harry Potter franchise to expand with numerous derivative works, 
    a travelling exhibition that premiered in Chicago in 2009, a studio tour in London that opened in 2012, a digital 
    platform on which J.K. Rowling updates the series with new information and insight, and a pentalogy of spin-off 
    films premiering in November 2016 with Fantastic Beasts and Where to Find Them, among many other developments. 
    Most recently, themed attractions, collectively known as The Wizarding World of Harry Potter, have been built at 
    several Universal Parks & Resorts amusement parks around the world.
    """

    pretty_print('TextRank Word2Vec 300 results')
    print_sentences(text_rank_word2vec_300.get_k_best_sentences_of_text(text))

    pretty_print('TextRank Glove 100 results')
    print_sentences(text_rank_glove_100.get_k_best_sentences_of_text(text))

    pretty_print('TextRank Glove 300 results')
    print_sentences(text_rank_glove_300.get_k_best_sentences_of_text(text))
Exemple #9
0
from topics_and_summary.preprocessing.dataset.structured import preprocess_dataset
from topics_and_summary.utils import pretty_print, RANDOM_STATE
from topics_and_summary.visualizations import plot_word_clouds_of_topics, tsne_clustering_chart

if __name__ == '__main__':
    """
    This Python module shows some of the functionalities of the library.
    """
    # %%

    # Load dataset and apply preprocessing
    dataset = TwentyNewsGroupsDataset()
    dataset = preprocess_dataset(dataset, ngrams='tri')

    # Create the Lda model
    pretty_print('Creating the Lda model')
    model = LdaGensimModel(dataset, num_topics=20, random_state=RANDOM_STATE)

    # Visualize with tsne
    tsne_clustering_chart(model)

    # %%

    # Print topics and coherence score
    pretty_print('\nTopics')
    NUM_WORDS_EACH_TOPIC_TO_BE_PRINTED = 15
    model.print_topics(NUM_WORDS_EACH_TOPIC_TO_BE_PRINTED)
    coherence_score = model.compute_coherence_value()
    pretty_print('Coherence Score')
    print(coherence_score)