def generate_and_store_models(path, dataset, plot_first_name):
    # region LDA
    pretty_print(plot_first_name + ' LDA')

    lda_path = join_paths(path, 'lda')
    lda_models_list = LdaModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lda_models, lda_coherence_values = \
        lda_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                   title=plot_first_name + ' LDA models',
                                                                   save_plot=True,
                                                                   save_plot_path=join_paths(lda_path,
                                                                                             'coherence_values.png'))
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lda_models_list.save(base_name='model', path=lda_path)
    store_plots(lda_models, lda_coherence_values)
    # endregion

    # region LSA
    pretty_print(plot_first_name + ' LSA')

    lsa_path = join_paths(path, 'lsa')
    lsa_models_list = LsaModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lsa_models, lsa_coherence_values = \
        lsa_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                   title=plot_first_name + ' LSA models',
                                                                   save_plot=True,
                                                                   save_plot_path=join_paths(lsa_path,
                                                                                             'coherence_values.png'))
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lsa_models_list.save(base_name='model', path=lsa_path)
    # endregion

    # region LDA Mallet
    pretty_print(plot_first_name + ' LDA Mallet')

    lda_mallet_path = join_paths(path, 'lda-mallet')
    lda_mallet_models_list = LdaMalletModelsList(dataset)
    # Create models, compute coherence values and store a plot with the coherence values
    pretty_print('Creating models')
    lda_mallet_models, lda_mallet_coherence_values = \
        lda_mallet_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS,
                                                                          title=plot_first_name + ' LDA Mallet models',
                                                                          save_plot=True,
                                                                          save_plot_path=join_paths(lda_mallet_path,
                                                                                                    'coherence_values.png'),
                                                                          models_base_name='model',
                                                                          model_path=lda_mallet_path)
    # Store the models and a txt file with the coherence value of each model
    pretty_print('Storing models')
    lda_mallet_models_list.save()
    # tSNE is too slow to calculate, because predictions in LdaMallet are too slow
    store_plots(lda_mallet_models, lda_mallet_coherence_values, tsne=False)
Example #2
0
    def test_dataset_save_and_load_with_preprocessing_options(self):
        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        options = DatasetPreprocessingOptions(normalize=True,
                                              lowercase=True,
                                              stopwords=False,
                                              contractions=False,
                                              vulgar_words=True,
                                              emails=True,
                                              punctuation=False,
                                              ngrams='tri',
                                              ngrams_model_func=trigrams_func,
                                              lemmatize=True,
                                              stem=True,
                                              apostrophes=True,
                                              chars=True)
        dataset = deepcopy(self.dataset)
        dataset.preprocessing_options = options

        # Save the dataset to disk
        dataset.save('test_dataset', SAVED_OBJECTS_PATH)

        # Load the dataset from disk
        dataset_from_disk = TwentyNewsGroupsDataset.load(
            'test_dataset', SAVED_OBJECTS_PATH,
            TwentyNewsGroupsDataset.DATASET_PATH)

        # Remove the dataset previously stored on disk
        rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_dataset'))

        # Check that the original dataset and the dataset saved and loaded are the same
        self.assertEqual(dataset, dataset_from_disk)
    def save(self, name: str, folder_path: str = None):
        """
        Stores the dataset on disk. Creates a folder that contains the files needed to store \
        the dataset object attributes.

        :param name: Name that will have the dataset folder on disk.
        :param folder_path: Path of the folder where the dataset will be stored on disk.
        """
        # Create the directory where all the files will be saved
        files_folder = join_paths(folder_path, name)
        os.mkdir(files_folder)

        # Create a copy of self
        self_copy = deepcopy(self)
        # Remove the preprocessing_options from the self copy
        del self_copy.preprocessing_options

        # Save the copy of self in a file (the preprocessing_options are not saved because where removed from the copy)
        save_obj_to_disk(self_copy, name + '_except_preprocessing_options',
                         files_folder)

        # Save the preprocessing options (if are not None)
        if self.preprocessing_options is not None:
            self.preprocessing_options.save(name + '_preprocessing_options',
                                            files_folder)
    def test_save_and_load_func_on_disk(self):
        def test_func(x):
            return x**3

        save_func_to_disk(test_func, 'test_func', SAVED_FUNCS_PATH)
        test_func_from_disk = load_func_from_disk('test_func',
                                                  SAVED_FUNCS_PATH)

        os.remove(join_paths(SAVED_FUNCS_PATH, 'test_func.dill'))

        # To compare the functions, we have to use them
        test_func_result_list = [
            test_func(1),
            test_func(2),
            test_func(3),
            test_func(4)
        ]
        test_func_from_disk_result_list = [
            test_func_from_disk(1),
            test_func_from_disk(2),
            test_func_from_disk(3),
            test_func_from_disk(4)
        ]

        self.assertEqual(test_func_result_list,
                         test_func_from_disk_result_list)
    def __init__(self, vectors_dim=_GLOVE_VECTORS_DIM, glove_dir: str = None):
        """
        Reads a glove file where contains in each row, in the first position the word, \
        and in the rest of the line the elements of the word vector.

        :param glove_dir: Path where the glove directory is located. That directory must contain \
        text files with the structure mentioned above.
        :param vectors_dim: Size of the word vector. Possible values are: 50, 100, 200, 300.
        """
        if glove_dir is None:
            glove_dir = self._GLOVE_DIR

        self.vectors_dim = vectors_dim
        # A dict where keys are words and values are their corresponding word vectors
        self.embeddings = {}

        with open(
                join_paths(glove_dir,
                           'glove.6B.' + str(vectors_dim) + 'd.txt')) as f:
            for line in f:
                # Each line contains: word number_of_the_word_vector.
                # P. e. the 0.418 0.24968 -0.41242 0.1217 0.34527 ...
                values = line.split()
                word = values[0]  # the word is the first element of the line
                word_vector = np.asarray(
                    values[1:], dtype='float32')  # the word vector is the rest
                self.embeddings[word] = word_vector
    def test_save_and_load_obj_on_disk(self):
        test_list = [1, 2, 3, 4]
        save_obj_to_disk(test_list, 'test_list', SAVED_OBJECTS_PATH)
        test_list_from_disk = load_obj_from_disk('test_list',
                                                 SAVED_OBJECTS_PATH)

        os.remove(join_paths(SAVED_OBJECTS_PATH, 'test_list.pickle'))

        self.assertEqual(test_list, test_list_from_disk)
    def test_join_paths(self):
        path = join_paths('Users/name/', 'Desktop', 'class/', 'files')

        if platform.system() in ['Linux', 'Darwin']:
            self.assertEqual('Users/name/Desktop/class/files', path)
        elif platform.system() == 'Windows':
            self.assertEqual('Users\\name\\Desktop\\class\\files', path)
        else:
            raise Exception('OS not found!')
    def get_original_doc_content_from_disk(self, doc: 'Document') -> str:
        """
        Given a Document object, this method return it's content obtained from disk as a str.

        :param doc: Document.
        :return: Content of the given document obtained from disk.
        """
        return get_file_content(
            join_paths(self.dataset_path,
                       doc.get_doc_path_inside_dataset_folder()),
            self.encoding)
Example #9
0
    def _load_files(self):
        """
        Load the files in the files_list.
        """
        for file_name in sorted(listdir(self.dataset_path)):
            # Skip hidden files
            if file_name.startswith('.'):
                continue

            file_content = get_file_content(
                join_paths(self.dataset_path, file_name), self.encoding)

            self.files_list.append(
                UnstructuredDocument(file_name, file_content))
Example #10
0
    def test_dataset_save_and_load_without_preprocessing_options(self):
        # Save the dataset to disk
        self.dataset.save('test_dataset', SAVED_OBJECTS_PATH)

        # Load the dataset from disk
        dataset_from_disk = TwentyNewsGroupsDataset.load(
            'test_dataset', SAVED_OBJECTS_PATH,
            TwentyNewsGroupsDataset.DATASET_PATH)

        # Remove the dataset previously stored on disk
        rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_dataset'))

        # Check that the original dataset and the dataset saved and loaded are the same
        self.assertEqual(self.dataset, dataset_from_disk)
Example #11
0
    def _load_files(self):
        """
        Load the files in the files_dict with the keys being the category of the files, \
        and the values being a list of document objects, where each document is a file of that category.
        """
        for directory in sorted(listdir(self.dataset_path)):
            # Skip hidden files
            if directory.startswith('.'):
                continue

            self.files_dict[directory] = []

            # Add each file in the category to the dict
            for file_name in sorted(listdir(join_paths(self.dataset_path, directory))):
                # Skip hidden files
                if file_name.startswith('.'):
                    continue

                file_content = get_file_content(
                    join_paths(self.dataset_path, directory, file_name),
                    self.encoding
                )

                self.files_dict[directory].append(self._create_structured_document(directory, file_name, file_content))
    def test_save_and_load_lsa_gensim_model_on_disk(self):
        # Instead of creating a new model, we load a pre-created model from disk
        model = LsaGensimModel.load('lsa-gensim-model',
                                    SAVED_TOPICS_MODELS_PATH)

        # Here we really test the save and load methods
        model_name = 'test-lsa-gensim-model'
        model.save(model_name, SAVED_TOPICS_MODELS_PATH)
        test_model_from_disk = LsaGensimModel.load(model_name,
                                                   SAVED_TOPICS_MODELS_PATH)

        # Remove the created model (it's directory and it's files inside that directory)
        rmtree(join_paths(SAVED_TOPICS_MODELS_PATH, model_name))

        self.assertEqual(model, test_model_from_disk)
    def test_save_and_load_lda_mallet_model_on_disk(self):
        # Lda mallet models cant' be stored in a different path than the original one
        # To test correctly this 2 methods, we need to create a new model
        model_name = 'test-lda-mallet-model'
        test_model = LdaMalletModel(
            self.dataset,
            num_topics=17,
            model_name=model_name,
            model_path=SAVED_TOPICS_MODELS_PATH,
            iterations=10
        )  # 10 iterations to make it too much faster (default is 1000)

        # Here we really test the save and load methods
        test_model.save()
        test_model_from_disk = LdaMalletModel.load(model_name,
                                                   SAVED_TOPICS_MODELS_PATH)

        # Remove the created model (it's directory and it's files inside that directory)
        rmtree(join_paths(SAVED_TOPICS_MODELS_PATH, model_name))

        self.assertEqual(test_model, test_model_from_disk)
Example #14
0
    def save(self, name: str, folder_path: str = None):
        """
        Stores the DatasetPreprocessingOptions object attributes on disk. \
        A folder with same name as the name parameter is created inside the folder_path folder. The folder contains:

        * A file with a dict with all the attributes (except the ngrams_model_func)
        * A file with the ngrams_model_func (even if it's None)

        :param name: Name that will have the folder with the object files.
        :param folder_path: Path of the folder where the DatasetPreprocessingOptions folder will be stored on disk.
        """
        # Create the directory
        files_folder = join_paths(folder_path, name)
        os.mkdir(files_folder)

        # Save the dict with all the attributes except the ngrams_model_func
        options_except_ngrams_model_func = self.as_dict()
        # as_dict() returns a copy of the params, so deleting ngrams_model_func from the dict
        # doesn't delete it from the original object
        del options_except_ngrams_model_func['ngrams_model_func']
        save_obj_to_disk(options_except_ngrams_model_func, name + '_options_except_ngrams_model_func', files_folder)

        # Save the ngrams_model_func
        save_func_to_disk(self.ngrams_model_func, name + '_ngrams_model_func', files_folder)
Example #15
0
    def load(cls, name: str, parent_folder_path: str = None) -> 'DatasetPreprocessingOptions':
        """
        Loads the options of a saved DatasetPreprocessingOptions object stored on disk.

        :param name: Name of the folder that contains the DatasetPreprocessingOptions object files.
        :param parent_folder_path: Path of the folder that contains the folder with the object files.
        :return: The DatasetPreprocessingOptions object loaded from disk.
        """
        files_folder = join_paths(parent_folder_path, name)

        # Load all the attributes except the ngrams_model_func (it's a dict)
        # noinspection PyTypeChecker
        options_except_ngrams_model_func: dict = load_obj_from_disk(name + '_options_except_ngrams_model_func',
                                                                    files_folder)

        # Load the ngrams_model_func
        ngrams_model_func = load_func_from_disk(name + '_ngrams_model_func', files_folder)

        # Join them in the same dict
        options = options_except_ngrams_model_func
        options['ngrams_model_func'] = ngrams_model_func

        # Create an instance of this class using the dict
        return cls(**options)
    def test_dataset_preprocessing_options_save_and_load(self):
        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        options = DatasetPreprocessingOptions(normalize=True,
                                              lowercase=True,
                                              stopwords=False,
                                              contractions=False,
                                              vulgar_words=True,
                                              emails=True,
                                              punctuation=False,
                                              ngrams='tri',
                                              ngrams_model_func=trigrams_func,
                                              lemmatize=True,
                                              stem=True,
                                              apostrophes=True,
                                              chars=True)

        # Save the options to disk
        options.save('test_options', SAVED_OBJECTS_PATH)

        # Load the options from disk
        options_from_disk = DatasetPreprocessingOptions.load(
            'test_options', SAVED_OBJECTS_PATH)

        # Remove the options previously stored on disk
        rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_options'))

        # Check that the original options and the options saved and loaded are the same
        # This doesn't check that the ngrams_model_func behave the same. Only checks if both are None or not None.
        self.assertEqual(options, options_from_disk)
        # Check that the ngrams_model_func behave the same
        words_list = ['windows', 'disk', 'operating', 'system']
        expected_ngrams = ['windows', 'disk_operating_system']
        self.assertEqual(expected_ngrams,
                         options.ngrams_model_func(words_list))
        self.assertEqual(options.ngrams_model_func(words_list),
                         options_from_disk.ngrams_model_func(words_list))
Example #17
0
from topics_and_summary.utils import get_abspath_from_project_source_root, join_paths

TESTS_BASE_PATH = get_abspath_from_project_source_root('tests')
SAVED_OBJECTS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/objects')
SAVED_FUNCS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/funcs')
SAVED_TOPICS_MODELS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/models/topics')
def plot_word_clouds_of_topics(topics: List[Topic],
                               single_plot_per_topic=False,
                               all_horizontal=True,
                               save=False,
                               dir_save_path: str = None,
                               save_base_name='wordcloud',
                               dpi=350,
                               show_plot=True):
    """
    Plots the specified topics and it's keywords as word-clouds.

    :param topics: Topics obtained with the get_topics() method of the TopicsModel class.
    :param single_plot_per_topic: If True, each topic is plotted in a separated plot. \
    If False, each plot contains 4 topics.
    :param all_horizontal: If True, all the keywords are plotted in horizontal.
    :param save: If true, the plots are saved to disk.
    :param dir_save_path: If save is True, this is the path of the directory where the plots will be saved.
    :param save_base_name: Base name for the image files saved. Their names will be <base-name>x, where x is an int
    starting from zero and ending in num_plots-1.
    :param dpi: Dots per inches for the images.
    :param show_plot: If true, shows the plot while executing.
    """
    if len(topics) == 0:
        raise Exception("topics param can't be an empty list")

    colors = [color
              for color in mcolors.TABLEAU_COLORS.values()]  # List of colors
    # Index of the current topic to be plotted.
    # Is used also for selecting the color for that topic in the function below.
    topic_index = topics[0].id

    def color_func(*args, **kwargs):
        return colors[topic_index % len(colors)]

    if all_horizontal:
        prefer_horizontal = 1.0
    else:
        prefer_horizontal = 0.9

    cloud = WordCloud(stopwords=STOPWORDS,
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=topics[0].num_keywords(),
                      colormap='tab10',
                      color_func=color_func,
                      prefer_horizontal=prefer_horizontal)

    num_topics_plotted = 0
    num_iterations = len(topics) if single_plot_per_topic else math.ceil(
        len(topics) / 4)

    progress_bar = tqdm(range(num_iterations))
    for i in progress_bar:
        progress_bar.set_description('Generating plots')
        # Each topic is plotted in a separate plot
        if single_plot_per_topic:
            topic = topics[num_topics_plotted]
            topic_index = topic.id
            topic_kws = dict(topic.as_list_of_tuples())
            # Process finished with exit code 139 (interrupted by signal 11: SIGSEGV) when LSAModel is used below
            cloud.generate_from_frequencies(topic_kws, max_font_size=300)

            plt.imshow(cloud)
            plt.title('Topic ' + str(topic_index), fontdict=dict(size=20))
            plt.axis("off")
            plt.margins(x=0, y=0)
            plt.tight_layout()

            num_topics_plotted += 1
        # Each plot contains, as max, 4 topics
        else:
            # Each plot is formed by 4 subplots, each one containing the keywords of a topic
            # noinspection PyTypeChecker
            fig, axes = plt.subplots(2,
                                     2,
                                     figsize=(10, 10),
                                     dpi=dpi,
                                     sharex=True,
                                     sharey=True)

            for ax in axes.flatten():
                # If all the topics have been plotted, and we are inside this for,
                # the current plot has less than 4 topic to show, so we remove the rest of the axes from the plot.
                if num_topics_plotted == len(topics):
                    fig.delaxes(ax)
                    continue

                fig.add_subplot(ax)

                topic = topics[num_topics_plotted]
                topic_index = topic.id
                topic_kws = dict(topic.as_list_of_tuples())
                # Process finished with exit code 139 (interrupted by signal 11: SIGSEGV) when LSAModel is used below
                cloud.generate_from_frequencies(topic_kws, max_font_size=300)

                plt.gca().imshow(cloud)
                plt.gca().set_title('Topic ' + str(topic_index),
                                    fontdict=dict(size=20))
                plt.gca().axis('off')

                num_topics_plotted += 1

            plt.axis('off')
            plt.margins(x=0, y=0)
            plt.tight_layout()

        if save:
            save_name = '{0}{1}.png'.format(save_base_name, i)
            plot_path = join_paths(dir_save_path, save_name)
            plt.savefig(plot_path, dpi=dpi)

        if show_plot:
            plt.show()

        # Avoid showing the plots when show_plot is False and plt.show() is called in another place
        plt.clf()
def tsne_clustering_chart(model: TopicsModel,
                          angle=.99,
                          doc_threshold=0,
                          plot_keywords=True,
                          num_keywords=5,
                          keywords_color_is_black=True,
                          save_path: str = None,
                          plot_name: str = None,
                          show_plot=True):
    """
    Use t-SNE technique for dimensionality reduction.

    :param model: Topics Model.
    :param angle: Number between 0 and 1. Angle less than 0.2 has quickly increasing computation \
    time and angle greater 0.8 has quickly increasing error.
    :param doc_threshold: Threshold that each document has to pass to be added to the plot.
    :param plot_keywords: If True, the keywords of each topic are plotted near a document of the topic.
    :param num_keywords: Number of keyword to show if plot_keywords is True.
    :param keywords_color_is_black: If true, the keywords color is black. If not, is the same color as the topic.
    :param save_path: Path where the html file with the interactive plot will be saved.
    :param plot_name: Name of the plot to be saved.
    :param show_plot: If true, opens a browser and shows the html with the plot.
    """
    if save_path is None:
        save_path = _TSNE_SAVE_PATH

    # Get doc topic prob matrix
    doc_topic_prob_matrix = model.get_doc_topic_prob_matrix()

    # Don't use docs that don't pass the threshold
    _idx = np.amax(
        doc_topic_prob_matrix,
        axis=1) > doc_threshold  # idx of doc that above the threshold
    doc_topic_prob_matrix = doc_topic_prob_matrix[_idx]

    # tSNE Dimension Reduction: 20-D -> 2-D
    # n_components is the number of dimensions of the plot. n_components=2 -> 2D
    num_dimensions = 2
    tsne_model = TSNE(n_components=num_dimensions,
                      verbose=1,
                      random_state=RANDOM_STATE,
                      angle=angle,
                      init='pca')
    tsne_lda = tsne_model.fit_transform(doc_topic_prob_matrix)

    # Colors for the points in the Bokeh plot
    colormap = np.array([
        "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a",
        "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94",
        "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d",
        "#17becf", "#9edae5"
    ])

    # Get the most relevant topic for each doc
    dominant_topic_per_doc = []
    dominant_topic_prob_per_doc = []
    for dominant_topic_doc in tqdm(doc_topic_prob_matrix):
        dominant_topic_per_doc.append(dominant_topic_doc.argmax())
        dominant_topic_prob_per_doc.append(dominant_topic_doc.max())

    # Configure the default output state to generate output saved to a file when show() is called.
    if plot_name is None:
        now = now_as_str()
        plot_name = 'tsne_' + now + '.html'

    bp.output_file(join_paths(save_path, plot_name), mode='inline')

    # Create the plot for the Topic Clusters using Bokeh
    plot = figure(
        title="t-SNE Clustering of {} LDA Topics".format(model.num_topics),
        tools=
        "pan,wheel_zoom,box_zoom,reset,hover,previewsave",  # plot option tools
        plot_width=1400,
        plot_height=900)

    plot.scatter(
        x='x',
        y='y',
        color='color',
        # When source is provided, the kwargs above must refer to keys in the dict passed to source
        source=bp.ColumnDataSource({
            "x":
            tsne_lda[:, 0],
            "y":
            tsne_lda[:, 1],
            "topic index":
            dominant_topic_per_doc,
            "topic prob":
            dominant_topic_prob_per_doc,
            "doc text":
            list(
                map(lambda x: ' '.join(x),
                    model.documents[:doc_topic_prob_matrix.shape[0]])),
            "color":
            colormap[dominant_topic_per_doc]
        }))

    if plot_keywords:
        # Plot the keywords for each topic:

        # Randomly choose a doc (within a topic) coordinate as the keywords coordinate
        topic_coord = np.empty(
            (doc_topic_prob_matrix.shape[1], num_dimensions)) * np.nan
        for topic_num in dominant_topic_per_doc:
            if not np.isnan(topic_coord).any():
                break
            topic_coord[topic_num] = tsne_lda[dominant_topic_per_doc.index(
                topic_num)]

        # List of num_topics keywords as a str per each topic in the model
        topics_kws = [
            model.get_k_kws_of_topic_as_str(topic, num_keywords)
            for topic in range(model.num_topics)
        ]

        # Plot the keywords
        for i in range(doc_topic_prob_matrix.shape[1]):
            if keywords_color_is_black:
                text_color = ['#000000']
            else:
                # TODO: The library doesn't allow to put a color in the contour,
                #  so this option doesn't let to visualize the words correctly
                text_color = [colormap[i]]

            plot.text(x='x',
                      y='y',
                      text='text',
                      text_color='text_color',
                      source=bp.ColumnDataSource({
                          "x": [topic_coord[i, 0]],
                          "y": [topic_coord[i, 1]],
                          "text": [topics_kws[i]],
                          "topic index": [i],
                          "text_color": text_color
                      }))

    # Add info box for each doc using hover tools
    hover = plot.select(dict(type=HoverTool))
    # With @ we refer to keys in the source dict. If the key contains spaces, it must be specified like @{key name}
    # TODO: This shows this fields for all objects, including the text, that doesn't have all them, but I think
    #  there is no solution to this, or at least in the documentation they only explain how to apply tooltips to figure.
    hover.tooltips = [("doc_index", "$index"),
                      ("topic_index", "@{topic index}"),
                      ("topic_prob", "@{topic prob}"),
                      ("doc_text", "@{doc text}")]

    if show_plot:
        show(plot)

    bp.save(plot)
from texttable import Texttable

from topics_and_summary.datasets.common import Dataset
from topics_and_summary.datasets.structured_dataset import StructuredDataset
from topics_and_summary.preprocessing.dataset_preprocessing_options import DatasetPreprocessingOptions
from topics_and_summary.preprocessing.ngrams import make_bigrams_and_get_bigrams_model_func, \
    make_trigrams_and_get_trigrams_model_func
from topics_and_summary.preprocessing.text import to_lowercase, expand_contractions, substitute_vulgar_words, \
    remove_stopwords, substitute_punctuation, lemmatize_words, stem_words, normalize_words, remove_emails, \
    remove_single_chars, remove_apostrophes
from topics_and_summary.utils import join_paths, get_abspath_from_project_source_root, pretty_print

_PREPROCESSING_FILES_DIR = get_abspath_from_project_source_root(
    'preprocessing-files')
_TRASH_WORDS_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'trash_words.txt')
_TRASH_DOCS_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'trash_docs.txt')


def print_words_that_contain_elem(dataset: Dataset, elem: str):
    """
    Prints a table with the following info:
        - Word that contains the given element.
        - Number of occurrences of the word in the whole dataset

    :param dataset: Dataset.
    :param elem: Elem contained in the printed words. \
    Will be used to create a regular expression, containing only that elem.
    """
    elem_re = re.compile(elem)
    # %%
    # Load dataset
    dataset = TwentyNewsGroupsDataset()

    # Topics info for the models
    MIN_TOPICS = 10
    MAX_TOPICS = 20
    BASE_PATH = get_abspath_from_project_source_root(
        'saved-elements/topics/comparison')

    # %%
    # Unigrams
    pretty_print('Unigrams')
    unigrams_dataset = preprocess_dataset(dataset, ngrams='uni')
    unigrams_path = join_paths(BASE_PATH, 'unigrams')

    generate_and_store_models(unigrams_path, unigrams_dataset, 'Unigrams')

    # Bigrams
    pretty_print('Bigrams')
    bigrams_dataset = preprocess_dataset(dataset, ngrams='bi')
    bigrams_path = join_paths(BASE_PATH, 'bigrams')

    generate_and_store_models(bigrams_path, bigrams_dataset, 'Bigrams')

    # Trigrams
    pretty_print('Trigrams')
    trigrams_dataset = preprocess_dataset(dataset, ngrams='tri')
    trigrams_path = join_paths(BASE_PATH, 'trigrams')
Example #22
0
 def get_doc_path_inside_dataset_folder(self) -> str:
     return join_paths(self.directory_name, self.name)
Example #23
0
import json
import re
from typing import Union, Callable, Set

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from topics_and_summary.utils import join_paths, get_abspath_from_project_source_root

_BASIC_STOPWORDS = set(stopwords.words('english'))
_EMAILS_RE = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
_PUNCTUATION_RE = re.compile('[—ºª#$€%&*+-_.·,;:<=>@/¡!¿?^¨`´\"(){|}~[\\]]')
_PREPROCESSING_FILES_DIR = get_abspath_from_project_source_root(
    'preprocessing-files')
_ADDITIONAL_STOPWORDS_PATH = join_paths(_PREPROCESSING_FILES_DIR,
                                        'stopwords.txt')
_EXPAND_CONTRACTIONS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR,
                                            'expand_contractions_dict.txt')
_VULGAR_WORDS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR,
                                     'vulgar_words_dict.txt')
_NORMALIZE_WORDS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR,
                                        'normalize_words_dict.txt')


def to_lowercase(text: str) -> str:
    """
    Returns the given text with all characters in lowercase.

    :param text: The text to be converted to lowercase. (String)
    :return: The given text with all characters in lowercase. (String)
    """
    def load(cls,
             name: str,
             parent_dir_path: str = None,
             dataset_path: str = None) -> 'Dataset':
        """
        Loads a saved dataset object from disk.

        :param name: Name of the folder where the dataset object files are stored.
        :param parent_dir_path: Path to the folder where the dataset object folder is stored on disk.
        :param dataset_path: Path to the folder that contains the original dataset documents.
        :return: The dataset loaded from disk.

        For example, consider the following directory structure:

        * stored-datasets-objects/dataset_obj_1/dataset_obj_1_preprocessing_options/...
        * stored-datasets-objects/dataset_obj_1/dataset_obj_1__except_preprocessing_options.pickle
        * datasets/20_newsgroups

        Where 20_newsgroups contains the original 20_newsgroups dataset documents and dataset_obj_1 contains |
        the files of a previously stored dataset object (with the save() method).

        To load the dataset_obj_1 dataset object that contains a dataset object of the 20 newsgroups dataset, \
        this method should be called this way:

        >>> from topics_and_summary.datasets.common import Dataset
        >>> dataset = Dataset.load('dataset_obj_1', 'path/to/stored-datasets-objects', 'path/to/datasets')
        """
        if parent_dir_path is None:
            parent_dir_path = get_abspath_from_project_source_root(
                'saved-elements/objects')

        files_folder = join_paths(parent_dir_path, name)

        # Load the dataset (except the preprocessing options)
        # noinspection PyTypeChecker
        dataset: Dataset = load_obj_from_disk(
            name + '_except_preprocessing_options', files_folder)

        # If the <dataset-name>_preprocessing_options folder exists, it means that the preprocessing_options where saved
        # In that case, the preprocessing_options are loaded
        if os.path.exists(
                join_paths(files_folder, name + '_preprocessing_options')):
            dataset.preprocessing_options = \
                DatasetPreprocessingOptions.load(name + '_preprocessing_options', files_folder)
        else:
            dataset.preprocessing_options = None

        # Update the dataset_path of the object if a value is given
        if dataset_path is not None:
            dataset.dataset_path = dataset_path
        else:
            # If the path to the files of the dataset has changed after the dataset object was stored,
            # the dataset_path attribute of the loaded object is wrong, but in this class we don't know the current
            # path of the dataset files, so the user needs to check if the path is ok or it needs to be updated
            warnings.warn(
                "\nThe dataset_path attribute of the loaded dataset object may need to be updated. "
                "It's current value is: {0}.\n"
                "If the path to the files of the dataset has changed after the dataset object was stored, "
                "the dataset_path attribute of the loaded object is wrong and needs to be changed manually.\n"
                "There are 2 ways to update the dataset path:\n"
                "\t1. Change it directly in the loaded model: dataset_obj.dataset_path = <path>\n"
                "\t2. Load the dataset again with load(), specifying the path in the dataset_path parameter"
                .format(dataset.dataset_path))

        return dataset