def generate_and_store_models(path, dataset, plot_first_name): # region LDA pretty_print(plot_first_name + ' LDA') lda_path = join_paths(path, 'lda') lda_models_list = LdaModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lda_models, lda_coherence_values = \ lda_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LDA models', save_plot=True, save_plot_path=join_paths(lda_path, 'coherence_values.png')) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lda_models_list.save(base_name='model', path=lda_path) store_plots(lda_models, lda_coherence_values) # endregion # region LSA pretty_print(plot_first_name + ' LSA') lsa_path = join_paths(path, 'lsa') lsa_models_list = LsaModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lsa_models, lsa_coherence_values = \ lsa_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LSA models', save_plot=True, save_plot_path=join_paths(lsa_path, 'coherence_values.png')) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lsa_models_list.save(base_name='model', path=lsa_path) # endregion # region LDA Mallet pretty_print(plot_first_name + ' LDA Mallet') lda_mallet_path = join_paths(path, 'lda-mallet') lda_mallet_models_list = LdaMalletModelsList(dataset) # Create models, compute coherence values and store a plot with the coherence values pretty_print('Creating models') lda_mallet_models, lda_mallet_coherence_values = \ lda_mallet_models_list.create_models_and_compute_coherence_values(MIN_TOPICS, MAX_TOPICS, title=plot_first_name + ' LDA Mallet models', save_plot=True, save_plot_path=join_paths(lda_mallet_path, 'coherence_values.png'), models_base_name='model', model_path=lda_mallet_path) # Store the models and a txt file with the coherence value of each model pretty_print('Storing models') lda_mallet_models_list.save() # tSNE is too slow to calculate, because predictions in LdaMallet are too slow store_plots(lda_mallet_models, lda_mallet_coherence_values, tsne=False)
def test_dataset_save_and_load_with_preprocessing_options(self): trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH) options = DatasetPreprocessingOptions(normalize=True, lowercase=True, stopwords=False, contractions=False, vulgar_words=True, emails=True, punctuation=False, ngrams='tri', ngrams_model_func=trigrams_func, lemmatize=True, stem=True, apostrophes=True, chars=True) dataset = deepcopy(self.dataset) dataset.preprocessing_options = options # Save the dataset to disk dataset.save('test_dataset', SAVED_OBJECTS_PATH) # Load the dataset from disk dataset_from_disk = TwentyNewsGroupsDataset.load( 'test_dataset', SAVED_OBJECTS_PATH, TwentyNewsGroupsDataset.DATASET_PATH) # Remove the dataset previously stored on disk rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_dataset')) # Check that the original dataset and the dataset saved and loaded are the same self.assertEqual(dataset, dataset_from_disk)
def save(self, name: str, folder_path: str = None): """ Stores the dataset on disk. Creates a folder that contains the files needed to store \ the dataset object attributes. :param name: Name that will have the dataset folder on disk. :param folder_path: Path of the folder where the dataset will be stored on disk. """ # Create the directory where all the files will be saved files_folder = join_paths(folder_path, name) os.mkdir(files_folder) # Create a copy of self self_copy = deepcopy(self) # Remove the preprocessing_options from the self copy del self_copy.preprocessing_options # Save the copy of self in a file (the preprocessing_options are not saved because where removed from the copy) save_obj_to_disk(self_copy, name + '_except_preprocessing_options', files_folder) # Save the preprocessing options (if are not None) if self.preprocessing_options is not None: self.preprocessing_options.save(name + '_preprocessing_options', files_folder)
def test_save_and_load_func_on_disk(self): def test_func(x): return x**3 save_func_to_disk(test_func, 'test_func', SAVED_FUNCS_PATH) test_func_from_disk = load_func_from_disk('test_func', SAVED_FUNCS_PATH) os.remove(join_paths(SAVED_FUNCS_PATH, 'test_func.dill')) # To compare the functions, we have to use them test_func_result_list = [ test_func(1), test_func(2), test_func(3), test_func(4) ] test_func_from_disk_result_list = [ test_func_from_disk(1), test_func_from_disk(2), test_func_from_disk(3), test_func_from_disk(4) ] self.assertEqual(test_func_result_list, test_func_from_disk_result_list)
def __init__(self, vectors_dim=_GLOVE_VECTORS_DIM, glove_dir: str = None): """ Reads a glove file where contains in each row, in the first position the word, \ and in the rest of the line the elements of the word vector. :param glove_dir: Path where the glove directory is located. That directory must contain \ text files with the structure mentioned above. :param vectors_dim: Size of the word vector. Possible values are: 50, 100, 200, 300. """ if glove_dir is None: glove_dir = self._GLOVE_DIR self.vectors_dim = vectors_dim # A dict where keys are words and values are their corresponding word vectors self.embeddings = {} with open( join_paths(glove_dir, 'glove.6B.' + str(vectors_dim) + 'd.txt')) as f: for line in f: # Each line contains: word number_of_the_word_vector. # P. e. the 0.418 0.24968 -0.41242 0.1217 0.34527 ... values = line.split() word = values[0] # the word is the first element of the line word_vector = np.asarray( values[1:], dtype='float32') # the word vector is the rest self.embeddings[word] = word_vector
def test_save_and_load_obj_on_disk(self): test_list = [1, 2, 3, 4] save_obj_to_disk(test_list, 'test_list', SAVED_OBJECTS_PATH) test_list_from_disk = load_obj_from_disk('test_list', SAVED_OBJECTS_PATH) os.remove(join_paths(SAVED_OBJECTS_PATH, 'test_list.pickle')) self.assertEqual(test_list, test_list_from_disk)
def test_join_paths(self): path = join_paths('Users/name/', 'Desktop', 'class/', 'files') if platform.system() in ['Linux', 'Darwin']: self.assertEqual('Users/name/Desktop/class/files', path) elif platform.system() == 'Windows': self.assertEqual('Users\\name\\Desktop\\class\\files', path) else: raise Exception('OS not found!')
def get_original_doc_content_from_disk(self, doc: 'Document') -> str: """ Given a Document object, this method return it's content obtained from disk as a str. :param doc: Document. :return: Content of the given document obtained from disk. """ return get_file_content( join_paths(self.dataset_path, doc.get_doc_path_inside_dataset_folder()), self.encoding)
def _load_files(self): """ Load the files in the files_list. """ for file_name in sorted(listdir(self.dataset_path)): # Skip hidden files if file_name.startswith('.'): continue file_content = get_file_content( join_paths(self.dataset_path, file_name), self.encoding) self.files_list.append( UnstructuredDocument(file_name, file_content))
def test_dataset_save_and_load_without_preprocessing_options(self): # Save the dataset to disk self.dataset.save('test_dataset', SAVED_OBJECTS_PATH) # Load the dataset from disk dataset_from_disk = TwentyNewsGroupsDataset.load( 'test_dataset', SAVED_OBJECTS_PATH, TwentyNewsGroupsDataset.DATASET_PATH) # Remove the dataset previously stored on disk rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_dataset')) # Check that the original dataset and the dataset saved and loaded are the same self.assertEqual(self.dataset, dataset_from_disk)
def _load_files(self): """ Load the files in the files_dict with the keys being the category of the files, \ and the values being a list of document objects, where each document is a file of that category. """ for directory in sorted(listdir(self.dataset_path)): # Skip hidden files if directory.startswith('.'): continue self.files_dict[directory] = [] # Add each file in the category to the dict for file_name in sorted(listdir(join_paths(self.dataset_path, directory))): # Skip hidden files if file_name.startswith('.'): continue file_content = get_file_content( join_paths(self.dataset_path, directory, file_name), self.encoding ) self.files_dict[directory].append(self._create_structured_document(directory, file_name, file_content))
def test_save_and_load_lsa_gensim_model_on_disk(self): # Instead of creating a new model, we load a pre-created model from disk model = LsaGensimModel.load('lsa-gensim-model', SAVED_TOPICS_MODELS_PATH) # Here we really test the save and load methods model_name = 'test-lsa-gensim-model' model.save(model_name, SAVED_TOPICS_MODELS_PATH) test_model_from_disk = LsaGensimModel.load(model_name, SAVED_TOPICS_MODELS_PATH) # Remove the created model (it's directory and it's files inside that directory) rmtree(join_paths(SAVED_TOPICS_MODELS_PATH, model_name)) self.assertEqual(model, test_model_from_disk)
def test_save_and_load_lda_mallet_model_on_disk(self): # Lda mallet models cant' be stored in a different path than the original one # To test correctly this 2 methods, we need to create a new model model_name = 'test-lda-mallet-model' test_model = LdaMalletModel( self.dataset, num_topics=17, model_name=model_name, model_path=SAVED_TOPICS_MODELS_PATH, iterations=10 ) # 10 iterations to make it too much faster (default is 1000) # Here we really test the save and load methods test_model.save() test_model_from_disk = LdaMalletModel.load(model_name, SAVED_TOPICS_MODELS_PATH) # Remove the created model (it's directory and it's files inside that directory) rmtree(join_paths(SAVED_TOPICS_MODELS_PATH, model_name)) self.assertEqual(test_model, test_model_from_disk)
def save(self, name: str, folder_path: str = None): """ Stores the DatasetPreprocessingOptions object attributes on disk. \ A folder with same name as the name parameter is created inside the folder_path folder. The folder contains: * A file with a dict with all the attributes (except the ngrams_model_func) * A file with the ngrams_model_func (even if it's None) :param name: Name that will have the folder with the object files. :param folder_path: Path of the folder where the DatasetPreprocessingOptions folder will be stored on disk. """ # Create the directory files_folder = join_paths(folder_path, name) os.mkdir(files_folder) # Save the dict with all the attributes except the ngrams_model_func options_except_ngrams_model_func = self.as_dict() # as_dict() returns a copy of the params, so deleting ngrams_model_func from the dict # doesn't delete it from the original object del options_except_ngrams_model_func['ngrams_model_func'] save_obj_to_disk(options_except_ngrams_model_func, name + '_options_except_ngrams_model_func', files_folder) # Save the ngrams_model_func save_func_to_disk(self.ngrams_model_func, name + '_ngrams_model_func', files_folder)
def load(cls, name: str, parent_folder_path: str = None) -> 'DatasetPreprocessingOptions': """ Loads the options of a saved DatasetPreprocessingOptions object stored on disk. :param name: Name of the folder that contains the DatasetPreprocessingOptions object files. :param parent_folder_path: Path of the folder that contains the folder with the object files. :return: The DatasetPreprocessingOptions object loaded from disk. """ files_folder = join_paths(parent_folder_path, name) # Load all the attributes except the ngrams_model_func (it's a dict) # noinspection PyTypeChecker options_except_ngrams_model_func: dict = load_obj_from_disk(name + '_options_except_ngrams_model_func', files_folder) # Load the ngrams_model_func ngrams_model_func = load_func_from_disk(name + '_ngrams_model_func', files_folder) # Join them in the same dict options = options_except_ngrams_model_func options['ngrams_model_func'] = ngrams_model_func # Create an instance of this class using the dict return cls(**options)
def test_dataset_preprocessing_options_save_and_load(self): trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH) options = DatasetPreprocessingOptions(normalize=True, lowercase=True, stopwords=False, contractions=False, vulgar_words=True, emails=True, punctuation=False, ngrams='tri', ngrams_model_func=trigrams_func, lemmatize=True, stem=True, apostrophes=True, chars=True) # Save the options to disk options.save('test_options', SAVED_OBJECTS_PATH) # Load the options from disk options_from_disk = DatasetPreprocessingOptions.load( 'test_options', SAVED_OBJECTS_PATH) # Remove the options previously stored on disk rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_options')) # Check that the original options and the options saved and loaded are the same # This doesn't check that the ngrams_model_func behave the same. Only checks if both are None or not None. self.assertEqual(options, options_from_disk) # Check that the ngrams_model_func behave the same words_list = ['windows', 'disk', 'operating', 'system'] expected_ngrams = ['windows', 'disk_operating_system'] self.assertEqual(expected_ngrams, options.ngrams_model_func(words_list)) self.assertEqual(options.ngrams_model_func(words_list), options_from_disk.ngrams_model_func(words_list))
from topics_and_summary.utils import get_abspath_from_project_source_root, join_paths TESTS_BASE_PATH = get_abspath_from_project_source_root('tests') SAVED_OBJECTS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/objects') SAVED_FUNCS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/funcs') SAVED_TOPICS_MODELS_PATH = join_paths(TESTS_BASE_PATH, 'saved-elements/models/topics')
def plot_word_clouds_of_topics(topics: List[Topic], single_plot_per_topic=False, all_horizontal=True, save=False, dir_save_path: str = None, save_base_name='wordcloud', dpi=350, show_plot=True): """ Plots the specified topics and it's keywords as word-clouds. :param topics: Topics obtained with the get_topics() method of the TopicsModel class. :param single_plot_per_topic: If True, each topic is plotted in a separated plot. \ If False, each plot contains 4 topics. :param all_horizontal: If True, all the keywords are plotted in horizontal. :param save: If true, the plots are saved to disk. :param dir_save_path: If save is True, this is the path of the directory where the plots will be saved. :param save_base_name: Base name for the image files saved. Their names will be <base-name>x, where x is an int starting from zero and ending in num_plots-1. :param dpi: Dots per inches for the images. :param show_plot: If true, shows the plot while executing. """ if len(topics) == 0: raise Exception("topics param can't be an empty list") colors = [color for color in mcolors.TABLEAU_COLORS.values()] # List of colors # Index of the current topic to be plotted. # Is used also for selecting the color for that topic in the function below. topic_index = topics[0].id def color_func(*args, **kwargs): return colors[topic_index % len(colors)] if all_horizontal: prefer_horizontal = 1.0 else: prefer_horizontal = 0.9 cloud = WordCloud(stopwords=STOPWORDS, background_color='white', width=2500, height=1800, max_words=topics[0].num_keywords(), colormap='tab10', color_func=color_func, prefer_horizontal=prefer_horizontal) num_topics_plotted = 0 num_iterations = len(topics) if single_plot_per_topic else math.ceil( len(topics) / 4) progress_bar = tqdm(range(num_iterations)) for i in progress_bar: progress_bar.set_description('Generating plots') # Each topic is plotted in a separate plot if single_plot_per_topic: topic = topics[num_topics_plotted] topic_index = topic.id topic_kws = dict(topic.as_list_of_tuples()) # Process finished with exit code 139 (interrupted by signal 11: SIGSEGV) when LSAModel is used below cloud.generate_from_frequencies(topic_kws, max_font_size=300) plt.imshow(cloud) plt.title('Topic ' + str(topic_index), fontdict=dict(size=20)) plt.axis("off") plt.margins(x=0, y=0) plt.tight_layout() num_topics_plotted += 1 # Each plot contains, as max, 4 topics else: # Each plot is formed by 4 subplots, each one containing the keywords of a topic # noinspection PyTypeChecker fig, axes = plt.subplots(2, 2, figsize=(10, 10), dpi=dpi, sharex=True, sharey=True) for ax in axes.flatten(): # If all the topics have been plotted, and we are inside this for, # the current plot has less than 4 topic to show, so we remove the rest of the axes from the plot. if num_topics_plotted == len(topics): fig.delaxes(ax) continue fig.add_subplot(ax) topic = topics[num_topics_plotted] topic_index = topic.id topic_kws = dict(topic.as_list_of_tuples()) # Process finished with exit code 139 (interrupted by signal 11: SIGSEGV) when LSAModel is used below cloud.generate_from_frequencies(topic_kws, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(topic_index), fontdict=dict(size=20)) plt.gca().axis('off') num_topics_plotted += 1 plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() if save: save_name = '{0}{1}.png'.format(save_base_name, i) plot_path = join_paths(dir_save_path, save_name) plt.savefig(plot_path, dpi=dpi) if show_plot: plt.show() # Avoid showing the plots when show_plot is False and plt.show() is called in another place plt.clf()
def tsne_clustering_chart(model: TopicsModel, angle=.99, doc_threshold=0, plot_keywords=True, num_keywords=5, keywords_color_is_black=True, save_path: str = None, plot_name: str = None, show_plot=True): """ Use t-SNE technique for dimensionality reduction. :param model: Topics Model. :param angle: Number between 0 and 1. Angle less than 0.2 has quickly increasing computation \ time and angle greater 0.8 has quickly increasing error. :param doc_threshold: Threshold that each document has to pass to be added to the plot. :param plot_keywords: If True, the keywords of each topic are plotted near a document of the topic. :param num_keywords: Number of keyword to show if plot_keywords is True. :param keywords_color_is_black: If true, the keywords color is black. If not, is the same color as the topic. :param save_path: Path where the html file with the interactive plot will be saved. :param plot_name: Name of the plot to be saved. :param show_plot: If true, opens a browser and shows the html with the plot. """ if save_path is None: save_path = _TSNE_SAVE_PATH # Get doc topic prob matrix doc_topic_prob_matrix = model.get_doc_topic_prob_matrix() # Don't use docs that don't pass the threshold _idx = np.amax( doc_topic_prob_matrix, axis=1) > doc_threshold # idx of doc that above the threshold doc_topic_prob_matrix = doc_topic_prob_matrix[_idx] # tSNE Dimension Reduction: 20-D -> 2-D # n_components is the number of dimensions of the plot. n_components=2 -> 2D num_dimensions = 2 tsne_model = TSNE(n_components=num_dimensions, verbose=1, random_state=RANDOM_STATE, angle=angle, init='pca') tsne_lda = tsne_model.fit_transform(doc_topic_prob_matrix) # Colors for the points in the Bokeh plot colormap = np.array([ "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ]) # Get the most relevant topic for each doc dominant_topic_per_doc = [] dominant_topic_prob_per_doc = [] for dominant_topic_doc in tqdm(doc_topic_prob_matrix): dominant_topic_per_doc.append(dominant_topic_doc.argmax()) dominant_topic_prob_per_doc.append(dominant_topic_doc.max()) # Configure the default output state to generate output saved to a file when show() is called. if plot_name is None: now = now_as_str() plot_name = 'tsne_' + now + '.html' bp.output_file(join_paths(save_path, plot_name), mode='inline') # Create the plot for the Topic Clusters using Bokeh plot = figure( title="t-SNE Clustering of {} LDA Topics".format(model.num_topics), tools= "pan,wheel_zoom,box_zoom,reset,hover,previewsave", # plot option tools plot_width=1400, plot_height=900) plot.scatter( x='x', y='y', color='color', # When source is provided, the kwargs above must refer to keys in the dict passed to source source=bp.ColumnDataSource({ "x": tsne_lda[:, 0], "y": tsne_lda[:, 1], "topic index": dominant_topic_per_doc, "topic prob": dominant_topic_prob_per_doc, "doc text": list( map(lambda x: ' '.join(x), model.documents[:doc_topic_prob_matrix.shape[0]])), "color": colormap[dominant_topic_per_doc] })) if plot_keywords: # Plot the keywords for each topic: # Randomly choose a doc (within a topic) coordinate as the keywords coordinate topic_coord = np.empty( (doc_topic_prob_matrix.shape[1], num_dimensions)) * np.nan for topic_num in dominant_topic_per_doc: if not np.isnan(topic_coord).any(): break topic_coord[topic_num] = tsne_lda[dominant_topic_per_doc.index( topic_num)] # List of num_topics keywords as a str per each topic in the model topics_kws = [ model.get_k_kws_of_topic_as_str(topic, num_keywords) for topic in range(model.num_topics) ] # Plot the keywords for i in range(doc_topic_prob_matrix.shape[1]): if keywords_color_is_black: text_color = ['#000000'] else: # TODO: The library doesn't allow to put a color in the contour, # so this option doesn't let to visualize the words correctly text_color = [colormap[i]] plot.text(x='x', y='y', text='text', text_color='text_color', source=bp.ColumnDataSource({ "x": [topic_coord[i, 0]], "y": [topic_coord[i, 1]], "text": [topics_kws[i]], "topic index": [i], "text_color": text_color })) # Add info box for each doc using hover tools hover = plot.select(dict(type=HoverTool)) # With @ we refer to keys in the source dict. If the key contains spaces, it must be specified like @{key name} # TODO: This shows this fields for all objects, including the text, that doesn't have all them, but I think # there is no solution to this, or at least in the documentation they only explain how to apply tooltips to figure. hover.tooltips = [("doc_index", "$index"), ("topic_index", "@{topic index}"), ("topic_prob", "@{topic prob}"), ("doc_text", "@{doc text}")] if show_plot: show(plot) bp.save(plot)
from texttable import Texttable from topics_and_summary.datasets.common import Dataset from topics_and_summary.datasets.structured_dataset import StructuredDataset from topics_and_summary.preprocessing.dataset_preprocessing_options import DatasetPreprocessingOptions from topics_and_summary.preprocessing.ngrams import make_bigrams_and_get_bigrams_model_func, \ make_trigrams_and_get_trigrams_model_func from topics_and_summary.preprocessing.text import to_lowercase, expand_contractions, substitute_vulgar_words, \ remove_stopwords, substitute_punctuation, lemmatize_words, stem_words, normalize_words, remove_emails, \ remove_single_chars, remove_apostrophes from topics_and_summary.utils import join_paths, get_abspath_from_project_source_root, pretty_print _PREPROCESSING_FILES_DIR = get_abspath_from_project_source_root( 'preprocessing-files') _TRASH_WORDS_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'trash_words.txt') _TRASH_DOCS_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'trash_docs.txt') def print_words_that_contain_elem(dataset: Dataset, elem: str): """ Prints a table with the following info: - Word that contains the given element. - Number of occurrences of the word in the whole dataset :param dataset: Dataset. :param elem: Elem contained in the printed words. \ Will be used to create a regular expression, containing only that elem. """ elem_re = re.compile(elem)
# %% # Load dataset dataset = TwentyNewsGroupsDataset() # Topics info for the models MIN_TOPICS = 10 MAX_TOPICS = 20 BASE_PATH = get_abspath_from_project_source_root( 'saved-elements/topics/comparison') # %% # Unigrams pretty_print('Unigrams') unigrams_dataset = preprocess_dataset(dataset, ngrams='uni') unigrams_path = join_paths(BASE_PATH, 'unigrams') generate_and_store_models(unigrams_path, unigrams_dataset, 'Unigrams') # Bigrams pretty_print('Bigrams') bigrams_dataset = preprocess_dataset(dataset, ngrams='bi') bigrams_path = join_paths(BASE_PATH, 'bigrams') generate_and_store_models(bigrams_path, bigrams_dataset, 'Bigrams') # Trigrams pretty_print('Trigrams') trigrams_dataset = preprocess_dataset(dataset, ngrams='tri') trigrams_path = join_paths(BASE_PATH, 'trigrams')
def get_doc_path_inside_dataset_folder(self) -> str: return join_paths(self.directory_name, self.name)
import json import re from typing import Union, Callable, Set from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer from topics_and_summary.utils import join_paths, get_abspath_from_project_source_root _BASIC_STOPWORDS = set(stopwords.words('english')) _EMAILS_RE = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)") _PUNCTUATION_RE = re.compile('[—ºª#$€%&*+-_.·,;:<=>@/¡!¿?^¨`´\"(){|}~[\\]]') _PREPROCESSING_FILES_DIR = get_abspath_from_project_source_root( 'preprocessing-files') _ADDITIONAL_STOPWORDS_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'stopwords.txt') _EXPAND_CONTRACTIONS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'expand_contractions_dict.txt') _VULGAR_WORDS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'vulgar_words_dict.txt') _NORMALIZE_WORDS_DICT_PATH = join_paths(_PREPROCESSING_FILES_DIR, 'normalize_words_dict.txt') def to_lowercase(text: str) -> str: """ Returns the given text with all characters in lowercase. :param text: The text to be converted to lowercase. (String) :return: The given text with all characters in lowercase. (String) """
def load(cls, name: str, parent_dir_path: str = None, dataset_path: str = None) -> 'Dataset': """ Loads a saved dataset object from disk. :param name: Name of the folder where the dataset object files are stored. :param parent_dir_path: Path to the folder where the dataset object folder is stored on disk. :param dataset_path: Path to the folder that contains the original dataset documents. :return: The dataset loaded from disk. For example, consider the following directory structure: * stored-datasets-objects/dataset_obj_1/dataset_obj_1_preprocessing_options/... * stored-datasets-objects/dataset_obj_1/dataset_obj_1__except_preprocessing_options.pickle * datasets/20_newsgroups Where 20_newsgroups contains the original 20_newsgroups dataset documents and dataset_obj_1 contains | the files of a previously stored dataset object (with the save() method). To load the dataset_obj_1 dataset object that contains a dataset object of the 20 newsgroups dataset, \ this method should be called this way: >>> from topics_and_summary.datasets.common import Dataset >>> dataset = Dataset.load('dataset_obj_1', 'path/to/stored-datasets-objects', 'path/to/datasets') """ if parent_dir_path is None: parent_dir_path = get_abspath_from_project_source_root( 'saved-elements/objects') files_folder = join_paths(parent_dir_path, name) # Load the dataset (except the preprocessing options) # noinspection PyTypeChecker dataset: Dataset = load_obj_from_disk( name + '_except_preprocessing_options', files_folder) # If the <dataset-name>_preprocessing_options folder exists, it means that the preprocessing_options where saved # In that case, the preprocessing_options are loaded if os.path.exists( join_paths(files_folder, name + '_preprocessing_options')): dataset.preprocessing_options = \ DatasetPreprocessingOptions.load(name + '_preprocessing_options', files_folder) else: dataset.preprocessing_options = None # Update the dataset_path of the object if a value is given if dataset_path is not None: dataset.dataset_path = dataset_path else: # If the path to the files of the dataset has changed after the dataset object was stored, # the dataset_path attribute of the loaded object is wrong, but in this class we don't know the current # path of the dataset files, so the user needs to check if the path is ok or it needs to be updated warnings.warn( "\nThe dataset_path attribute of the loaded dataset object may need to be updated. " "It's current value is: {0}.\n" "If the path to the files of the dataset has changed after the dataset object was stored, " "the dataset_path attribute of the loaded object is wrong and needs to be changed manually.\n" "There are 2 ways to update the dataset path:\n" "\t1. Change it directly in the loaded model: dataset_obj.dataset_path = <path>\n" "\t2. Load the dataset again with load(), specifying the path in the dataset_path parameter" .format(dataset.dataset_path)) return dataset