def process_stopwords(configuration): logging.debug('Processing stopwords') # Initialize stopwords remover class conf_language = configuration['stopwords_lang'] conf_stopwords_files = configuration['stopwords_files'] stopwords_remover = StopWords(conf_language, conf_stopwords_files) t0 = time() X_src = [] selected_attr = configuration['selected_attr'] delimiter = configuration[ 'delimiter'] if 'delimiter' in configuration else '\t' ofile = codecs.open( join(configuration['data_path'], configuration['input_name']) + '.sw', 'w') ofile.write(selected_attr + '\n') f = codecs.open(join(configuration['data_path'], configuration['input_name']), mode='rb', encoding='utf-8') f.next() for line in f: text = line.split(delimiter)[3] o_text = CRMDataset.process_text(text, stopwords_remover) if o_text == '': o_text = 'all_stopwords' ofile.write(o_text.encode("utf-8") + '\n') logging.debug(" done in %fs" % (time() - t0))
def __init__(self, dataset_name): """ Loads the wordcloud configuration for a given dataset """ cfg_name = dataset_name + '_cfg' module_name = 'configuration.' + cfg_name config_module = importlib.import_module(module_name) self.configuration = config_module.get_wordcloud_cfg() # Initialize stopwords remover class conf_language = self.configuration['stopwords_lang'] if conf_language is None: self.stopwords_remover = None logging.info( "Stop words removal is not configured and will be skipped") else: conf_stopwords_files = self.configuration['stopwords_files'] self.stopwords_remover = StopWords(conf_language, conf_stopwords_files)
def test_remove_sw_from_dtm(self): texts = ["Esto, es un texto", "Este texto también: es otro texto", "Este texto contiene la letra EÑE"] sw = StopWords("spanish") vec = CountVectorizer() dtm = vec.fit_transform(texts) # This returns csc sparse matrix col_names = np.array(vec.get_feature_names()) # Previous check self.assertEqual(len(col_names), 11) self.assertEqual(dtm.toarray().shape, (3, 11)) self.assertEqual(np.sum(dtm.toarray()), 16) # Stopwords (columns) removed: es, este, esto, la, otro, también, un result_dtm, result_col_names = sw.remove_sw_from_dtm(dtm, col_names) self.assertEqual(len(result_col_names), 4) self.assertEqual(result_dtm.toarray().shape, (3, 4)) self.assertEqual(np.sum(result_dtm.toarray()), 7)
class WordCloud(object): MULTISOURCE_ENUMERATE = 0 MULTISOURCE_COMPARE = 1 configuration = None dataset = None stopwords_remover = None dtm_matrices = [] dtm_col_names = [] def __init__(self, dataset_name): """ Loads the wordcloud configuration for a given dataset """ cfg_name = dataset_name + '_cfg' module_name = 'configuration.' + cfg_name config_module = importlib.import_module(module_name) self.configuration = config_module.get_wordcloud_cfg() # Initialize stopwords remover class conf_language = self.configuration['stopwords_lang'] if conf_language is None: self.stopwords_remover = None logging.info( "Stop words removal is not configured and will be skipped") else: conf_stopwords_files = self.configuration['stopwords_files'] self.stopwords_remover = StopWords(conf_language, conf_stopwords_files) def load_dataset(self): """ Create and load the dataset based on its configuration """ self.dataset = self.configuration['dataset_class'](self.configuration) self.dataset.load_dataset() # Load texts sources in a dtm each vec = CountVectorizer() for index, src in enumerate(self.configuration['train_src_names']): logging.debug("Creating DTM for source %s" % src) # This returns a csc sparse matrix frequencies_dtm = vec.fit_transform(self.dataset.get_texts(index)) self.dtm_matrices.append(frequencies_dtm) self.dtm_col_names.append(np.array(vec.get_feature_names())) def create_wordcloud_file(self, tags, output_file): # Get configuration parameters conf_num_tags = self.configuration['num_tags'] conf_min_tag_size = self.configuration['min_tag_size'] conf_max_tag_size = self.configuration['max_tag_size'] conf_image_size = self.configuration['image_size'] conf_font = self.configuration['font'] conf_background = self.configuration['background'] logging.info("Creating wordcloud image file: %s" % output_file) # Limit the tags to be displayed to those appearing more frequently tags = tags[:conf_num_tags] # Create the image tags = wc.make_tags(tags, minsize=conf_min_tag_size, maxsize=conf_max_tag_size) # Save image to file wc.create_tag_image(tags, output_file, size=conf_image_size, fontname=conf_font, layout=wc.LAYOUT_HORIZONTAL, background=conf_background) logging.info("Created wordcloud image file: %s" % output_file) print("Created wordcloud image file: %s" % output_file) def create_wordcloud_for_dtm(self, freq_dtm, dtm_col_names, out_path): """ Given a DTM, create a wordcloud file :param freq_dtm: Document Term Matrix to be displayed as a wordcloud :param dtm_col_names: Column names (terms) of the DTM :param out_path: Path to the output file with the wordcloud image """ # Get ordered tags by frequency # We use our own implementation, because pytagcloud.get_tag_counts has a poor performance words = dtm.get_term_freq(freq_dtm, dtm_col_names, sort=dtm.SORT_ON_FREQUENCY, reverse_sort=True) # Output filename is the same as input without the extension logging.debug("Term frequency (first 10): %s" % words[:10]) self.create_wordcloud_file(words, out_path) def preprocess_source_dtms(self): for index, src in enumerate(self.configuration['train_src_names']): self.dtm_matrices[index], self.dtm_col_names[index] = \ self.preprocess_source_dtm(self.dtm_matrices[index], self.dtm_col_names[index]) def preprocess_source_dtm(self, source_dtm, dtm_col_names): # Do not consider numbers in text for the wordcloud for index, term in enumerate(dtm_col_names): dtm_col_names[index] = replace_numbers(term) # Remove stop words for all DTMs if self.stopwords_remover is not None: source_dtm, dtm_col_names = self.stopwords_remover.remove_sw_from_dtm( source_dtm, dtm_col_names) return source_dtm, dtm_col_names def create_wordclouds_for_sources(self): """ Creates one wordcloud file for each source document """ for index, src in enumerate(self.configuration['train_src_names']): logging.info("...Wordcloud generation for source '%s' " % src) # Output filename is the same as input without the extension path = self.configuration['output_path'] filename = ".".join(src.split(".")[:-1]) + ".png" self.create_wordcloud_for_dtm(self.dtm_matrices[index], self.dtm_col_names[index], path + "/" + filename) def create_comparison_wordclouds(self): """ Creates two wordclouds with the subtraction, in both ways, of two given sources """ path = self.configuration['output_path'] sources = self.configuration['train_src_names'] max_freq_terms = self.configuration['max_freq_terms'] for iteration in range(len(sources) / 2): idx1, idx2 = iteration * 2, iteration * 2 + 1 logging.info("...Wordcloud comparation of sources '%s' and '%s' " % (sources[idx1], sources[idx2])) # Use source files without extension as basename basename_1 = ".".join(sources[idx1].split(".")[:-1]) basename_2 = ".".join(sources[idx2].split(".")[:-1]) # First we transform the frequencies as a percentage of the total, so we can compare the two data sets self.dtm_matrices[idx1] = dtm.set_frequency_as_percentage( self.dtm_matrices[idx1]) self.dtm_matrices[idx2] = dtm.set_frequency_as_percentage( self.dtm_matrices[idx2]) ''' # Uncomment for debug or exploration: get tags ordered by frequency for each source words1 = dtm.get_term_freq(self.dtm_matrices[idx1], self.dtm_col_names[idx1], sort=dtm.SORT_ON_FREQUENCY, reverse_sort=True) words2 = dtm.get_term_freq(self.dtm_matrices[idx2], self.dtm_col_names[idx2], sort=dtm.SORT_ON_FREQUENCY, reverse_sort=True) logging.debug("Len words source(%d) = %d: " % (idx1, len(words1))) logging.debug("First 10 words from source(%d): %s" % (idx1, words1[:10])) logging.debug("Len words source(%d) = %d: " % (idx2, len(words2))) logging.debug("First 10 words from source(%d): %s" % (idx2, words2[:10])) ''' # TODO: put duplicate code in one function # Calculate the differences subtract_dtm = dtm.subtract_dtm_frequencies( self.dtm_matrices[idx1], self.dtm_col_names[idx1], self.dtm_matrices[idx2], self.dtm_col_names[idx2]) words = dtm.get_term_freq(subtract_dtm, self.dtm_col_names[idx1], sort=dtm.SORT_ON_FREQUENCY, reverse_sort=True) logging.debug("Length source(%d) = %d" % (idx1, len(self.dtm_col_names[idx1]))) logging.debug("Length source(%d) - source(%d) = %d: " % (idx1, idx2, len(words))) logging.debug("First 10 words from subtraction: %s" % words[:10]) wc_file = "%s/%s_minus_%s.png" % (path, basename_1, basename_2) self.create_wordcloud_file(words, wc_file) # TODO: set how many words to track in configuration file xlsx_file = "%s/%s_minus_%s.xlsx" % (path, basename_1, basename_2) excel.create_docterm_statistics_file( xlsx_file, np.array(words)[:max_freq_terms, 0], self.dataset.get_texts(idx1), image=wc_file) # Now the same, in the other way subtract_dtm = dtm.subtract_dtm_frequencies( self.dtm_matrices[idx2], self.dtm_col_names[idx2], self.dtm_matrices[idx1], self.dtm_col_names[idx1]) words = dtm.get_term_freq(subtract_dtm, self.dtm_col_names[idx2], sort=dtm.SORT_ON_FREQUENCY, reverse_sort=True) logging.debug("Length source(%d) = %d" % (idx2, len(self.dtm_col_names[idx2]))) logging.debug("Length source(%d) - source(%d) = %d: " % (idx2, idx1, len(words))) logging.debug("First 10 words from subtraction: %s" % words[:10]) wc_file = "%s/%s_minus_%s.png" % (path, basename_2, basename_1) self.create_wordcloud_file(words, wc_file) # TODO: set how many words to track in configuration file xlsx_file = "%s/%s_minus_%s.xlsx" % (path, basename_2, basename_1) excel.create_docterm_statistics_file( xlsx_file, np.array(words)[:max_freq_terms, 0], self.dataset.get_texts(idx2), image=wc_file) def generate_wc(self): """ Calls the appropriate generator based on the configuration properties """ if self.configuration['multi_src'] == WordCloud.MULTISOURCE_ENUMERATE: self.create_wordclouds_for_sources() else: self.create_comparison_wordclouds()
def test_remove_not_default_sw(self): text = u"Esta, es la Prueba número veinte" sw = StopWords("spanish", sw_files=["numbers"], load_default=False) result = sw.remove_sw_from_text(text, lowercase=True, remove_punctuation=False) self.assertEqual(u"esta , es la prueba número", result)
def test_remove_additional_sw(self): text = u"Esta, es la Prueba número veinte" sw = StopWords("spanish", sw_files=["numbers"]) result = sw.remove_sw_from_text(text) self.assertEqual(u"Prueba número", result)
def test_remove_default_sw_from_list_to_list(self): words = ["This", "is", "number", "one", "test"] sw = StopWords("english") result = sw.remove_sw_from_list(words) self.assertEqual(["number", "one", "test"], result)
def test_remove_default_sw_from_list_to_text(self): words = ["This", "is", "number", "one", "test"] sw = StopWords("english") result = sw.remove_sw_from_list(words, as_string=True) self.assertEqual(u"number one test", result)
def test_remove_default_sw_from_text_to_list(self): text = u"This, is number one test" sw = StopWords("english") result = sw.remove_sw_from_text(text, as_string=False) self.assertEqual(["number", "one", "test"], result)
def test_remove_default_sw_from_text_to_text(self): text = u"This, is number one test" sw = StopWords("english") result = sw.remove_sw_from_text(text) self.assertEqual(u"number one test", result)