def word_count_dict_to_html(word_count_dict, type, ignores, includes): sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq) if type == 'word': # filter short words sorted_dict = [(word, freq) for (word, freq) in sorted_dict if len(word) > 4 or any(pattern.match(word) for pattern in includes)] # filter words in ignore_list sorted_dict = [(word, freq) for (word, freq) in sorted_dict #if word.lower() not in ignore_list] if not any(pattern.match(word) for pattern in ignores)] number_of_words = 42 # only take the longest words. If there are less words than n, # len(sorted_dict) words are returned cloud_words = sorted_dict[-number_of_words:] if not cloud_words: return [], '' min_count = cloud_words[0][1] max_count = cloud_words[-1][1] delta_count = max_count - min_count if delta_count == 0: delta_count = 1 min_font_size = 10 max_font_size = 50 font_delta = max_font_size - min_font_size # sort words with unicode sort function cloud_words.sort(key=lambda (word, count): unicode.coll(word)) html_elements = [] for index, (word, count) in enumerate(cloud_words): font_factor = (count - min_count) / delta_count font_size = int(min_font_size + font_factor * font_delta) html_elements.append('<a href="search/%s">' '<span style="font-size:%spx">%s</span></a>' % (index, font_size, word) + #Add some whitespace ' ') html_body = ''.join(['<body>', '\n'.join(html_elements), '\n</body>\n']) html_doc = ''.join(['<html><head>', CLOUD_CSS, '</head>', html_body, '</html>']) return (cloud_words, html_doc)
def word_count_dict_to_html(word_count_dict, type, ignore_list, include_list): logging.debug("Turning the word_count_dict into html") logging.debug("Length word_count_dict: %s" % len(word_count_dict)) sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq) if type == "word": # filter short words include_list = map(lambda word: word.lower(), include_list) get_long_words = lambda (word, freq): len(word) > 4 or word.lower() in include_list sorted_dict = filter(get_long_words, sorted_dict) logging.debug("Filtered short words. Length word_count_dict: %s" % len(sorted_dict)) # filter words in ignore_list sorted_dict = filter(lambda (word, freq): word.lower() not in ignore_list, sorted_dict) logging.debug("Filtered blacklist words. Length word_count_dict: %s" % len(sorted_dict)) often_used_words = [] number_of_words = 42 """ only take the longest words. If there are less words than n, len(sorted_dict) words are returned """ cloud_words = sorted_dict[-number_of_words:] logging.debug("Selected most frequent words. Length CloudWords: %s" % len(cloud_words)) if len(cloud_words) < 1: return [], "" min_count = cloud_words[0][1] max_count = cloud_words[-1][1] logging.debug("Min word count: %s, Max word count: %s" % (min_count, max_count)) delta_count = max_count - min_count if delta_count == 0: delta_count = 1 min_font_size = 10 max_font_size = 50 font_delta = max_font_size - min_font_size # sort words with unicode sort function cloud_words.sort(key=lambda (word, count): unicode.coll(word)) logging.debug("Sorted cloud words. Length CloudWords: %s" % len(cloud_words)) html_elements = [] css = """\ <style type="text/css"> body { font-family: sans-serif; text-align: center; } a:link { color:black; text-decoration:none; } a:visited { color:black; text-decoration:none; } a:focus { color:black; text-decoration:none; } a:hover { color:black; text-decoration:none; } a:active { color:black; text-decoration:none; } </style>""" for index, (word, count) in enumerate(cloud_words): font_factor = (count - min_count) / delta_count font_size = int(min_font_size + font_factor * font_delta) html_elements.append( '<a href="search/%s">' '<span style="font-size:%spx">%s</span></a>' % (index, font_size, word) + # Add some whitespace " " ) # random.shuffle(html_elements) html_body = "<body>" + "\n".join(html_elements) + "\n</body>\n" html_doc = "<html><head>" + css + "</head>" + html_body + "</html>" return (cloud_words, html_doc)
def word_count_dict_to_html(word_count_dict, type, ignore_list, include_list): logging.debug('Turning the word_count_dict into html') logging.debug('Length word_count_dict: %s' % len(word_count_dict)) sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq) if type == 'word': # filter short words include_list = map(lambda word: word.lower(), include_list) get_long_words = lambda (word, freq): len(word) > 4 or word.lower() in include_list sorted_dict = filter(get_long_words, sorted_dict) logging.debug('Filtered short words. Length word_count_dict: %s' % len(sorted_dict)) # filter words in ignore_list sorted_dict = filter(lambda (word, freq): word.lower() not in ignore_list, sorted_dict) logging.debug('Filtered blacklist words. Length word_count_dict: %s' % len(sorted_dict)) often_used_words = [] number_of_words = 42 ''' only take the longest words. If there are less words than n, len(sorted_dict) words are returned ''' cloud_words = sorted_dict[-number_of_words:] logging.debug('Selected most frequent words. Length CloudWords: %s' % len(cloud_words)) if len(cloud_words) < 1: return [], '' min_count = cloud_words[0][1] max_count = cloud_words[-1][1] logging.debug('Min word count: %s, Max word count: %s' % (min_count, max_count)) delta_count = max_count - min_count if delta_count == 0: delta_count = 1 min_font_size = 10 max_font_size = 50 font_delta = max_font_size - min_font_size # sort words with unicode sort function cloud_words.sort(key=lambda (word, count): unicode.coll(word)) logging.debug('Sorted cloud words. Length CloudWords: %s' % len(cloud_words)) html_elements = [] css = '''\ <style type="text/css"> body { font-family: sans-serif; text-align: center; } a:link { color:black; text-decoration:none; } a:visited { color:black; text-decoration:none; } a:focus { color:black; text-decoration:none; } a:hover { color:black; text-decoration:none; } a:active { color:black; text-decoration:none; } </style>''' for index, (word, count) in enumerate(cloud_words): font_factor = (count - min_count) / delta_count font_size = int(min_font_size + font_factor * font_delta) html_elements.append('<a href="search/%s">' '<span style="font-size:%spx">%s</span></a>' % (index, font_size, word) + #Add some whitespace ' ') #random.shuffle(html_elements) html_body = '<body>' + '\n'.join(html_elements) + '\n</body>\n' html_doc = '<html><head>' + css + '</head>' + html_body + '</html>' return (cloud_words, html_doc)