Ejemplo n.º 1
0
def word_count_dict_to_html(word_count_dict, type, ignores, includes):
    sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq)

    if type == 'word':
        # filter short words
        sorted_dict = [(word, freq) for (word, freq) in sorted_dict
                       if len(word) > 4 or
                          any(pattern.match(word) for pattern in includes)]

    # filter words in ignore_list
    sorted_dict = [(word, freq) for (word, freq) in sorted_dict
                   #if word.lower() not in ignore_list]
                   if not any(pattern.match(word) for pattern in ignores)]

    number_of_words = 42

    # only take the longest words. If there are less words than n,
    # len(sorted_dict) words are returned
    cloud_words = sorted_dict[-number_of_words:]

    if not cloud_words:
        return [], ''

    min_count = cloud_words[0][1]
    max_count = cloud_words[-1][1]

    delta_count = max_count - min_count
    if delta_count == 0:
        delta_count = 1

    min_font_size = 10
    max_font_size = 50

    font_delta = max_font_size - min_font_size

    # sort words with unicode sort function
    cloud_words.sort(key=lambda (word, count): unicode.coll(word))

    html_elements = []

    for index, (word, count) in enumerate(cloud_words):
        font_factor = (count - min_count) / delta_count
        font_size = int(min_font_size + font_factor * font_delta)

        html_elements.append('<a href="search/%s">'
                            '<span style="font-size:%spx">%s</span></a>'
                            % (index, font_size, word) +
                            #Add some whitespace
                            '&#xA0;')

    html_body = ''.join(['<body>', '\n'.join(html_elements), '\n</body>\n'])
    html_doc = ''.join(['<html><head>', CLOUD_CSS, '</head>', html_body, '</html>'])

    return (cloud_words, html_doc)
Ejemplo n.º 2
0
def word_count_dict_to_html(word_count_dict, type, ignore_list, include_list):
    logging.debug("Turning the word_count_dict into html")
    logging.debug("Length word_count_dict: %s" % len(word_count_dict))

    sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq)

    if type == "word":
        # filter short words
        include_list = map(lambda word: word.lower(), include_list)
        get_long_words = lambda (word, freq): len(word) > 4 or word.lower() in include_list
        sorted_dict = filter(get_long_words, sorted_dict)
        logging.debug("Filtered short words. Length word_count_dict: %s" % len(sorted_dict))

    # filter words in ignore_list
    sorted_dict = filter(lambda (word, freq): word.lower() not in ignore_list, sorted_dict)
    logging.debug("Filtered blacklist words. Length word_count_dict: %s" % len(sorted_dict))

    often_used_words = []
    number_of_words = 42

    """
    only take the longest words. If there are less words than n,
    len(sorted_dict) words are returned
    """
    cloud_words = sorted_dict[-number_of_words:]
    logging.debug("Selected most frequent words. Length CloudWords: %s" % len(cloud_words))

    if len(cloud_words) < 1:
        return [], ""

    min_count = cloud_words[0][1]
    max_count = cloud_words[-1][1]

    logging.debug("Min word count: %s, Max word count: %s" % (min_count, max_count))

    delta_count = max_count - min_count
    if delta_count == 0:
        delta_count = 1

    min_font_size = 10
    max_font_size = 50

    font_delta = max_font_size - min_font_size

    # sort words with unicode sort function
    cloud_words.sort(key=lambda (word, count): unicode.coll(word))

    logging.debug("Sorted cloud words. Length CloudWords: %s" % len(cloud_words))

    html_elements = []

    css = """\
    <style type="text/css">
        body {
            font-family: sans-serif;
            text-align: center;
        }
        a:link { color:black; text-decoration:none; }
        a:visited { color:black; text-decoration:none; }
        a:focus { color:black; text-decoration:none; }
        a:hover { color:black; text-decoration:none; }
        a:active { color:black; text-decoration:none; }
    </style>"""

    for index, (word, count) in enumerate(cloud_words):
        font_factor = (count - min_count) / delta_count
        font_size = int(min_font_size + font_factor * font_delta)

        html_elements.append(
            '<a href="search/%s">'
            '<span style="font-size:%spx">%s</span></a>' % (index, font_size, word) +
            # Add some whitespace
            "&#xA0;"
        )

    # random.shuffle(html_elements)

    html_body = "<body>" + "\n".join(html_elements) + "\n</body>\n"
    html_doc = "<html><head>" + css + "</head>" + html_body + "</html>"

    return (cloud_words, html_doc)
Ejemplo n.º 3
0
def word_count_dict_to_html(word_count_dict, type, ignore_list, include_list):
    logging.debug('Turning the word_count_dict into html')
    logging.debug('Length word_count_dict: %s' % len(word_count_dict))

    sorted_dict = sorted(word_count_dict.items(), key=lambda (word, freq): freq)

    if type == 'word':
        # filter short words
        include_list = map(lambda word: word.lower(), include_list)
        get_long_words = lambda (word, freq): len(word) > 4 or word.lower() in include_list
        sorted_dict = filter(get_long_words, sorted_dict)
        logging.debug('Filtered short words. Length word_count_dict: %s' % len(sorted_dict))

    # filter words in ignore_list
    sorted_dict = filter(lambda (word, freq): word.lower() not in ignore_list, sorted_dict)
    logging.debug('Filtered blacklist words. Length word_count_dict: %s' % len(sorted_dict))

    often_used_words = []
    number_of_words = 42

    '''
    only take the longest words. If there are less words than n,
    len(sorted_dict) words are returned
    '''
    cloud_words = sorted_dict[-number_of_words:]
    logging.debug('Selected most frequent words. Length CloudWords: %s' % len(cloud_words))

    if len(cloud_words) < 1:
        return [], ''

    min_count = cloud_words[0][1]
    max_count = cloud_words[-1][1]

    logging.debug('Min word count: %s, Max word count: %s' % (min_count, max_count))

    delta_count = max_count - min_count
    if delta_count == 0:
        delta_count = 1

    min_font_size = 10
    max_font_size = 50

    font_delta = max_font_size - min_font_size

    # sort words with unicode sort function
    cloud_words.sort(key=lambda (word, count): unicode.coll(word))

    logging.debug('Sorted cloud words. Length CloudWords: %s' % len(cloud_words))

    html_elements = []


    css = '''\
    <style type="text/css">
        body {
            font-family: sans-serif;
            text-align: center;
        }
        a:link { color:black; text-decoration:none; }
        a:visited { color:black; text-decoration:none; }
        a:focus { color:black; text-decoration:none; }
        a:hover { color:black; text-decoration:none; }
        a:active { color:black; text-decoration:none; }
    </style>'''

    for index, (word, count) in enumerate(cloud_words):
        font_factor = (count - min_count) / delta_count
        font_size = int(min_font_size + font_factor * font_delta)

        html_elements.append('<a href="search/%s">'
                            '<span style="font-size:%spx">%s</span></a>'
                            % (index, font_size, word) +
                            #Add some whitespace
                            '&#xA0;')

    #random.shuffle(html_elements)

    html_body = '<body>' + '\n'.join(html_elements) + '\n</body>\n'
    html_doc = '<html><head>' + css + '</head>' + html_body + '</html>'

    return (cloud_words, html_doc)