Ejemplo n.º 1
0
def top_tfidf_words(queryset_person, document_collection):
    stopwords = functions.get_stopwords()
    # fast way to join lists: http://stackoverflow.com/questions/716477/join-list-of-lists-in-python
    person_words = question_queryset_to_unicode_text(queryset_person) #list(itertools.chain.from_iterable([tokenizer.tokenize(question.tittel) for question in queryset_person]))  # wordpunct_tokenize ser ut til å fungere bedre enn mange andre. men en regexp er bedre?
     #list(itertools.chain.from_iterable([tokenizer.tokenize(question.tittel) for question in queryset_everybody]))

    evaluated_words = {}                                # dict to hold words

    #print "evaluerer ord"
    if len(person_words) > 30:
        #print len(set(person_words))
        for word in set(person_words):                      # just once for each word
            #print "ord: %s og value %s" % (word, dc.tf_idf(word, person_words))
            if word.lower() not in stopwords:                   # får ord som har, få, som top.. prøver med stoppord
                evaluated_words[word] = document_collection.tf_idf(word, person_words)
        #print evaluated_words
        sorted_words = sorted(evaluated_words.items(), key=lambda x: x[1], reverse=True)[:29]
        max_tfidf = sorted_words[0][1]
        result = [{'tag': x, 'tfidf': y, 'size': functions.font_size_from_percent(round(float(y)/max_tfidf*100,2)) } for x, y in sorted_words]
    else:
        result = [{'tag': 'For lite data til å lage ordsky.', 'tfidf': 0, 'size': 7}]
    return result
Ejemplo n.º 2
0
def create_weigted_list(queryset):
    ''' create a weigthed list of top word from questions from a particular user '''
    # first get a list of stop words
    stopwords = functions.get_stopwords()
    pure_words = question_queryset_to_unicode_text(queryset)

    counts = defaultdict(int)
    for word in pure_words:
        if word.lower() not in stopwords:
            counts[word] += 1
    
    pairs = [(x,y) for x,y in counts.items() if y >= 3]                             # remove shorter than 3 accounts
    if len(pairs) > 1:
        # if we have data to work with:                                             
        sorted_stems = sorted(pairs, key = lambda x: x[1], reverse=True)[:30]       # Sort (stem,count) pairs based on count, pick top n words
        max_value = sorted_stems[0][1]                                              # max_value, min_value = sorted_stems[0][1], sorted_stems[-1][1]
        #print sorted_stems, max_value
        result = [{'tag': x,  'freq': y, 'size': functions.font_size_from_percent(round(float(y)/max_value*100,2)) } for x, y in sorted_stems]       # round(n,2) <- rounds off float.
    else:
        # too little data to do this
        result = [{'tag': 'For lite data til å lage ordsky, ingen enkeltord er brukt mer enn 3 ganger.', 'freq': 0, 'size': 7}]
    return result