def top_tfidf_words(queryset_person, document_collection): stopwords = functions.get_stopwords() # fast way to join lists: http://stackoverflow.com/questions/716477/join-list-of-lists-in-python person_words = question_queryset_to_unicode_text(queryset_person) #list(itertools.chain.from_iterable([tokenizer.tokenize(question.tittel) for question in queryset_person])) # wordpunct_tokenize ser ut til å fungere bedre enn mange andre. men en regexp er bedre? #list(itertools.chain.from_iterable([tokenizer.tokenize(question.tittel) for question in queryset_everybody])) evaluated_words = {} # dict to hold words #print "evaluerer ord" if len(person_words) > 30: #print len(set(person_words)) for word in set(person_words): # just once for each word #print "ord: %s og value %s" % (word, dc.tf_idf(word, person_words)) if word.lower() not in stopwords: # får ord som har, få, som top.. prøver med stoppord evaluated_words[word] = document_collection.tf_idf(word, person_words) #print evaluated_words sorted_words = sorted(evaluated_words.items(), key=lambda x: x[1], reverse=True)[:29] max_tfidf = sorted_words[0][1] result = [{'tag': x, 'tfidf': y, 'size': functions.font_size_from_percent(round(float(y)/max_tfidf*100,2)) } for x, y in sorted_words] else: result = [{'tag': 'For lite data til å lage ordsky.', 'tfidf': 0, 'size': 7}] return result
def create_weigted_list(queryset): ''' create a weigthed list of top word from questions from a particular user ''' # first get a list of stop words stopwords = functions.get_stopwords() pure_words = question_queryset_to_unicode_text(queryset) counts = defaultdict(int) for word in pure_words: if word.lower() not in stopwords: counts[word] += 1 pairs = [(x,y) for x,y in counts.items() if y >= 3] # remove shorter than 3 accounts if len(pairs) > 1: # if we have data to work with: sorted_stems = sorted(pairs, key = lambda x: x[1], reverse=True)[:30] # Sort (stem,count) pairs based on count, pick top n words max_value = sorted_stems[0][1] # max_value, min_value = sorted_stems[0][1], sorted_stems[-1][1] #print sorted_stems, max_value result = [{'tag': x, 'freq': y, 'size': functions.font_size_from_percent(round(float(y)/max_value*100,2)) } for x, y in sorted_stems] # round(n,2) <- rounds off float. else: # too little data to do this result = [{'tag': 'For lite data til å lage ordsky, ingen enkeltord er brukt mer enn 3 ganger.', 'freq': 0, 'size': 7}] return result