def __init__(self, docs, num_vec_words=100, num_doc_keywords=3, stopword_ratio=.15, min_word_length=3): """ Calculates idf values. words that appear in more than `stopword_ratio` of the documents are given an idf of 0. Gets top `num_doc_keywords` from each doc and picks the `num_vec_words` most common. Input: docs - A list of (title, body) pairs. num_vec_words - Number of words to make the feature space out of. By definition, the number of dimensions. Defaults to 100. num_doc_keywords - Number of keywords to pick from each document. Defaults to 3. """ self.docs = docs self.clean_docs = [clean_doc(doc) for doc in self.docs] self.idf = tfidf.TfIdf() for doc in self.clean_docs: self.idf.add_input_document(doc) self.idf.stopwords = self.idf.calculate_stopwords(.15) # .15 just based on trying random nums # At > top 3, I started seeing stopwords enter self.top_words = [(word,score) for doc in self.clean_docs for (word,score) in self.idf.get_doc_keywords(doc)[:num_doc_keywords] if len(word)>=min_word_length and score>0] self.top_word_counts = Counter(p[0] for p in self.top_words) self.key_words = [word for (word, _) in self.top_word_counts.most_common(num_vec_words)] self.vecs_to_docs = defaultdict(list) for (raw_doc, doc) in zip(self.docs, self.clean_docs): doc_dict = dict(self.idf.get_doc_keywords(doc)) vector = tuple(doc_dict.get(word, 0) for word in self.key_words) self.vecs_to_docs[vector].append(raw_doc) self.doc_vectors = np.array(list(self.vecs_to_docs.keys())) self.tree = kmeans.bisecting_kmeans(self.doc_vectors)
def kmeans_test(request): data = None form = kmeansNumSamplesForm() k=None sample_size=None grouped_data = None clusters = None error_list = None if request.method=='POST': form = kmeansNumSamplesForm(request.POST) if form.is_valid(): sample_size = int(form.cleaned_data['num_samples']) k = int(form.cleaned_data['k']) # Generate random data data = numpy.random.random((sample_size, 2)) # Calculate kmeans if form.cleaned_data['method']=='Basic': grouped_data, clusters, error_list = kmeans.kmeans(data,num_clusters=k, min_error=0.01, max_iter=100) else: grouped_data, clusters, error_list = kmeans.bisecting_kmeans(data,k=k, min_error=0.01, max_iter=50) return render_to_response('visualization/kmeans.html', { 'data': grouped_data, 'clusters': clusters, 'error_list': error_list, 'form':form, 'k': k, 'sample_size': sample_size, }, context_instance=RequestContext(request))
def __init__(self, docs, num_vec_words=100, num_doc_keywords=3, stopword_ratio=.15, min_word_length=3): """ Calculates idf values. words that appear in more than `stopword_ratio` of the documents are given an idf of 0. Gets top `num_doc_keywords` from each doc and picks the `num_vec_words` most common. Input: docs - A list of (title, body) pairs. num_vec_words - Number of words to make the feature space out of. By definition, the number of dimensions. Defaults to 100. num_doc_keywords - Number of keywords to pick from each document. Defaults to 3. """ self.docs = docs self.clean_docs = [clean_doc(doc) for doc in self.docs] self.idf = tfidf.TfIdf() for doc in self.clean_docs: self.idf.add_input_document(doc) self.idf.stopwords = self.idf.calculate_stopwords( .15) # .15 just based on trying random nums # At > top 3, I started seeing stopwords enter self.top_words = [ (word, score) for doc in self.clean_docs for (word, score) in self.idf.get_doc_keywords(doc)[:num_doc_keywords] if len(word) >= min_word_length and score > 0 ] self.top_word_counts = Counter(p[0] for p in self.top_words) self.key_words = [ word for (word, _) in self.top_word_counts.most_common(num_vec_words) ] self.vecs_to_docs = defaultdict(list) for (raw_doc, doc) in zip(self.docs, self.clean_docs): doc_dict = dict(self.idf.get_doc_keywords(doc)) vector = tuple(doc_dict.get(word, 0) for word in self.key_words) self.vecs_to_docs[vector].append(raw_doc) self.doc_vectors = np.array(list(self.vecs_to_docs.keys())) self.tree = kmeans.bisecting_kmeans(self.doc_vectors)
def make_tree(vecs_to_docs): """ Clusters a vector->docs dict. """ points = np.array(list(vecs_to_docs.keys())) return kmeans.bisecting_kmeans(points)