Example #1
0
    def __init__(self, docs, num_vec_words=100, num_doc_keywords=3, stopword_ratio=.15,
            min_word_length=3):
        """
        Calculates idf values. words that appear in more than `stopword_ratio`
        of the documents are given an idf of 0. Gets top `num_doc_keywords` from
        each doc and picks the `num_vec_words` most common.
        
        Input:
        docs - A list of (title, body) pairs.
        num_vec_words - Number of words to make the feature space out of. By
            definition, the number of dimensions. Defaults to 100.
        num_doc_keywords - Number of keywords to pick from each document.
            Defaults to 3.
        """
        self.docs = docs
        self.clean_docs = [clean_doc(doc) for doc in self.docs]
        self.idf = tfidf.TfIdf()
        for doc in self.clean_docs:
            self.idf.add_input_document(doc)
        self.idf.stopwords = self.idf.calculate_stopwords(.15) # .15 just based on trying random nums

        # At > top 3, I started seeing stopwords enter
        self.top_words = [(word,score) for doc in self.clean_docs 
                for (word,score) in self.idf.get_doc_keywords(doc)[:num_doc_keywords]
                if len(word)>=min_word_length and score>0]
        self.top_word_counts = Counter(p[0] for p in self.top_words)
        self.key_words = [word for (word, _) in self.top_word_counts.most_common(num_vec_words)]

        self.vecs_to_docs = defaultdict(list)
        for (raw_doc, doc) in zip(self.docs, self.clean_docs):
            doc_dict = dict(self.idf.get_doc_keywords(doc))
            vector = tuple(doc_dict.get(word, 0) for word in self.key_words)
            self.vecs_to_docs[vector].append(raw_doc)
        self.doc_vectors = np.array(list(self.vecs_to_docs.keys()))
        self.tree = kmeans.bisecting_kmeans(self.doc_vectors)
Example #2
0
def kmeans_test(request):
	data = None
	form = kmeansNumSamplesForm()
	k=None
	sample_size=None
	grouped_data = None
	clusters = None
	error_list = None
	if request.method=='POST':
		form = kmeansNumSamplesForm(request.POST)
		if form.is_valid():
			sample_size = int(form.cleaned_data['num_samples'])
			k = int(form.cleaned_data['k'])
			# Generate random data
			data = numpy.random.random((sample_size, 2)) 
			# Calculate kmeans
			if form.cleaned_data['method']=='Basic':
				grouped_data, clusters, error_list = kmeans.kmeans(data,num_clusters=k, min_error=0.01, max_iter=100)
			else:
				grouped_data, clusters, error_list = kmeans.bisecting_kmeans(data,k=k, min_error=0.01, max_iter=50)
	return render_to_response('visualization/kmeans.html', {
		'data': grouped_data,
		'clusters': clusters,
		'error_list': error_list,
		'form':form,
		'k': k,
		'sample_size': sample_size,
		}, context_instance=RequestContext(request))
Example #3
0
    def __init__(self,
                 docs,
                 num_vec_words=100,
                 num_doc_keywords=3,
                 stopword_ratio=.15,
                 min_word_length=3):
        """
        Calculates idf values. words that appear in more than `stopword_ratio`
        of the documents are given an idf of 0. Gets top `num_doc_keywords` from
        each doc and picks the `num_vec_words` most common.
        
        Input:
        docs - A list of (title, body) pairs.
        num_vec_words - Number of words to make the feature space out of. By
            definition, the number of dimensions. Defaults to 100.
        num_doc_keywords - Number of keywords to pick from each document.
            Defaults to 3.
        """
        self.docs = docs
        self.clean_docs = [clean_doc(doc) for doc in self.docs]
        self.idf = tfidf.TfIdf()
        for doc in self.clean_docs:
            self.idf.add_input_document(doc)
        self.idf.stopwords = self.idf.calculate_stopwords(
            .15)  # .15 just based on trying random nums

        # At > top 3, I started seeing stopwords enter
        self.top_words = [
            (word, score) for doc in self.clean_docs
            for (word,
                 score) in self.idf.get_doc_keywords(doc)[:num_doc_keywords]
            if len(word) >= min_word_length and score > 0
        ]
        self.top_word_counts = Counter(p[0] for p in self.top_words)
        self.key_words = [
            word
            for (word, _) in self.top_word_counts.most_common(num_vec_words)
        ]

        self.vecs_to_docs = defaultdict(list)
        for (raw_doc, doc) in zip(self.docs, self.clean_docs):
            doc_dict = dict(self.idf.get_doc_keywords(doc))
            vector = tuple(doc_dict.get(word, 0) for word in self.key_words)
            self.vecs_to_docs[vector].append(raw_doc)
        self.doc_vectors = np.array(list(self.vecs_to_docs.keys()))
        self.tree = kmeans.bisecting_kmeans(self.doc_vectors)
Example #4
0
def make_tree(vecs_to_docs):
    """
    Clusters a vector->docs dict.
    """
    points = np.array(list(vecs_to_docs.keys()))
    return kmeans.bisecting_kmeans(points)
Example #5
0
def make_tree(vecs_to_docs):
    """
    Clusters a vector->docs dict.
    """
    points = np.array(list(vecs_to_docs.keys()))
    return kmeans.bisecting_kmeans(points)