def init(topic_file): """ Parses a topic file and returns all of the documents and the list of words in it. Key arguments: topic_file -- the topic file. """ bible = dotdict() bible.topics = {} bible.words = dotdict() docs = [] dir = os.path.dirname(topic_file) topic_handle = open(topic_file, 'r') for topic in topic_handle: if topic == "\n": continue topic = topic[:-1] # Read in each topic file. docs_file = dir + '/' + topic + '.txt' docs_handle = open(docs_file, 'r') # Add a new topic to the bible. topic = topic.upper() bible.topics[topic] = [] # Split the file based on the document id and its text. split = re.split(r"(--\w+--)", str(docs_handle.read()), re.I | re.M)[1:] # For each document, find all the words. i = 0 while i < len(split): id = split[i][2:-2] words = re.findall(r'\w+', split[i + 1].upper()) doc = Doc.factory(topic, id, words, bible) docs.append(doc) bible.topics[topic].append(doc) i += 2 docs_handle.close() topic_handle.close() # Run through each document and init the TFIDF vector. for doc in docs: doc.init(bible) # Set the docs in the bible bible.docs = docs return bible, docs
def execute(self, docs): """ @see parent """ clusters = [] # First cluster contains all of the docs. cluster = dotdict() cluster.docs = [] for doc in self.bible.docs: doc.cluster = cluster cluster.docs.append(doc) cluster.centroid = self.centroid(cluster.docs) # Append the first cluster to the cluster list. clusters.append(cluster) while len(clusters) != self.k: # Use the abstract select cluster method. cluster = self.select_cluster(clusters) # Remove this cluster from the current set because it will be split. clusters.remove(cluster) max_sim = float("-inf") max_bicluster = None for i in range(self.iter): # Free the docs from whatever cluster they are in. for doc in cluster.docs: doc.cluster = None kmeans = KMeans(2) kmeans.bible = self.bible kmeans.execute(cluster.docs) bicluster = kmeans.clusters sim = kmeans.similarity() if sim > max_sim: max_sim = sim max_bicluster = bicluster # Re-assign the documents to their respective max bicluster. for cluster in bicluster: for doc in cluster.docs: doc.cluster = cluster # Add the new max bicluster to the current cluster set. clusters.extend(bicluster) self.clusters = clusters
def execute(self, docs): """ @see parent """ # To start, every document is its own cluster. clusters = [] for doc in docs: cluster = dotdict() doc.cluster = cluster cluster.docs = [] cluster.docs.append(doc) cluster.centroid = dict(doc.tfidf) clusters.append(cluster) # Continue merging until we reach k clusters. while len(clusters) != self.k: clusters = self.merge(clusters) self.clusters = clusters