def build_from_data(self, twn, docs, max_depth=5, min_doc_num=20): starttime = time.time() cowordnet = self.build_global_cowordnet(docs) com_dect = comdect.LabelCommunityDetection(min_nodes=30) group = CommunityGroup(cowordnet, docs, com_dect) real_labels = None if self.logfile: real_labels = self.load_doc_labels(docs) self.resoutput = open(self.logfile, 'w') self.resoutput.write( 'NCluster \tAdjusted_Rand \tAdjusted_NMI \tF-Measure \tV-Measure \tprecison \tcall\n' ) depth = 0 rootcom = Community(twn, group, len(docs)) group.add_community(rootcom) while depth <= max_depth: acoms = group.active_coms() if not acoms: break print 'dividing community' for c in acoms: children = c.make_children() if children: group.remove_community(c) for ch in children: group.add_community(ch) acoms = group.active_coms() if not acoms: break uncdocs = group.unclassified_docs() print 'Mapping unclassified document into communities' Community.map_docs_to_coms(uncdocs, acoms) group.remove_null_community(min_doc_num) depth += 1 if self.logfile: predicted = group.doc_labels() rs = cmp_cluster_result(predicted, len(group), real_labels) self.resoutput.write(rs) print 'rebuilding wordnet' for c in acoms: c.rebuild_wordnet() self.merge_communities(group, 0.5) if self.logfile: predicted = group.doc_labels() rs = cmp_cluster_result(predicted, len(group), real_labels) self.resoutput.write(rs) self.resoutput.write('\r\n') self.resoutput.write(self.output_keywords(group).encode('utf8')) self.resoutput.close() #os.system('emacs ../doc_clustering_evalution.txt') self._run_time = time.time() - starttime print 'Elapsed time: %.2fs' % self._run_time return group.doc_labels()
def build_from_data(self,twn,docs,max_depth=5, min_doc_num=20): starttime = time.time() cowordnet = self.build_global_cowordnet(docs) com_dect = comdect.LabelCommunityDetection(min_nodes=30) group = CommunityGroup(cowordnet,docs,com_dect) real_labels = None if self.logfile: real_labels = self.load_doc_labels(docs) self.resoutput = open(self.logfile,'w') self.resoutput.write('NCluster \tAdjusted_Rand \tAdjusted_NMI \tF-Measure \tV-Measure \tprecison \tcall\n') depth = 0 rootcom = Community(twn,group,len(docs)) group.add_community(rootcom) while depth<=max_depth: acoms = group.active_coms() if not acoms: break print 'dividing community' for c in acoms: children = c.make_children() if children: group.remove_community(c) for ch in children: group.add_community(ch) acoms = group.active_coms() if not acoms: break uncdocs = group.unclassified_docs() print 'Mapping unclassified document into communities' Community.map_docs_to_coms(uncdocs, acoms) group.remove_null_community(min_doc_num) depth += 1 if self.logfile: predicted = group.doc_labels() rs = cmp_cluster_result(predicted,len(group),real_labels) self.resoutput.write(rs) print 'rebuilding wordnet' for c in acoms: c.rebuild_wordnet() self.merge_communities(group, 0.5) if self.logfile: predicted = group.doc_labels() rs = cmp_cluster_result(predicted,len(group),real_labels) self.resoutput.write(rs) self.resoutput.write('\r\n') self.resoutput.write(self.output_keywords(group).encode('utf8')) self.resoutput.close() #os.system('emacs ../doc_clustering_evalution.txt') self._run_time = time.time() - starttime print 'Elapsed time: %.2fs' % self._run_time return group.doc_labels()
def load_title_wordnet(self, min_coocur=2, min_weight=1e-3): titleiter = Community.wordpair_weight(self.iter_title_words(), min_coocur, min_weight) elist = [] for w1, w2, co, weight in titleiter: elist.append({'source': w1, 'target': w2, 'weight': weight}) return igraph.Graph.DictList(vertices=None, edges=elist)
def merge_communities(self, group, merge_freshold=0.5): n = len(group) dset = DisjoinSet(n) coms = list(iter(group)) for i in range(0,n-1): for j in range(i+1,n): sim = Community.similarity(coms[i],coms[j]) #print 'similarity: %.5f' % sim if sim > merge_freshold: dset.union(i,j) clusters = dset.sets(min_size=2) for c in clusters: group.merge_communities([coms[i] for i in c])
def merge_communities(self, group, merge_freshold=0.5): n = len(group) dset = DisjoinSet(n) coms = list(iter(group)) for i in range(0, n - 1): for j in range(i + 1, n): sim = Community.similarity(coms[i], coms[j]) #print 'similarity: %.5f' % sim if sim > merge_freshold: dset.union(i, j) clusters = dset.sets(min_size=2) for c in clusters: group.merge_communities([coms[i] for i in c])
def build_global_cowordnet(self, docs, min_coocur=2): dociter = Community.wordpair_weight(docs, min_coocur,0) # import itertools # co_dict = dict() # for words in docs: # for wp in itertools.combinations(words,2): # if wp[1]>wp[0]: wp = (wp[1],wp[0]) # try: # co_dict[wp] += 1 # except: # co_dict[wp] = 1 def dict2list(docs): for w1,w2,co,weight in docs: yield {'source':w1, 'target':w2, 'weight':co} return igraph.Graph.DictList(vertices=None, edges=dict2list(dociter))
def build_global_cowordnet(self, docs, min_coocur=2): dociter = Community.wordpair_weight(docs, min_coocur, 0) # import itertools # co_dict = dict() # for words in docs: # for wp in itertools.combinations(words,2): # if wp[1]>wp[0]: wp = (wp[1],wp[0]) # try: # co_dict[wp] += 1 # except: # co_dict[wp] = 1 def dict2list(docs): for w1, w2, co, weight in docs: yield {'source': w1, 'target': w2, 'weight': co} return igraph.Graph.DictList(vertices=None, edges=dict2list(dociter))
def load_title_wordnet(self,min_coocur=2, min_weight=1e-3): titleiter = Community.wordpair_weight(self.iter_title_words(), min_coocur, min_weight) elist = [] for w1,w2,co,weight in titleiter: elist.append({'source':w1, 'target':w2, 'weight':weight}) return igraph.Graph.DictList(vertices=None, edges=elist)