def main(): try : opts, args = getopt.getopt(sys.argv[1:], [], "debug") except getopt.GetoptError: err() if not args: err() else: source_dir = sys.argv[-1] for opt, arg in opts: if opt == "--debug": global DEBUG DEBUG = True print "***************************************************************" print "Newsgroups with IDFs" newsgroups_IDFs = computeDocumentFrequency(source_dir) for newsgroup, IDF in newsgroups_IDFs.iteritems(): print newsgroup if DEBUG: print dumps(IDF, sort_keys=True, indent=4) print "***************************************************************" print "hClustering" newsgroup_TFIDF = computeNewsGroupCategory(source_dir) root = hCluster(newsgroup_TFIDF) print tree.printChildren(root)
def hCluster(S): ''' build denogram by comparing cosine similarity scores of all categories ''' while len(S) > 1: cos_score = 0.0 selected = (None, None) right, left = None, None # 1) find the two most similar elements e1 and e2 in S using cos() for k, v in S.iteritems(): for k_, v_ in S.iteritems(): if k_ is not k: temp_score = cosineSimilarity(v, v_) if temp_score > cos_score: selected = (k, k_) cos_score = temp_score right = tree.assertCreateNode(k_) left = tree.assertCreateNode(k) # 2) replace them in S with e1Ve2 parent = str(selected[0]) + " && " + str(selected[1]) if DEBUG: print "parent : ", parent print "right : ", right print "left : ", left node = tree.Node(parent, right, left) S[node] = merge(v, v_) del S[selected[0]] del S[selected[1]] if DEBUG: print "\n" print "Tree: " print tree.printChildren(node) print "\n\n" return node