def print_clusters(clusters, no_of_docs): """ Prints documents by their numbers they were clustered together because of common terms. Also shows how many documents could be clustered. @param clusters: A list containing document numbers clustered together upon a certain number of common terms (represented by their numbers, too). @param no_of_docs: The number (int) of documents the collection we clustered consists of. """ if len(clusters) == 0 or no_of_docs == 0: raise VoidStructureError, \ "Please provide non-zero/empty values." set_of_docs_clustered = set() cluster_sizes = list() for docs, _ in clusters: cluster_sizes.append(len(docs)) for doc in docs: set_of_docs_clustered.add(doc) cluster_sizes = sorted(cluster_sizes) rate_of_docs_clustered = float(len(set_of_docs_clustered)) / no_of_docs print "Lowest IDF value considered for terms:", \ get_def_idf_filter_val() print "Number of feature terms used to cluster:", \ get_def_common_terms_no() print "Number of clusters built:", len(clusters) print "Number of docs clustered:", len(set_of_docs_clustered), "/", \ no_of_docs print "Average cluster size:", \ sum(cluster_sizes) / float(len(cluster_sizes)) print "Median cluster size:", cluster_sizes[len(cluster_sizes)/2] ten_biggest_clusters = sorted(cluster_sizes, reverse=True)[:10] print "Ten biggest cluster sizes:", ten_biggest_clusters print "Coverage of ten biggest clusters over docs clustered:", \ sum(ten_biggest_clusters) / float(len(set_of_docs_clustered)) print "Rate of docs clustered:", rate_of_docs_clustered
def process_project(tfidf_matrix_file, xmlcollection): """ Here starts the classification upon the TF*IDF matrix. """ pos_idx = get_positional_index(tfidf_matrix_file) no_of_docs = len(xmlcollection.get_docs()) cluster_pairs = list() # In here create cluster pairs soft_clusters = list() # In here create soft clusters doc_idx1 = 0 max_doc_idx = no_of_docs - 1 for doc_line1 in pos_idx: doc_idx2 = doc_idx1 + 1 # Do comparison as of next document terms1 = set(doc_line1) common_terms = set() soft_cluster = set() soft_cluster_common_terms = set() # Last document doesn't have other document to compare to; # break loop if(doc_idx1 == max_doc_idx): break already_added = False while True: # Break loop if last document reached to compare to # already reached before if(doc_idx2 == max_doc_idx): break terms2 = set(pos_idx[doc_idx2]) common_terms = terms1.intersection(terms2) soft_cluster_common_terms = \ soft_cluster_common_terms.union(common_terms) if len(common_terms) >= get_def_common_terms_no(): doc_no1 = doc_idx1 + 1 doc_no2 = doc_idx2 + 1 clustered_doc_pair = [doc_no1, doc_no2] if already_added == False: soft_cluster.add(doc_no1) already_added = True soft_cluster.add(doc_no2) cluster_pairs.append([clustered_doc_pair, common_terms]) doc_idx2 += 1 if len(soft_cluster) > 0: soft_clusters.append([tuple(sorted(soft_cluster)), tuple(sorted(soft_cluster_common_terms))]) doc_idx1 += 1 # Print found soft cluster groups print_line() soft_clusters = filter_subsets(soft_clusters, nested=True) print "Soft clustering (statistics): " print_clusters(soft_clusters, no_of_docs) print_line() # Print found hard cluster groups print "Hard clustering (statistics): " hard_clusters = create_hard_clusters(soft_clusters, no_of_docs) print_clusters(hard_clusters, no_of_docs) # Write found soft & hard clusters base_clust_dir = get_clustdir() write_clusters(xmlcollection, soft_clusters, base_clust_dir) write_clusters(xmlcollection, hard_clusters, base_clust_dir, type_='hard')