def process_args(args): Ks = map(int, args.num_clusters.split(",")) Ks.sort() num_exemplars = map(int, args.num_exemplars.split(",")) num_exemplars.sort() subset_sizes = map(int, args.subset_sizes.split(",")) subset_sizes.sort() num_types = map( int, args.num_types.split(",")) if not args.rand_exemplars else [] num_types.sort() docs = doc.get_docs_nested(get_data_dir(args.dataset)) random.shuffle(docs) num_docs = len(docs) subset_sizes = filter(lambda x: x >= 2 and x <= num_docs, subset_sizes) smallest_subset = min(subset_sizes) Ks = filter(lambda x: x >= 2 and x <= smallest_subset, Ks) num_exemplars = filter(lambda x: x >= 1 and x <= smallest_subset, num_exemplars) return docs, Ks, subset_sizes, num_exemplars, num_types
def main(args): if(len(args) != 1): print "Usage: mds.py C clustering.pkl" print " C is the cluster in clustering.pkl to display" sys.exit(0) #C = int(args[1]) #path = args[2] print "Loading" #clustering = utils.load_obj(path) #docs = clustering[C].members docs = doc.get_docs_nested(driver.get_data_dir("small")) print "Calculating Pairwise Similarities" similarities = utils.pairwise(docs, lambda x,y: x.similarity(y)) #print "INITIAL SIMILARITIES:" #utils.print_mat(similarities) #similarities = [[0,93,82,133],[93,0,52,60],[82,52,0,111],[133,60,111,0]] print "Starting MDS" #pos = reduction(similarities) pos = classicMDS(similarities) print "MDS:" utils.print_mat(pos)
def test_features_syn(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) max_size = int(sys.argv[3]) num_combine = int(sys.argv[4]) min_size = int(sys.argv[5]) d = collections.defaultdict(list) for _doc in docs: d[_doc.label].append(_doc) pure_clusters = d.values() broken_clusters = list() for x in xrange(10): for _cluster in pure_clusters: broken_clusters += [_cluster[i:i + max_size] for i in range(0, len(_cluster), max_size)] combined_clusters = list() while broken_clusters: if len(broken_clusters) < num_combine: clusters = list(broken_clusters) else: clusters = random.sample(broken_clusters, num_combine) for _cluster in clusters: broken_clusters.remove(_cluster) combined_clusters.append(utils.flatten(clusters)) clusters = map(lambda combined_cluster: cluster.Cluster(combined_cluster), combined_clusters) ncluster.test_features(clusters, min_size)
def test(): docs = doc.get_docs_nested(get_data_dir("wales_20")) num_types = len(set(map(lambda _doc: _doc.label, docs))) num_subset = len(docs) num_seeds = 4 initial_cluster_range = [3, 4, 5] min_pts = 2 ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
def all_cluster(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) num_subset = int(sys.argv[3]) num_initial_clusters = int(sys.argv[4]) num_seeds = int(sys.argv[5]) min_pts = int(sys.argv[6]) outdir = os.path.join(_output_dir, str(datetime.date.today()) + "_" + "_".join(sys.argv[1:])) ncluster.all_cluster(docs, num_subset, num_initial_clusters, num_seeds, min_pts, outdir)
def overall_experiment(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) num_types = len(set(map(lambda _doc: _doc.label, docs))) num_subset = len(docs) num_seeds = 10 #initial_cluster_range = range(num_types / 2, int(1.5 * num_types)) initial_cluster_range = [10] min_pts = 5 ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
def check_init(): docs = doc.get_docs_nested(get_data_dir("test")) random.shuffle(docs) confirm = cluster.MaxCliqueInitCONFIRM(docs, 2, 10) confirm._init_clusters() print print "Cluster Sim Mat" sim_mat = confirm.get_cluster_sim_mat() utils.print_mat(utils.apply_mat(sim_mat, lambda x: "%3.2f" % x))
def draw_all(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) try: shutil.rmtree('output/docs') except: pass try: os.mkdir('output/docs') except: pass for _doc in docs: _doc.draw().save("output/docs/%s.png" % _doc._id)
def subset_experiment2(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) num_types = len(set(map(lambda _doc: _doc.label, docs))) print "Num Types:", num_types initial_cluster_range = list() if num_types != 2: initial_cluster_range.append(num_types / 2) initial_cluster_range.append(num_types) initial_cluster_range.append(int(1.5 * num_types)) subsets = [int(sys.argv[3])] for num_subset in subsets: num_seeds = 50 min_pts = 30 ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
def compare_true_templates(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) #confirm = cluster.PerfectCONFIRM(docs) confirm = cluster.BestPerfectCONFIRM(docs, lr=0.05) confirm.cluster() analyzer = metric.KnownClusterAnalyzer(confirm) analyzer.print_all() analyzer.draw_centers() analyzer.clusters[0].center.push_away(analyzer.clusters[1].center) print "PUSHING APART!" print print analyzer = metric.KnownClusterAnalyzer(confirm) analyzer.draw_centers() analyzer.print_all() print print
def double_cluster_known(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) epsilon = float(sys.argv[3]) organizer = cluster.TemplateSorter(docs) organizer.go(epsilon) organizer.prune_clusters() clusters = organizer.get_clusters() print "Initial Clustering Complete" print "Reclustering..." centers = map(lambda x: x.center, clusters) organizer.go(epsilon,templates=centers) organizer.prune_clusters() clusters = organizer.get_clusters() print print analyzer = metric.KnownClusterAnalyzer(clusters) analyzer.draw_centers() analyzer.print_all()
def extract(): dataset = sys.argv[2] outdir = "output/" + "_".join(sys.argv[1:]) docs = doc.get_docs_nested(get_data_dir(dataset)) random.shuffle(docs) rand_amounts = [10, 20, 30, 50, 75, 100] type_percs = [0.01, 0.25, 0.50, 0.75, 0.90, 1.0] #rand_amounts = [1, 2, 3, 5] #type_percs = [0.01, 0.50] num_type_seeds = 30 if dataset not in ['nist', 'wales_balanced'] else 50 #num_type_seeds = 7 if dataset not in ['nist', 'wales_balanced'] else 50 extractor = ncluster.FeatureExtractor(docs) #extractor.extract_random(os.path.join(outdir, 'rand'), rand_amounts) extractor.extract_type(os.path.join(outdir, 'type'), num_type_seeds, type_percs)
def extract(): try: # sys.argv[3] is the number of threads num_seeds = int(sys.argv[4]) feature_file = sys.argv[5] manifest_file = sys.argv[6] docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) except: print "python driver.py extract dataset #threads num_seeds feature_file manifest_file" return seeds = random.sample(docs, min(num_seeds, len(docs))) feature_mat = ncluster.extract_features_par(docs, seeds) np.save(feature_file, feature_mat) out = open(manifest_file, 'w') for _doc in docs: out.write("%s\n" % _doc.source_file) out.close()
def auto(): try: # sys.argv[3] is the number of threads Ks = map(int, sys.argv[4].split(",")) subsets = map(int, sys.argv[5].split(",")) seeds = map(int, sys.argv[6].split(",")) docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) except: print "python driver.py auto dataset #threads Ks subsets seeds" return Ks.sort() subsets.sort() seeds.sort() filtered = filter(lambda x: x < len(docs), subsets) if len(filtered) < len(subsets): filtered.append(len(docs)) subsets = filtered ncluster.run_auto_minpts(docs, Ks, subsets, seeds)
def subset_experiment(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) num_types = len(set(map(lambda _doc: _doc.label, docs))) print "Num Types:", num_types initial_cluster_range = list() if num_types != 2: initial_cluster_range.append(num_types / 2) initial_cluster_range.append(num_types) initial_cluster_range.append(int(1.5 * num_types)) possible_subsets = [100, 200, 500, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] subsets = list() for s in possible_subsets: if s < len(docs): subsets.append(s) subsets.append(len(docs)) for num_subset in subsets: num_seeds = 50 min_pts = 30 ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
def test_par(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) num_seeds = 5 seeds = random.sample(docs, num_seeds) # parallel start_time = time.time() features_par = ncluster.extract_features_par(docs, seeds)[0] end_time = time.time() print "Parallel Time elapsed: ", datetime.timedelta(seconds=(end_time - start_time)) # serial start_time = time.time() features_ser = ncluster.extract_features(docs, seeds)[0] end_time = time.time() print "Serial Time elapsed: ", datetime.timedelta(seconds=(end_time - start_time)) for x in xrange(features_par.shape[0]): for y in xrange(features_par.shape[1]): if features_par[x,y] != features_ser[x,y]: print x, y, features_par[x,y], features_ser[x,y]
def process_args(args): Ks = map(int, args.num_clusters.split(",")) Ks.sort() num_exemplars = map(int, args.num_exemplars.split(",")) num_exemplars.sort() subset_sizes = map(int, args.subset_sizes.split(",")) subset_sizes.sort() num_types = map(int, args.num_types.split(",")) if not args.rand_exemplars else [] num_types.sort() docs = doc.get_docs_nested(get_data_dir(args.dataset)) random.shuffle(docs) num_docs = len(docs) subset_sizes = filter(lambda x: x >= 2 and x <= num_docs, subset_sizes) smallest_subset = min(subset_sizes) Ks = filter(lambda x: x >= 2 and x <= smallest_subset, Ks) num_exemplars = filter(lambda x: x >= 1 and x <= smallest_subset, num_exemplars) return docs, Ks, subset_sizes, num_exemplars, num_types
def cluster_known(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) random.shuffle(docs) param = int(sys.argv[3]) param2 = int(sys.argv[5]) param3 = int(sys.argv[6]) factory = get_confirm(sys.argv[4]) confirm = factory(docs, sim_thresh=param, # BaseCONFIRM num_instances=3, # InitCONFIRMs, how many instances to examine to find $num_clust clusters num_clust=2, # MaxCliqueCONFIRM, how many clusters try for lr=0.02, # learning rate WavgNet instances_per_cluster=10, # SupervisedInitCONFIRM, how many labeled instances start a cluster min_size=2, # PruningCONFIRM, clusters under this size get pruned maxK=4, # MaxClustersCONFIRM, max on created clusters (doesn't work) num_initial_seeds=param, # KumarCONFIRM, how many seeds to start with iterations=1, # KumarCONFIRM, how many iterations to perform num_seeds=param, # KumarCONFIRM, how many seeds to get each iteration cluster_range=(2,5), # KumarCONFIRM, how many clusters to search over seeds_per_batch=2, # MaxCliqueSeedsKumarCONFIRM, how many seeds to get per batch batch_size=10, # MaxCliqueSeedsKumarCONFIRM, how many batches num_per_seed=param, # SemiSupervisedKumarCONFIRM, how many docs/label to make seeds init_subset=30000, # PipelineCONFIRM, how many docs to initialize min_membership=1, # PipelineCONFIRM, how many docs a cluster must have after initialization z_threshold=-100, # PipelineCONFIRM, the reject threshold for the greedy pass use_labels=False, # PipelineCONFIRM, Skips kumarconfirm init and uses the labels use_ss=param3 ) confirm.cluster_bootstrap() print print
def test_features(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) random.shuffle(docs) ncluster.test_splitting(docs)