Example #1
0
def process_args(args):
    Ks = map(int, args.num_clusters.split(","))
    Ks.sort()

    num_exemplars = map(int, args.num_exemplars.split(","))
    num_exemplars.sort()

    subset_sizes = map(int, args.subset_sizes.split(","))
    subset_sizes.sort()

    num_types = map(
        int, args.num_types.split(",")) if not args.rand_exemplars else []
    num_types.sort()

    docs = doc.get_docs_nested(get_data_dir(args.dataset))
    random.shuffle(docs)
    num_docs = len(docs)

    subset_sizes = filter(lambda x: x >= 2 and x <= num_docs, subset_sizes)
    smallest_subset = min(subset_sizes)

    Ks = filter(lambda x: x >= 2 and x <= smallest_subset, Ks)
    num_exemplars = filter(lambda x: x >= 1 and x <= smallest_subset,
                           num_exemplars)

    return docs, Ks, subset_sizes, num_exemplars, num_types
Example #2
0
def main(args):
    if(len(args) != 1):
        print "Usage: mds.py C clustering.pkl"
        print "     C is the cluster in clustering.pkl to display"
        sys.exit(0)

    #C = int(args[1])
    #path = args[2]
    
    print "Loading"
    #clustering = utils.load_obj(path)

    #docs = clustering[C].members
    docs = doc.get_docs_nested(driver.get_data_dir("small"))

    print "Calculating Pairwise Similarities"
    similarities = utils.pairwise(docs, lambda x,y: x.similarity(y))

    #print "INITIAL SIMILARITIES:"
    #utils.print_mat(similarities)

    #similarities = [[0,93,82,133],[93,0,52,60],[82,52,0,111],[133,60,111,0]]

    print "Starting MDS"
    #pos = reduction(similarities)
    pos = classicMDS(similarities)

    print "MDS:"
    utils.print_mat(pos)
Example #3
0
def test_features_syn():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	max_size = int(sys.argv[3])
	num_combine = int(sys.argv[4])
	min_size = int(sys.argv[5])

	d = collections.defaultdict(list)
	for _doc in docs:
		d[_doc.label].append(_doc)
	pure_clusters = d.values()
	broken_clusters = list()
	for x in xrange(10):
		for _cluster in pure_clusters:
			broken_clusters += [_cluster[i:i + max_size] for i in range(0, len(_cluster), max_size)]
		combined_clusters = list()
		while broken_clusters:
			if len(broken_clusters) < num_combine:
				clusters = list(broken_clusters)
			else:
				clusters = random.sample(broken_clusters, num_combine)
			for _cluster in clusters:
				broken_clusters.remove(_cluster)
			combined_clusters.append(utils.flatten(clusters))

		clusters = map(lambda combined_cluster: cluster.Cluster(combined_cluster), combined_clusters)
		ncluster.test_features(clusters, min_size)
Example #4
0
def test():
	docs = doc.get_docs_nested(get_data_dir("wales_20"))
	num_types = len(set(map(lambda _doc: _doc.label, docs)))
	num_subset = len(docs)
	num_seeds = 4
	initial_cluster_range = [3, 4, 5]
	min_pts = 2
	ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
Example #5
0
def all_cluster():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	num_subset = int(sys.argv[3])
	num_initial_clusters = int(sys.argv[4])
	num_seeds = int(sys.argv[5])
	min_pts = int(sys.argv[6])
	outdir = os.path.join(_output_dir, str(datetime.date.today()) + "_" + "_".join(sys.argv[1:]))
	ncluster.all_cluster(docs, num_subset, num_initial_clusters, num_seeds, min_pts, outdir)
Example #6
0
def overall_experiment():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	num_types = len(set(map(lambda _doc: _doc.label, docs)))
	num_subset = len(docs)
	num_seeds = 10
	#initial_cluster_range = range(num_types / 2, int(1.5 * num_types))
	initial_cluster_range = [10]
	min_pts = 5
	ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
Example #7
0
def check_init():
	docs = doc.get_docs_nested(get_data_dir("test"))
	random.shuffle(docs)
	confirm = cluster.MaxCliqueInitCONFIRM(docs, 2, 10)
	confirm._init_clusters()

	print
	print "Cluster Sim Mat"
	sim_mat = confirm.get_cluster_sim_mat()
	utils.print_mat(utils.apply_mat(sim_mat, lambda x: "%3.2f" % x))
Example #8
0
def draw_all():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	try:
		shutil.rmtree('output/docs')
	except:
		pass
	try:
		os.mkdir('output/docs')
	except:
		pass
	for _doc in docs:
		_doc.draw().save("output/docs/%s.png" % _doc._id)
Example #9
0
def subset_experiment2():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))

	num_types = len(set(map(lambda _doc: _doc.label, docs)))
	print "Num Types:", num_types
	initial_cluster_range = list()
	if num_types != 2:
		initial_cluster_range.append(num_types / 2)
	initial_cluster_range.append(num_types)
	initial_cluster_range.append(int(1.5 * num_types))

	subsets = [int(sys.argv[3])]
	for num_subset in subsets:
		num_seeds = 50
		min_pts = 30
		ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
Example #10
0
def compare_true_templates():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	#confirm = cluster.PerfectCONFIRM(docs)
	confirm = cluster.BestPerfectCONFIRM(docs, lr=0.05)
	confirm.cluster()
	analyzer = metric.KnownClusterAnalyzer(confirm)
	analyzer.print_all()
	analyzer.draw_centers()
	analyzer.clusters[0].center.push_away(analyzer.clusters[1].center)
	print "PUSHING APART!"
	print
	print
	analyzer = metric.KnownClusterAnalyzer(confirm)
	analyzer.draw_centers()
	analyzer.print_all()
	print
	print
Example #11
0
def double_cluster_known():
        docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
        epsilon = float(sys.argv[3])
        organizer = cluster.TemplateSorter(docs)
        organizer.go(epsilon)
        organizer.prune_clusters()
        clusters = organizer.get_clusters()
        print "Initial Clustering Complete"
        print "Reclustering..."
        centers = map(lambda x: x.center, clusters)
        organizer.go(epsilon,templates=centers)
        organizer.prune_clusters()
        clusters = organizer.get_clusters()
        print
        print
        analyzer = metric.KnownClusterAnalyzer(clusters)
        analyzer.draw_centers()
        analyzer.print_all()
Example #12
0
def extract():
	dataset = sys.argv[2]
	outdir = "output/" + "_".join(sys.argv[1:])

	docs = doc.get_docs_nested(get_data_dir(dataset))
	random.shuffle(docs)

	rand_amounts = [10, 20, 30, 50, 75, 100]
	type_percs = [0.01, 0.25, 0.50, 0.75, 0.90, 1.0]

	#rand_amounts = [1, 2, 3, 5]
	#type_percs = [0.01, 0.50]

	num_type_seeds = 30 if dataset not in ['nist', 'wales_balanced'] else 50
	#num_type_seeds = 7 if dataset not in ['nist', 'wales_balanced'] else 50

	extractor = ncluster.FeatureExtractor(docs)
	#extractor.extract_random(os.path.join(outdir, 'rand'), rand_amounts)
	extractor.extract_type(os.path.join(outdir, 'type'), num_type_seeds, type_percs)
Example #13
0
def extract():
	try:
		# sys.argv[3] is the number of threads
		num_seeds = int(sys.argv[4])
		feature_file = sys.argv[5]
		manifest_file = sys.argv[6]
		docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	except:
		print "python driver.py extract dataset #threads num_seeds feature_file manifest_file"
		return

	seeds = random.sample(docs, min(num_seeds, len(docs)))
	feature_mat = ncluster.extract_features_par(docs, seeds)
	np.save(feature_file, feature_mat)
	out = open(manifest_file, 'w')
	for _doc in docs:
		out.write("%s\n" % _doc.source_file)

	out.close()
Example #14
0
def auto():
	try:
		# sys.argv[3] is the number of threads
		Ks = map(int, sys.argv[4].split(","))
		subsets = map(int, sys.argv[5].split(","))
		seeds = map(int, sys.argv[6].split(","))
		docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
		
	except:
		print "python driver.py auto dataset #threads Ks subsets seeds"
		return

	Ks.sort()
	subsets.sort()
	seeds.sort()
	filtered = filter(lambda x: x < len(docs), subsets)
	if len(filtered) < len(subsets):
		filtered.append(len(docs))
		subsets = filtered

	ncluster.run_auto_minpts(docs, Ks, subsets, seeds)
Example #15
0
def subset_experiment():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))

	num_types = len(set(map(lambda _doc: _doc.label, docs)))
	print "Num Types:", num_types
	initial_cluster_range = list()
	if num_types != 2:
		initial_cluster_range.append(num_types / 2)
	initial_cluster_range.append(num_types)
	initial_cluster_range.append(int(1.5 * num_types))

	possible_subsets = [100, 200, 500, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
	subsets = list()
	for s in possible_subsets:
		if s < len(docs):
			subsets.append(s)
	subsets.append(len(docs))
	for num_subset in subsets:
		num_seeds = 50
		min_pts = 30
		ncluster.overall(docs, num_subset, num_seeds, initial_cluster_range, min_pts)
Example #16
0
def test_par():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	num_seeds = 5
	seeds = random.sample(docs, num_seeds)

	# parallel
	start_time = time.time()
	features_par = ncluster.extract_features_par(docs, seeds)[0]
	end_time = time.time()
	print "Parallel Time elapsed: ", datetime.timedelta(seconds=(end_time - start_time))

	# serial
	start_time = time.time()
	features_ser = ncluster.extract_features(docs, seeds)[0]
	end_time = time.time()
	print "Serial Time elapsed: ", datetime.timedelta(seconds=(end_time - start_time))

	for x in xrange(features_par.shape[0]):
		for y in xrange(features_par.shape[1]):
			if features_par[x,y] != features_ser[x,y]:
				print x, y, features_par[x,y], features_ser[x,y]
Example #17
0
def process_args(args):
	Ks = map(int, args.num_clusters.split(","))
	Ks.sort()
		
	num_exemplars = map(int, args.num_exemplars.split(","))
	num_exemplars.sort()

	subset_sizes = map(int, args.subset_sizes.split(","))
	subset_sizes.sort()

	num_types =  map(int, args.num_types.split(",")) if not args.rand_exemplars else []
	num_types.sort()

	docs = doc.get_docs_nested(get_data_dir(args.dataset))
	random.shuffle(docs)
	num_docs = len(docs)

	subset_sizes = filter(lambda x: x >= 2 and x <= num_docs, subset_sizes)
	smallest_subset = min(subset_sizes)

	Ks = filter(lambda x: x >= 2 and x <= smallest_subset, Ks)
	num_exemplars = filter(lambda x: x >= 1 and x <= smallest_subset, num_exemplars)

	return docs, Ks, subset_sizes, num_exemplars, num_types
Example #18
0
def cluster_known():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	random.shuffle(docs)
	param = int(sys.argv[3])
	param2 = int(sys.argv[5])
	param3 = int(sys.argv[6])

	factory = get_confirm(sys.argv[4])
	confirm = factory(docs, 
		sim_thresh=param,		 	# BaseCONFIRM
		num_instances=3, 			# InitCONFIRMs, how many instances to examine to find $num_clust clusters
		num_clust=2,  				# MaxCliqueCONFIRM, how many clusters try for
		lr=0.02,  					# learning rate WavgNet
		instances_per_cluster=10,  	# SupervisedInitCONFIRM, how many labeled instances start a cluster
		min_size=2, 				# PruningCONFIRM, clusters under this size get pruned
		maxK=4, 					# MaxClustersCONFIRM, max on created clusters (doesn't work)

		num_initial_seeds=param, 	# KumarCONFIRM, how many seeds to start with
		iterations=1,				# KumarCONFIRM, how many iterations to perform
		num_seeds=param,            # KumarCONFIRM, how many seeds to get each iteration
		cluster_range=(2,5),	 	# KumarCONFIRM, how many clusters to search over

		seeds_per_batch=2,  		# MaxCliqueSeedsKumarCONFIRM, how many seeds to get per batch
		batch_size=10,  			# MaxCliqueSeedsKumarCONFIRM, how many batches
		num_per_seed=param,			# SemiSupervisedKumarCONFIRM, how many docs/label to make seeds

		init_subset=30000,			# PipelineCONFIRM, how many docs to initialize
		min_membership=1, 			# PipelineCONFIRM, how many docs a cluster must have after initialization
		z_threshold=-100,			# PipelineCONFIRM, the reject threshold for the greedy pass
		use_labels=False,			# PipelineCONFIRM, Skips kumarconfirm init and uses the labels
		use_ss=param3
		)

	confirm.cluster_bootstrap()
	print
	print
Example #19
0
def test_features():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	random.shuffle(docs)
	ncluster.test_splitting(docs)