def run_baseline():
    gold_filter = []
    with open(FEATURE_CSV, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            gold_filter += [int(row["book_id"])]
    clusters = [[], []]
    for id in gold_filter:
        clusters[random.randint(0, len(clusters) - 1)] += [id]
    f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter))
    print "%s,%s,%s" % (f_score, recall, precision)
    f_score, recall, precision = score_clusters([clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter))
    gold_standard = get_gold_standard(gold_filter)
    print "f-score:", f_measure(
        [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])]
    )
    print "%s,%s,%s" % (f_score, recall, precision)
def run_baseline():
    gold_filter = []
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            gold_filter += [int(row['book_id'])]
    clusters = [[], []]
    for id in gold_filter:
        clusters[random.randint(0, len(clusters) - 1)] += [id]
    f_score, recall, precision = score_clusters(clusters,
                                                get_gold_standard(gold_filter))
    print "%s,%s,%s" % (f_score, recall, precision)
    f_score, recall, precision = score_clusters(
        [clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter))
    gold_standard = get_gold_standard(gold_filter)
    print "f-score:", f_measure(
        [set(clusters[0] + clusters[1]),
         set([])],
        [set(gold_standard[0]), set(gold_standard[1])])
    print "%s,%s,%s" % (f_score, recall, precision)
Example #3
0
def cluster_things(keys_to_use, gold_standard="normal", make_pickle=False):
    # Open the CSV file
    vectors = []
    gold_filter = []
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row_values = []
            gold_filter += [int(row['book_id'])]
            for key in row:
                if key != 'book_id' and key in keys_to_use:
                    row_values += [float(row[key])]
            vectors += [row_values]
    gold_clusters = []
    if gold_standard == "normal":
        gold_clusters = get_gold_standard(gold_filter)
    else:
        gold_clusters = get_kincaid_cluster(gold_filter)
    vectors = [array(f) for f in vectors]
    clusterer = cluster.KMeansClusterer(len(gold_clusters), euclidean_distance)
    clusters = clusterer.cluster(vectors, True)
    if make_pickle == True:
        pickle.dump(clusterer, open(PICKLE_FILE, 'w'))

    # Attempt to classify the things again, so we know which vector they belong to
    results = []
    for i in range(0, len(gold_clusters)):
        results += [[]]
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row_values = []
            for key in row:
                if key != 'book_id' and key in keys_to_use:
                    row_values += [float(row[key])]
            results[clusterer.classify([array(f)
                                        for f in row_values])] += [row]

    book_ids = []
    for i, c in enumerate(results):
        t = []
        for row in c:
            t += [int(row['book_id'])]
        book_ids += [t]
    # Open the source files and find the correct things
    return score_clusters(gold_clusters, book_ids)
def cluster_things(keys_to_use, gold_standard="normal", make_pickle=False):
	# Open the CSV file
	vectors = []
	gold_filter = []
	with open(FEATURE_CSV, 'r') as csvfile:
		reader = csv.DictReader(csvfile)
		for row in reader:
			row_values = []
			gold_filter += [int(row['book_id'])]
			for key in row:
				if key != 'book_id' and key in keys_to_use:
					row_values += [float(row[key])]
			vectors += [row_values]
	gold_clusters = []
	if gold_standard == "normal":
		gold_clusters = get_gold_standard(gold_filter)
	else:
		gold_clusters = get_kincaid_cluster(gold_filter)
	vectors = [array(f) for f in vectors]
	clusterer = cluster.KMeansClusterer(len(gold_clusters), euclidean_distance)
	clusters = clusterer.cluster(vectors, True) 
	if make_pickle == True:
		pickle.dump(clusterer, open(PICKLE_FILE, 'w'))

	# Attempt to classify the things again, so we know which vector they belong to
	results = []
	for i in range(0, len(gold_clusters)):
		results += [[]]
	with open(FEATURE_CSV, 'r') as csvfile:
		reader = csv.DictReader(csvfile)
		for row in reader:
			row_values = []
			for key in row:
				if key != 'book_id' and key in keys_to_use:
					row_values += [float(row[key])]
			results[clusterer.classify([array(f) for f in row_values])] += [row]

	book_ids = []
	for i, c in enumerate(results):
		t = []
		for row in c:
			t += [int(row['book_id'])]
		book_ids += [t]
	# Open the source files and find the correct things
	return score_clusters(gold_clusters, book_ids)