def run_baseline(): gold_filter = [] with open(FEATURE_CSV, "r") as csvfile: reader = csv.DictReader(csvfile) for row in reader: gold_filter += [int(row["book_id"])] clusters = [[], []] for id in gold_filter: clusters[random.randint(0, len(clusters) - 1)] += [id] f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter)) print "%s,%s,%s" % (f_score, recall, precision) f_score, recall, precision = score_clusters([clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter)) gold_standard = get_gold_standard(gold_filter) print "f-score:", f_measure( [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])] ) print "%s,%s,%s" % (f_score, recall, precision)
def run_baseline(): gold_filter = [] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: gold_filter += [int(row['book_id'])] clusters = [[], []] for id in gold_filter: clusters[random.randint(0, len(clusters) - 1)] += [id] f_score, recall, precision = score_clusters(clusters, get_gold_standard(gold_filter)) print "%s,%s,%s" % (f_score, recall, precision) f_score, recall, precision = score_clusters( [clusters[0] + clusters[1], [2098]], get_gold_standard(gold_filter)) gold_standard = get_gold_standard(gold_filter) print "f-score:", f_measure( [set(clusters[0] + clusters[1]), set([])], [set(gold_standard[0]), set(gold_standard[1])]) print "%s,%s,%s" % (f_score, recall, precision)
def cluster_things(keys_to_use, gold_standard="normal", make_pickle=False): # Open the CSV file vectors = [] gold_filter = [] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: row_values = [] gold_filter += [int(row['book_id'])] for key in row: if key != 'book_id' and key in keys_to_use: row_values += [float(row[key])] vectors += [row_values] gold_clusters = [] if gold_standard == "normal": gold_clusters = get_gold_standard(gold_filter) else: gold_clusters = get_kincaid_cluster(gold_filter) vectors = [array(f) for f in vectors] clusterer = cluster.KMeansClusterer(len(gold_clusters), euclidean_distance) clusters = clusterer.cluster(vectors, True) if make_pickle == True: pickle.dump(clusterer, open(PICKLE_FILE, 'w')) # Attempt to classify the things again, so we know which vector they belong to results = [] for i in range(0, len(gold_clusters)): results += [[]] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: row_values = [] for key in row: if key != 'book_id' and key in keys_to_use: row_values += [float(row[key])] results[clusterer.classify([array(f) for f in row_values])] += [row] book_ids = [] for i, c in enumerate(results): t = [] for row in c: t += [int(row['book_id'])] book_ids += [t] # Open the source files and find the correct things return score_clusters(gold_clusters, book_ids)