def skill_correlations(runs=50, n_clusters=5): results = [] clustering = kmeans for run in range(runs): for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]: for clustering in clusterings: for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]: answers, items = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation) true_cluster_names = list(items['concept'].unique()) X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) print(run, skill_correlation, clustering.__name__, students, '===', rand) if rand >= 0.9: results.append([students, clustering.__name__, rand, skill_correlation]) break results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation']) print(results) f, ax = plt.subplots(figsize=(7, 7)) ax.set(yscale="log") sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
def students(runs=15): results = [] for run in range(runs): # for n_students in range(100, 1001, 100): # for n_students in [10, 25, 50, 100, 200, 300, 400, 600]: for difficulty_shift in np.arange(-1, 1.1, 0.2): answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing) true_cluster_names = list(items['concept'].unique()) # for i, clustering in enumerate(clusterings): for similarity, euclid, similarity_name in similarities: X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name]) print(run, n_students, similarity_name, rand) results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity']) print(results) plt.figure(figsize=(16, 24)) sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
texts=[items.get_value(item, 'name') for item in items_ids], shapes=ground_truth, ) plt.legend(handles=[ mlines.Line2D([], [], color='black', linewidth=0, marker=markers[i], label=v) for i, v in enumerate(true_cluster_names) ]) plt.subplot(2, len(similarities) / 2 + 1, len(similarities) + 1) rands = [] for c1 in [ground_truth] + clusters: l = [] for c2 in [ground_truth] + clusters: l.append(rand_index(c1, c2)) rands.append(l) sns.heatmap(rands, xticklabels=['truth'] + similarities_names, yticklabels=['truth'] + similarities_names, annot=True) plt.title(data_set) # sns.clustermap(rands, xticklabels=['truth'] + similarities_names, yticklabels=['truth'] + similarities_names, annot=True) if False: for i, (similarity, similarities_name) in enumerate(zip(similarities, similarities_names)): print(similarities_name) X = similarity(answers) ground_truth =np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in X.index]) same, different = [], [] for concept1 in set(ground_truth): for concept2 in set(ground_truth):
runs = 30 results = [] for run in range(runs): A = answers.sample(frac=0.5) for similarity, euclid, similarity_name in similarities: print(similarity_name) X = similarity(A) items_ids = X.index if dimensions: model = PCA(n_components=dimensions) X = pd.DataFrame(data=model.fit_transform(X), index=X.index) ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in items_ids]) for i, clustering in enumerate(clusterings): labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) print(" - ", clustering.__name__, rand) results.append([similarity_name, clustering.__name__, rand]) results = pd.DataFrame(results, columns=["similarity", "clustering", "rand_index"]) print(results) plt.figure(figsize=(16, 24)) plt.title(data_set) sns.barplot(data=results, x="similarity", y="rand_index", hue="clustering") plt.show()