def skill_correlations(runs=50, n_clusters=5): results = [] clustering = kmeans for run in range(runs): for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]: for clustering in clusterings: for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]: answers, items = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation) true_cluster_names = list(items['concept'].unique()) X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) print(run, skill_correlation, clustering.__name__, students, '===', rand) if rand >= 0.9: results.append([students, clustering.__name__, rand, skill_correlation]) break results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation']) print(results) f, ax = plt.subplots(figsize=(7, 7)) ax.set(yscale="log") sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
def students(runs=15): results = [] for run in range(runs): # for n_students in range(100, 1001, 100): # for n_students in [10, 25, 50, 100, 200, 300, 400, 600]: for difficulty_shift in np.arange(-1, 1.1, 0.2): answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing) true_cluster_names = list(items['concept'].unique()) # for i, clustering in enumerate(clusterings): for similarity, euclid, similarity_name in similarities: X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name]) print(run, n_students, similarity_name, rand) results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity']) print(results) plt.figure(figsize=(16, 24)) sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
# data_set, n_clusters = 'cestina-konc-prid', 7 # data_set, n_clusters = 'math_garden-multiplication', 3 # data_set, n_clusters = 'math_garden-addition', 3 # data_set, n_clusters = 'math_garden-subtraction', 3 # data_set, n_clusters = 'math_garden-all', 3 answers = pd.read_pickle('data/{}-answers.pd'.format(data_set)) items = pd.read_pickle('data/{}-items.pd'.format(data_set)) true_cluster_names = list(items['concept'].unique()) print(len(answers), len(items)) projection = tsne similarity, euclid = similarity_pearson, True X = similarity(answers) xs, ys = projection(X, euclid=euclid, perplexity=10) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) plot_clustering( items_ids, xs, ys, labels=ground_truth, texts=[items.get_value(item, 'name') for item in items_ids], shapes=ground_truth, ) if False: df = pd.read_csv('tsne-prid-jmena-data.csv', sep=';')
true_cluster_names = list(items['concept'].unique()) students = pd.Series(answers['student'].unique()) # print(data_set, len(students), len(items), len(answers)) # continue for frac in list(np.arange(0.02, 0.2, 0.02)) + list(np.arange(0.2, 1.1, 0.1)): for run in range(runs): S = students.sample(frac=frac) S1 = S[:len(S) // 2] S2 = S[len(S) // 2:] A1 = answers[answers['student'].isin(S1)] A2 = answers[answers['student'].isin(S2)] print(data_set, frac, len(A1), len(A2)) similarity, euclid, similarity_name = similarity_setting X1 = similarity(A1) X2 = similarity(A2) if len(X1.index) != len(X2.index): continue p, _ = pearsonr(X1.replace(1, 0).as_matrix().flatten(), X2.replace(1, 0).as_matrix().flatten()) results.append([frac, p, run, data_set]) results = pd.DataFrame(results, columns=['frac', 'correlation', 'run', 'data_set']) print(results) plt.figure(figsize=(16, 24)) # sns.pointplot(data=results, x='frac', y='rand_index', hue='clustering') sns.tsplot(data=results, time='frac', value='correlation', unit='run', condition='data_set')
(lambda x: similarity_yulesQ(x), False, "yuleQ"), (lambda x: similarity_pearson(x), True, "pearson -> euclid"), # (lambda x: similarity_kappa(x), True, 'kappa -> euclid'), (lambda x: similarity_yulesQ(x), True, "yuleQ -> euclid"), (lambda x: similarity_pearson(similarity_pearson(x)), True, "pearson -> pearson -> euclid"), ] dimensions = 0 clusterings = [kmeans, spectral_clustering2, hierarchical] runs = 30 results = [] for run in range(runs): A = answers.sample(frac=0.5) for similarity, euclid, similarity_name in similarities: print(similarity_name) X = similarity(A) items_ids = X.index if dimensions: model = PCA(n_components=dimensions) X = pd.DataFrame(data=model.fit_transform(X), index=X.index) ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in items_ids]) for i, clustering in enumerate(clusterings): labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) print(" - ", clustering.__name__, rand) results.append([similarity_name, clustering.__name__, rand]) results = pd.DataFrame(results, columns=["similarity", "clustering", "rand_index"])
similarity, euclid = similarity_pearson, True projection = tsne clustering = kmeans # answers = answers.loc[:67000, :] print(len(answers)) if True: plt.figure(figsize=(10, 5)) # students = answers['student'].unique() # students = students[: len(students) // 2] # answers = answers[answers['student'].isin(students)] for i, modificator in enumerate(modificators): print(modificator, len(answers)) modified_answers = modificator.modify(answers.copy()) X = similarity(modified_answers) xs, ys = projection(X, euclid=euclid) ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in X.index]) plt.subplot(1, len(modificators), i + 1) plt.title(str(modificator)) for x, y, item, visualization in zip(xs, ys, X.index, ground_truth): value = items.get_value(item, "name") plt.plot(x, y, markers[visualization], color=colors[visualization], alpha=(int(value) + 5) / 25) plt.text(x, y, value) plt.legend( handles=[ mlines.Line2D([], [], color=colors[i], linewidth=0, marker=markers[i], label=v) for i, v in enumerate(true_cluster_names)