def skill_correlations(runs=50, n_clusters=5):
    results = []
    clustering = kmeans
    for run in range(runs):
        for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]:
            for clustering in clusterings:
                for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
                    answers, items  = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation)
                    true_cluster_names = list(items['concept'].unique())
                    X = similarity(answers)
                    items_ids = X.index
                    ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                    labels = clustering(X, n_clusters, euclid=euclid)
                    rand = rand_index(ground_truth, labels)

                    print(run, skill_correlation, clustering.__name__, students, '===', rand)
                    if rand >= 0.9:
                        results.append([students, clustering.__name__, rand, skill_correlation])
                        break

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation'])

    print(results)
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
    sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
def students(runs=15):
    results = []
    for run in range(runs):
        # for n_students in range(100, 1001, 100):
        # for n_students in [10, 25, 50, 100, 200, 300,  400, 600]:
        for difficulty_shift in np.arange(-1, 1.1, 0.2):
            answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing)
            true_cluster_names = list(items['concept'].unique())
            # for i, clustering in enumerate(clusterings):
            for similarity, euclid, similarity_name in similarities:
                X = similarity(answers)
                items_ids = X.index
                ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                labels = clustering(X, n_clusters, euclid=euclid)
                rand = rand_index(ground_truth, labels)
                results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name])
                print(run, n_students, similarity_name, rand)

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity'])
    print(results)

    plt.figure(figsize=(16, 24))
    sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
    # data_set, n_clusters = 'cestina-konc-prid', 7
    # data_set, n_clusters  = 'math_garden-multiplication', 3
    # data_set, n_clusters  = 'math_garden-addition', 3
    # data_set, n_clusters  = 'math_garden-subtraction', 3
    # data_set, n_clusters  = 'math_garden-all', 3
    answers = pd.read_pickle('data/{}-answers.pd'.format(data_set))
    items = pd.read_pickle('data/{}-items.pd'.format(data_set))
    true_cluster_names = list(items['concept'].unique())

    print(len(answers), len(items))

    projection = tsne
    similarity, euclid = similarity_pearson, True


    X = similarity(answers)
    xs, ys = projection(X, euclid=euclid, perplexity=10)

    items_ids = X.index
    ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

    plot_clustering(
        items_ids, xs, ys,
        labels=ground_truth,
        texts=[items.get_value(item, 'name') for item in items_ids],
        shapes=ground_truth,
    )


if False:
    df = pd.read_csv('tsne-prid-jmena-data.csv', sep=';')
    true_cluster_names = list(items['concept'].unique())
    students = pd.Series(answers['student'].unique())

    # print(data_set, len(students), len(items), len(answers))
    # continue

    for frac in list(np.arange(0.02, 0.2, 0.02)) + list(np.arange(0.2, 1.1, 0.1)):
        for run in range(runs):
            S = students.sample(frac=frac)
            S1 = S[:len(S) // 2]
            S2 = S[len(S) // 2:]
            A1 = answers[answers['student'].isin(S1)]
            A2 = answers[answers['student'].isin(S2)]
            print(data_set, frac, len(A1), len(A2))
            similarity, euclid, similarity_name = similarity_setting
            X1 = similarity(A1)
            X2 = similarity(A2)
            if len(X1.index) != len(X2.index):
                continue

            p, _ = pearsonr(X1.replace(1, 0).as_matrix().flatten(), X2.replace(1, 0).as_matrix().flatten())

            results.append([frac, p, run, data_set])

results = pd.DataFrame(results, columns=['frac', 'correlation', 'run', 'data_set'])
print(results)

plt.figure(figsize=(16, 24))
# sns.pointplot(data=results, x='frac', y='rand_index', hue='clustering')
sns.tsplot(data=results, time='frac', value='correlation', unit='run', condition='data_set')
    (lambda x: similarity_yulesQ(x), False, "yuleQ"),
    (lambda x: similarity_pearson(x), True, "pearson -> euclid"),
    # (lambda x: similarity_kappa(x), True, 'kappa -> euclid'),
    (lambda x: similarity_yulesQ(x), True, "yuleQ -> euclid"),
    (lambda x: similarity_pearson(similarity_pearson(x)), True, "pearson -> pearson -> euclid"),
]
dimensions = 0
clusterings = [kmeans, spectral_clustering2, hierarchical]

runs = 30
results = []
for run in range(runs):
    A = answers.sample(frac=0.5)
    for similarity, euclid, similarity_name in similarities:
        print(similarity_name)
        X = similarity(A)
        items_ids = X.index
        if dimensions:
            model = PCA(n_components=dimensions)
            X = pd.DataFrame(data=model.fit_transform(X), index=X.index)

        ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in items_ids])

        for i, clustering in enumerate(clusterings):

            labels = clustering(X, n_clusters, euclid=euclid)
            rand = rand_index(ground_truth, labels)
            print("  - ", clustering.__name__, rand)
            results.append([similarity_name, clustering.__name__, rand])

results = pd.DataFrame(results, columns=["similarity", "clustering", "rand_index"])
similarity, euclid = similarity_pearson, True
projection = tsne
clustering = kmeans

# answers = answers.loc[:67000, :]
print(len(answers))

if True:
    plt.figure(figsize=(10, 5))
    # students = answers['student'].unique()
    # students = students[: len(students) // 2]
    # answers = answers[answers['student'].isin(students)]
    for i, modificator in enumerate(modificators):
        print(modificator, len(answers))
        modified_answers = modificator.modify(answers.copy())
        X = similarity(modified_answers)
        xs, ys = projection(X, euclid=euclid)
        ground_truth = np.array([true_cluster_names.index(items.get_value(item, "concept")) for item in X.index])

        plt.subplot(1, len(modificators), i + 1)
        plt.title(str(modificator))

        for x, y, item, visualization in zip(xs, ys, X.index, ground_truth):
            value = items.get_value(item, "name")
            plt.plot(x, y, markers[visualization], color=colors[visualization], alpha=(int(value) + 5) / 25)
            plt.text(x, y, value)

    plt.legend(
        handles=[
            mlines.Line2D([], [], color=colors[i], linewidth=0, marker=markers[i], label=v)
            for i, v in enumerate(true_cluster_names)