def concept_clustering(data, skill, cluster_number=3, plot=True): pk, level = data.get_skill_id(skill) items = data.get_items_df() items = items[items["skill_lvl_" + str(level)] == pk] skills = data.get_skills_df() skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique() corr = compute_corr(data, merge_skills=True) corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 try: sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=True) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) except np.linalg.linalg.LinAlgError: sc = SpectralClusterer(corr, kcut=corr.shape[0] * 0.5, mutual=False) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) if plot: colors = "rgbyk" for i, p in enumerate(corr.columns): skill = skills.loc[int(p)] plt.plot(sc.eig_vect[i, 1], sc.eig_vect[i, 2], "o", color=colors[labels[i]]) plt.text(sc.eig_vect[i, 1], sc.eig_vect[i, 2], skill["name"]) plt.title(data) return labels
def item_clustering(data, skill, cluster_number=3, plot=True): pk, level = data.get_skill_id(skill) items = data.get_items_df() items = items[items["skill_lvl_" + str(level)] == pk] items = items[items["visualization"] != "pairing"] corr = compute_corr(data) corr = pd.DataFrame(corr, index=items.index, columns=items.index) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 sc = SpectralClusterer(corr, kcut=corr.shape[0] / 2, mutual=True) # sc = SpectralClusterer(corr, kcut=30, mutual=True) labels = sc.run(cluster_number=cluster_number, KMiter=50, sc_type=2) if plot: colors = "rgbyk" visualizations = list(items["visualization"].unique()) for i, p in enumerate(corr.columns): item = items.loc[p] plt.plot(sc.eig_vect[i,1], sc.eig_vect[i,2], "o", color=colors[visualizations.index(item["visualization"])]) # plt.plot(sc.eig_vect[i, 1], sc.eig_vect[i, 2], "o", color=colors[labels[i]]) plt.text(sc.eig_vect[i, 1], sc.eig_vect[i, 2], item["name"]) for i, vis in enumerate(visualizations): plt.plot(0, 0, "o", color=colors[i], label=vis) plt.title(data) plt.legend(loc=3) return labels
def spectral_clustering(similarity, concepts=2, euclid=False): if euclid: X = similarity_euclidean(similarity) else: X = similarity X[X < 0] = 0 sc = SpectralClusterer(X, kcut=X.shape[0] / 2, mutual=True) return sc.run(cluster_number=concepts, KMiter=50, sc_type=2)
def spectral(similarity, euclid=False): if euclid: similarity = similarity_euclidean(similarity) else: similarity[similarity < 0] = 0 sc = SpectralClusterer(similarity, kcut=similarity.shape[0] / 2, mutual=True) sc.run(cluster_number=2, KMiter=50, sc_type=2) return (sc.eig_vect[:, 1], sc.eig_vect[:, 2])