WHRITE_PLOT = True if __name__=='__main__': data = pd.read_csv('features/text/tfidf_doc.csv',sep='§') model = KMeans(n_clusters=2,random_state=42,n_init=30) cluster = model.fit_predict(data.drop(['Sequence'],axis = 'columns')) data['cluster'] = cluster cluster = pd.Series(cluster) cluster.name = 'Cluster' if WHRITE_CLUSTER: result = pd.concat([data['Sequence'],cluster],axis=1) result.to_csv('result/cluster_tfidf.csv',sep='§') svd = TruncatedSVD(n_components=2) svd = pd.DataFrame(svd.fit_transform(data.drop(['Sequence','cluster'],axis='columns'))) svd = svd.add_prefix(f'Svd_') plt.scatter(svd['Svd_0'], svd['Svd_1'], c=cluster, s=50, cmap='viridis') plt.show() if WHRITE_PLOT: plot_cluster(svd[['Svd_0','Svd_1']].values,data['Sequence'],cluster.values,'result/plot_cluster_tfidf.html')
if __name__ == '__main__': warnings.filterwarnings('ignore') data = pd.read_csv('features/text/emotion_doc.csv', sep='§') pca = TSNE(n_components=2, random_state=42) pca = pd.DataFrame( pca.fit_transform(data.drop(['Sequence'], axis='columns'))) pca = pca.add_prefix(f'PCA_') model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30) cluster = model.fit_predict(data.drop(['Sequence'], axis='columns')) cluster = pd.Series(cluster) cluster.name = 'Cluster' if WHRITE_CLUSTER: result = pd.concat([data['Sequence'], cluster], axis=1) result.to_csv('result/cluster_sentiment.csv', sep='§') plt.scatter(pca['PCA_0'], pca['PCA_1'], c=cluster, s=50, cmap='viridis') plt.show() if WHRITE_PLOT: plot_cluster(pca[['PCA_0', 'PCA_1']].values, data['Sequence'], cluster.values, f'result/plot_cluster_sentiment_tsne.html')
from sklearn.decomposition import PCA import matplotlib as mpl mpl.use('TkAgg') # or whatever other backend that you want import matplotlib.pyplot as plt import seaborn as sns sns.set() # for plot styling import warnings warnings.filterwarnings('ignore') from plt import plot_cluster data = pd.read_csv( '/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/emotion_doc.csv', sep='§') pca = PCA(n_components=2) pca = pd.DataFrame(pca.fit_transform(data.drop(['Sequence'], axis='columns'))) pca = pca.add_prefix(f'PCA_') model = KMeans(n_clusters=3, random_state=42, n_init=30) cluster = model.fit_predict(data.drop(['Sequence'], axis='columns')) plt.scatter(pca['PCA_0'], pca['PCA_1'], c=cluster, s=50, cmap='viridis') plt.show() plot_cluster( pca[['PCA_0', 'PCA_1']].values, data['Sequence'], cluster, '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_sentiments.html' ) #f.savefig("/home/mickael/Documents/Challenge_Video_Audio_Text/result/kmeans_sentiments.png", bbox_inches='tight')
data = pd.read_csv( f'/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/tfidf_doc.csv', sep='§') model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30) cluster = model.fit_predict(data.drop(['Sequence'], axis='columns')) data['cluster'] = clusters #plt.scatter(svd['Svd_0'], svd['Svd_1'], c=cluster, s=50, cmap='viridis') #plt.show() plot_cluster( svd[['Svd_0', 'Svd_1']].values, data['Sequence'], cluster.values, '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_tfidf_5.html' ) #f.savefig("/home/mickael/Documents/Challenge_Video_Audio_Text/result/kmeans_tfidf_2.png", bbox_inches='tight') """ Calcul du score avec silhouette // k=2 le mieux pour chaque """ #best_k(data.drop(['Sequence'], axis='columns'), range_min = 20, verbose = True) # print(data.head()) # ann = pd.read_csv('/home/mickael/Documents/Challenge_Video_Audio_Text/data/external/annotation.csv') # ann = ann[['Sequence','Violent']] # print(ann.head()) # print(ann.shape) # val = pd.concat([data[['Sequence','cluster']],ann],axis = 'columns') # val = val.dropna()
sns.set() # for plot styling import warnings import numpy as np warnings.filterwarnings('ignore') from plt import plot_cluster from evaluation_cluster import best_k from sklearn.manifold import TSNE N_COMPONENTS = 2 N_CLUSTERS = 5 data = pd.read_csv( f'/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/nmf_tfidf_{N_COMPONENTS}_TSNE.csv', sep='§') model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30) cluster = model.fit_predict(data.drop(['Sequence'], axis='columns')) data['cluster'] = cluster #plt.scatter(data['nmf_0'], data['nmf_1'], c=cluster, s=50, cmap='viridis') #plt.show() plot_cluster( data[['nmf_0', 'nmf_1']].values, data['Sequence'], cluster, '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_nmf_5_TSNE.html' ) #best_k(data.drop(['Sequence'], axis='columns'), range_min = 20, verbose = True)