Ejemplo n.º 1
0
WHRITE_PLOT = True

if __name__=='__main__':
    
    data = pd.read_csv('features/text/tfidf_doc.csv',sep='§')

    model = KMeans(n_clusters=2,random_state=42,n_init=30)

    cluster = model.fit_predict(data.drop(['Sequence'],axis = 'columns'))

    data['cluster'] = cluster

    cluster = pd.Series(cluster)
    cluster.name = 'Cluster'

    if WHRITE_CLUSTER:
        result = pd.concat([data['Sequence'],cluster],axis=1)
        result.to_csv('result/cluster_tfidf.csv',sep='§')

    svd = TruncatedSVD(n_components=2)
    svd = pd.DataFrame(svd.fit_transform(data.drop(['Sequence','cluster'],axis='columns')))
    svd = svd.add_prefix(f'Svd_')

    plt.scatter(svd['Svd_0'], svd['Svd_1'], c=cluster, s=50, cmap='viridis')
    plt.show()


    if WHRITE_PLOT:
        plot_cluster(svd[['Svd_0','Svd_1']].values,data['Sequence'],cluster.values,'result/plot_cluster_tfidf.html')

Ejemplo n.º 2
0
if __name__ == '__main__':

    warnings.filterwarnings('ignore')

    data = pd.read_csv('features/text/emotion_doc.csv', sep='§')

    pca = TSNE(n_components=2, random_state=42)
    pca = pd.DataFrame(
        pca.fit_transform(data.drop(['Sequence'], axis='columns')))
    pca = pca.add_prefix(f'PCA_')

    model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30)

    cluster = model.fit_predict(data.drop(['Sequence'], axis='columns'))

    cluster = pd.Series(cluster)
    cluster.name = 'Cluster'

    if WHRITE_CLUSTER:
        result = pd.concat([data['Sequence'], cluster], axis=1)
        result.to_csv('result/cluster_sentiment.csv', sep='§')

    plt.scatter(pca['PCA_0'], pca['PCA_1'], c=cluster, s=50, cmap='viridis')
    plt.show()

    if WHRITE_PLOT:
        plot_cluster(pca[['PCA_0',
                          'PCA_1']].values, data['Sequence'], cluster.values,
                     f'result/plot_cluster_sentiment_tsne.html')
Ejemplo n.º 3
0
from sklearn.decomposition import PCA
import matplotlib as mpl
mpl.use('TkAgg')  # or whatever other backend that you want
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()  # for plot styling
import warnings
warnings.filterwarnings('ignore')
from plt import plot_cluster

data = pd.read_csv(
    '/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/emotion_doc.csv',
    sep='§')

pca = PCA(n_components=2)
pca = pd.DataFrame(pca.fit_transform(data.drop(['Sequence'], axis='columns')))
pca = pca.add_prefix(f'PCA_')

model = KMeans(n_clusters=3, random_state=42, n_init=30)

cluster = model.fit_predict(data.drop(['Sequence'], axis='columns'))

plt.scatter(pca['PCA_0'], pca['PCA_1'], c=cluster, s=50, cmap='viridis')
plt.show()

plot_cluster(
    pca[['PCA_0', 'PCA_1']].values, data['Sequence'], cluster,
    '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_sentiments.html'
)
#f.savefig("/home/mickael/Documents/Challenge_Video_Audio_Text/result/kmeans_sentiments.png", bbox_inches='tight')
data = pd.read_csv(
    f'/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/tfidf_doc.csv',
    sep='§')

model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30)

cluster = model.fit_predict(data.drop(['Sequence'], axis='columns'))

data['cluster'] = clusters

#plt.scatter(svd['Svd_0'], svd['Svd_1'], c=cluster, s=50, cmap='viridis')
#plt.show()

plot_cluster(
    svd[['Svd_0', 'Svd_1']].values, data['Sequence'], cluster.values,
    '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_tfidf_5.html'
)
#f.savefig("/home/mickael/Documents/Challenge_Video_Audio_Text/result/kmeans_tfidf_2.png", bbox_inches='tight')
""" Calcul du score avec silhouette // k=2 le mieux pour chaque """
#best_k(data.drop(['Sequence'], axis='columns'), range_min = 20, verbose = True)

# print(data.head())

# ann = pd.read_csv('/home/mickael/Documents/Challenge_Video_Audio_Text/data/external/annotation.csv')
# ann = ann[['Sequence','Violent']]
# print(ann.head())
# print(ann.shape)

# val = pd.concat([data[['Sequence','cluster']],ann],axis = 'columns')
# val  = val.dropna()
Ejemplo n.º 5
0
sns.set()  # for plot styling
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from plt import plot_cluster
from evaluation_cluster import best_k
from sklearn.manifold import TSNE

N_COMPONENTS = 2
N_CLUSTERS = 5

data = pd.read_csv(
    f'/home/mickael/Documents/Challenge_Video_Audio_Text/features/text/nmf_tfidf_{N_COMPONENTS}_TSNE.csv',
    sep='§')

model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=30)

cluster = model.fit_predict(data.drop(['Sequence'], axis='columns'))

data['cluster'] = cluster

#plt.scatter(data['nmf_0'], data['nmf_1'], c=cluster, s=50, cmap='viridis')
#plt.show()

plot_cluster(
    data[['nmf_0', 'nmf_1']].values, data['Sequence'], cluster,
    '/home/mickael/Documents/Challenge_Video_Audio_Text/result/plot_cluster_nmf_5_TSNE.html'
)

#best_k(data.drop(['Sequence'], axis='columns'), range_min = 20, verbose = True)