def features_from(i): df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 gmm, features = 0, 0 print(df_fiscalite) if (i == 0): df_fiscalite = ut.read_data( 'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_resp_fis = get_open_reponses(df_fiscalite) df_ids_fis = get_ids_open_reponses(df_fiscalite) elif (i == 1): df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) elif (i == 2): df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) elif (i == 3): df_organisation = ut.read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) dfs_responses = np.array([["responses fiscalite", df_resp_fis], ["responses democratie", df_resp_dem], ["responses ecologie", df_resp_eco], ["responses organisation", df_resp_org]]) dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org]) # read features features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=10) gmm.fit(np.array(features)) local_pool = multiprocessing.Pool(20, initializer) local_pool.map(fill_X, range(four_surveys_taken_auth_ids)) local_pool.close() local_pool.join() np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_eco['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_organisation = read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_org['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses organisation_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10)
from src.kmeans_embeddings import FeaturesExtractor from src.utils import (read_data, get_open_reponses) from sklearn.mixture import GaussianMixture #%% extract data from json df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_organisation = ut.read_data('data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) #%% questionId = '162' df_responses = get_open_reponses(df_fiscalite) responses = (df_responses[df_responses.questionId == questionId].formattedValue.values.tolist()) # Extract embeddings for sentences s = FeaturesExtractor() features = [s.get_features(x) for x in responses] features_np = np.array(features) #samples_id = np.random.choice(range(len(features)), 5000) features_np_samples = features_np[:,:]#samples_id, :] np.savetxt('features_s_fiscalite_'+questionId+'.tsv', features_np_samples, delimiter='\t') #responses_samples = [responses[i] for i in samples_id] with open('labels_s_fiscalite_'+questionId+'.tsv', 'w') as f:
# -*- coding: utf-8 -*- from src.kmeans_embeddings import FeaturesExtractor from src.utils import (read_data, get_open_reponses) from sklearn.cluster import KMeans import numpy as np import pandas as pd if __name__ == '__main__': df = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_responses = get_open_reponses(df) responses = (df_responses[df_responses.questionId == '107']. formattedValue.values.tolist()) # Extract embeddings for sentences s = FeaturesExtractor() features = [s.get_features(x) for x in responses] features_np = np.array(features) print(features_np) samples_id = np.random.choice(range(len(features)), 5000) features_np_samples = features_np[samples_id, :] np.savetxt('features_s.tsv', features_np_samples, delimiter='\t') responses_samples = [responses[i] for i in samples_id] with open('labels_s.tsv', 'w') as f: for resp in responses_samples:
import numpy as np from src.utils import read_data, get_open_reponses #%% extract data from json df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_organisation = read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) #%% #%% responses of each themes df_resp_fis = get_open_reponses(df_fiscalite) df_resp_dem = get_open_reponses(df_democratie) df_resp_eco = get_open_reponses(df_ecologie) df_resp_org = get_open_reponses(df_organisation) dfs_responses = np.array([["responses fiscalite", df_resp_fis], ["responses democratie", df_resp_dem], ["responses ecologie", df_resp_eco], ["responses organisation", df_resp_org]]) # allAuthIds is the sets of all the authorIds allAuthIds = [] for i in range(4): allAuthIds.extend(set(dfs_responses[i, 1]['authorId'].values)) allAuthIds = set(allAuthIds)
from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_dem['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
import src.utils as ut import numpy as np import pandas as pd import string import multiprocessing from src.utils import (read_data, get_open_reponses, get_ids_open_reponses) from som_batched_learning import (open_model, get_clusters) from X import (get_X, get_auth_id) #Output of the first GMM learning stage X = get_X() #Output of the 2nd stage: best SOM model found after 500 models trained best_som_model = open_model(60) clusters = get_clusters(nb_clusters=10, X_projected=X, sm=best_som_model) df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_organisation = ut.read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) df_reponses = get_open_reponses(dfs) print(clusters)