Example #1
0
def features_from(i):
    df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    gmm, features = 0, 0
    print(df_fiscalite)
    if (i == 0):
        df_fiscalite = ut.read_data(
            'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
        df_resp_fis = get_open_reponses(df_fiscalite)
        df_ids_fis = get_ids_open_reponses(df_fiscalite)
    elif (i == 1):
        df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
        df_resp_dem = get_open_reponses(df_democratie)
        df_ids_dem = get_ids_open_reponses(df_democratie)
    elif (i == 2):
        df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
        df_resp_eco = get_open_reponses(df_ecologie)
        df_ids_eco = get_ids_open_reponses(df_ecologie)
    elif (i == 3):
        df_organisation = ut.read_data(
            'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
        df_resp_org = get_open_reponses(df_organisation)
        df_ids_org = get_ids_open_reponses(df_organisation)
    dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                    ["ecologie", df_ecologie],
                    ["organisation", df_organisation]])
    dfs_responses = np.array([["responses fiscalite", df_resp_fis],
                              ["responses democratie", df_resp_dem],
                              ["responses ecologie", df_resp_eco],
                              ["responses organisation", df_resp_org]])
    dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org])
    # read features
    features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv',
                          delimiter='\t')
    # Fit GMM
    gmm = GaussianMixture(n_components=10)
    gmm.fit(np.array(features))
    local_pool = multiprocessing.Pool(20, initializer)
    local_pool.map(fill_X, range(four_surveys_taken_auth_ids))
    local_pool.close()
    local_pool.join()
    np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
from sklearn.mixture import GaussianMixture


def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_resp_eco = get_open_reponses(df_ecologie)
df_ids_eco = get_ids_open_reponses(df_ecologie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_eco['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
Example #3
0

def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_organisation = read_data(
    'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
df_resp_org = get_open_reponses(df_organisation)
df_ids_org = get_ids_open_reponses(df_organisation)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_org['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses organisation_all_questions.tsv',
                      delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
Example #4
0
from src.kmeans_embeddings import FeaturesExtractor
from src.utils import (read_data, get_open_reponses)
from sklearn.mixture import GaussianMixture

#%% extract data from json
df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_organisation = ut.read_data('data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')

dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]])
#%%

questionId = '162'

df_responses = get_open_reponses(df_fiscalite)

responses = (df_responses[df_responses.questionId == questionId].formattedValue.values.tolist())

# Extract embeddings for sentences
s = FeaturesExtractor()
features = [s.get_features(x) for x in responses]

features_np = np.array(features)

#samples_id = np.random.choice(range(len(features)), 5000)

features_np_samples = features_np[:,:]#samples_id, :]
np.savetxt('features_s_fiscalite_'+questionId+'.tsv', features_np_samples, delimiter='\t')
#responses_samples = [responses[i] for i in samples_id]
with open('labels_s_fiscalite_'+questionId+'.tsv', 'w') as f:
# -*- coding: utf-8 -*-


from src.kmeans_embeddings import FeaturesExtractor
from src.utils import (read_data, get_open_reponses)
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd


if __name__ == '__main__':
    df = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
    df_responses = get_open_reponses(df)

    responses = (df_responses[df_responses.questionId == '107'].
                 formattedValue.values.tolist())

    # Extract embeddings for sentences
    s = FeaturesExtractor()
    features = [s.get_features(x) for x in responses]

    features_np = np.array(features)
    print(features_np)

    samples_id = np.random.choice(range(len(features)), 5000)

    features_np_samples = features_np[samples_id, :]
    np.savetxt('features_s.tsv', features_np_samples, delimiter='\t')
    responses_samples = [responses[i] for i in samples_id]
    with open('labels_s.tsv', 'w') as f:
        for resp in responses_samples:
import numpy as np
from src.utils import read_data, get_open_reponses

#%% extract data from json
df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_organisation = read_data(
    'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')

dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                ["ecologie", df_ecologie], ["organisation", df_organisation]])
#%%

#%% responses of each themes
df_resp_fis = get_open_reponses(df_fiscalite)
df_resp_dem = get_open_reponses(df_democratie)
df_resp_eco = get_open_reponses(df_ecologie)
df_resp_org = get_open_reponses(df_organisation)

dfs_responses = np.array([["responses fiscalite", df_resp_fis],
                          ["responses democratie", df_resp_dem],
                          ["responses ecologie", df_resp_eco],
                          ["responses organisation", df_resp_org]])

# allAuthIds is the sets of all the authorIds
allAuthIds = []
for i in range(4):
    allAuthIds.extend(set(dfs_responses[i, 1]['authorId'].values))
allAuthIds = set(allAuthIds)
from sklearn.mixture import GaussianMixture


def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_resp_dem = get_open_reponses(df_democratie)
df_ids_dem = get_ids_open_reponses(df_democratie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_dem['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
Example #8
0
import src.utils as ut
import numpy as np
import pandas as pd
import string
import multiprocessing
from src.utils import (read_data, get_open_reponses, get_ids_open_reponses)
from som_batched_learning import (open_model, get_clusters)
from X import (get_X, get_auth_id)

#Output of the first GMM learning stage
X = get_X()
#Output of the 2nd stage: best SOM model found after 500 models trained
best_som_model = open_model(60)

clusters = get_clusters(nb_clusters=10, X_projected=X, sm=best_som_model)

df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_organisation = ut.read_data(
    'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                ["ecologie", df_ecologie], ["organisation", df_organisation]])

df_reponses = get_open_reponses(dfs)

print(clusters)