def features_from(i): df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 gmm, features = 0, 0 print(df_fiscalite) if (i == 0): df_fiscalite = ut.read_data( 'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_resp_fis = get_open_reponses(df_fiscalite) df_ids_fis = get_ids_open_reponses(df_fiscalite) elif (i == 1): df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) elif (i == 2): df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) elif (i == 3): df_organisation = ut.read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) dfs_responses = np.array([["responses fiscalite", df_resp_fis], ["responses democratie", df_resp_dem], ["responses ecologie", df_resp_eco], ["responses organisation", df_resp_org]]) dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org]) # read features features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=10) gmm.fit(np.array(features)) local_pool = multiprocessing.Pool(20, initializer) local_pool.map(fill_X, range(four_surveys_taken_auth_ids)) local_pool.close() local_pool.join() np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
def extract_features(): for k in [1]: ids_questions = get_ids_open_reponses(dfs[k,1]) ids_auth = np.sort(list(set(dfs_responses[k,1]['authorId'].values))) # Extract embeddings for sentences features = np.zeros((len(ids_auth), 300*len(ids_questions))) for i in range(len(ids_auth)) : for j in range(len(ids_questions)) : response_unique = dfs_responses[k,1][dfs_responses[k,1]['authorId'] == ids_auth[i]][dfs_responses[k,1][dfs_responses[k,1]['authorId'] == ids_auth[i]]['questionId'] == ids_questions[j]].formattedValue.values.tolist() if (len(response_unique) > 0) : features[i][300*j:300*(j+1)] = s.get_features(response_unique[0]) else: features[i][300*j:300*(j+1)] = [0.]*300 np.savetxt(dfs_responses[k,0,]+'_all_questions.tsv', features, delimiter='\t')
def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_eco['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids)))) local_pool.close()
def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_organisation = read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_org['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses organisation_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_dem['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids)))) local_pool.close()
from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_resp_fis = get_open_reponses(df_fiscalite) df_ids_fis = get_ids_open_reponses(df_fiscalite) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_fis['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses fiscalite_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10) X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids)))) local_pool.close() local_pool.join() np.savetxt("X_fiscalite.csv", X, delimiter=",")