def build_all_2(): print 'For each class, we build all the trees and save them in CSVs' path_to_save = '../data/test/try' """ nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative') write_tree_in_csv(inf_trees) des_trees = [] # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees.pkl','wb'))""" T = pickle.load(open('../data/trees_with_labels.pkl','r')) T = [t[0] for t in T] """y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels.pkl','wb'))""" index = ['bin','count','norm','height','tfid'] print 'Dicts' D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_all = {'bin':D_bin ,'count': D_count,'norm': D_norm,'height': D_height,'tfid': D_tfid} pickle.dump(D_all,open(path_to_save+'dicts.pkl','wb')) print 'Vects' vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = {'bin':V_bin ,'count': V_count,'norm': V_norm,'height': V_height,'tfid': V_tfid} pickle.dump(V_all,open(path_to_save+'vects.pkl','wb')) #Y = vectorizer.inverse_transform(V_bin) print 'Kernels' ## tree kernels #max_depth = 15 #T_p = [ctree.prune(t,max_depth) for t in T] #K_tree = kernels.compute_gram(T_p,T_p,kernels.tree_kernel) #pickle.dump(K_tree,open(path_to_save+'tree_kernel.pkl')) print 'vector kernels' print 'linear' K_bin_lin = pairwise.linear_kernel(V_bin) K_count_lin = pairwise.linear_kernel(V_count) K_norm_lin = pairwise.linear_kernel(V_norm) K_height_lin = pairwise.linear_kernel(V_height) K_tfid_lin = pairwise.linear_kernel(V_tfid) K_all_lin = {'bin':K_bin_lin, 'count':K_count_lin, 'norm':K_norm_lin, 'height':K_height_lin, 'tfid':K_tfid_lin} print 'rbf' K_bin_rbf = pairwise.rbf_kernel(V_bin) K_count_rbf = pairwise.rbf_kernel(V_count) K_norm_rbf = pairwise.rbf_kernel(V_norm) K_height_rbf = pairwise.rbf_kernel(V_height) K_tfid_rbf = pairwise.rbf_kernel(V_tfid) K_all_rbf = {'bin':K_bin_rbf, 'count':K_count_rbf, 'norm':K_norm_rbf, 'height':K_height_rbf, 'tfid':K_tfid_rbf} print 'cosine sim' K_bin_cos_sim = pairwise.cosine_similarity(V_bin) K_count_cos_sim = pairwise.cosine_similarity(V_count) K_norm_cos_sim = pairwise.cosine_similarity(V_norm) K_height_cos_sim = pairwise.cosine_similarity(V_height) K_tfid_cos_sim = pairwise.cosine_similarity(V_tfid) K_all_cos_sim = {'bin':K_bin_cos_sim, 'count':K_count_cos_sim, 'norm':K_norm_cos_sim, 'height':K_height_cos_sim, 'tfid':K_tfid_cos_sim} print 'euclidean distance' K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = {'bin':K_bin_eucl_dist, 'count':K_count_eucl_dist, 'norm':K_norm_eucl_dist, 'height':K_height_eucl_dist, 'tfid':K_tfid_eucl_dist} print 'minkowski distance' K_bin_mink_dist = pairwise.pairwise_distances(V_bin,metric='minkowski') K_count_mink_dist = pairwise.pairwise_distances(V_count,metric='minkowski') K_norm_mink_dist = pairwise.pairwise_distances(V_norm,metric='minkowski') K_height_mink_dist = pairwise.pairwise_distances(V_height,metric='minkowski') K_tfid_mink_dist = pairwise.pairwise_distances(V_tfid,metric='minkowski') K_all_mink_dist = {'bin':K_bin_mink_dist, 'count':K_count_mink_dist, 'norm':K_norm_mink_dist, 'height':K_height_mink_dist, 'tfid':K_tfid_mink_dist} K_all = {'lin':K_all_lin, 'rbf':K_all_rbf, 'cos_sim':K_all_cos_sim,'eucl_dist':K_all_eucl_dist,'mink_dist':K_all_mink_dist} pickle.dump(K_all,open(path_to_save+'vect_kernels.pkl','wb')) print "done"
def build_all(): # For each class, we build all the trees and save them in CSVs nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative/') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') write_tree_in_csv(inf_trees) des_trees = [] #des_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') #write_tree_in_csv(des_trees) # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} path_to_save = '~/Documents/s2/tal/discourseAnalysis/data/' y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels_test.pkl','wb')) T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees_test.pkl','wb')) index = ['bin','count','norm','height','tfid'] #Dicts D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_df = pd.DataFrame([D_bin,D_count,D_norm,D_height,D_tfid],index=index) D_df = D_df.transpose() D_df.to_pickle(path_to_save+'dicts_test.pkl') #Vects vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = np.zeros((len(index),V_bin.shape[0],V_bin.shape[1])) V_all = np.array([V_bin,V_count,V_norm,V_height,V_tfid]) V_df = [] for i in range(V_all.shape[1]): d = {} for j,v in enumerate(V_all[:,i]): d[index[j]]=v V_df.append(d) V_df = pd.DataFrame(V_df) V_df.to_pickle(path_to_save+'vects_test.pkl') #euclidean distance K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = [K_bin_eucl_dist, K_count_eucl_dist, K_norm_eucl_dist, K_height_eucl_dist, K_tfid_eucl_dist] K_all = {'eucl_dist':K_all_eucl_dist} pickle.dump(K_all,open(path_to_save+'kernels_test.pkl','wb'))