def main(args): # datasets de train y test df = pd.read_csv("data/imdb_small.csv") #recortando data set para pruebas chiquitas if (args.elem > 0): df = df[:args.elem] X_train, y_train, X_test, y_test = get_instances(df) alpha = args.alpha_Start # rango de k k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step) # rango de alpha alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step) # resultados de accuaracies para cada alpha results = [] if (alpha > 0): # PCA for alpha in alpha_range: pca = PCA(alpha) #print("fit") pca.fit(X_train.toarray()) #print("trainx") X_train_aux = pca.transform(X_train) #print("train_test") X_test_aux = pca.transform(X_test) # resultados de accuaracy con cada k accs = evaluate_knn(k_range, X_train_aux, y_train, X_test_aux, y_test, args.reps) results.append(accs) else: # resultados de accuaracy con cada k accs = evaluate_knn(k_range, X_train, y_train, X_test, y_test, args.reps) results.append(accs) if (alpha > 0): for i, result in enumerate(results): plt.plot(k_range, result, label='alpha = {0}'.format(alpha_range[i])) else: plt.plot(k_range, results[0], label='sin pca') plt.xlabel('k') plt.ylabel('accuaracy') plt.legend() plt.savefig('results/k_vs_accuaracy-{}'.format( time.strftime("%Y%m%d-%H%M%S"))) plt.show() '''
def run_test(df, TRAIN_SIZE=6225, TEST_SIZE=500, BINARIO=False, NEGACIONES=False, NORMA_PESADA=False, IDF=False, STOP_WORDS=False): print("--------------------------------------------------------") print("Test empezado con:") print("Train size:", TRAIN_SIZE) print("Test size:", TEST_SIZE) print("Negaciones:", NEGACIONES) print("Binario:", BINARIO) print("Norma pesada:", NORMA_PESADA) print("Stop words:", STOP_WORDS) print("IDF:", IDF) text_train, label_train, text_test, label_test = get_instances( df, TRAIN_SIZE, TEST_SIZE) X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) X_train = X_train.todense() var_total = np.std(X_train, axis=1).sum() pca = PCA(1500) pca.fit(X_train) X_train = pca.transform(X_train, 1500) X_train = tomar_porcentaje(X_train, 0.03, var_total) #MODIFICAR X_test = pca.transform(X_test, X_train.shape[1]) print("ALPHA =", X_train.shape[1]) clf = KNNClassifier(1) clf.fit(X_train, y_train) mat = [] if not NORMA_PESADA: mat = clf.testearK(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd) mat = clf.testearK_weighted(X_test, covarianzas) vAcc = [] for i in range(len(mat[0])): a = mat[:, i] acc = accuracy_score(y_test, a) vAcc.append(acc) return vAcc
def main(args): # datasets de train y test df = pd.read_csv("data/imdb_small.csv") #recortando data set para pruebas chiquitas if (args.elem > 0): df = df[:args.elem] X_train, y_train, X_test, y_test = get_instances(df) alpha = args.alpha_Start # rango de k k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step) # rango de alpha alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step) # resultados de accuaracies para cada k results = [] for i in range(len(k_range)): results.append([]) for alpha in alpha_range: pca = PCA(alpha) pca.fit(X_train.toarray()) X_train_aux = pca.transform(X_train) X_test_aux = pca.transform(X_test) # kNN for i in range(len(k_range)): k = k_range[i] clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) t = accuracy_score(y_test, y_pred) results[i].append(t) for i, result in enumerate(results): plt.plot(alpha_range, result, label='k = {0}'.format(k_range[i])) plt.xlabel('alpha') plt.ylabel('accuaracy') plt.legend() plt.savefig('results/k_vs_accuaracy-{}'.format( time.strftime("%Y%m%d-%H%M%S"))) plt.show() '''
print("exponente: {}".format(exp)) for rep in range(1): print("rep: {}".format(rep)) out_path = "data/{}_{}_{}.out".format(out_name, exp, rep) paths[exp].append(out_path) eps = 10**(-exp) pca = PCA(alpha, eps) print("Entrenando PCA") t = time.clock() pca.fit(X_train_orig) print("Transformando datos") X_train = pca.transform(X_train_orig) X_test = pca.transform(X_test_orig) total_time = time.clock() - t print("time: {}".format(total_time)) """ Entrenamos KNN """ clf = KNNClassifier(5) print("Entrenando KNN") clf.fit(X_train, y_train) """ Testeamos """ print("Prediciendo etiquetas...")
def run_test(df, TRAIN_SIZE=6225, TEST_SIZE=500, ALPHA=None, K=None, BINARIO=False, NEGACIONES=False, NORMA_PESADA=False, IDF=False, STOP_WORDS=False): print("--------------------------------------------------------") print("Test empezado con:") print("Train size:", TRAIN_SIZE) print("Test size:", TEST_SIZE) print("Alpha:", ALPHA) print("K:", K) print("Negaciones:", NEGACIONES) print("Binario:", BINARIO) print("Norma pesada:", NORMA_PESADA) print("Stop words:", STOP_WORDS) print("IDF:", IDF) text_train, label_train, text_test, label_test = get_instances( df, TRAIN_SIZE, TEST_SIZE) print("Vectorizando...") X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) if ALPHA != None: print("Obteniendo componentes principales...") pca = PCA(ALPHA) pca.fit(X_train.todense()) X_train = pca.transform(X_train, ALPHA) X_test = pca.transform(X_test, ALPHA) clf = KNNClassifier(K) clf.fit(X_train, y_train) print("Prediciendo...") if not NORMA_PESADA: y_pred = clf.predict(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): if ALPHA: covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd) else: covarianzas[i] = (( (X_train.todense())[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std( (X_train.todense())[:, i]) * ystd) y_pred = clf.predict_weighted(X_test, np.abs(correlaciones)) print("Test finalizado") acc = accuracy_score(y_test, y_pred) print("Accuracy: {}".format(acc)) return acc
test_path = sys.argv[1] out_path = sys.argv[2] df = pd.read_csv("data/imdb_small.csv") df_test = pd.read_csv(test_path) print("Vectorizando datos...") X_train, y_train, X_test, ids_test = get_instances(df, df_test) #Comentar esto si nuestra mejor configuraciĆ³n no usa PCA alpha = 450 pca = PCA(alpha) print("Entrenando PCA") pca.fit(X_train.toarray()) X_train = pca.transform(X_train) X_test = pca.transform(X_test) """ Entrenamos KNN """ clf = KNNClassifier(2000) clf.fit(X_train, y_train) """ Testeamos """ print("Prediciendo etiquetas...") y_pred = clf.predict(X_test).reshape(-1) # Convierto a 'pos' o 'neg' labels = ['pos' if val == 1 else 'neg' for val in y_pred]
TRAIN_SIZE = 6225 NEGACIONES = True BINARIO = True NORMA_PESADA = True STOP_WORDS = True IDF = True df_train = pd.read_csv("../data/imdb_small.csv") text_train = get_instances(df_train, TRAIN_SIZE) X_train = vectorizar(text_train, BINARIO, IDF, NEGACIONES, STOP_WORDS).todense() if caso == "0": var_total = np.std(X_train, axis=1).sum() pca = PCA(alpha) pca.fit(X_train) X_train = pca.transform(X_train, alpha) var_con_pca = np.std(X_train, axis=1).sum() else: var_total = np.std(X_train, axis=1).sum() pca = PCA(X_train.shape[1]) pca.fit(X_train) X_train = pca.transform(X_train, X_train.shape[1]) var_parcial = np.std(X_train[:, 0]) for i in range(1, X_train.shape[1]): if var_parcial / var_total > P: print(i) break var_parcial = var_parcial + np.std(X_train[:, i])
out_path = sys.argv[2] df_train = pd.read_csv("../data/imdb_small.csv") df_test = pd.read_csv(test_path) print("Vectorizando datos...") text_train, label_train, text_test, ids_test = get_instances( df_train, df_test, TRAIN_SIZE) X_train, y_train, X_test = vectorizar(text_train, label_train, text_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) if ALPHA != None: print("Obteniendo componentes principales...") pca = PCA(ALPHA) pca.fit(X_train.todense()) X_train = pca.transform(X_train, ALPHA) X_test = pca.transform(X_test, ALPHA) clf = KNNClassifier(K) clf.fit(X_train, y_train) print("Prediciendo...") print(X_test.shape) if not NORMA_PESADA: y_pred = clf.predict(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): if ALPHA:
X_train, y_train, X_test, y_test = get_instances(df) k = 500 # rango de alpha alpha_range = np.arange(50, 150, 30) y_alpha = [] y_time = [] for alpha in alpha_range: t0 = time.time() #pca pca = PCA(alpha) pca.fit(X_train.toarray()) X_train_aux = pca.transform(X_train) X_test_aux = pca.transform(X_test) #knn clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) y_alpha.append(acc) fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Alpha') ax1.set_ylabel('Accuaracy', color=color)