def main(args): # datasets de train y test df = pd.read_csv("data/imdb_small.csv") #recortando data set para pruebas chiquitas if (args.elem > 0): df = df[:args.elem] X_train, y_train, X_test, y_test = get_instances(df) alpha = args.alpha_Start # rango de k k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step) # rango de alpha alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step) # resultados de accuaracies para cada k results = [] for i in range(len(k_range)): results.append([]) for alpha in alpha_range: pca = PCA(alpha) pca.fit(X_train.toarray()) X_train_aux = pca.transform(X_train) X_test_aux = pca.transform(X_test) # kNN for i in range(len(k_range)): k = k_range[i] clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) t = accuracy_score(y_test, y_pred) results[i].append(t) for i, result in enumerate(results): plt.plot(alpha_range, result, label='k = {0}'.format(k_range[i])) plt.xlabel('alpha') plt.ylabel('accuaracy') plt.legend() plt.savefig('results/k_vs_accuaracy-{}'.format( time.strftime("%Y%m%d-%H%M%S"))) plt.show() '''
def evaluate_knn(k_range, X_train, y_train, X_test, y_test, reps): accs = [] # kNN for k in k_range: t = 0 for i in range(reps): clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) t += accuracy_score(y_test, y_pred) accs.append(t / reps) return accs
def run_test(df, TRAIN_SIZE=6225, TEST_SIZE=500, BINARIO=False, NEGACIONES=False, NORMA_PESADA=False, IDF=False, STOP_WORDS=False): print("--------------------------------------------------------") print("Test empezado con:") print("Train size:", TRAIN_SIZE) print("Test size:", TEST_SIZE) print("Negaciones:", NEGACIONES) print("Binario:", BINARIO) print("Norma pesada:", NORMA_PESADA) print("Stop words:", STOP_WORDS) print("IDF:", IDF) text_train, label_train, text_test, label_test = get_instances( df, TRAIN_SIZE, TEST_SIZE) X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) X_train = X_train.todense() var_total = np.std(X_train, axis=1).sum() pca = PCA(1500) pca.fit(X_train) X_train = pca.transform(X_train, 1500) X_train = tomar_porcentaje(X_train, 0.03, var_total) #MODIFICAR X_test = pca.transform(X_test, X_train.shape[1]) print("ALPHA =", X_train.shape[1]) clf = KNNClassifier(1) clf.fit(X_train, y_train) mat = [] if not NORMA_PESADA: mat = clf.testearK(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd) mat = clf.testearK_weighted(X_test, covarianzas) vAcc = [] for i in range(len(mat[0])): a = mat[:, i] acc = accuracy_score(y_test, a) vAcc.append(acc) return vAcc
def main(args): # datasets de train y test df = pd.read_csv("data/imdb_small.csv") x_poda_frec = np.arange(args._from, args.to, args.step) y_poda_frec = [] df = df[:6000] for i in x_poda_frec: X_train, y_train, X_test, y_test = get_instances(df, 0.9, i) k = 500 alpha = 50 #pca pca = PCA(alpha) pca.fit(X_train.toarray()) X_train_aux = pca.transform(X_train) X_test_aux = pca.transform(X_test) #knn clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) y_poda_frec.append(acc) fig, ax1 = plt.subplots() plt.plot(x_poda_frec, y_poda_frec, label='k = 500, alpha = 50'.format(k)) plt.xlabel('min_df') plt.ylabel('accuaracy') plt.legend() plt.savefig('results/min_df_accuaracy-{}'.format( time.strftime("%Y%m%d-%H%M%S"))) plt.show()
if len(sys.argv) != 3: print("Uso: python classify archivo_de_test archivo_salida") exit() test_path = sys.argv[1] out_path = sys.argv[2] df = pd.read_csv("data/imdb_small.csv") df_test = pd.read_csv(test_path) print("Vectorizando datos...") X_train, y_train, X_test, ids_test = get_instances(df, df_test) """ Entrenamos KNN """ clf = KNNClassifier(1120) clf.fit(X_train, y_train) """ Testeamos """ print("Prediciendo etiquetas...") y_pred = clf.predict(X_test).reshape(-1) labels = ['pos' if val == 1 else 'neg' for val in y_pred] df_out = pd.DataFrame({"id": ids_test, "label": labels}) df_out.to_csv(out_path, index=False) print("Salida guardada en {}".format(out_path))
def run_test(df, TRAIN_SIZE=6225, TEST_SIZE=500, ALPHA=None, K=None, BINARIO=False, NEGACIONES=False, NORMA_PESADA=False, IDF=False, STOP_WORDS=False): print("--------------------------------------------------------") print("Test empezado con:") print("Train size:", TRAIN_SIZE) print("Test size:", TEST_SIZE) print("Alpha:", ALPHA) print("K:", K) print("Negaciones:", NEGACIONES) print("Binario:", BINARIO) print("Norma pesada:", NORMA_PESADA) print("Stop words:", STOP_WORDS) print("IDF:", IDF) text_train, label_train, text_test, label_test = get_instances( df, TRAIN_SIZE, TEST_SIZE) print("Vectorizando...") X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) if ALPHA != None: print("Obteniendo componentes principales...") pca = PCA(ALPHA) pca.fit(X_train.todense()) X_train = pca.transform(X_train, ALPHA) X_test = pca.transform(X_test, ALPHA) clf = KNNClassifier(K) clf.fit(X_train, y_train) print("Prediciendo...") if not NORMA_PESADA: y_pred = clf.predict(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): if ALPHA: covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd) else: covarianzas[i] = (( (X_train.todense())[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std( (X_train.todense())[:, i]) * ystd) y_pred = clf.predict_weighted(X_test, np.abs(correlaciones)) print("Test finalizado") acc = accuracy_score(y_test, y_pred) print("Accuracy: {}".format(acc)) return acc
df_test = pd.read_csv(test_path) print("Vectorizando datos...") text_train, label_train, text_test, ids_test = get_instances( df_train, df_test, TRAIN_SIZE) X_train, y_train, X_test = vectorizar(text_train, label_train, text_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) if ALPHA != None: print("Obteniendo componentes principales...") pca = PCA(ALPHA) pca.fit(X_train.todense()) X_train = pca.transform(X_train, ALPHA) X_test = pca.transform(X_test, ALPHA) clf = KNNClassifier(K) clf.fit(X_train, y_train) print("Prediciendo...") print(X_test.shape) if not NORMA_PESADA: y_pred = clf.predict(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) covarianzas = np.zeros(X_train.shape[1]) correlaciones = np.zeros(X_train.shape[1]) for i in range(X_train.shape[1]): if ALPHA: covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum() correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) *
# rango de alpha alpha_range = np.arange(50, 150, 30) y_alpha = [] y_time = [] for alpha in alpha_range: t0 = time.time() #pca pca = PCA(alpha) pca.fit(X_train.toarray()) X_train_aux = pca.transform(X_train) X_test_aux = pca.transform(X_test) #knn clf = KNNClassifier(k) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) y_alpha.append(acc) fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Alpha') ax1.set_ylabel('Accuaracy', color=color) ax1.plot(alpha_range, y_alpha, color=color) ax1.tick_params(axis='y', labelcolor=color) plt.xticks(rotation=90) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
STOP_WORDS) print("Obteniendo componentes principales...") pca = PCA(1601) pca.fit(X_train.toarray()) ALPHA = 800 saltoALPHA = 100 mAcc = [] while ALPHA < 1601: print("ALPHA = ", ALPHA) X_train = pca.transform(X_train, ALPHA) X_test = pca.transform(X_test, ALPHA) clf = KNNClassifier(1) clf.fit(X_train, y_train) mat = [] if not NORMA_PESADA: mat = clf.testearK(X_test) else: y_train_norm = y_train - y_train.mean() ystd = np.std(y_train) print(X_train.shape) covarianzas = np.zeros(ALPHA) correlaciones = np.zeros(ALPHA) for i in range(ALPHA): covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) * y_train_norm).sum()
logging.info('Vectorizando los datos') vectorizer = CountVectorizer(max_df=0.90, min_df=0.01, max_features=5000) vectorizer.fit(text_train) X_train, y_train = vectorizer.transform(text_train), ( label_train == 'pos').values X_test, y_test = vectorizer.transform(text_test), (label_test == 'pos').values # X_train = X_train.todense() # X_test = X_test.todense() # ------------------------------------------------------------------------------ # Entrenamiento # ------------------------------------------------------------------------------ logging.info(f'Entranando el clasificador. (K={K})') time_start = process_time() clf = KNNClassifier(K) clf.fit(X_train, y_train) time_finish = process_time() logging.info(f'Clasificador entrenado en {time_finish - time_start:.4f}s') # ------------------------------------------------------------------------------ # Midiendo predicción # ------------------------------------------------------------------------------ if N_test == 0: N_test = X_test.shape[0] logging.info(f'Midiendo tiempos para {N_test} elementos de testing.') time_start = process_time() clf.predict(X_test[:N_test]) time_finish = process_time()
df = pd.read_csv("../data/imdb_small.csv") df['label'] = (df['label'] == 'pos').astype('int') TOTAL_TRAIN = 6000 text_train, label_train, text_test, label_test = get_instances(df,TOTAL_TRAIN,TEST_SIZE) print("Vectorizando...") X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS) TRAIN = 1000 saltoTRAIN = 1000 mAcc = [] while TRAIN <= 6000: print(TRAIN) clf = KNNClassifier(1) clf.fit(X_train[:TRAIN], y_train[:TRAIN]) mat = [] mat = clf.testearK(X_test) vAcc = [] for i in range(len(mat[0])): a = mat[:, i] acc = accuracy_score(y_test, a) vAcc.append(acc) mAcc.append(vAcc) TRAIN += saltoTRAIN fout = open("resultados_exp5.pkl","wb") pickle.dump(mAcc,fout)