def gridSearchNGram\ (X, X_train, y_train, learner, ngrams, k=3, return_errors=False, random_state=42, method='kfold'): g_ngrams, scores = [], [] rs = random_state total = len(ngrams) step = int(total / 10) start = time.time() for ng in ngrams: rs += 1 bow = BagOfWords(ng).fit(X) XX = bow.transform(X_train) if method == 'kfold' : scores.append(KFold_score(XX, y_train, learner, k=k, random_state=rs)) elif method == 'five2' : scores.append(five2_score(XX, y_train, learner, random_state=rs)) g_ngrams.append(ng) if (rs - random_state) % step == 0: now = time.time() print(' +gridSearch2D : {}% t:{:.2f}'.format(int((rs - random_state)/total*100), now - start)) start = now best = np.argmax(scores) bests = g_ngrams[best] if return_errors: return bests, (1 - np.array(scores)), g_ngrams else: return bests, np.array(scores), g_ngrams
for ng in range(2, 6): ret = ngramXP(X, y, ng) ngs.append(ng) ans.append(ret[0]) scores.append(np.max(ret[1])) max_ = np.argmax(scores) best_ng = ngs[max_] best_low, best_high = ans[max_] print(scores[max_]) print(best_ng) print(best_low) print(best_high) X = X_train + X_test bow = BagOfWords(best_ng).fit(X) XX = bow.transform(X, best_low, best_high) learner = LinearSVC() XX_train = XX[:len(X_train)] XX_test = XX[len(X_train):] learner = learner.fit(XX_train, y_train) print((learner.predict(XX_test) == y_test).mean())