def gridSearchNGram\ (X, X_train, y_train, learner, ngrams, k=3, return_errors=False, random_state=42, method='kfold'): g_ngrams, scores = [], [] rs = random_state total = len(ngrams) step = int(total / 10) start = time.time() for ng in ngrams: rs += 1 bow = BagOfWords(ng).fit(X) XX = bow.transform(X_train) if method == 'kfold' : scores.append(KFold_score(XX, y_train, learner, k=k, random_state=rs)) elif method == 'five2' : scores.append(five2_score(XX, y_train, learner, random_state=rs)) g_ngrams.append(ng) if (rs - random_state) % step == 0: now = time.time() print(' +gridSearch2D : {}% t:{:.2f}'.format(int((rs - random_state)/total*100), now - start)) start = now best = np.argmax(scores) bests = g_ngrams[best] if return_errors: return bests, (1 - np.array(scores)), g_ngrams else: return bests, np.array(scores), g_ngrams
def get_top(data, target, p, n, ngram=1): p_data = [] for i, d in enumerate(data): if target[i] == p: p_data.append(d) freqs = BagOfWords(ngram).fit(p_data).freqs_ keys, values = list(zip(*(freqs.items()))) top = np.argsort(values)[::-1][:n] return np.array(keys)[top], np.array(values)[top]
def ngramXP(X, y, ngram): print(' +ngram : begin with ngram {}'.format(ngram)) bow = BagOfWords(ngram).fit(X) learner = LinearSVC() mixed_learner = BowAndLearner(bow, learner) lowBounds = np.arange(0, 30, 1) highBounds = np.arange(20, 70, 1) ans = gridSearch2D(X, y, mixed_learner, lowBounds, highBounds, condition, method='five2') save2DGridSearch(ans[1], ans[2], ans[3], 'borne basse', 'borne haute', ngram) with open('results-{}.json'.format(ngram), 'wb') as f: json.dump('{}'.format(ans), f) return ans
def transform_cut(self, X): newX = super(BowPreFilter, self).transform_cut(X, self.low_, self.high_) bow = BagOfWords(self.ngram_).fit(self.X_) return bow.transform_cut(newX)
if __name__ == '__main__': if len(sys.argv) < 2: print(' >> you need to give data') exit() else: file = sys.argv[1] """ simple bow """ scores = [] X, y = load_data(os.path.join(DATA, file)) y, reverse = target2int(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42) low = 18 high = 53 X = X_train + X_test XX = BagOfWords().fit(X).transform_cut(X, low, high) print(get_top(XX, y, 0, 5)) print(get_top(XX, y, 1, 5)) print(reverse)
scores = [] X, y = load_data(os.path.join(DATA, file)) y, reverse = target2int(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42) low = 18 high = 53 X = X_train + X_test XX = BagOfWords().fit(X).transform(X, low, high) XX_train = XX[:len(X_train)] XX_test = XX[len(X_train):] learner = LinearSVC() scores.append(KFold_score(XX_train, y_train, learner, get_scores=True)) """ bow ngram """ X, y = load_data(os.path.join(DATA, file)) y, reverse = target2int(y) X_train, X_test, y_train, y_test = train_test_split(X, y,
for ng in range(2, 6): ret = ngramXP(X, y, ng) ngs.append(ng) ans.append(ret[0]) scores.append(np.max(ret[1])) max_ = np.argmax(scores) best_ng = ngs[max_] best_low, best_high = ans[max_] print(scores[max_]) print(best_ng) print(best_low) print(best_high) X = X_train + X_test bow = BagOfWords(best_ng).fit(X) XX = bow.transform(X, best_low, best_high) learner = LinearSVC() XX_train = XX[:len(X_train)] XX_test = XX[len(X_train):] learner = learner.fit(XX_train, y_train) print((learner.predict(XX_test) == y_test).mean())
if __name__ == '__main__': if len(sys.argv) < 2: print(' >> you need to give data') exit() else: file = sys.argv[1] X, y = load_data(os.path.join(DATA, file)) y, reverse = target2int(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42) bow = BagOfWords().fit(X) learner = LinearSVC() mixed_learner = BowAndLearner(bow, learner) lowBounds = np.arange(0, 30, 1) highBounds = np.arange(20, 70, 1) ans = gridSearch2D(X_train, y_train, mixed_learner, lowBounds, highBounds, condition) # display2DGridSearch(ans[1], ans[2], ans[3], 'borne basse', 'borne haute') best_low, best_high = ans[0] print(ans[0]) print(np.max(ans[1]))