Example #1
0
def gridSearchNGram\
(X, X_train, y_train, learner, ngrams, k=3, return_errors=False, random_state=42, method='kfold'):
	g_ngrams, scores = [], []

	rs = random_state

	total = len(ngrams)
	step = int(total / 10)

	start = time.time()

	for ng in ngrams:
		rs += 1
		bow = BagOfWords(ng).fit(X)
		XX = bow.transform(X_train)
		if method == 'kfold' :
			scores.append(KFold_score(XX, y_train, learner, k=k, random_state=rs))
		elif method == 'five2' :
			scores.append(five2_score(XX, y_train, learner, random_state=rs))
		g_ngrams.append(ng)
		if (rs - random_state) % step == 0:
			now = time.time()
			print(' +gridSearch2D : {}% t:{:.2f}'.format(int((rs - random_state)/total*100), now - start))
			start = now

	best = np.argmax(scores)
	bests = g_ngrams[best]

	if return_errors:
		return bests, (1 - np.array(scores)), g_ngrams
	else:
		return bests, np.array(scores), g_ngrams
Example #2
0
def get_top(data, target, p, n, ngram=1):
    p_data = []

    for i, d in enumerate(data):
        if target[i] == p:
            p_data.append(d)

    freqs = BagOfWords(ngram).fit(p_data).freqs_

    keys, values = list(zip(*(freqs.items())))

    top = np.argsort(values)[::-1][:n]
    return np.array(keys)[top], np.array(values)[top]
Example #3
0
def ngramXP(X, y, ngram):

    print(' +ngram : begin with ngram {}'.format(ngram))

    bow = BagOfWords(ngram).fit(X)
    learner = LinearSVC()

    mixed_learner = BowAndLearner(bow, learner)

    lowBounds = np.arange(0, 30, 1)
    highBounds = np.arange(20, 70, 1)

    ans = gridSearch2D(X,
                       y,
                       mixed_learner,
                       lowBounds,
                       highBounds,
                       condition,
                       method='five2')
    save2DGridSearch(ans[1], ans[2], ans[3], 'borne basse', 'borne haute',
                     ngram)

    with open('results-{}.json'.format(ngram), 'wb') as f:
        json.dump('{}'.format(ans), f)

    return ans
Example #4
0
 def transform_cut(self, X):
     newX = super(BowPreFilter, self).transform_cut(X, self.low_,
                                                    self.high_)
     bow = BagOfWords(self.ngram_).fit(self.X_)
     return bow.transform_cut(newX)
Example #5
0
if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(' >> you need to give data')
        exit()
    else:
        file = sys.argv[1]
    """
    simple bow
    """

    scores = []

    X, y = load_data(os.path.join(DATA, file))
    y, reverse = target2int(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.50,
                                                        random_state=42)

    low = 18
    high = 53

    X = X_train + X_test

    XX = BagOfWords().fit(X).transform_cut(X, low, high)

    print(get_top(XX, y, 0, 5))
    print(get_top(XX, y, 1, 5))
    print(reverse)
Example #6
0
    scores = []

    X, y = load_data(os.path.join(DATA, file))
    y, reverse = target2int(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.50,
                                                        random_state=42)

    low = 18
    high = 53

    X = X_train + X_test

    XX = BagOfWords().fit(X).transform(X, low, high)

    XX_train = XX[:len(X_train)]
    XX_test = XX[len(X_train):]

    learner = LinearSVC()
    scores.append(KFold_score(XX_train, y_train, learner, get_scores=True))
    """
    bow ngram
    """

    X, y = load_data(os.path.join(DATA, file))
    y, reverse = target2int(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
Example #7
0
    for ng in range(2, 6):
        ret = ngramXP(X, y, ng)
        ngs.append(ng)
        ans.append(ret[0])
        scores.append(np.max(ret[1]))

    max_ = np.argmax(scores)

    best_ng = ngs[max_]
    best_low, best_high = ans[max_]

    print(scores[max_])
    print(best_ng)
    print(best_low)
    print(best_high)

    X = X_train + X_test

    bow = BagOfWords(best_ng).fit(X)
    XX = bow.transform(X, best_low, best_high)

    learner = LinearSVC()

    XX_train = XX[:len(X_train)]
    XX_test = XX[len(X_train):]

    learner = learner.fit(XX_train, y_train)

    print((learner.predict(XX_test) == y_test).mean())
Example #8
0
if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(' >> you need to give data')
        exit()
    else:
        file = sys.argv[1]

    X, y = load_data(os.path.join(DATA, file))
    y, reverse = target2int(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.50,
                                                        random_state=42)

    bow = BagOfWords().fit(X)
    learner = LinearSVC()

    mixed_learner = BowAndLearner(bow, learner)

    lowBounds = np.arange(0, 30, 1)
    highBounds = np.arange(20, 70, 1)

    ans = gridSearch2D(X_train, y_train, mixed_learner, lowBounds, highBounds,
                       condition)
    # display2DGridSearch(ans[1], ans[2], ans[3], 'borne basse', 'borne haute')

    best_low, best_high = ans[0]

    print(ans[0])
    print(np.max(ans[1]))