def find_best_weights(train, test, clf, thresh=0.5): y_pred_cm = run_cm(train, test, '../data/ADR-lexicon.txt') _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf) _, y_prob_nblcr = run_nblcr(train, test, '../data/nblcr', grams='123', clf=clf) _, y_prob_we = run_we(train, test, '../data/w2v_150.txt', 150, clf=clf) alphas = np.float32(np.linspace(0, 1, 21)) max_f1 = 0 best_weights = [0, 0, 0] for alpha1 in alphas: for alpha2 in alphas: for alpha3 in alphas: if alpha1 + alpha2 + alpha3 > 1: continue y_pred = [] for i in xrange(len(y_pred_cm)): val = alpha1*y_pred_cm[i] + alpha2*y_prob_tfidf[i,1] + alpha3*y_prob_nblcr[i,1] + (1-alpha1-alpha2-alpha3)*y_prob_we[i,1] if val >= thresh: y_pred.append(1) else: y_pred.append(0) f1 = f1_score(test['label'], y_pred) if f1 > max_f1: best_weights = [alpha1, alpha2, alpha3] max_f1 = f1 return best_weights, max_f1
def find_best_weights(train, test, clf, thresh=0.5): y_pred_cm = run_cm(train, test, '../data/ADR-lexicon.txt') _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf) _, y_prob_nblcr = run_nblcr(train, test, '../data/nblcr', grams='123', clf=clf) _, y_prob_we = run_we(train, test, '../data/w2v_150.txt', 150, clf=clf) alphas = np.float32(np.linspace(0, 1, 21)) max_f1 = 0 best_weights = [0, 0, 0] for alpha1 in alphas: for alpha2 in alphas: for alpha3 in alphas: if alpha1 + alpha2 + alpha3 > 1: continue y_pred = [] for i in xrange(len(y_pred_cm)): val = alpha1 * y_pred_cm[i] + alpha2 * y_prob_tfidf[ i, 1] + alpha3 * y_prob_nblcr[i, 1] + ( 1 - alpha1 - alpha2 - alpha3) * y_prob_we[i, 1] if val >= thresh: y_pred.append(1) else: y_pred.append(0) f1 = f1_score(test['label'], y_pred) if f1 > max_f1: best_weights = [alpha1, alpha2, alpha3] max_f1 = f1 return best_weights, max_f1
def run_ensemble(train, test, weights, clf, thresh=0.5): y_pred_cm = run_cm(train, test, '../data/ADR-lexicon.txt') _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf) _, y_prob_nblcr = run_nblcr(train, test, '../data/nblcr', grams='123', clf=clf) _, y_prob_we = run_we(train, test, '../data/w2v_150.txt', 150, clf=clf) y_pred = [] for i in xrange(len(y_pred_cm)): val = weights[0]*y_pred_cm[i] + weights[1]*y_prob_tfidf[i,1] + weights[2]*y_prob_nblcr[i,1] + (1-weights[0]-weights[1]-weights[2])*y_prob_we[i,1] if val >= thresh: y_pred.append(1) else: y_pred.append(0) return y_pred
def run_ensemble(train, test, weights, clf, thresh=0.5): y_pred_cm = run_cm(train, test, '../data/ADR-lexicon.txt') _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf) _, y_prob_nblcr = run_nblcr(train, test, '../data/nblcr', grams='123', clf=clf) _, y_prob_we = run_we(train, test, '../data/w2v_150.txt', 150, clf=clf) y_pred = [] for i in xrange(len(y_pred_cm)): val = weights[0] * y_pred_cm[i] + weights[1] * y_prob_tfidf[ i, 1] + weights[2] * y_prob_nblcr[i, 1] + ( 1 - weights[0] - weights[1] - weights[2]) * y_prob_we[i, 1] if val >= thresh: y_pred.append(1) else: y_pred.append(0) return y_pred