def con_Interval(): X, y, vectorizer = get_X_y() #BS_PR.sort_ratio(X,y,vectorizer) n_samples = 1000 bs_indexes = bootstrap_indexes(X,n_samples) w_lists = np.zeros((n_samples,X.shape[1])) lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto') kf = cross_validation.StratifiedKFold(y,n_folds=5,shuffle=True) parameters = {"C":[100,10,1.0,.1, .01, .001,0.0001]} clf0 = grid_search.GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf) clf0.fit(X,y) best_C = clf0.best_params_['C'] for i in range(n_samples): train_X = X[bs_indexes[i]] train_Y = y[bs_indexes[i]] lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C) lr.fit(train_X,train_Y) w = lr.coef_ w_lists[i] = w print('iteration',i) mean = np.mean(w_lists,axis=0) std = np.std(w_lists,axis=0) p_lower = mean - (1.96)*std p_upper = mean + (1.96)*std sort_p_lower = sorted(zip(p_lower.tolist(),vectorizer.get_feature_names(),range(len(mean))),reverse=True) sort_p_upper = sorted(zip(p_upper.tolist(),vectorizer.get_feature_names(),range(len(mean)))) save_dict = {} save_dict["w_list"] = w_lists save_dict["sort_p_lower"] = sort_p_lower save_dict["sort_p_upper"] = sort_p_upper save_dict["mean"] = list(mean) dict_file = open("BS_NConPR/coefficient.pkl","wb") cPickle.dump(save_dict,dict_file,cPickle.HIGHEST_PROTOCOL) dict_file.close() #set break point here texify_most_informative_features(sort_p_lower,sort_p_upper) #draw top features for positive instances for i in range(5): values = w_lists[:,sort_p_lower[i][2]] plt.hist(values,bins=20) plt.title(vectorizer.get_feature_names()[sort_p_lower[i][2]]) plt.xlabel("Value") plt.ylabel("Frequency") plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_lower[i][2]]+"_L2.png") plt.clf() #draw top features for negative instances for i in range(5): plt.clf() values = w_lists[:,sort_p_upper[i][2]] plt.hist(values,bins=20) plt.title(vectorizer.get_feature_names()[sort_p_upper[i][2]]) plt.xlabel("Value") plt.ylabel("Frequency") plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_upper[i][2]]+"_L2.png") plt.clf() plot_Features(sort_p_lower,sort_p_upper,X,y,vectorizer)
def con_Interval(): X, y, vectorizer = get_X_y() n_samples = 1000 bs_indexes = bootstrap_indexes(X, n_samples) w_lists = np.zeros((n_samples, X.shape[1])) lr = LogisticRegression(penalty="l2", fit_intercept=True, class_weight='auto') kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True) parameters = {"C": [100, 10, 1.0, .1, .01, .001, 0.0001]} clf0 = grid_search.GridSearchCV(lr, parameters, scoring='roc_auc', cv=kf) clf0.fit(X, y) best_C = clf0.best_params_['C'] print "best AUC score is: " + str(clf0.best_score_) for i in range(n_samples): train_X = X[bs_indexes[i]] train_Y = y[bs_indexes[i]] #lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C) #lr.fit(train_X,train_Y) clf = SGDClassifier(loss="log", alpha=1.0 / best_C, n_iter=np.ceil(10**6 / train_X.shape[0]), class_weight="auto").fit(train_X, train_Y) w = clf.coef_ w_lists[i] = w print('iteration', i) CI_hash_pos = {} CI_hash_neg = {} mean = np.mean(w_lists, axis=0) std = np.std(w_lists, axis=0) p_lower = mean - (1.96) * std p_upper = mean + (1.96) * std sort_p_lower = sorted(zip(p_lower.tolist(), vectorizer.get_feature_names(), range(len(mean))), reverse=True) sort_p_upper = sorted( zip(p_upper.tolist(), vectorizer.get_feature_names(), range(len(mean)))) save_dict = {} save_dict["w_list"] = w_lists save_dict["sort_p_lower"] = sort_p_lower save_dict["sort_p_upper"] = sort_p_upper dict_file = open("reuters/coefficient.pkl", "wb") cPickle.dump(save_dict, dict_file, cPickle.HIGHEST_PROTOCOL) dict_file.close() texify_most_informative_features(sort_p_lower, sort_p_upper) #draw top features for positive instances plot_index = [1, 2, 4, 8] plt.figure(1)
def con_Interval(): X, y, vectorizer = get_X_y() n_samples = 1000 bs_indexes = bootstrap_indexes(X, n_samples) w_lists = np.zeros((n_samples, X.shape[1])) lr = LogisticRegression(penalty="l2", fit_intercept=True, class_weight="auto") kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True) parameters = {"C": [100, 10, 1.0, 0.1, 0.01, 0.001, 0.0001]} clf0 = grid_search.GridSearchCV(lr, parameters, scoring="roc_auc", cv=kf) clf0.fit(X, y) best_C = clf0.best_params_["C"] print "best AUC score is: " + str(clf0.best_score_) for i in range(n_samples): train_X = X[bs_indexes[i]] train_Y = y[bs_indexes[i]] # lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C) # lr.fit(train_X,train_Y) clf = SGDClassifier( loss="log", alpha=1.0 / best_C, n_iter=np.ceil(10 ** 6 / train_X.shape[0]), class_weight="auto" ).fit(train_X, train_Y) w = clf.coef_ w_lists[i] = w print ("iteration", i) CI_hash_pos = {} CI_hash_neg = {} mean = np.mean(w_lists, axis=0) std = np.std(w_lists, axis=0) p_lower = mean - (1.96) * std p_upper = mean + (1.96) * std sort_p_lower = sorted(zip(p_lower.tolist(), vectorizer.get_feature_names(), range(len(mean))), reverse=True) sort_p_upper = sorted(zip(p_upper.tolist(), vectorizer.get_feature_names(), range(len(mean)))) save_dict = {} save_dict["w_list"] = w_lists save_dict["sort_p_lower"] = sort_p_lower save_dict["sort_p_upper"] = sort_p_upper dict_file = open("reuters/coefficient.pkl", "wb") cPickle.dump(save_dict, dict_file, cPickle.HIGHEST_PROTOCOL) dict_file.close() texify_most_informative_features(sort_p_lower, sort_p_upper) # draw top features for positive instances plot_index = [1, 2, 4, 8] plt.figure(1)
def con_Interval(): X, y, vectorizer = get_X_y() #BS_PR.sort_ratio(X,y,vectorizer) n_samples = 1000 bs_indexes = bootstrap_indexes(X,n_samples) w_lists = np.zeros((n_samples,X.shape[1])) lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto') kf = cross_validation.StratifiedKFold(y,n_folds=5,shuffle=True) parameters = {"C":[100,10,1.0,.1, .01, .001,0.0001,0.00001,0.000001,0.0000001]} clf0 = grid_search.GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf) clf0.fit(X,y) best_C = clf0.best_params_['C'] lr1 = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C) scores = cross_validation.cross_val_score(lr1,X,y,cv=kf,scoring = 'roc_auc') print "best C is: "+str(best_C) print scores for i in range(n_samples): train_X = X[bs_indexes[i]] train_Y = y[bs_indexes[i]] lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C) lr.fit(train_X,train_Y) w = lr.coef_ w_lists[i] = w print('iteration',i) mean = np.mean(w_lists,axis=0) std = np.std(w_lists,axis=0) p_lower = mean - (1.96)*std p_upper = mean + (1.96)*std sort_p_lower = sorted(zip(p_lower.tolist(),vectorizer.get_feature_names(),range(len(mean))),reverse=True) sort_p_upper = sorted(zip(p_upper.tolist(),vectorizer.get_feature_names(),range(len(mean)))) save_dict = {} save_dict["w_list"] = w_lists save_dict["sort_p_lower"] = sort_p_lower save_dict["sort_p_upper"] = sort_p_upper save_dict["mean"] = list(mean) dict_file = open("BS_NConPR/coefficient.pkl","wb") cPickle.dump(save_dict,dict_file,cPickle.HIGHEST_PROTOCOL) dict_file.close() #set break point here texify_most_informative_features(sort_p_lower,sort_p_upper) #draw top features for positive instances for i in range(5): values = w_lists[:,sort_p_lower[i][2]] plt.hist(values,bins=20) plt.title(vectorizer.get_feature_names()[sort_p_lower[i][2]]) plt.xlabel("Value") plt.ylabel("Frequency") plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_lower[i][2]]+"_L2.png") plt.clf() #draw top features for negative instances for i in range(5): plt.clf() values = w_lists[:,sort_p_upper[i][2]] plt.hist(values,bins=20) plt.title(vectorizer.get_feature_names()[sort_p_upper[i][2]]) plt.xlabel("Value") plt.ylabel("Frequency") plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_upper[i][2]]+"_L2.png") plt.clf() plot_Features(sort_p_lower,sort_p_upper,X,y,vectorizer)