예제 #1
0
def con_Interval():
    X, y, vectorizer = get_X_y()
    #BS_PR.sort_ratio(X,y,vectorizer)
    n_samples = 1000
    bs_indexes = bootstrap_indexes(X,n_samples)
    w_lists = np.zeros((n_samples,X.shape[1]))
    lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto')
    kf = cross_validation.StratifiedKFold(y,n_folds=5,shuffle=True)
    parameters = {"C":[100,10,1.0,.1, .01, .001,0.0001]}
    clf0 = grid_search.GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf)
    clf0.fit(X,y)
    best_C = clf0.best_params_['C']

    for i in range(n_samples):
        train_X = X[bs_indexes[i]]
        train_Y = y[bs_indexes[i]]
        lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C)
        lr.fit(train_X,train_Y)
        w = lr.coef_
        w_lists[i] = w
        print('iteration',i)

    mean = np.mean(w_lists,axis=0)
    std = np.std(w_lists,axis=0)
    p_lower = mean - (1.96)*std
    p_upper = mean + (1.96)*std
    sort_p_lower = sorted(zip(p_lower.tolist(),vectorizer.get_feature_names(),range(len(mean))),reverse=True)
    sort_p_upper = sorted(zip(p_upper.tolist(),vectorizer.get_feature_names(),range(len(mean))))
    save_dict = {}
    save_dict["w_list"] = w_lists
    save_dict["sort_p_lower"] = sort_p_lower
    save_dict["sort_p_upper"] = sort_p_upper
    save_dict["mean"] = list(mean)
    dict_file = open("BS_NConPR/coefficient.pkl","wb")
    cPickle.dump(save_dict,dict_file,cPickle.HIGHEST_PROTOCOL)
    dict_file.close()
    #set break point here
    texify_most_informative_features(sort_p_lower,sort_p_upper)

    #draw top features for positive instances
    for i in range(5):
        values = w_lists[:,sort_p_lower[i][2]]
        plt.hist(values,bins=20)
        plt.title(vectorizer.get_feature_names()[sort_p_lower[i][2]])
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_lower[i][2]]+"_L2.png")
        plt.clf()

    #draw top features for negative instances
    for i in range(5):
        plt.clf()
        values = w_lists[:,sort_p_upper[i][2]]
        plt.hist(values,bins=20)
        plt.title(vectorizer.get_feature_names()[sort_p_upper[i][2]])
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_upper[i][2]]+"_L2.png")
        plt.clf()
    plot_Features(sort_p_lower,sort_p_upper,X,y,vectorizer)
def con_Interval():
    X, y, vectorizer = get_X_y()
    n_samples = 1000
    bs_indexes = bootstrap_indexes(X, n_samples)
    w_lists = np.zeros((n_samples, X.shape[1]))
    lr = LogisticRegression(penalty="l2",
                            fit_intercept=True,
                            class_weight='auto')
    kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True)
    parameters = {"C": [100, 10, 1.0, .1, .01, .001, 0.0001]}
    clf0 = grid_search.GridSearchCV(lr, parameters, scoring='roc_auc', cv=kf)
    clf0.fit(X, y)
    best_C = clf0.best_params_['C']
    print "best AUC score is: " + str(clf0.best_score_)
    for i in range(n_samples):
        train_X = X[bs_indexes[i]]
        train_Y = y[bs_indexes[i]]
        #lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C)
        #lr.fit(train_X,train_Y)
        clf = SGDClassifier(loss="log",
                            alpha=1.0 / best_C,
                            n_iter=np.ceil(10**6 / train_X.shape[0]),
                            class_weight="auto").fit(train_X, train_Y)
        w = clf.coef_
        w_lists[i] = w
        print('iteration', i)
    CI_hash_pos = {}
    CI_hash_neg = {}
    mean = np.mean(w_lists, axis=0)
    std = np.std(w_lists, axis=0)
    p_lower = mean - (1.96) * std
    p_upper = mean + (1.96) * std
    sort_p_lower = sorted(zip(p_lower.tolist(), vectorizer.get_feature_names(),
                              range(len(mean))),
                          reverse=True)
    sort_p_upper = sorted(
        zip(p_upper.tolist(), vectorizer.get_feature_names(),
            range(len(mean))))
    save_dict = {}
    save_dict["w_list"] = w_lists
    save_dict["sort_p_lower"] = sort_p_lower
    save_dict["sort_p_upper"] = sort_p_upper
    dict_file = open("reuters/coefficient.pkl", "wb")
    cPickle.dump(save_dict, dict_file, cPickle.HIGHEST_PROTOCOL)
    dict_file.close()
    texify_most_informative_features(sort_p_lower, sort_p_upper)
    #draw top features for positive instances
    plot_index = [1, 2, 4, 8]
    plt.figure(1)
def con_Interval():
    X, y, vectorizer = get_X_y()
    n_samples = 1000
    bs_indexes = bootstrap_indexes(X, n_samples)
    w_lists = np.zeros((n_samples, X.shape[1]))
    lr = LogisticRegression(penalty="l2", fit_intercept=True, class_weight="auto")
    kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True)
    parameters = {"C": [100, 10, 1.0, 0.1, 0.01, 0.001, 0.0001]}
    clf0 = grid_search.GridSearchCV(lr, parameters, scoring="roc_auc", cv=kf)
    clf0.fit(X, y)
    best_C = clf0.best_params_["C"]
    print "best AUC score is: " + str(clf0.best_score_)
    for i in range(n_samples):
        train_X = X[bs_indexes[i]]
        train_Y = y[bs_indexes[i]]
        # lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C)
        # lr.fit(train_X,train_Y)
        clf = SGDClassifier(
            loss="log", alpha=1.0 / best_C, n_iter=np.ceil(10 ** 6 / train_X.shape[0]), class_weight="auto"
        ).fit(train_X, train_Y)
        w = clf.coef_
        w_lists[i] = w
        print ("iteration", i)
    CI_hash_pos = {}
    CI_hash_neg = {}
    mean = np.mean(w_lists, axis=0)
    std = np.std(w_lists, axis=0)
    p_lower = mean - (1.96) * std
    p_upper = mean + (1.96) * std
    sort_p_lower = sorted(zip(p_lower.tolist(), vectorizer.get_feature_names(), range(len(mean))), reverse=True)
    sort_p_upper = sorted(zip(p_upper.tolist(), vectorizer.get_feature_names(), range(len(mean))))
    save_dict = {}
    save_dict["w_list"] = w_lists
    save_dict["sort_p_lower"] = sort_p_lower
    save_dict["sort_p_upper"] = sort_p_upper
    dict_file = open("reuters/coefficient.pkl", "wb")
    cPickle.dump(save_dict, dict_file, cPickle.HIGHEST_PROTOCOL)
    dict_file.close()
    texify_most_informative_features(sort_p_lower, sort_p_upper)
    # draw top features for positive instances
    plot_index = [1, 2, 4, 8]
    plt.figure(1)
def con_Interval():
    X, y, vectorizer = get_X_y()
    #BS_PR.sort_ratio(X,y,vectorizer)
    n_samples = 1000
    bs_indexes = bootstrap_indexes(X,n_samples)
    w_lists = np.zeros((n_samples,X.shape[1]))
    lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto')
    kf = cross_validation.StratifiedKFold(y,n_folds=5,shuffle=True)
    parameters = {"C":[100,10,1.0,.1, .01, .001,0.0001,0.00001,0.000001,0.0000001]}
    clf0 = grid_search.GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf)
    clf0.fit(X,y)
    best_C = clf0.best_params_['C']
    lr1 = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C)
    scores = cross_validation.cross_val_score(lr1,X,y,cv=kf,scoring = 'roc_auc')
    print "best C is: "+str(best_C)
    print scores
    for i in range(n_samples):
        train_X = X[bs_indexes[i]]
        train_Y = y[bs_indexes[i]]
        lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=best_C)
        lr.fit(train_X,train_Y)
        w = lr.coef_
        w_lists[i] = w
        print('iteration',i)

    mean = np.mean(w_lists,axis=0)
    std = np.std(w_lists,axis=0)
    p_lower = mean - (1.96)*std
    p_upper = mean + (1.96)*std
    sort_p_lower = sorted(zip(p_lower.tolist(),vectorizer.get_feature_names(),range(len(mean))),reverse=True)
    sort_p_upper = sorted(zip(p_upper.tolist(),vectorizer.get_feature_names(),range(len(mean))))
    save_dict = {}
    save_dict["w_list"] = w_lists
    save_dict["sort_p_lower"] = sort_p_lower
    save_dict["sort_p_upper"] = sort_p_upper
    save_dict["mean"] = list(mean)
    dict_file = open("BS_NConPR/coefficient.pkl","wb")
    cPickle.dump(save_dict,dict_file,cPickle.HIGHEST_PROTOCOL)
    dict_file.close()
    #set break point here
    texify_most_informative_features(sort_p_lower,sort_p_upper)

    #draw top features for positive instances
    for i in range(5):
        values = w_lists[:,sort_p_lower[i][2]]
        plt.hist(values,bins=20)
        plt.title(vectorizer.get_feature_names()[sort_p_lower[i][2]])
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_lower[i][2]]+"_L2.png")
        plt.clf()

    #draw top features for negative instances
    for i in range(5):
        plt.clf()
        values = w_lists[:,sort_p_upper[i][2]]
        plt.hist(values,bins=20)
        plt.title(vectorizer.get_feature_names()[sort_p_upper[i][2]])
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig("BS_NConPR/"+vectorizer.get_feature_names()[sort_p_upper[i][2]]+"_L2.png")
        plt.clf()
    plot_Features(sort_p_lower,sort_p_upper,X,y,vectorizer)