def main(argv):
	X = np.load('trainingdata2.npy')
	y = np.load('trainingdatalabel2.npy')
	labels = np.unique(y)
	logreg = LogisticRegression(C=1e5)
	logreg.fit(X,y)
	score1 = logreg.score(X,y)
	newval = createceps(sys.argv[1])
	outp = logreg.predict(newval)
	clf = svm.SVC(kernel='rbf', C = 1.0)
	clf.fit(X,y)
	score2 = clf.score(X,y)
	confidence1 = logreg.decision_function(newval)
	confidence2 = clf.decision_function(newval)
	outp2 = clf.predict(newval)
	if outp[0]==1 :
		print "Speaker is Angry"
		image = Image.open('angry.png')
		image.show()
	elif outp[0] ==2 :
		print "Speaker is scared"
		image = Image.open('scared.png')
		image.show()
	elif outp[0] ==3 :
		print "Speaker is happy"
		image = Image.open('happy.jpg')
		image.show()
	elif outp[0] ==4 :
		print "Speaker is neutral"
		image = Image.open('neutral.png')
		image.show()
	elif outp[0] ==5 :
		print "Speaker is sad"
		image = Image.open('sad.jpg')
		image.show()
		
	if outp2[0]==1 :
		print "Speaker is Angry"
		image = Image.open('angry.png')
		image.show()
	elif outp2[0] ==2 :
		print "Speaker is scared"
		image = Image.open('scared.png')
		image.show()
	elif outp2[0] ==3 :
		print "Speaker is happy"
		image = Image.open('happy.jpg')
		image.show()
	elif outp2[0] ==4 :
		print "Speaker is neutral"
		image = Image.open('neutral.png')
		image.show()
	elif outp2[0] ==5 :
		print "Speaker is sad"
		image = Image.open('sad.jpg')
		image.show()
	print "Accuracy of logistic regression %f"%score1
	print confidence1[0]
	print "Accuracy of SVM Classifier %f"%score2
	print confidence2[0]
def test_thresholded_scorers():
    """Test scorers that take thresholds."""
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = SCORERS['log_loss'](clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
Exemple #3
0
class LogisticRegressionAdaptive(LogisticRegression):

    def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None):

        super(LogisticRegressionAdaptive,self).__init__(
            penalty=penalty, dual=dual, tol=tol, C=C,
            fit_intercept=fit_intercept, intercept_scaling=intercept_scaling,
            class_weight=class_weight, random_state=random_state)

        self.clf = LogisticRegression(
            penalty=penalty, dual=dual, tol=tol, C=10,
            fit_intercept=fit_intercept, intercept_scaling=intercept_scaling,
            class_weight=class_weight, random_state=random_state)
        self.c_average = []

    def fit(self, X, y):

        kcv = StratifiedKFold(y=y, n_folds=5, shuffle=False, random_state=None)

        # Set the parameters by cross-validation
        tuned_parameters = [{'C': [pow(10,x) for x in range(-2,3)]}]   #[0.001, 0.01, 0.1, 1, 10, 100, 1000]

        score = 'accuracy'

        clf = GridSearchCV(self.clf, tuned_parameters, scoring=score, cv=kcv)
        clf.fit(X, y)
        self.clf.C =clf.best_estimator_.C
        self.clf.fit(X,y)
        self.C = clf.best_estimator_.C
        self.c_average.append(self.C)
        # print "best:",
        super(LogisticRegressionAdaptive,self).fit(X,y)
        return self

    def get_c_ave(self):
        # import numpy as np
        return self.c_average

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self,X):
        return self.clf.predict_proba(X)

    def decision_function(self, X):
        return self.clf.decision_function(X)

    def transform(self, X, threshold=None):
        return self.clf.transform(X, threshold=threshold)

    def __repr__(self):
        return "%s - %s" % (self.__class__.__name__,self.clf)
Exemple #4
0
	def lr_classify(self):
		print "Logistic Regression"

		clf = LogisticRegression()
		clf.fit(self.descr, self.target)
		mean = clf.score(self.test_descr, self.test_target)
		print "Mean : %3f" % mean
		print "Coefficients ", clf.coef_
		print "Intercept ", clf.intercept_
		print "Confidence Score ",clf.decision_function(self.descr)
		print "Predict Probability ", clf.predict_proba(self.descr)
		print "Transform ", clf.transform(self.descr)
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    with pytest.raises(ValueError, match="multiclass format is not supported"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # test error is raised with a single class present in model
    # (predict_proba shape is not suitable for binary auc)
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, np.zeros_like(y_train))
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # for proba scorers
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('neg_log_loss')(clf, X_test, y_test)
def train_custom_one_vs_all(X_train,X_test,Y_train,topk):

    #convert matrix to row for efficient splicing
    Y_train = Y_train.tocsc()
    tag_classifiers = []
    num_training,numclasses = Y_train.shape
    num_test_examples = X_test.shape[0]


    # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that
    num_examples = X_test.shape[0]
    num_classes = len(tag_classifiers)
    topk_class_distances = []
    for i in xrange(num_examples):
        heap = []
        topk_class_distances += [heap]
    

    for j in xrange(numclasses):
        # train on each class label for all the training examples
        y = numpy.ravel(Y_train.getcol(j).todense());

        clf = LogisticRegression(penalty='l2',dual=False,tol=0.0001,C=0.8,fit_intercept=True,intercept_scaling=1)
    
        clf.fit(X_train,y);
        print "Trained for class",j
        # get the decision for all test examples
        decision = clf.decision_function(X_test)
        # for each test example add its decision value to the heap of top k decision values
        for i in xrange(num_test_examples):
            h = topk_class_distances[i]
            if len(h) < topk: heapq.heappush(h,(decision[i],j))
            else:             heapq.heappushpop(h,(decision[i],j))
        print "Predicted for class",j

    #clean the decision values and store the class labels
    class_label_indices = []
    for i in xrange(num_examples):
        topk_labels = [label for dist,label in topk_class_distances[i]]
        class_label_indices += [topk_labels]

    return class_label_indices
def main():
    fpath_train = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/TestTrainData/MaritalTrainData.txt"
    bunch_train = bunchcreator.LoadFileAsBunch(fpath_train, ["NoMarital", "Marital"])
    fpath_test = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/TestTrainData/MaritalTestData.txt"
    bunch_test = bunchcreator.LoadFileAsBunch(fpath_test, ["NoMarital", "Marital"])

    print("Done with Bunching");
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(bunch_train.data)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    X_test_counts = count_vect.transform(bunch_test.data)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    print "Done with TFIDF"
    clf = LogisticRegression()
    clf.fit(X_train_tfidf, bunch_train.target)
    
    preds_int = clf.predict(X_test_tfidf)
    preds_float = clf.decision_function(X_test_tfidf)
    y_true = np.array(bunch_test.target)
    filepath = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/OutputFiles/GT_Pred.txt"
    fw = open(filepath, 'w')
    for i in range(len(bunch_test.target)):
        fw.write(str(bunch_test.target[i])+":"+str(preds_int[i])+":"+str(preds_float[i])+"\n")

    fpr, tpr, _ = metrics.roc_curve(y_true, preds_float)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve ' )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
Exemple #8
0
                            (y_p_cv_train, y_purturb_cv_train), axis=0)
                        scaler = StandardScaler().fit(X_cv)
                        X_cv_transformed = scaler.transform(X_cv)
                        X_pu_cv_test_transformed = scaler.transform(
                            X_pu_cv_test)

                        clf = LogisticRegression(penalty="l2",
                                                 C=c,
                                                 class_weight={
                                                     -1: 1,
                                                     1: r
                                                 },
                                                 random_state=i)
                        clf.fit(X_cv_transformed, y_cv)

                        scores = clf.decision_function(
                            X_pu_cv_test_transformed)
                        #print("scores.shape:", scores.shape)
                        #next: accumulate the score and count properly, that's why we use ShuffleSplit
                        accScores[test_bstrp_index] += scores
                        timesClassified[test_bstrp_index] += 1
                        #print("Log: finished %d/%d, time elapsed: %.2f" %(i, T, elapsed_time) )
                        nUnclassified = np.sum(timesClassified == 0)
                    avgScores = accScores / timesClassified
                    orderAvgScores = np.argsort(
                        -avgScores)  #sort in descent order
                    topNIndex = orderAvgScores[:topN]
                    truePosIndex = np.array(
                        range(y_p_cv_val.shape[0])
                    )  #they are the firstN rows in the concatenated validation set
                    truePosRecall = np.intersect1d(topNIndex,
                                                   truePosIndex,
# In order to do this, we can lower the threshold for predicting class 1.

# This will reduce our false negative rate to 0, but at the expense of our false positive rate.
Y_pp['pred_class_thresh10'] = [1 if x >= 0.10 else 0 for x in Y_pp.class_1_pp.values]
print(Y_pp.iloc[0:10])


from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# plt.style.use('seaborn-white')
%matplotlib inline



Y_score = logreg.decision_function(X_test)

FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(Y_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[11,9])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
train_data, test_data = products.random_split(.8, seed=1)

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print(test_matrix[0])


model = LogisticRegression()
model.fit(train_matrix, train_data['sentiment'])

sample_test_matrix = vectorizer.transform(['ammazing wow wow'])
print(sample_test_matrix)

model.decision_function(sample_test_matrix)

from sframe import SArray
def my_predictions(model, test_matrix):
    return SArray([+1 if s >= 0 else -1 for s in model.decision_function(test_matrix)])

print (my_predictions(model, sample_test_matrix))
print (SArray(model.predict(sample_test_matrix)))

#import pickle
#pickle.dumps(model)


from sklearn.externals import joblib
joblib.dump(model, 'yelp_model.pkl') 
joblib.dump(vectorizer.vocabulary_, 'yelp_vocabulary_.pkl')
Exemple #11
0
from sklearn import metrics
from itertools import cycle

if __name__ == '__main__':
    np.random.seed(0)
    pd.set_option('display.width', 300)
    np.set_printoptions(suppress=True, linewidth=200)
    n = 300
    x = np.random.randn(n, 50)
    y = np.array([0] * 100 + [1] * 100 + [2] * 100)
    n_class = 3
    print('Before = \n', y)

    clf = LogisticRegression(penalty='l2', C=1)
    clf.fit(x, y)
    y_score = clf.decision_function(x)
    y = label_binarize(y, classes=np.arange(n_class))
    print('After = \n', y)
    colors = cycle('gbc')
    fpr = dict()
    tpr = dict()
    auc = np.empty(n_class + 2)
    mpl.rcParams['font.sans-serif'] = 'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(7, 6), facecolor='w')
    for i, color in zip(np.arange(n_class), colors):
        fpr[i], tpr[i], thresholds = metrics.roc_curve(y[:, i], y_score[:, i])
        auc[i] = metrics.auc(fpr[i], tpr[i])
        plt.plot(fpr[i],
                 tpr[i],
                 c=color,
count = 0
for co in coefs:
	if co >= 0:
		count += 1

print "Number of non negative coeffs ", count

sample_test_data = test_data[10:13]
print sample_test_data

def probability(score):
	return (1 / (1 + numpy.exp(-score)))

sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores
print sentiment_model.predict(sample_test_matrix)

test_set_scores = sentiment_model.decision_function(test_matrix)
names = test_data["name"]
name_predictions = dict(zip(names, test_set_scores))

sorted_reviews = sorted(name_predictions.items(), key=operator.itemgetter(1), reverse=True)

most_positive_reviews = sorted_reviews[:20]
print most_positive_reviews

most_negative_reviews = sorted_reviews[-1:-22:-1]
print most_negative_reviews
Exemple #13
0
def runLogis(label, Xdata, ydata, XNoLabel, testcase, debug):
    print("-------------- ")
    print(label)
    print("---------------")
    print("X(shape):  ", Xdata.shape)
    print("y(shape):  ", ydata.shape)
    if debug == 1:
        print("type(X):   ", type(Xdata))
        print("type(y):   ", type(ydata))
    print("-------------------------")
    if debug == 1:
        print("X:")
        print(Xdata)
        print("-------------------------")
        print("y:")
        print(ydata)
        print("-------------------------")
    print("-------------------------")
    lr = LogisticRegression(C = 1.0)
    lr.fit(Xdata, ydata)
    print("\n")
    print(lr.fit(Xdata, ydata))
    print("-------------------------")
    print("prediction probabilities of X:")
    print("The returned estimates for all classes are ordered by the label of classes")
    print("number of samples x number of classes (2 if 0-1)")
    lr.predict_proba(Xdata)
    print(lr.predict_proba(Xdata))
    print("-------------------------")
    print("Predict confidence scores for samples.")
    print("The confidence score for a sample is the signed distance of that sample to the hyperplane")
    print("Confidence scores per (sample, class) combination. In the binary case, ")
    print("confidence score for self.classes_[1] where >0 means this class would be predicted.")
    print(" ")
    print(lr.decision_function(Xdata))
    print("-------------------------")
    print("regression coefficients shape[n_classes-1, n_features]")
    #print(lr.coef_)
    print("-------------------------")
    print("params: ")
    print(lr.get_params(deep=True))
    print("-------------------------")
    print("fit_transform: ")
    print("Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X")
    print(lr.fit_transform(Xdata, ydata))
    print("-------------------------")
    print("scores, Returns the mean accuracy on the given data and labels: ")
    print(lr.score(Xdata, ydata))
    print("-------------------------")
    if testcase == 1:
        print("XNoLabel:")
        #print("shape(XNoLabel)", shape.XNoLabel )
        print(XNoLabel)
        predY = lr.predict(XNoLabel)

        print("-------------------------")
        print("Predict class labels for samples in X ")
        print("len(lr.predict(XNoLabel))", len(predY) )
        print("type(lr.predict(XNoLabel))", type(predY) )
        print(" ")
        print("predY:")
        print(predY[0:20])
        return predY
Exemple #14
0
def predictiveModeling():
    print ("training a predictive model...")
    try:
        # split the data into a training set and a test set
        train_split = int(len(data) * 4.0 / 5.0)

        X_train = X[:train_split]
        X_test = X[train_split:]
        y_train = y[:train_split]
        y_test = y[train_split:]

        # if you wanted to use a different model, you'd specify that here
        clf = LogisticRegression(penalty="l2")
        clf.fit(X_train, y_train)

        print "score", clf.score(X_test, y_test)

        # first, let's find the model score for every dress in our dataset
        probs = zip(clf.decision_function(X), raw_data)

        prettiest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "like" else 1, p))
        prettiest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "dislike" else 1, p))
        ugliest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "like" else 1, -p))
        ugliest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "dislike" else 1, -p))
        in_between_things = sorted(probs, key=lambda (p, (cd, g, f)): abs(p))

        # and let's look at the most and least extreme dresses
        cd = zip(X, raw_data)
        least_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]))
        most_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]), reverse=True)

        least_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): max([abs(c) for c in x]))
        most_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): min([abs(c) for c in x]), reverse=True)

        directory = "results/notableDresses/"
        makeFolder(directory)

        for i in range(min(N_COMPONENTS_TO_SHOW, numComponents)):
            Image.open(prettiest_liked_things[i][1][2]).save(directory + "prettiest_pretty_" + str(i) + ".png")
            Image.open(prettiest_disliked_things[i][1][2]).save(directory + "prettiest_ugly_" + str(i) + ".png")
            Image.open(ugliest_liked_things[i][1][2]).save(directory + "ugliest_pretty_" + str(i) + ".png")
            Image.open(ugliest_disliked_things[i][1][2]).save(directory + "directoryugliest_ugly_" + str(i) + ".png")
            Image.open(in_between_things[i][1][2]).save(directory + "neither_pretty_nor_ugly_" + str(i) + ".png")
            Image.open(least_extreme_things[i][1][2]).save(directory + "least_extreme_" + str(i) + ".png")
            Image.open(most_extreme_things[i][1][2]).save(directory + "most_extreme_" + str(i) + ".png")
            Image.open(least_interesting_things[i][1][2]).save(directory + "least_interesting_" + str(i) + ".png")
            Image.open(most_interesting_things[i][1][2]).save(directory + "most_interesting_" + str(i) + ".png")

        # and now let's look at precision-recall
        probs = zip(clf.decision_function(X_test), raw_data[train_split:])
        num_dislikes = len([c for c in y_test if c == 1])
        num_likes = len([c for c in y_test if c == 0])
        lowest_score = round(min([p[0] for p in probs]), 1) - 0.1
        highest_score = round(max([p[0] for p in probs]), 1) + 0.1
        INTERVAL = 0.1

        # first do the likes
        score = lowest_score
        while score <= highest_score:
            true_positives = len([p for p in probs if p[0] <= score and p[1][1] == "like"])
            false_positives = len([p for p in probs if p[0] <= score and p[1][1] == "dislike"])
            positives = true_positives + false_positives
            precision = np.float64(1.0 * true_positives) / positives
            recall = np.float64(1.0 * true_positives) / num_likes
            print "likes", score, precision, recall
            score += INTERVAL

        # then do the dislikes
        score = highest_score
        while score >= lowest_score:
            true_positives = len([p for p in probs if p[0] >= score and p[1][1] == "dislike"])
            false_positives = len([p for p in probs if p[0] >= score and p[1][1] == "like"])
            positives = true_positives + false_positives
            precision = np.float64(1.0 * true_positives) / positives
            recall = np.float64(1.0 * true_positives) / num_dislikes
            print "dislikes", score, precision, recall
            score -= INTERVAL

        # now do both
        score = lowest_score
        while score <= highest_score:
            likes = len([p for p in probs if p[0] <= score and p[1][1] == "like"])
            dislikes = len([p for p in probs if p[0] <= score and p[1][1] == "dislike"])
            print score, likes, dislikes
            score += INTERVAL
    except:
        print ("the model could not be trained.")
vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
train_matrix = vectorizer.fit_transform(train_data["review_clean"])
test_matrix = vectorizer.transform(test_data["review_clean"])
words = vectorizer.get_feature_names()

# Create a logistic regression model
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data["sentiment"])

# Create a SFrame with words and their corresponding coefficient
sentiment_model_coef_table = sframe.SFrame({"word": words, "coefficient": sentiment_model.coef_.flatten()})

# Sanity check using some sample data
sample_test_data = test_data[10:13]
sample_test_matrix = vectorizer.transform(sample_test_data["review_clean"])
sample_test_scores = sentiment_model.decision_function(sample_test_matrix)
sample_test_probabilities = sigmoid(sample_test_scores)

# Apply the logistic regression model on the test matrix
# Compute scores, compute probabilities, compute predicted sentiment
test_scores = sentiment_model.decision_function(test_matrix)
test_probabilities = sigmoid(test_scores)
test_data["probability"] = test_probabilities
test_data["predicted_score"] = test_scores
test_data["predicted_sentiment"] = test_data["predicted_score"].apply(lambda score: +1 if score > 0.0 else -1)

# Sort the test data on the predicted probability
# Get the likely products for the most positive and most negative reviews
test_data.sort("probability", ascending=False)["name"][0:20]
test_data.sort("probability", ascending=True)["name"][0:20]
from itertools import cycle


if __name__ == '__main__':
    np.random.seed(0)
    pd.set_option('display.width', 300)
    np.set_printoptions(suppress=True, linewidth=200)
    n = 300
    x = np.random.randn(n, 50)
    y = np.array([0]*100+[1]*100+[2]*100)
    n_class = 3
    print 'Before = \n', y

    clf = LogisticRegression(penalty='l2', C=1)
    clf.fit(x, y)
    y_score = clf.decision_function(x)
    y = label_binarize(y, classes=np.arange(n_class))
    print 'After = \n', y
    colors = cycle('gbc')
    fpr = dict()
    tpr = dict()
    auc = np.empty(n_class+2)
    mpl.rcParams['font.sans-serif'] = u'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(7, 6), facecolor='w')
    for i, color in zip(np.arange(n_class), colors):
        fpr[i], tpr[i], thresholds = metrics.roc_curve(y[:, i], y_score[:, i])
        auc[i] = metrics.auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], c=color, lw=1.5, alpha=0.7, label=u'AUC=%.3f' % auc[i])
    # micro
    fpr['micro'], tpr['micro'], thresholds = metrics.roc_curve(y.ravel(), y_score.ravel())
Exemple #17
0
def predictive_modeling(raw_data, y):
    print("logistic regression...")
    directory = "results/notableDresses/"
    make_folder(directory)

    # split the data into a training set and a test set
    train_split = int(len(raw_data) * 4.0 / 5.0)

    x_train = X[:train_split]
    x_test = X[train_split:]
    y_train = y[:train_split]
    y_test = y[train_split:]

    # if you wanted to use a different model, you'd specify that here
    clf = LogisticRegression(penalty='l2')
    clf.fit(x_train, y_train)

    print "score", clf.score(x_test, y_test)

    # first, let's find the model score for every dress in our dataset
    probs = zip(clf.decision_function(X), raw_data)

    prettiest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == LIKE else 1, p))
    prettiest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == DISLIKE else 1, p))
    ugliest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == LIKE else 1, -p))
    ugliest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == DISLIKE else 1, -p))
    in_between_things = sorted(probs, key=lambda (p, (cd, g, f)): abs(p))

    # and let's look at the most and least extreme dresses
    cd = zip(X, raw_data)
    least_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]))
    most_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]), reverse=True)

    least_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): max([abs(c) for c in x]))
    most_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): min([abs(c) for c in x]), reverse=True)

    for i in range(10):
        open_image_from_url(prettiest_liked_things[i][1][2]).save(directory + "prettiest_pretty_" + str(i) + ".png")
        open_image_from_url(prettiest_disliked_things[i][1][2]).save(directory + "prettiest_ugly_" + str(i) + ".png")
        open_image_from_url(ugliest_liked_things[i][1][2]).save(directory + "ugliest_pretty_" + str(i) + ".png")
        open_image_from_url(ugliest_disliked_things[i][1][2]).save(
            directory + "directoryugliest_ugly_" + str(i) + ".png")
        open_image_from_url(in_between_things[i][1][2]).save(directory + "neither_pretty_nor_ugly_" + str(i) + ".png")
        open_image_from_url(least_extreme_things[i][1][2]).save(directory + "least_extreme_" + str(i) + ".png")
        open_image_from_url(most_extreme_things[i][1][2]).save(directory + "most_extreme_" + str(i) + ".png")
        open_image_from_url(least_interesting_things[i][1][2]).save(directory + "least_interesting_" + str(i) + ".png")
        open_image_from_url(most_interesting_things[i][1][2]).save(directory + "most_interesting_" + str(i) + ".png")

    # and now let's look at precision-recall
    probs = zip(clf.decision_function(x_test), raw_data[train_split:])
    num_dislikes = len([c for c in y_test if c == 1])
    num_likes = len([c for c in y_test if c == 0])
    lowest_score = round(min([p[0] for p in probs]), 1) - 0.1
    highest_score = round(max([p[0] for p in probs]), 1) + 0.1
    INTERVAL = 0.1

    # first do the likes
    score = lowest_score
    while score <= highest_score:
        true_positives = len([p for p in probs if p[0] <= score and p[1][1] == LIKE])
        false_positives = len([p for p in probs if p[0] <= score and p[1][1] == DISLIKE])
        positives = true_positives + false_positives
        if positives > 0:
            precision = 1.0 * true_positives / positives
            recall = 1.0 * true_positives / num_likes
            print "likes", score, precision, recall
        score += INTERVAL

    # then do the dislikes
    score = highest_score
    while score >= lowest_score:
        true_positives = len([p for p in probs if p[0] >= score and p[1][1] == DISLIKE])
        false_positives = len([p for p in probs if p[0] >= score and p[1][1] == LIKE])
        positives = true_positives + false_positives
        if positives > 0:
            precision = 1.0 * true_positives / positives
            recall = 1.0 * true_positives / num_dislikes
            print "dislikes", score, precision, recall
        score -= INTERVAL

    # now do both
    score = lowest_score
    while score <= highest_score:
        likes = len([p for p in probs if p[0] <= score and p[1][1] == LIKE])
        dislikes = len([p for p in probs if p[0] <= score and p[1][1] == DISLIKE])
        print score, likes, dislikes
        score += INTERVAL

        

    ### text feature
    clf_t = LinearSVC(C=0.04)
    clf_t.fit(Xtr_t[:,:,thres-1],ytrain)
    ### audio feature
    clf_a = LogisticRegression(C=0.001)
    clf_a.fit(Xtr_a[:,:,thres-1],ytrain)
    ### video feature
    clf_v = SVC(gamma=0.001,C=10)
    clf_v.fit(Xtr_v[:,:,thres-1],ytrain)

    ypr_a = clf_a.predict(Xts_a[:,:,thres-1])
    yscore_a = clf_a.decision_function(Xts_a[:,:,thres-1])
    ypr_v = clf_v.predict(Xts_v[:,:,thres-1])
    yscore_v = clf_v.decision_function(Xts_v[:,:,thres-1])
    ypr_t = clf_t.predict(Xts_t[:,:,thres-1])

    """
    ### feature for fusion
    ### audio RL feature
    clf_a = LogisticRegression(C=0.001)
    ypreds_a, yscores_a, ytests_a = UncertaintyStats(Xtr_a,ytrain,pred_sec_lst,clf_a)
    ### video RL feature
    clf_v = SVC(gamma=0.001,C=10)
    ypreds_v, yscores_v, ytests_v = UncertaintyStats(Xtr_v,ytrain,pred_sec_lst,clf_v)
    Xfuse = np.c_[ypreds_a,yscores_a,ypreds_v,yscores_v]
    yfuse = ytests_v
    clf_fuse = LinearSVC(C=1)
 train_matrix = vectorizer.fit_transform(train_data["review_clean"])
 test_matrix = vectorizer.transform(test_data["review_clean"])
 
 #train LogisticRegression model
 train_y = train_data["sentiment"]
 model = LogisticRegression()
 model.fit(train_matrix, train_y)
 coefficients = model.coef_
 #Quiz question: How many weights are >= 0?
 num = len([weight for weight in coefficients[0] if weight >= 0])
 print "Number of positive weights: %d"%num, "\n"
 
 #test prediction for 3 sample dataset
 sample_test_data = test_data[10:13]
 sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
 scores = model.decision_function(sample_test_matrix)
 #Quiz question: Of the three data points in sample_test_data, which one 
 #(first, second, or third) has the lowest probability of being classified 
 #as a positive review?
 probs = prob_prediction(scores)
 print "The probabilities: ", probs, "\n"
 
 #find the 20 reviews in the entire test_data with the highest probability 
 #of being classified as a positive review.
 scores = model.decision_function(test_matrix)
 probs = prob_prediction(scores)
 index_probs = zip(test_index, probs)
 index_probs.sort(key = lambda x: x[1])
 most_positive_reviews_index = [a for (a, b) in index_probs[-20:]]
 most_positive_reviews = products.iloc[most_positive_reviews_index, 0]
 print "Most positive reviews: ", most_positive_reviews, "\n"
Exemple #20
0
from sklearn import svm
X = [[0, 0,0.5], [1, 1,1.5],[2,3,4]]
y = [0,1,1]
clf = svm.SVC()
clf.fit(X,y)



#_*_coding:utf-8-*
from sklearn import svm
X = [[0, 0,0.5], [1,1,1.5],[2,3,4],[0,0,0.4]]
y = [0,1,2,0]
clf = svm.SVC()
clf.fit(X,y)
dec = clf.decision_function([[1,1,1.5]])
print dec

import numpy as np
from sklearn.lda import LDA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2],[4,3]])
y = np.array([1, 1, 1, 2, 2, 3,3])
clf = LDA()
clf.fit(X, y)
LDA(n_components=None, priors=None)
print(clf.predict([[-0.8, -1],[3,4]]))

# sift 算法
import numpy as np
import cv2
from matplotlib import pyplot as plt
Exemple #21
0
# In[34]:


predict_y=clf.predict(test_x)


# In[35]:


#预测样本的置信分数


# In[36]:


score_y=clf.decision_function(test_x)


# In[37]:


#计算混淆矩阵,并显示


# In[38]:


cm=confusion_matrix(test_y,predict_y)


# In[39]:
Exemple #22
0
# 분류 연습용 샘플 데이터 작성
x, y = make_classification(n_samples=16,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           random_state=0)
# n_samples 표본데이터수 n_features 독립변수의 수 n_informative 독립 변수 중 종속 변수와 상관 관계가 있는 성분의 수  n_redundant : 독립 변수 중 다른 독립 변수의 선형 조합으로 나타나는 성분의 수   random_state 난수고정
# print(x)  #[[ 2.03418291 -0.38437236]  [ 4.06377686  0.17863836] ...
# print(y)  #[0 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1]  실제값

model = LogisticRegression().fit(x, y)
y_hat = model.predict(x)
print('y_hat :', y_hat)  #예측값

f_value = model.decision_function(x)  # 결정(판별)함수, 불확실성 측정 함수
print(f_value)  #[ 0.37829565  1.6336573  -1.42938156  1.21967832  ....
print()

df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T, columns=['f', 'yhat', 'y'])
print(df)  # 0보다 작으면 0, 0보다 크면 1로 예측한 것 yhat
print()

# ROC 커브
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y, y_hat, labels=[1, 0]))

recall = 7 / (
    7 + 1
)  # 민감도=재현율=참 양성 비율(TPR) = TP / TP+FN    :정답이 Positive인 것들 중에서 정말로 정답을 맞춘 수의 비율  -> y축이된다.
fallout = 1 / (
#print(X_train,y_train)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
log = LogisticRegression()
log.fit(X_train, y_train)
predict_y_test = log.predict(X_test)
print(predict_y_test)
print(accuracy_score(y_test, predict_y_test))

#confussion matrix
from sklearn.metrics import confusion_matrix
con = confusion_matrix(y_test, predict_y_test)
print(con)

predict_prob_y_test = log.decision_function(X_test)
# keep probabilities for the positive outcome only (or) both can be used same resukts
#predict_prob_y_test = log.predict_proba(X_test)
#predict_prob_y_test =predict_prob_y_test[:, 1]

#data for the roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
false_positive_rate, true_positive_rate, threshold = roc_curve(
    y_test, predict_prob_y_test)
#plot the roc-auc curve
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], [0, 1])
plt.xlabel('Flase positive rate')
plt.ylabel('True pasitive rate')
plt.xlim(0.0, 1.0)
Exemple #24
0
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
'Accuracy Score: {0}'.format(lrc_acc_score_test)
ax.set_title('CC Testing Data CM Using Logistic Regression', size=15)
plt.show()

print('LR Metrics Class Wise')
report = classification_report(y_test, lbl_predictions_test)
print(report)

print("Logistic Regression Training Data accuracy {0:.2f}".format(
    lrc_acc_score_train))
print("Logistic Regression Testing Data accuracy {0:.2f}".format(
    lrc_acc_score_test))

y_score_test = clf.decision_function(X_test)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test)

y_score_train = clf.decision_function(X_train)
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)

#Plotting ROC Curve
plt.figure()
lw = 2
plt.plot(fpr_train, tpr_train, color='darkorange', lw=lw, label='train')
plt.plot(fpr_test, tpr_test, color='navy', lw=lw, linestyle='--', label='test')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve LR')
Exemple #25
0
    html += '<tr>'
    if pct >= 0:    
        html += '<td width="50%"></td><td width="50%"><div style="width:' + width + ';background-color:rgb(' + str(r) + "," + str(g) + "," + str(b) + ')">' + str(pct) + "</td></div>"
    else:
        html += '<td width="50%"><div style="text-align:right;float:right;width:' + width + ';background-color:rgb(' + str(r) + "," + str(g) + "," + str(b) + ')">' + str(pct) + '</td></div><td width="50%"></td>'
    html += '</tr>'
html += "</table></body></html>"

f = open("html.html","w")
f.write(html)
f.close()

# and now some qualitative results

# first, let's find the model score for every shirt in our dataset
probs = zip(clf.decision_function(X),data)

girliest_girl_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (g,p))[0]
girliest_boy_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (-g,p))[0]
boyiest_girl_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (g,-p))[0]
boyiest_boy_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (-g,-p))[0]
most_androgynous_shirt = sorted(probs,key=lambda (p,(cd,g,f)): abs(p))[0]
blandest = sorted(probs,key=lambda (p,(cd,g,f)): sum(cd))[0]
coloriest = sorted(probs,key=lambda (p,(cd,g,f)): -sum(cd))[0]

# and now let's look at precision-recall
probs = zip(clf.decision_function(X_test),data[train_split:])
num_boys = len([c for c in y_test if c == 1])
num_girls = len([c for c in y_test if c == 0])
lowest_score = round(min([p[0] for p in probs]),1) - 0.1
highest_score = round(max([p[0] for p in probs]),1) + 0.1
# %%
print(accuracy_score(y_test, y_lr_pred))
print(precision_score(y_test, y_lr_pred))
print(recall_score(y_test, y_lr_pred))
print(f1_score(y_test, y_lr_pred))

# %%

# %%
compute_plot_grid_coords(x, 2)

# %%
lr_clf.predict(compute_plot_grid_coords(x, 2))

# %%
lr_clf.decision_function(compute_plot_grid_coords(x, 2))

# %%
lr_clf.decision_function(compute_plot_grid_coords(x, 2)) > -3  # noqa

# %%
lr_clf.predict_proba(compute_plot_grid_coords(x, 2))

# %%
lr_clf.predict_proba(compute_plot_grid_coords(x, 2))[:, 1]

# %%
grid_x, grid_y = compute_plot_grid(x, 0.02)
grid_x.shape, grid_y.shape

# %%
Exemple #27
0
Created on May 27, 2012

@author: sijin
'''

import numpy as np
from sklearn.preprocessing import Scaler
from sklearn.linear_model import LogisticRegression

with open('../../data/quora/input00.txt') as f:
    mn = f.readline().split(' ')
    N, M = int(mn[0]), int(mn[1])
    
    print 'M, N = {}, {}'.format(M, N)    
    
    X = np.zeros((N, M))
    Y = np.zeros(N, np.int)
    for row in range(N):
        training_data = f.readline().strip().split(' ')
        Y[row] = 1 if training_data[1] == '+1' else -1
        for col in range(2, 2+M):
            X[row, col-2] = training_data[col].split(':')[1]
    
    
    X = Scaler().fit_transform(X)
    clf_l1_LR = LogisticRegression(C=0.01, penalty='l1', tol=0.01)
    clf_l1_LR.fit(X, Y)
    print clf_l1_LR.decision_function(X)
      
if __name__ == '__main__':
    pass
Exemple #28
0
        fillstyle="none",
        c='k',
        mew=2)
ax.plot(precision, recall, label='precision recall curve')
ax.set_xlabel('Precision')
ax.set_ylabel('Recall')
#ax.set_aspect(1)
#ax.axis([0,1,0,1])
'''
ROC - Receiver operating characteristic
'''

from sklearn.metrics import roc_curve

fpr, tpr, threshold = roc_curve(y_test,
                                logreg.decision_function(X_test),
                                drop_intermediate=False)

plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('FPR')
plt.ylabel('TPR')

close_zero = np.argmin(np.abs(threshold))
plt.plot(fpr[close_zero],
         tpr[close_zero],
         'o',
         markersize=10,
         label='threshold',
         fillstyle='none',
         c='k',
         mew=2)
#Calculate the number of positive (>= 0, nonnegative) coeffs 
cntnonneg=np.sum(sentiment_model.coef_>=0)+np.sum(sentiment_model.intercept_
                                                  >0)

#Making predictions with logistic regression
#Take the 11th, 12th, and 13th data points in the test data and save them to 
#sample_test_data
sample_test_data = test_data.iloc[10:13]
sample_test_data.iloc[0]['review']
sample_test_data.iloc[1]['review']

#The sentiment_model should predict +1 if the sentiment is positive
#-1 if the sentiment is negative
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
#calculate the score of each data point with decision_function()
scores = sentiment_model.decision_function(sample_test_matrix) #WTransh(X)
print (scores)

#Prediciting Sentiment
#make class predictions from scores
def predictions(scores):
    """ make class predictions
    """
    preds = []
    for score in scores:
        if score > 0:
            pred = 1
        else:
            pred = -1
        preds.append(pred)
    return preds
def regressionModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test):

    # Reminder:
    # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
    #           intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

    #df_sig_train['X1X2'] = df_sig_train['PTS']*df_sig_train['AST']
    #df_sig_train['X1X1'] = df_sig_train['PTS']*df_sig_train['PTS']
    #df_sig_train['X2X2'] = df_sig_train['AST']*df_sig_train['AST']

    #df_bkg_train['X1X2'] = df_bkg_train['PTS']*df_bkg_train['AST']
    #df_bkg_train['X1X1'] = df_bkg_train['PTS']*df_bkg_train['PTS']
    #df_bkg_train['X2X2'] = df_bkg_train['AST']*df_bkg_train['AST']

    # '---------- Prepare Training ----------'

    X_sig = np.array(df_sig_train)
    y_sig = np.array(X_sig.shape[0] * [1])
    X_bkg = np.array(df_bkg_train)
    y_bkg = np.array(X_bkg.shape[0] * [0])

    X = np.concatenate((X_sig, X_bkg))
    y = np.concatenate((y_sig, y_bkg))

    print 'X_sig.shape: ', X_sig.shape
    print 'y_sig.shape: ', y_sig.shape
    print 'X_bkg.shape: ', X_bkg.shape
    print 'y_bkg.shape: ', y_bkg.shape
    print 'X.shape: ', X.shape
    print 'y.shape: ', y.shape

    # '---------- Prepare Testing ----------'

    X_sig_test = np.array(df_sig_test)
    y_sig_test = np.array(X_sig_test.shape[0] * [1])
    X_bkg_test = np.array(df_bkg_test)
    y_bkg_test = np.array(X_bkg_test.shape[0] * [0])

    X_test = np.concatenate((X_sig_test, X_bkg_test))
    y_test = np.concatenate((y_sig_test, y_bkg_test))

    print 'X_sig_test.shape: ', X_sig_test.shape
    print 'y_sig_test.shape: ', y_sig_test.shape
    print 'X_bkg_test.shape: ', X_bkg_test.shape
    print 'y_bkg_test.shape: ', y_bkg_test.shape
    print 'X_test.shape: ', X_test.shape
    print 'y_test.shape: ', y_test.shape


    #C = 10.0 ** np.arange(-10, 10)
    #for c in C:

    #    print c
    # '---------- Model ----------'

    # first way of doing preprocessing
    #X = preprocessing.scale(X)

    # second way of doing preprocessing
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)

    model = LogisticRegression(C=1000, penalty='l1')
    model.fit(X, y)



    print '---------- Training/Testing info ----------'

    print 'Accuracy (training): ', model.score(X, y)
    print 'Null Error Rate (training): ', y.mean()

    X_test = scaler.transform(X_test)
    predicted_test = model.predict(X_test)

    predicted_test_clever = (predicted_test + y_test).tolist()
    error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever))
    print "Error: ", error_test

    print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test)
    print "Recall (testing): ",   metrics.recall_score(y_test, predicted_test)
    print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test)
    print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test) 

    #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL'
    user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2], dtype=float))
    print 'Score (user input): ', model.decision_function(user_input)
    result = model.predict_proba(user_input)
    print 'Probability of 1 (user input): ', result



    # 3A. Examine the coefficients
    #print "Coefficients: ", pd.DataFrame(zip(X, np.transpose(model.coef_)))

    # 3B. Calculating Error
    #predicted_train = model.predict(X)
    #print predicted_train
    #predicted_train_clever = (predicted_train + y).tolist()    
    #error = float(predicted_train_clever.count(1)) / float(len(predicted_train_clever))
    #print "Error: ", error_train

    # 4. Cross-validation

    #scores = cross_val_score(LogisticRegression(), X , y, 'accuracy', 4)
    #print "Cross-validation: ", scores
    #print "Cross-validation mean: ", scores.mean()


    # '--------- Visualization -----------'

    Classifier_training_S = model.decision_function(X[y>0.5]).ravel()
    Classifier_training_B = model.decision_function(X[y<0.5]).ravel()
    Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel()
    Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel()

    (h_test_s, h_test_b) =  visualSigBkg("Logistic Regression", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B)

    return (model, X, y, result, model.score(X, y), error_test, h_test_s, h_test_b)
recall = recall_score(y_true=test_data['sentiment'].to_numpy(),
                      y_pred=model.predict(test_matrix))
print "Recall on test data: %s" % recall

print model.classes_
# column ordering of output matrix from predict_proba() is the same as output from model.classes_
score_after_sigmoid = pd.DataFrame(model.predict_proba(test_matrix))

threshold_values = np.linspace(0.5, 1, num=100)

precision_all = []
recall_all = []

for threshold in threshold_values:
    prediction = apply_threshold(pd.DataFrame(model.predict_proba(test_matrix)[:,1]), threshold)
    
    precision_all.append(precision_score(y_true=test_data['sentiment'].to_numpy(), 
        y_pred = prediction.as_matrix()[:,0]))

    recall_all.append(recall_score(y_true=test_data['sentiment'].to_numpy(),
        y_pred = prediction.as_matrix()[:,0]))

prediction_98 = apply_threshold(pd.DataFrame(model.predict_proba(test_matrix)[:,1]), 0.98)
print_confusion_matrix(test_data['sentiment'].to_numpy(), prediction_98.as_matrix()[:,0], model)


baby_reviews = test_data[test_data['name'].apply(lambda x: 'baby' in x.lower())]
baby_matrix = vectorizer.transform(baby_reviews['review_clean'])
probabilities = model.decision_function(baby_matrix)

Exemple #32
0
print(lr.predict(train_bream_smelt[:5]))
print(lr.predict_proba(train_bream_smelt[:5]))
print(lr.classes_)  # ['Bream', 'Smelt'] Bream : 음성클래스(0) Smelt:양성클래스(1)
'''
로지스틱 회귀가 학습한 계수(가중치)를 확인해보자
'''
print('계수: ', lr.coef_
      )  # [[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]]
print('절편: ', lr.intercept_)  # [-2.16155132]
'''
로지스틱 회귀가 학습한 방정식
-0.404 * weight -0.576 * length -0.633 * diagonal -1.013 * height -0.732 * width -2.162 ==> z
lr의 decision_function()함수를 사용하면 양성클래스(빙어)에 대한 z 값을 계산한다
'''

z = lr.decision_function(train_bream_smelt[:5])
print('z값: ', z)
'''
이 z값을 시그모이드 함수에 통과시키면 확률을 얻을 수 있다
'''
s = 1 / (1 + np.exp(-z))
print(s)
'''
파이썬 사이피이 라이브러리에 시그모이드 함수가 있음 => expit()
'''
from scipy.special import expit
print(expit(z))
'''
로지스틱 회귀로 다중 분류하기 (7개의 생선을 분류)
LogisticRegression : L2규제를 함 C 매게변수(기본값:1)를 이용해서 규제함 
                    ==> C값이 작을수록 규제강도기 높아짐
Exemple #33
0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, f1_score, auc

digits = datasets.load_digits()
X = digits.data
y = digits.target.copy()
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
print("old f1_score =", f1_score(y_test, y_predict))
decision_score = log_reg.decision_function(X_test)  # 输入sigmod中的分数值
precisions, recalls, thresholds = precision_recall_curve(y_test, decision_score)
for i in range(thresholds.shape[0]):
    if precisions[i] == recalls[i]:
        y_predict = np.array(decision_score > thresholds[i], dtype=int)
        print("thresholds =", thresholds[i])
        print("new f1_score =", f1_score(y_test, y_predict))

plt.figure("Threshold Precision Recall")
plt.plot(thresholds, precisions[:-1], label="precision")  # last thresholds, precisions=1, recalls=0
plt.plot(thresholds, recalls[:-1], label="recall")
plt.legend()

plt.figure("PR")
plt.plot(recalls, precisions)
Exemple #34
0
alice_list = []
alice_text = (val_df.iloc[0][6]).split()

count = 0
alice_mapped_num_list = []
for word in alice_text:
    alice_mapped_num = create_bitstring_sha224(word)
    alice_mapped_num_list.append(alice_mapped_num)
    count = count + 1

print("Alice's total features: ", count)

file1_alice = open(DATA61_ROOT + "Input-P0-0", "w")
for feature in alice_mapped_num_list:
    file1_alice.write(feature + " ")
file1_alice.close()

# Predict the label and get the accuracy using 5-fold cross validation
y_predicted = LR_model.predict(X_val)
print("Predicted_label: ", y_predicted)
predicted_dist = LR_model.decision_function(X_val)
print("Predicted_distance: ", predicted_dist)
mean_accuracy = cross_val_score(LR_model,
                                X_train,
                                Y_train,
                                scoring='accuracy',
                                cv=10).mean()
print("mean_cross_val", mean_accuracy)
print("Accuracy: %.2f" % accuracy_score(Y_val, y_predicted))
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    random_state=42)

gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
gbrt.fit(X_train, y_train)
# decision_function
print(gbrt.decision_function(X_test).shape)
print(gbrt.decision_function(X_test)[:6, :])
# predict_proba
print(gbrt.predict_proba(X_test)[:6, :])

# 通过计算argmax来再现预测结果
print(np.argmax(gbrt.decision_function(X_test), axis=1))
print(np.argmax(gbrt.predict_proba(X_test), axis=1))
print(gbrt.predict(X_test))
#逻辑回归做分类
logreg = LogisticRegression()
named_target = iris.target_names[y_train]
logreg.fit(X_train, named_target)
print(logreg.classes_)
print(logreg.predict(X_test)[:10])
argmax_dec_func = np.argmax(logreg.decision_function(X_test), axis=1)
print(argmax_dec_func[:10])
print(logreg.classes_[argmax_dec_func][:10])
print(
    np.all(
        logreg.classes_[argmax_dec_func][:10] == logreg.predict(X_test)[:10]))
clr = LogisticRegression(solver="lbfgs", penalty='none', random_state=42)
clr.fit(X, y)

# Output coefficients

# In[6]:

print("[Intercept] ", X.columns)
print(clr.intercept_, clr.coef_)

# Prediction and scoring

# In[7]:

yp = clr.predict(X)
y_score = clr.decision_function(X)
print(y_score)

# ### Performance Metrics

# In[8]:

tn, fp, fn, tp = confusion_matrix(y, yp).ravel()

# In[9]:

print("Confusion Matrix:")
print("%32s" % "Predicted")
print("%17s" % " ", "%8s" % "UNC", "%8s" % "Duke")
print("%8s" % "Actual", "%8s" % "UNC", "%8i" % tp, "%8i" % fn)
print("%8s" % " ", "%8s" % "Duke", "%8i" % fp, "%8i" % tn)
Exemple #37
0
	ax.set_title('calibration curve for '+str(num_hrs)+'hours prediction')
	print('\n')
	print("logistic regression:")
	confusion = confusion_matrix(Y_test, final_predicted_logit)
	TP = confusion[1, 1]
	TN = confusion[0, 0]
	FP = confusion[0, 1]
	FN = confusion[1, 0]
	print("accuracy of logistic regression ",(TP + TN) / float(TP + TN + FP + FN))
	print(confusion_matrix(Y_test, final_predicted_logit))
	print(classification_report(Y_test, final_predicted_logit))
	sensitivity = TP / float(FN + TP)
	print("sensitivity of logistic regression ", sensitivity)
	specificity = TN / float(TN + FP)
	print("specificity of logistic regression ",specificity)
	prob_pos = clf_logi.decision_function(X_test)
	prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
	fraction_of_positives, mean_predicted_value = calibration_curve(Y_test, prob_pos, n_bins=10, normalize=True)
	ax.plot([0, 1], [0, 1], linestyle='--', color='black', label='BASE')
	ax.plot(fraction_of_positives, mean_predicted_value, 'r--', label='LogisticRegression')
	print('\n\n')

	print("ExtraTreesClassifier :")
	confusion = confusion_matrix(Y_test, final_predicted_xtrees)
	TP = confusion[1, 1]
	TN = confusion[0, 0]
	FP = confusion[0, 1]
	FN = confusion[1, 0]
	print("accuracy of extratrees ",(TP + TN) / float(TP + TN + FP + FN))
	print(confusion_matrix(Y_test, final_predicted_xtrees))
	print(classification_report(Y_test, final_predicted_xtrees))
sess = rt.InferenceSession(onx.SerializeToString())
res = sess.run(None, {'float_input': X_test.astype(numpy.float32)})
print("skl", clr.predict_proba(X_test[:1]))
print("onnx", res[1][:2])

###################################
# Raw scores and decision_function
# ++++++++++++++++++++++++++++++++
#

initial_type = [('float_input', FloatTensorType([None, 4]))]
options = {id(clr): {'raw_scores': True}}
onx2 = convert_sklearn(clr,
                       initial_types=initial_type,
                       options=options,
                       target_opset=12)

sess2 = rt.InferenceSession(onx2.SerializeToString())
res2 = sess2.run(None, {'float_input': X_test.astype(numpy.float32)})
print("skl", clr.decision_function(X_test[:1]))
print("onnx", res2[1][:2])

#################################
# **Versions used for this example**

print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
print("onnx: ", onnx.__version__)
print("onnxruntime: ", rt.__version__)
print("skl2onnx: ", skl2onnx.__version__)
Exemple #39
0
def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1):
    train = np.zeros((train_x.shape[0], class_num))
    test = np.zeros((test_x.shape[0], class_num))
    test_pre = np.zeros((folds, test_x.shape[0], class_num))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.5,
                'colsample_bylevel': 0.5,
                'learning_rate': 0.1,
                'scale_pos_weight': 20,
                'seed': 2018,
                'nthread': 16,
                'num_class': class_num,
                'silent': True,
            }

            num_round = 2000
            early_stopping_rounds = 100

            model = clf.train(params,
                              train_matrix,
                              num_round,
                              valid_sets=test_matrix,
                              early_stopping_rounds=early_stopping_rounds)

            pre = model.predict(te_x,
                                num_iteration=model.best_iteration).reshape(
                                    (te_x.shape[0], class_num))
            pred = model.predict(test_x,
                                 num_iteration=model.best_iteration).reshape(
                                     (test_x.shape[0], class_num))
        if clf_name == "lr":
            model = LogisticRegression(C=4, dual=False)
            model.fit(tr_x, tr_y)
            pre = model.predict_proba(te_x)
            pred = model.predict_proba(test_x)

        if clf_name == "svm":
            model = svm.LinearSVC()
            model.fit(tr_x, tr_y)
            pre = model.decision_function(te_x)
            pred = model.decision_function(test_x)

        train[test_index] = pre

        test_pre[i, :] = pred
        cv_scores.append(
            f1_score(te_y,
                     np.argmax(pre, axis=1),
                     labels=range(0, 19),
                     average='macro'))

        print("%s now score is:" % clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    with open("score_cv.txt", "a") as f:
        f.write("%s now score is:" % clf_name + str(cv_scores) + "\n")
        f.write("%s_score_mean:" % clf_name + str(np.mean(cv_scores)) + "\n")
    return train.reshape(-1, class_num), test.reshape(
        -1, class_num), np.mean(cv_scores)
recall = recall_score(y_test, y_log_predict)
def f1_score(precision, recall):
    try:
        return 2 * precision * recall / (precision + recall)
    except Exception as e:
        return repr(e)
f1 = f1_score(precision, recall)
print(precision, recall, f1)
# sklearn中的
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision = precision_score(y_test, y_log_predict)
recall = recall_score(y_test, y_log_predict)
print(precision, recall)
dec_score = log_reg.decision_function(x_test)  # 决策的分数值
print(np.max(dec_score), np.min(dec_score))
y_predict2 = np.array(dec_score >= 5, dtype='int')
# 可视化方式观察
precisions = []
recalls = []
thresholds = np.arange(np.min(dec_score), np.max(dec_score), 0.1)
for threshold in thresholds:
    y_predict = np.array(dec_score >= threshold, dtype='int')
    precisions.append(precision_score(y_test, y_predict))
    recalls.append(recall_score(y_test, y_predict))
plt.plot(thresholds, precisions)
plt.plot(thresholds, recalls)
plt.show()
plt.plot(precisions, recalls)
plt.show()
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
import plda
import keras
import os

def adaptive_snorm(scores,
                   scores_enr,
                   scores_test,
                   n_cohort_enr=200,
                   n_cohort_test=200):
    scores_enr = -np.sort(-scores_enr, axis=1)[:, :n_cohort_enr]
    scores_test = -np.sort(-scores_test, axis=1)[:, :n_cohort_test]
    mean_enr = np.tile(np.expand_dims(np.mean(scores_enr, axis=1), axis=1),
                       (1, scores.shape[1]))
    mean_test = np.tile(np.expand_dims(np.mean(scores_test, axis=1), axis=0),
                        (scores.shape[0], 1))
    std_enr = np.tile(np.expand_dims(np.std(scores_enr, axis=1), axis=1),
                      (1, scores.shape[1]))
    std_test = np.tile(np.expand_dims(np.std(scores_test, axis=1), axis=0),
                       (scores.shape[0], 1))
    return 0.5 * ((scores - mean_enr) / std_enr +
                  (scores - mean_test) / std_test)

def load_ivector(filename):
    utt = np.loadtxt(filename,
Exemple #42
0
# creating testing and training set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

# train scikit learn model
clf = LogisticRegression()
clf.fit(X_train, Y_train)
print('score Scikit learn: ', clf.score(X_test, Y_test))

# visualize data, uncomment "show()" to run it
pos = where(Y == 1)
neg = where(Y == 0)
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
xlabel('Exam 1 score')
ylabel('Exam 2 score')
legend(['Not Admitted', 'Admitted'])
#0.807551618	-0.75983985
#0.531634773	-0.479185022

filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

y = clf.predict([[-0.869144323, 0.389309751]])
z = clf.decision_function([[-0.869144323, 0.389309751]])
print(y)
print(z)
scatter(-0.869144323, 0.389309751, marker='+', c='g')

show()
Exemple #43
0
#logistic regression
model_lr = LogisticRegression(C=1000)
score_list_lr = []
for train_index, test_index in kf.split(features):
    start_time = time.time()
    #train
    model_lr.fit(features[train_index], label[train_index])
    #test
    score_list_lr.append(model_lr.score(features[test_index], label[test_index]))
    pred_lr = model_lr.predict(features[test_index])
    print 'Time spent in each fold:'
    print time.time() - start_time

#plot ROC
y_score_lr = model_lr.decision_function(features[test_index])
fpr_l,tpr_l,_ = rc(label[test_index], y_score_lr)
fig_21 = plt.figure()
lw = 1
plt.plot(fpr_l, tpr_l, color='black', linestyle='-.',
         lw=lw, label='ROC curve (LogisticR)')
plt.plot(fpr_m, tpr_m, color='aqua', linestyle=':',
         lw=lw, label='ROC curve (Multinomial)')
plt.plot(fpr_g, tpr_g, color='cornflowerblue', lw=lw,
         label='ROC curve (Gaussian)', linestyle=':')
plt.plot(fpr_h, tpr_h, color='darkorange',
         lw=lw, label='ROC curve (Soft SVM)')
plt.plot(fpr_s, tpr_s, color='deeppink', lw=lw,
         label='ROC curve (Hard SVM)')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
y_hat2 = model2.predict(X)


from sklearn.metrics import confusion_matrix
print(confusion_matrix(y, y_hat1))

print(confusion_matrix(y, y_hat2))

# 두 모형은 분류결과표로 봤을 때는 성능이 같다

from sklearn.metrics import classification_report
print(classification_report(y, model1.predict(X)))
print(classification_report(y, model2.predict(X)))


# ROC curve 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
fpr1, tpr1, thresholds1 = roc_curve(y, model1.decision_function(X))
fpr2, tpr2, thresholds1 = roc_curve(y, model2.decision_function(X))

plt.rc('font', family="D2Coding")
plt.plot(fpr1, tpr1, 'o-', ms=2, label="Logistic Regression")
plt.plot(fpr2, tpr2, 'o-', ms=2, label="Kernel SVM")
plt.legend()
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlabel('위양성률(Fall-Out)')
plt.ylabel('재현률(Recall)')
plt.title('ROC 커브')
plt.show()
Exemple #45
0
cnf_matrix = confusion_matrix(Y_test,y_predict)

param_grid = [
    {
        'C':[0.01,0.1,1,10,100],
        'penalty':['l2','l1'],
        'class_weight':['balanced',None]
    }
]
grid_search = GridSearchCV(lg,param_grid,cv=10,n_jobs=-1)
grid_search.fit(X_train,Y_train)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

decision_scores = lg.decision_function(x_test)

from sklearn.metrics import precision_recall_curve

precisions,recalls,thresholds = precision_recall_curve(y_test,decision_scores)
plt.plot(thresholds,precisions[:-1])
plt.plot(thresholds,recalls[:-1])
plt.grid()
plt.show()

def plot_cnf_matirx(cnf_matrix, description):
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
Exemple #46
0
ss = ShuffleSplit(n_splits=1,test_size=0.2, train_size=0.8, random_state=0)

train_index, test_index = next(ss.split(X,y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

clf.C = 1e-3
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


X_test_value = clf.decision_function(X_test)

sorted_va  = np.sort(X_test_value)

plt.plot(X_test_value)
plt.plot([0,120],[0,0], linestyle='--')

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

plt.plot(sigmoid(sorted_va))
plt.plot([0,120], [0.5, 0.5], linestyle='--')
def probabilisticFreeChoicePilotTask_logisticRegression(reward1, target1, trial1, reward3, target3, trial3, stim_trials):

    
    '''
    Previous rewards and no rewards
    '''
    fc_target_low_block1 = []
    fc_target_high_block1 = []
    fc_prob_low_block1 = []
    prev_reward1_block1 = []
    prev_reward2_block1 = []
    prev_reward3_block1 = []
    prev_reward4_block1 = []
    prev_reward5_block1 = []
    prev_noreward1_block1 = []
    prev_noreward2_block1 = []
    prev_noreward3_block1 = []
    prev_noreward4_block1 = []
    prev_noreward5_block1 = []
    prev_stim_block1 = []

    fc_target_low_block3 = []
    fc_target_high_block3 = []
    fc_prob_low_block3 = []
    prev_reward1_block3 = []
    prev_reward2_block3 = []
    prev_reward3_block3 = []
    prev_reward4_block3 = []
    prev_reward5_block3 = []
    prev_noreward1_block3 = []
    prev_noreward2_block3 = []
    prev_noreward3_block3 = []
    prev_noreward4_block3 = []
    prev_noreward5_block3 = []
    prev_stim1_block3 = []
    prev_stim2_block3 = []
    prev_stim3_block3 = []
    prev_stim4_block3 = []
    prev_stim5_block3 = []

    for i in range(5,len(trial1)):
        if trial1[i] == 2:
            fc_target_low_block1.append(2 -target1[i])   # = 1 if selected low-value, = 0 if selected high-value
            fc_target_high_block1.append(target1[i] - 1)  # = 1 if selected high-value, =  0 if selected low-value
            prev_reward1_block1.append((2*target1[i-1] - 3)*reward1[i-1])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward2_block1.append((2*target1[i-2] - 3)*reward1[i-2])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward3_block1.append((2*target1[i-3] - 3)*reward1[i-3])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward4_block1.append((2*target1[i-4] - 3)*reward1[i-4])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward5_block1.append((2*target1[i-5] - 3)*reward1[i-5])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward1_block1.append((2*target1[i-1] - 3)*(1 - reward1[i-1]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward2_block1.append((2*target1[i-2] - 3)*(1 - reward1[i-2]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward3_block1.append((2*target1[i-3] - 3)*(1 - reward1[i-3]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward4_block1.append((2*target1[i-4] - 3)*(1 - reward1[i-4]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward5_block1.append((2*target1[i-5] - 3)*(1 - reward1[i-5]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_stim_block1.append(0)
    num_block3 = len(trial3)
    for i in range(5,num_block3):
        if (trial3[i] == 2):
            fc_target_low_block3.append(2 - target3[i])   # = 1 if selected low-value, = 0 if selected high-value
            fc_target_high_block3.append(target3[i] - 1)
            prev_reward1_block3.append((2*target3[i-1] - 3)*reward3[i-1])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward2_block3.append((2*target3[i-2] - 3)*reward3[i-2])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward3_block3.append((2*target3[i-3] - 3)*reward3[i-3])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward4_block3.append((2*target3[i-4] - 3)*reward3[i-4])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_reward5_block3.append((2*target3[i-5] - 3)*reward3[i-5])  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward1_block3.append((2*target3[i-1] - 3)*(1 - reward3[i-1]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward2_block3.append((2*target3[i-2] - 3)*(1 - reward3[i-2]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward3_block3.append((2*target3[i-3] - 3)*(1 - reward3[i-3]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward4_block3.append((2*target3[i-4] - 3)*(1 - reward3[i-4]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_noreward5_block3.append((2*target3[i-5] - 3)*(1 - reward3[i-5]))  # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded
            prev_stim1_block3.append(2*stim_trials[i - 1] - 1)  # = 1 if stim was delivered and = -1 if stim was not delivered
            prev_stim2_block3.append(2*stim_trials[i - 2] - 1)
            prev_stim3_block3.append(2*stim_trials[i - 3] - 1)
            prev_stim4_block3.append(2*stim_trials[i - 4] - 1)
            prev_stim5_block3.append(2*stim_trials[i - 5] - 1)


    '''
    Turn everything into an array
    '''
    fc_target_low_block1 = np.array(fc_target_low_block1)
    fc_target_high_block1 = np.array(fc_target_high_block1)
    prev_reward1_block1 = np.array(prev_reward1_block1)
    prev_reward2_block1 = np.array(prev_reward2_block1)
    prev_reward3_block1 = np.array(prev_reward3_block1)
    prev_reward4_block1 = np.array(prev_reward4_block1)
    prev_reward5_block1 = np.array(prev_reward5_block1)
    prev_noreward1_block1 = np.array(prev_noreward1_block1)
    prev_noreward2_block1 = np.array(prev_noreward2_block1)
    prev_noreward3_block1 = np.array(prev_noreward3_block1)
    prev_noreward4_block1 = np.array(prev_noreward4_block1)
    prev_noreward5_block1 = np.array(prev_noreward5_block1)
    prev_stim_block1 = np.array(prev_stim_block1)

    fc_target_low_block3 = np.array(fc_target_low_block3)
    fc_target_high_block3 = np.array(fc_target_high_block3)
    prev_reward1_block3 = np.array(prev_reward1_block3)
    prev_reward2_block3 = np.array(prev_reward2_block3)
    prev_reward3_block3 = np.array(prev_reward3_block3)
    prev_reward4_block3 = np.array(prev_reward4_block3)
    prev_reward5_block3 = np.array(prev_reward5_block3)
    prev_noreward1_block3 = np.array(prev_noreward1_block3)
    prev_noreward2_block3 = np.array(prev_noreward2_block3)
    prev_noreward3_block3 = np.array(prev_noreward3_block3)
    prev_noreward4_block3 = np.array(prev_noreward4_block3)
    prev_noreward5_block3 = np.array(prev_noreward5_block3)
    prev_stim1_block3 = np.array(prev_stim1_block3)
    prev_stim2_block3 = np.array(prev_stim2_block3)
    prev_stim3_block3 = np.array(prev_stim3_block3)
    prev_stim4_block3 = np.array(prev_stim4_block3)
    prev_stim5_block3 = np.array(prev_stim5_block3)

    const_logit_block1 = np.ones(fc_target_low_block1.size)
    const_logit_block3 = np.ones(fc_target_low_block3.size)

    
    '''
    Oraganize data and regress with GLM 
    '''
    x = np.vstack((prev_reward1_block1,prev_reward2_block1,prev_reward3_block1,prev_reward4_block1,prev_reward5_block1,
        prev_noreward1_block1,prev_noreward2_block1,prev_noreward3_block1,prev_noreward4_block1,prev_noreward5_block1))
    x = np.transpose(x)
    x = sm.add_constant(x,prepend='False')

    y = np.vstack((prev_reward1_block3,prev_reward2_block3,prev_reward3_block3,prev_reward4_block3,prev_reward5_block3,
        prev_noreward1_block3,prev_noreward2_block3,prev_noreward3_block3,prev_noreward4_block3,prev_noreward5_block3,
        prev_stim1_block3, prev_stim2_block3, prev_stim3_block3, prev_stim4_block3, prev_stim5_block3))
    y = np.transpose(y)
    y = sm.add_constant(y,prepend='False')

    model_glm_block1 = sm.GLM(fc_target_low_block1,x,family = sm.families.Binomial())
    model_glm_block3 = sm.GLM(fc_target_low_block3,y,family = sm.families.Binomial())
    fit_glm_block1 = model_glm_block1.fit()
    fit_glm_block3 = model_glm_block3.fit()
    print fit_glm_block1.predict()
    
    '''
    Oraganize data and regress with LogisticRegression
    '''
    
    d_block1 = {'target_selection': fc_target_high_block1, 
            'prev_reward1': prev_reward1_block1, 
            'prev_reward2': prev_reward2_block1, 
            'prev_reward3': prev_reward3_block1, 
            'prev_reward4': prev_reward4_block1, 
            'prev_reward5': prev_reward5_block1, 
            'prev_noreward1': prev_noreward1_block1, 
            'prev_noreward2': prev_noreward2_block1,
            'prev_noreward3': prev_noreward3_block1, 
            'prev_noreward4': prev_noreward4_block1, 
            'prev_noreward5': prev_noreward5_block1}

    df_block1 = pd.DataFrame(d_block1)

    y_block1, X_block1 = dmatrices('target_selection ~ prev_reward1 + prev_reward2 + prev_reward3 + \
                                    prev_reward4 + prev_reward5 + prev_noreward1 + prev_noreward2 + \
                                    prev_noreward3 + prev_noreward4 + prev_noreward5', df_block1,
                                    return_type = "dataframe")
    
    #print X_block1.columns
    # flatten y_block1 into 1-D array
    y_block1 = np.ravel(y_block1)
    
    d_block3 = {'target_selection': fc_target_high_block3, 
            'prev_reward1': prev_reward1_block3, 
            'prev_reward2': prev_reward2_block3, 
            'prev_reward3': prev_reward3_block3, 
            'prev_reward4': prev_reward4_block3, 
            'prev_reward5': prev_reward5_block3, 
            'prev_noreward1': prev_noreward1_block3, 
            'prev_noreward2': prev_noreward2_block3,
            'prev_noreward3': prev_noreward3_block3, 
            'prev_noreward4': prev_noreward4_block3, 
            'prev_noreward5': prev_noreward5_block3, 
            'prev_stim1': prev_stim1_block3,
            'prev_stim2': prev_stim2_block3,
            'prev_stim3': prev_stim3_block3,
            'prev_stim4': prev_stim4_block3,
            'prev_stim5': prev_stim5_block3}
    df_block3 = pd.DataFrame(d_block3)

    y_block3, X_block3 = dmatrices('target_selection ~ prev_reward1 + prev_reward2 + prev_reward3 + \
                                    prev_reward4 + prev_reward5 + prev_noreward1 + prev_noreward2 + \
                                    prev_noreward3 + prev_noreward4 + prev_noreward5 + prev_stim1 + \
                                    prev_stim2 + prev_stim3 + prev_stim4 + prev_stim5', df_block3,
                                    return_type = "dataframe")
    
    # flatten y_block3 into 1-D array
    y_block3 = np.ravel(y_block3)

    # Split data into train and test sets
    X_block1_train, X_block1_test, y_block1_train, y_block1_test = train_test_split(X_block1,y_block1,test_size = 0.3, random_state = 0)
    X_block3_train, X_block3_test, y_block3_train, y_block3_test = train_test_split(X_block3,y_block3,test_size = 0.3, random_state = 0)

    # instantiate a logistic regression model, and fit with X and y training sets
    model_block1 = LogisticRegression()
    model_block3 = LogisticRegression()
    model_block1 = model_block1.fit(X_block1_train, y_block1_train)
    model_block3 = model_block3.fit(X_block3_train, y_block3_train)
    y_block1_score = model_block1.decision_function(X_block1_test)
    y_block3_score = model_block3.decision_function(X_block3_test)

    y_block1_nullscore = np.ones(len(y_block1_score))
    y_block3_nullscore = np.ones(len(y_block3_score))


    # Compute ROC curve and ROC area for each class (low value and high value)
    '''
    fpr_block1 = dict()
    tpr_block1 = dict()
    fpr_block3 = dict()
    tpr_block3 = dict()
    roc_auc_block1 = dict()
    roc_auc_block3 = dict()
    '''
    
    
    fpr_block1, tpr_block1, thresholds_block1 = roc_curve(y_block1_test,y_block1_score)
    roc_auc_block1 = auc(fpr_block1,tpr_block1)
    fpr_block3, tpr_block3, thresholds_block3 = roc_curve(y_block3_test,y_block3_score)
    roc_auc_block3 = auc(fpr_block3,tpr_block3)
    fpr_null_block1, tpr_null_block1, thresholds_null_block1 = roc_curve(y_block1_test,y_block1_nullscore)
    roc_nullauc_block1 = auc(fpr_null_block1,tpr_null_block1)
    fpr_null_block3, tpr_null_block3, thresholds_null_block3 = roc_curve(y_block3_test,y_block3_nullscore)
    roc_nullauc_block3 = auc(fpr_null_block3,tpr_null_block3)

    plt.figure()
    plt.plot(fpr_block1,tpr_block1,'r',label="Block 1 (area = %0.2f)" % roc_auc_block1)
    plt.plot(fpr_null_block1,tpr_null_block1,'r--',label="Block 1 - Null (area = %0.2f)" % roc_nullauc_block1)
    plt.plot(fpr_block3,tpr_block3,'m',label="Block 3 (area = %0.2f)" % roc_auc_block3)
    plt.plot(fpr_null_block3,tpr_null_block3,'m--',label="Block 3 - Null (area = %0.2f)" % roc_nullauc_block3)
    plt.plot([0,1],[0,1],'b--')
    #plt.plot(fpr_block1[1],tpr_block1[1],label="Class HV (area = %0.2f)" % roc_auc_block1[1])
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc=4)
    plt.show()

    # Predict class labels for the test set
    predicted_block1 = model_block1.predict(X_block1_test)
    probs_block1 = model_block1.predict_proba(X_block1_test)
    predicted_block3 = model_block3.predict(X_block3_test)
    probs_block3 = model_block3.predict_proba(X_block3_test)

    # Generate evaluation metrics
    print "Block 1 accuracy:", metrics.accuracy_score(y_block1_test, predicted_block1)
    print "Block 1 ROC area under curve:", metrics.roc_auc_score(y_block1_test, probs_block1[:,1])
    print 'Null accuracy rate for Block 1:',np.max([y_block1_test.mean(),1 - y_block1_test.mean()])
    
    print "Block 3 accuracy:", metrics.accuracy_score(y_block3_test, predicted_block3)
    print "Block 3 ROC area under curve:", metrics.roc_auc_score(y_block3_test, probs_block3[:,1])
    print 'Null accuracy rate for Block 3:',np.max([y_block3_test.mean(),1 - y_block3_test.mean()])
    
    
    # Model evaluation using 10-fold cross-validation
    scores_block1 = cross_val_score(LogisticRegression(),X_block1,y_block1,scoring='accuracy',cv=10)
    scores_block3 = cross_val_score(LogisticRegression(),X_block3,y_block3,scoring='accuracy',cv=10)
    print "Block 1 CV scores:", scores_block1
    print "Block 1 Avg CV score:", scores_block1.mean()
    print "Block 3 CV scores:", scores_block3
    print "Block 3 Avg CV score:", scores_block3.mean()

    '''
    # check the accuracy on the training set
    print 'Model accuracy for Block1:',model_block1.score(X_block1, y_block1)
    print 'Null accuracy rate for Block1:',np.max([y_block1.mean(),1 - y_block1.mean()])

    print 'Model accuracy for Block3:',model_block3.score(X_block3, y_block3)
    print 'Null accuracy rate for Block3:',np.max([y_block3.mean(),1 - y_block3.mean()])
    '''

    # examine the coefficients
    print pd.DataFrame(zip(X_block1.columns, np.transpose(model_block1.coef_)))
    print pd.DataFrame(zip(X_block3.columns, np.transpose(model_block3.coef_)))
    
    #return fit_glm_block1, fit_glm_block3
    return model_block1, model_block3, predicted_block1, predicted_block3
# 特征选择
target = np.array(data.Class.tolist())
feathers = data.drop(['Time', 'Class'], axis=1).values
# 划分训练集和测试集
train_x, test_x, train_y, test_y = train_test_split(feathers, target, test_size=0.1, random_state=33)


# 逻辑回归模型训练
lg = LogisticRegression()
lg.fit(train_x, train_y)
predict_y = lg.predict(test_x)


# 模型评估
# 预测样本的置信分数
score_y = lg.decision_function(test_x)
# 计算混淆矩阵并显示
cm = confusion_matrix(test_y, predict_y)
plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix') 
plt.colorbar()
trick_marks = [0, 1]
plt.xticks(trick_marks, rotation = 0)
plt.yticks(trick_marks)
thresh = cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 
    plt.text(j, i, cm[i, j], 
        horizontalalignment = 'center', 
        color = 'white' if cm[i, j] > thresh else 'black') 
plt.tight_layout() 
class MITLLStringMatcher(BaseEstimator,ClassifierMixin):
    """
    MIT-LL String Matcher as Sklearn Estimator:

     String Matching Techniques:
       - Levenshtein Distance
       - Jaro-Winkler 
       - Soft TF-IDF
    """

    # Logging
    LOG_LEVEL = logging.INFO
    logging.basicConfig(level=LOG_LEVEL,
                                format='%(asctime)s %(levelname)-8s %(message)s',
                                                    datefmt='%a, %d %b %Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    def __init__(self,algorithm='jw', stf_thresh=0.6, idf_model=None, text_normalizer = None):
        """ Initialize dict containing hyperparameters """

        self.algorithm = algorithm
        self.stf_thresh = stf_thresh
        self.idf_model = idf_model
        self.text_normalizer = text_normalizer


    #
    # Basic String Matching Functions
    #
    def levenshtein_similarity(self,s,t):
        """ Levenshtein Similarity """

        Ns = len(s); Nt = len(t);

        lev_sim = 1.0 - (jellyfish.levenshtein_distance(s,t))/float(max(Ns,Nt))

        return lev_sim


    def jaro_winkler_similarity(self,s,t):
        """ Jaro-Winkler Similarity """

        jw_sim = jellyfish.jaro_winkler(s,t)

        return jw_sim


    def soft_tfidf_similarity(self,s,t):
        """
        Soft TFIDF Similarity:

        This similarity measure is only meaningful when you have multi-word strings. 
        For single words, this measure will return 0.0
        """
        stf = self.hyparams['matcher'] #soft tf-idf object

        tfidf_sim = 0.5*(stf.score(s,t)+stf.score(t,s))

        return tfidf_sim


    #
    # Utitlity Functions
    #
    def init_hyparams(self):
        """ Initialize hyper-parameters dict """

        self.hyparams = dict()
        self.hyparams['match_fcn'] = None
        self.hyparams['algo'] = self.algorithm
        self.hyparams['txt_normer'] = self.text_normalizer

        if self.algorithm == 'lev': #levenshtein
            self.hyparams['match_fcn'] = self.levenshtein_similarity

        elif self.algorithm== 'jw': #jaro-winkler
            self.hyparams['match_fcn'] = self.jaro_winkler_similarity

        elif self.algorithm== 'stf': #softtfidf
            self.hyparams['match_fcn'] = self.soft_tfidf_similarity
            self.hyparams['stf_thresh'] = self.stf_thresh
            self.hyparams['idf_model'] = self.idf_model


    def validate_hyparams(self):
        """ Basic hyperparameter input validation"""
        
        if self.hyparams['algo'] not in set(['lev','jw','stf']):
            raise ValueError("Value of algorithm has to be either 'lev','jw' or 'stf'. Got {0}".format(self.hyparams['algo']))

        if self.hyparams['txt_normer'] not in set(['latin',None]):
            raise ValueError("The only value of txt_normer currently support is 'latin' (or None)")

        if self.hyparams['algo'] == 'stf':
            if (self.hyparams['stf_thresh'] < 0) | (self.hyparams['stf_thresh'] > 1):
                raise ValueError("Value of soft tf-idf's internal jaro-winkler threshold", \
                        "must be [0,1].")

            if self.hyparams['idf_model']:
                if set(self.hyparams['idf_model'].keys()) != set(['idf','corpus_vocab','oov_idf_val']):
                    raise ValueError("IDF model provided must contain only the following keys: ", \
                            "'idf', 'corpus_vocab', and 'oov_idf_val'.")

                if (not isinstance(self.hyparams['idf_model']['idf'],np.ndarray)) or \
                        (self.hyparams['idf_model']['idf'].dtype.type is not np.float64):
                    raise ValueError("idf_model['idf'] must be an np.ndarray of dtype np.float64")

                if not isinstance(self.hyparams['idf_model']['corpus_vocab'],dict):
                    raise ValueError("idf_model['corpus_vocab'] must be a dict.")

                if not isinstance(self.hyparams['idf_model']['oov_idf_val'],float):
                    raise ValueError("idf_model['oov_idf_val'] must be a float.")


    def init_algorithm(self):
        """ Validate hyperparameter inputs, init matcher object if neccessary"""
        
        self.validate_hyparams()

        # Initialize Soft TF-IDF matcher if needed
        if self.hyparams['algo'] == 'stf': #softtfidf
            self.hyparams['matcher'] = Softtfidf(self.hyparams['stf_thresh'],self.hyparams['idf_model'])

        if self.hyparams['txt_normer'] == 'latin':
            self.normalizer = normutils.latin_normalization.MITLLLatinNormalizer()
        else:
            self.normalizer = normutils.text_normalization.MITLLTextNormalizer() #generic normer

    
    def get_raw_similarities(self, X, y=None):
        """ Convert input to raw similarities """

        #make sure we have [0,1] class encoding in y
        if y:
            if set(y) != set((0,1)):
                raise ValueError("y expects class labels to be from {0,1}") 

        similarities = list()

        for i in xrange(len(X)):
            pair = X[i]
            s = unicode(self.normalizer.normalize(pair[0]),'utf-8')
            t = unicode(self.normalizer.normalize(pair[1]),'utf-8')

            if (len(s) > 0) and (len(t) > 0):
                sim = self.hyparams['match_fcn'](s,t)
                similarities.append(sim)
            else:
                similarities.append(0.0)
                if y: y[i] = -1 #set y-value of non-conforming pair to -1

        sims_array = np.asarray(similarities).reshape(-1,1)
        
        if y:
            return (sims_array,y)
        else:
            return sims_array


    def save_model(self,fnameout):
        """ Save model parameters out after fitting. """
        
        if self.lr_:
            model_out = dict()
            model_out['algo'] = self.hyparams['algo']
            model_out['txt_normer'] = self.hyparams['txt_normer']
            model_out['calibration'] = self.lr_
            if self.hyparams['algo'] == 'stf':
                model_out['stf_thresh'] = self.hyparams['stf_thresh']
                model_out['idf_model'] = self.hyparams['idf_model']

            pickle.dump(model_out,open(fnameout,"wb"))
            return self
        else:
            raise ValueError("save_model failed: No model has yet been fit or loaded.")


    def load_model(self,fnamein):
        """ Load model parameters. """
        model_in = pickle.load(open(fnamein,'rb')) # will throw I/O error if file not found

        self.init_hyparams() #initialize hyper-parameter dict

        self.hyparams['algo'] = model_in['algo']
        self.hyparams['txt_normer'] = model_in['txt_normer']
        self.lr_ = model_in['calibration']
        if model_in['algo'] == 'stf':
            self.hyparams['stf_thresh'] = model_in['stf_thresh']
            self.hyparams['idf_model'] = model_in['idf_model']

        self.init_algorithm() #validate hyparams (we assume object not fit when load_model called)

        return self


    #
    # Learning
    #
    def fit(self,X,y):
        """ Fit string matching models to training data
        Assuming X is list of tuples: (('s1',t1'),...,('sN',tN'))
        """
        y = y[:] #shallow copy y, b/c in-place operations to follow

        # Initialize hyper-parameter dict then algorithm
        self.init_hyparams(); self.init_algorithm()

        # Get string match scores
        (s,y) = self.get_raw_similarities(X,y)

        # Get rid of any non-conforming pairs
        data = zip(s,y)
        for pair in reversed(data): #iterate backwards to remove items from "data" 
                                    #so as not to mess up internal indexing of for-loop
            if pair[1] == -1: 
                data.remove(pair)

        (s,y) = zip(*data) 
        
        # Do Platt Scaling 
        self.lr_ = LR(penalty='l1',class_weight='balanced')
        self.lr_.fit(s,y)

        return self


    #
    # Inference
    # 
    def decision_function(self,X):
        """ Take input data, turn into decision """
        s = self.get_raw_similarities(X)

        return self.lr_.decision_function(s)


    def predict(self,X):
        """ Class predictions """
        s = self.get_raw_similarities(X)

        return self.lr_.predict(s)


    def predict_proba(self,X):
        """ Posterior match probabilities (need this for log-loss for CV """
        s = self.get_raw_similarities(X)

        return self.lr_.predict_proba(s)

    #
    # Evaluate
    #
    def score(self,X,y,sample_weight=None):
        """ Score matcher """
        return roc_auc_score(y,self.predict(X),sample_weight=sample_weight)
Exemple #50
0
                  columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
                  )
df["target"] = dataset.target
print(df.head())
print(df.tail())

print(df.info())
print(df.shape)

X = df.iloc[: , :-1]
y = df.iloc[: , -1]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=160)
model = LogisticRegression()
model.fit(X_train, y_train)

fpr0, tpr0, thresholds0 = roc_curve(y_test, model.decision_function(X_test)[:, 0], pos_label=0)
fpr1, tpr1, thresholds1 = roc_curve(y_test, model.decision_function(X_test)[:, 1], pos_label=1)
fpr2, tpr2, thresholds2 = roc_curve(y_test, model.decision_function(X_test)[:, 2], pos_label=2)

print(fpr0, tpr0, thresholds0)

plt.plot(fpr0, tpr0, "r-", label="class 0 ")
plt.plot(fpr1, tpr1, "g-", label="class 1")
plt.plot(fpr2, tpr2, "b-", label="class 2")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlim(-0.05, 1.0)
plt.ylim(0, 1.05)
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
fpr_c, tpr_c, th_c = roc_curve(test["los"], test_cart_prob[::,1])
fpr_s, tpr_s, th_s = roc_curve(test["los"], test_ad_prob[::,1])
fpr_n, tpr_n, th_n = roc_curve(test["los"], test_naive_prob[::,1])
plt.plot(fpr_l, tpr_l)
plt.plot(fpr_c, tpr_c)
plt.plot(fpr_s, tpr_s)
plt.plot(fpr_n, tpr_n)
plt.plot(sorted(np.random.uniform(0, 1, len(test['los']))),sorted(np.random.uniform(0, 1, len(test['los']))),'r--',color='k')
plt.legend(['LogisticRegress', 'CART', 'AdBoosting', 'NaiveBayes','Randomness'], loc='lower right')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

#precision-recall
#log
precision_l, recall_l, thresholds_l = precision_recall_curve(test["los"], log.decision_function(test_variables))
pl.plot(recall_l, precision_l)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("LogisticRegression")
pl.show()
#cart
precision_c, recall_c, thresholds_c = precision_recall_curve(test["los"], test_cart_prob[::,1])
pl.plot(recall_c, precision_c)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("CART")
pl.show()
#ad
precision_ad, recall_ad, thresholds_ad = precision_recall_curve(test["los"], ad.decision_function(test_variables))
pl.plot(recall_ad, precision_ad)
Exemple #52
0
class TagSelector:
    def __init__(self, trainfeatures, tags):
        words, samples, targets = [], [], []
        positive = 0
        for word, sample, target in self.parse_features(trainfeatures, True):
            words.append(word)
            samples.append(sample)
            targets.append(target)

            # balanced set 
            #if target == 1:
                #words.append(word)
                #samples.append(sample)
                #targets.append(target)
                #positive += 1
            #elif positive > 0:
                #words.append(word)
                #samples.append(sample)
                #targets.append(target)
                #positive = max(positive-1, 0)
                
        samples, targets = np.array(samples), np.array(targets)

        #self.scaler = StandardScaler()
        #samples = self.scaler.fit_transform(samples)

        start_t = time()
        self.logit_fit = LogisticRegression().fit(samples, targets)
        end_t = time()
        print("Logit fitted in (%f s)" % (end_t-start_t))

        start_t = time()
        self.svm_fit = svm.LinearSVC().fit(samples, targets)
        end_t = time()
        print("SVM fitted in (%f s)" % (end_t-start_t))

        start_t = time()
        self.nb_fit = GaussianNB().fit(samples, targets)
        end_t = time()
        print("NB fitted in (%f s)" % (end_t-start_t))

        print("Training set size: %d" % len(samples))

    def parse_features(self, featuresfile, is_training):
        with open(featuresfile, 'r') as f:
            for line in f:
                vec = line.split("\t")
                word = vec[0] 
                sample = [float(vec[i]) for i in range(1, len(vec)-1)]
                if is_training:
                    target = int(vec[-1])
                    yield word, sample, target 
                else:
                    id = int(vec[-1])
                    yield word, sample, id 

    def next_sample(self, featurefile):
        cur_id = 0
        samples, words = [], []
        for word, vec, id in self.parse_features(featurefile, False):
            if id != cur_id:
                if cur_id != 0:
                    yield samples, words, cur_id 
                cur_id = id
                samples, words = [vec], [word]
            else:
                samples.append(vec)
                words.append(word)
        if cur_id != 0:
            yield samples, words, cur_id 

    def rank(self, samples, words):
        samples, preds = np.array(samples), []
        # no need to rank if 3 or less candidates
        if len(words) > 3:
            start_t = time()
            #samples = self.scaler.transform(samples)

            preds_lgt = self.logit_fit.decision_function(samples)
            preds_svm = self.svm_fit.decision_function(samples)
            #preds_nb = self.nb_fit.predict_proba(samples)
            #preds_nb = np.array([x[1] for x in preds_nb])

            preds = 0.7*preds_svm + 0.3*preds_lgt # + 0.05*preds_nb

            end_t = time()
            #print("Predictions made in (%f s)" % (end_t-start_t))

            #results = zip(words, preds_nb)
            #return [w for w, _ in sorted(results, key = lambda x: x[1][1], reverse=True)]

            results = zip(words, preds)
            return [w for w, _ in sorted(results, key = lambda x: x[1], reverse=True)]

        return words
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])



print "Start logisitc regression. Don't worry it will take some time"

#Train a sentiment classifier with logistic regression
from sklearn.linear_model import LogisticRegression

majority_model = LogisticRegression()
majority_model.fit(train_matrix, train_data['sentiment'])

#Find the most positive (and negative) review
scores_test = majority_model.decision_function(test_matrix)
pred_test = pred(scores_test)


#Compute accuracy of the classifier
print "Compute accuracy of the classifier. It will take a while so take a break"
accuracy_test = compute_accuracy(test_data['sentiment'], pred_test)  
print("The accuracy on the test data is: %.3f" %accuracy_test)

#Answer 0.843

Exemple #54
0
    X_test_unlabeled_pool = X_test[:1992, :]
    X_test_test = X_test[1992:, :]
    y_test_unlabeled_pool = y_test[:1992, -1]
    y_test_test = y_test[1992:, -1]

    acc = []
    train_acc = []
    dim = []

    for k in range(0, 10, 1):
        clf = LogisticRegression()
        print 'Size of X: ', len(X_train), X_train.shape, type(X_train)
        clf.fit(X_train, y_train[:, -1])

        preds = clf.decision_function(X_test_unlabeled_pool)

        values = []
        positions = []
        for i in range(0, len(X_test_unlabeled_pool), 1):
            values.append(abs(preds[i]))
            positions.append(i)

        for i in range(10):
            pos = np.array(values).argmin()
            # print np.array(values).min()
            X_train_new = np.zeros(((X_train.shape[0] + 1), X_train.shape[1]))
            y_train_new = np.zeros(((y_train[:, -1].shape[0] + 1), 1))

            X_train_new[:X_train.shape[0]] = X_train
            X_train_new[X_train.shape[0]:] = X_test_unlabeled_pool[pos, :]
Exemple #55
0
    show_dataset(X, Y)

    # Split dataset
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

    # Create and train Gaussian Naive Bayes classifier
    gnb = GaussianNB()
    gnb.fit(X_train, Y_train)

    # Create and train a Logistic regressor (for comparison)
    lr = LogisticRegression()
    lr.fit(X_train, Y_train)

    # Compute ROC Curve
    Y_gnb_score = gnb.predict_proba(X_test)
    Y_lr_score = lr.decision_function(X_test)

    fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1])
    fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score)

    # Plot ROC Curve
    plt.figure(figsize=(30, 25))

    plt.plot(fpr_gnb,
             tpr_gnb,
             color='red',
             label='Naive Bayes (AUC: %.2f)' % auc(fpr_gnb, tpr_gnb))
    plt.plot(fpr_lr,
             tpr_lr,
             color='green',
             label='Logistic Regression (AUC: %.2f)' % auc(fpr_lr, tpr_lr))
cm_logistic


# #### 14. What does the Confusion Matrix tell us?

'''Our model has a higher proportion of false negatives to positives than false positives to negatives.
The model handles negatives better. This actually is disturbing because in a disaster,
all else being equal, we would probably want to err on overestimating the survivors (false positives)
and minimizing false negatives. The headlines would probably be more oriented towards government waste
rather than government tragedy (deaths that could have been prevented due to being prepared for a
number of survivors), which is preferable.'''

# #### 15. Plot the ROC curve

Y_score_lr = lr.decision_function(X_test)

FPR_logistic = dict() #false positive rate. X-axis for ROC Curve
TPR_logistic = dict() #true positive rate. Y-axis for ROC curve
ROC_AUC = dict()

FPR_logistic[1], TPR_logistic[1], thresholds_logistic = metrics.roc_curve(y_test, Y_score_lr)
ROC_AUC[1] = metrics.auc(FPR_logistic[1], TPR_logistic[1])

plt.figure()
plt.plot(FPR_logistic[1], TPR_logistic[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')