Esempio n. 1
0
def interpolated_precision_recall_curve(queries_ranking, queries_similarities, relevants):
    
    queries_count = np.shape(queries_ranking)[0]
    interpolated_precision = np.zeros(11,dtype = np.float128) 
    
    for qindex in range(0,queries_count):

        tp = 0
        precision, recall = [0],[0]
        
        relevants_count = np.shape(np.nonzero(relevants[qindex]))[1]
        retrieved_count = 1
        
        for ranki in queries_ranking[qindex]:
            if (queries_similarities[qindex][ranki] > 0) and (relevants[qindex][ranki] == 1):
                tp += 1
                
            precisioni = tp / retrieved_count
            if relevants_count == 0:
                recalli = 1
            else:
                recalli = tp / relevants_count
            
            retrieved_count += 1

            precision += [precisioni]
            recall += [recalli]
              
        # query's 11 levels of precision recall precision_levels[0] = max precision in recall > 0                  
        precision_levels = []
        
        for i in range(0,11):
            prec_ati = 0
            for j in range(0,len(recall)):
                if i <= recall[j]*10:
                    prec_ati =  max(prec_ati,precision[j])
                    
            precision_levels.append(prec_ati)
            interpolated_precision[i] += prec_ati/queries_count
            
        del precision
        del recall
                 
    
    auc = float("{0:1.4f}".format(metrics.auc([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],interpolated_precision)))
    
    return interpolated_precision, auc
Esempio n. 2
0
    def draw_roc(self, label_sets, title='', save_path='', show_plot=False):
        # Compute ROC curve and area the curve
        pyplot.clf()

        for i, (labels, probas) in enumerate(label_sets):
            fpr, tpr, _ = roc_curve(labels, probas[:, 1])
            roc_auc = auc(fpr, tpr)

            # Plot ROC curve
            pyplot.plot(fpr,
                        tpr,
                        label='Training fold {0} (area = {1})'.format(
                            i + 1, round(roc_auc, 2)))

        pyplot.plot([0, 1], [0, 1], 'k--')
        pyplot.xlim([0.0, 1.0])
        pyplot.ylim([0.0, 1.0])
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        pyplot.title(title)
        pyplot.legend(loc="lower right")
        if save_path: pyplot.savefig(save_path)
        if show_plot: pyplot.show()
from sklearn.metrics import metrics

file = open("../output/splitpred1.csv")
numarray = []
while 1:
    line = file.readline()
    if not line:
        break
    numarray.append(float(line))
file = open("../output/answers.csv")
answerarray = []
while 1:
    line = file.readline()
    if not line:
        break
    answerarray.append(float(line))
fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1)
auc = metrics.auc(fpr, tpr)
print auc
train_output = train[:, 4]

print("train in and out", train_input.shape, train_output.shape)

test_input = test[:, 0:4]
test_output = test[:, 4]

print("test in and out", test_input.shape, test_output.shape)

# train an SVC (Suppot Vector Classifier)

# create the classifier
classifier = RandomForestClassifier()  # or SVC()
# learn the data
classifier.fit(train_input, train_output)
print(test_output)
# predict the output of the test input
predicted = classifier.predict(test_input)

print(predicted)

# Calculate the ROC curve
fpr, tpr, thresholds = metrics.roc_curve(test_output, predicted, pos_label=2)
# Calculate the area under the ROC curve
auc = metrics.auc(fpr, tpr)

print(auc)

# predict the test set values
# get our AUC and accuracy
Esempio n. 5
0
File: util.py Progetto: jcrudy/higgs
def roc(y, p_hat):
    fpr, tpr, thresholds = roc_curve(y, p_hat)
    roc_auc = auc(fpr, tpr)
    
    return roc_auc, fpr, tpr
Esempio n. 6
0
def evaluate(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    fpr, tpr, _thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
    return metrics.auc(fpr, tpr)
Esempio n. 7
0
def doCV():
    SEED = 42
    rnd = np.random.RandomState(SEED)

    model_lr = linear_model.LogisticRegression(C=3)
    model_rf = ensemble.RandomForestClassifier(
        n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2
    )

    print "loading data for random forest..."
    y, X = data_io.load_data_pd("train_orig.csv", use_labels=True)
    _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False)

    xtrain = getRFX(X)
    xtest = getRFX_test(X_test)
    xtrain = xtrain[:, 1:]
    xtest = xtest[:, 1:]

    xtrain.dump("num_train.dat")
    xtest.dump("num_test.dat")
    print "dumped..!"
    print "loading data for logistic regression..."
    ysp, Xsp = data_io.load_data("train_orig.csv")
    y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False)
    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    # print Xsp.shape, X_testsp.shape
    encoder.fit(np.vstack((Xsp, X_testsp)))
    Xsp = encoder.transform(Xsp)  # Returns a sparse matrix (see numpy.sparse)
    X_testsp = encoder.transform(X_testsp)

    print "starting cross validation..."
    nFeatures = X.shape[0]
    niter = 10
    cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd)
    mean_auc = 0.0
    i = 0
    for train, test in cv:
        xtrain = X.ix[train]
        ytrain = y[train]
        xtest = X.ix[test]
        ytest = y[test]

        xtrain_sp = Xsp[train]
        xtest_sp = X_testsp[test]
        ytrainsp = ysp[train]

        xtrain = getRFX(xtrain)
        xtest = getRFX_test(xtest)
        xtrain = xtrain[:, 1:]
        xtest = xtest[:, 1:]

        print "fitting random forest...."
        model_rf.fit(xtrain, ytrain)
        preds_rf = model_rf.predict_proba(xtest)[:, 1]

        print "fitting logistic regression..."
        model_lr.fit(xtrain_sp, ytrainsp)
        preds_lr = model_lr.predict_proba(xtest_sp)[:, 1]

        preds = [np.mean(x) for x in zip(preds_rf, preds_lr)]

        fpr, tpr, _ = metrics.roc_curve(ytest, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc)
        mean_auc += roc_auc
        i += 1
    print "Mean AUC: ", mean_auc / niter
Esempio n. 8
0
 def get_auc(self, labels_true, labels_prob):
     fpr, tpr, _ = roc_curve(labels_true, labels_prob)
     return auc(fpr, tpr)
    def interpolated_precision_recall_curve(self,queries_ranking, queries_similarities, relevants):
#         precision_per_query = []
#         recall_per_query = []
#         precision_leves_per_query = []
        
        queries_count = shape(queries_ranking)[0]
        interpolated_precision = zeros(11,dtype = np.float128) 
        
        for qindex in range(0,queries_count):

            tp = 0
            precision, recall = [0],[0]
            
            relevants_count = shape(nonzero(relevants[qindex]))[1]
#             print('relevants_count : ',relevants_count)
            retrieved_count = 1
            
            for ranki in queries_ranking[qindex]:
#                 print('queries_similarities.shape(%d,%d)'%(queries_similarities.shape[0],queries_similarities.shape[1]))
#                 print('relevants.shape(%d,%d)'%(relevants.shape[0],relevants.shape[1]))
#                 print('[qindex][ranki]',qindex,ranki)
                if (queries_similarities[qindex][ranki] > 0) and (relevants[qindex][ranki] == 1):
                    tp += 1
                    
                precisioni = tp / retrieved_count
                if relevants_count == 0:
                    recalli = 1
                else:
                    recalli = tp / relevants_count
                
                retrieved_count += 1
                
#                 if (recalli != 1.0):
#                     print('precisioni : ',precisioni,'recalli : ',recalli,"!!!!!!!!!!!!!!!!!!!!!!!")
#                 else:
#                     print('precisioni : ',precisioni,'recalli : ',recalli,)
#                 print('tp : ',tp,'retrieved_count : ',retrieved_count, 'relevants_count : ',relevants_count)

                precision += [precisioni]
                recall += [recalli]
                  
            # query's 11 levels of precision recall precision_levels[0] = max precision in recall > 0                  
            precision_levels = []
            
            for i in range(0,11):
                prec_ati = 0
                for j in range(0,len(recall)):
                    if i <= recall[j]*10:
                        prec_ati =  max(prec_ati,precision[j])
                        
                precision_levels.append(prec_ati)
                interpolated_precision[i] += prec_ati/queries_count
                
            del precision
            del recall
                  
#             precision_leves_per_query.append(precision_levels)  

#             print(precision_levels)
#             print(interpolated_precision)
#             print("**************")
            
#             precision_per_query.append(precision)
#             recall_per_query.append(recall)
#             
#         print('precision_per_query : ',shape(precision_per_query))
#         print('recall_per_query : ',shape(recall_per_query))
#         print('precision_leves_per_query : ',shape(precision_leves_per_query))
        
        
        auc = float("{0:1.4f}".format(metrics.auc([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],interpolated_precision)))
        
        return interpolated_precision, auc
Esempio n. 10
0
def blTopic():
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset

    kernelType = options.kernelType
    nFold = options.nFold
    nCodeword = options.nCodeword

    dataPath = rootDir + dataset + bofDir
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword) + bofext
    nCategory = len(catList)

    perfMean = np.zeros(nCategory)
    perfStd = np.zeros(nCategory)

    for iCat, catname in enumerate(catList):
        print catname
        #read the category data which will positive
        fname = dataPath + catname + dataext
        catpos = np.genfromtxt(fname, dtype=np.int)  # catpos
        catpos = catpos[:, :nCodeword + 1]
        catpos[:, nCodeword] = 1
        #read the category data of remaining classes
        for cats in catList:
            if (cats != catname):
                firstvisit = True
                if (firstvisit):
                    catneg = np.genfromtxt(fname, dtype=np.int)
                    firstvisit = False
                else:
                    catneg = np.concatenate(
                        (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0)
        #sample the negative data to have equal size as the positive
        nPos = catpos.shape[0]
        nNeg = catneg.shape[0]
        catneg = catneg[np.random.randint(0, nNeg, nPos), :]  #catneg
        catneg = catneg[:, :nCodeword + 1]
        catneg[:, nCodeword] = 0
        #combine positive and negative data
        data = np.concatenate((catpos, catneg), axis=0)
        #shuffle the rows to aid in random selection of train and test
        np.random.shuffle(data)

        X = data[:, :nCodeword]
        y = data[:, nCodeword]

        clfParamList = {
            'kernel': kernelType,
            'gamma': 1e-3,
            'C': 1,
            'degree': 4,
            'probability': True,
            'shrinking': True,
            'cache_size': 1000
        }
        classifier = SVC(**clfParamList)
        cv = StratifiedKFold(y, k=nFold)
        avgprec = np.zeros(nFold)
        for icv, (train, test) in enumerate(cv):
            clf = classifier.fit(X[train], y[train])
            probas_ = clf.predict_proba(X[test])
            precision, recall, thresholds = precision_recall_curve(
                y[test], probas_[:, 1])  #@UnusedVariable
            avgprec[icv] = auc(recall, precision)
        perfMean[iCat] = np.mean(avgprec)
        perfStd[iCat] = np.std(avgprec)
    return [perfMean, perfStd]