def calculate_roc(truth, predictions):
    lb_truth = label_binarize(truth.iloc[:, -1].astype(int), np.arange(n_classes))
    lb_prediction = label_binarize(predictions.iloc[:, -1].astype(int), np.arange(n_classes))

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(letter_set)):
        fpr[i], tpr[i], _ = roc_curve(lb_truth[:, i], lb_prediction[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(lb_truth.ravel(), lb_prediction.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    return fpr, tpr, roc_auc
Beispiel #2
0
def PersonWorker(person):
    print('starting on person: ', str(person))

    #data = 40 videos x 32 alpha(csp channel)
    (X_train, y_train, X_test, y_test) = DL.loadPersonEpochDimRedu(person=person,
        featureFunc = featureFunc,
    )
    
    #http://stackoverflow.com/questions/26963454/lda-ignoring-n-components => only 1 feature :(
    print(np.shape(X_train))

    svm = LinearSVC()
    svm.fit(X_train, y_train)
    
    y = svm.predict(X_train)
    y = label_binarize(y, classes=[0, 1, 2, 3])
    train_auc = UT.auc(y, y_train)

    y = svm.predict(X_test)
    y = label_binarize(y, classes=[0, 1, 2, 3])
    test_auc = UT.auc(y, y_test)


    print('person: ', person, 
        ' - train auc: ', str(train_auc),
        ' - test auc: ' , str(test_auc)
    )

    return [train_auc, test_auc]
    def fit(self, X, y):
        self.init_params(X, y)
        self.paths = self.construct_paths()
        num = len(self.paths[0])
        swarm_paths = [sorted(list(set([s[i] for s in self.paths if s[i] is not None]))) for i in xrange(num)]
        W = self.init_network()
        self.W_swarms = [[[s for s in self.swarms if s.path[j] == i] for i in swarm_paths[j]] for j in xrange(num)]

        X_train, X_valid, y_train, y_valid = cv.train_test_split(X, y, test_size=self.validation_size,
                                                                 random_state=self.random_state)

        # binarize true values
        if len(self.classes_) > 2:
            y_train = label_binarize(y_train, self.classes_)
            y_valid = label_binarize(y_valid, self.classes_)
        else:
            y_train = self.mlb.fit_transform(label_binarize(y_train, self.classes_))
            y_valid = self.mlb.fit_transform(label_binarize(y_valid, self.classes_))

        j = 0
        tmp = [1e3 - float(x * 1e3)/self.window for x in xrange(self.window)]
        window = deque(tmp, maxlen=(self.window * 5))
        self.num_evals = 0
        best_score = np.inf

        if self.verbose:
            print "Fitting network {0}-{1}-{2} with {3} paths".format(self.n_in, self.n_hidden, self.n_out, len(self.swarms))

        while True:
            j += 1
            for s in self.swarms:
                for p_index in xrange(self.num_particles):
                    self.num_evals += 1

                    # evaluate each swarm
                    score = s.evaluate(W, X_train, y_train, p_index)

                    # reconstruct gvn
                    Wn = self.reconstruct_gvn(W)

                    # update
                    s.update(self.w, self.c1, self.c2, p_index)

                    # evaluate gvn
                    y_pred = self.forward(Wn, X_valid)
                    score = self.cost(y_valid, y_pred)
                    if score < best_score:
                        W = Wn[:]
                        best_score = score

            window.append(best_score)
            r = linregress(range(self.window), list(window)[-self.window:])
            if self.verbose:
                print j, best_score

            if r[0] >= 0 or best_score < 1e-3:
                self.W = W
                self.num_generations = j
                return self
def test_sensitivity_specificity_error_multilabels():
    y_true = [1, 3, 3, 2]
    y_pred = [1, 1, 3, 2]
    y_true_bin = label_binarize(y_true, classes=np.arange(5))
    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))

    with pytest.raises(ValueError):
        sensitivity_score(y_true_bin, y_pred_bin)
 def __init__(self, file_path, number_features):
     dataset = self.load_dataset(file_path, number_features)
     xs = dataset[:, 0:number_features + 1]
     ys = dataset[:, number_features + 1]
     self.xs, self.xs_test, ys, ys_test = train_test_split(xs, ys, train_size=0.6)
     self.ys = np.transpose(label_binarize(ys, classes=[0, 1, 2]))
     self.ys_test = np.transpose(label_binarize(ys_test, classes=[0, 1, 2]))
     self.m = self.xs.shape[0]
     self.test_set_size = self.xs_test.shape[0]
def getROCScore(X_train, y_train, X_test, y_test, classifierName, depth=None, Cvalue=1,alphaValue=0.0):



# Binarize the output
    y_train = label_binarize(y_train, classes=[3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 999])
    n_classes = y_train.shape[1]
    y_test = label_binarize(y_test, classes=[3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 999])



# Learn to predict each class against the other
    if classifierName=='DecisionTree':
        classifier=OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth))
    elif classifierName=='LogisticRegression':
        classifier = OneVsRestClassifier(linear_model.LogisticRegression(C=Cvalue))
    elif classifierName=='LinearSVC':
        classifier= OneVsRestClassifier(LinearSVC(C=Cvalue))
    elif classifierName=='NaiveBayes':
        classifier= OneVsRestClassifier(MultinomialNB(alpha=alphaValue))
    elif classifierName=='Bagging':
        estimator= tree.DecisionTreeClassifier()
        classifier=OneVsRestClassifier(BaggingClassifier(base_estimator=estimator))

    
    y_score = classifier.fit(X_train, y_train).predict(X_test)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    return (roc_auc["micro"],roc_auc["macro"],classifier)
def xval(clf, x, y, train_index, test_index):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train)
    y_pred = clf.predict_proba(x_test)
    if len(clf.classes_) > 2:
        mse = mean_squared_error(label_binarize(y_test, clf.classes_), y_pred)
    else:
        mlb = MultiLabelBinarizer()
        mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
    acc = accuracy_score(y_test, y_pred.argmax(axis=1))
    evals = clf.get_num_evals()
    return mse, acc, evals
def PR_multi_class(data_train, data_test, data_test_vectors):
    # Binarize the output
    y_train_label = label_binarize(data_train.target, classes=[0, 1, 2])
    n_classes = y_train_label.shape[1]
    
    random_state = np.random.RandomState(0)
    
    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5,
                                                        random_state=random_state)
    
    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
    classifier.fit(X_train, y_train)
    y_pred_score = classifier.decision_function(data_test_vectors)
    
    y_test_label = label_binarize(data_test.target, classes=[0, 1, 2])
    
    # Compute Precision-Recall and plot curve
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test_label[:, i], y_pred_score[:, i])
        average_precision[i] = average_precision_score(y_test_label[:, i], y_pred_score[:, i])
    
    # Compute micro-average ROC curve and ROC area
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_label.ravel(), y_pred_score.ravel())
    average_precision["micro"] = average_precision_score(y_test_label, y_pred_score, average="micro")
    
    # Plot Precision-Recall curve for each class
    plt.clf()
#    plt.plot(recall["micro"], precision["micro"],
#             label='micro-average PR curve (area = {0:0.2f})'
#                   ''.format(average_precision["micro"]))
    for i in range(n_classes):
        plt.plot(recall[i], precision[i],
                 label='PR curve of class {0} (area = {1:0.2f})'
                       ''.format(i, average_precision[i]))
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve of multi-class')
    plt.legend(loc="lower right")
    plt.show()
    return 0
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()
    sentences.append(temp_doc)

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  create_directory('model')
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def roc(features_trunc, labels, categories, classifier):
	"""
	compute and plot the roc curve for the given classifier
		features_trunc - features matrix truncated to the k best features
		labels - the classes of the data
		categories - different possible categories (66 for subcategories or 14 for categories)
		classifier - MultinomialNB or lda
	"""
	# divide the data into training and test set
	features_train, features_test, categoryids_train, categoryids_test = train_test_split(features_trunc, labels, test_size=.1,random_state=0)
	# define the OneVsRestClassifier with the given classifier (LDA or Naive Bayes)
	clf = OneVsRestClassifier(classifier)
	# train the classifier and compute the probabilities for the test data labels
	clf_fit = clf.fit(features_train, categoryids_train)
	labels_score = clf_fit.predict_proba(features_test)
	# binarize the labels (necessary for the roc curve)
	categoryids_test = label_binarize(categoryids_test, classes=categories)
	# compute the false positive rate, true positive rate and the thresholds
	fpr, tpr, thresholds = metrics.roc_curve(categoryids_test.ravel(), labels_score.ravel())
	# compute the area under the curve
	roc_auc = metrics.auc(fpr, tpr)
	# plot the roc curve
	pl.clf()
	pl.plot(fpr, tpr, 'r',label='micro-average ROC curve (area = {0:0.2f})'''.format(roc_auc), linewidth=2)
	pl.plot([0, 1], [0, 1], 'k--', linewidth=2)
	pl.xlim([0.0, 1.0])
	pl.ylim([0.0, 1.05])
	pl.xlabel('false positive rate')
	pl.ylabel('true positive rate')
	pl.title('Receiver operating characteristic for micro-averaged classification scores')
	pl.legend(loc="lower right")
	pl.show()
def model(train_data, train_label, test_data, test_label, n_classes):
    # Binarize the output
    train_label = label_binarize(train_label, classes=list(np.arange(n_classes)))
    test_label = label_binarize(test_label, classes=list(np.arange(n_classes)))

    # Basic classifier
    # basic_clf = LogisticRegression(C=1.0)
    # basic_clf = SVC()
    # basic_clf = KNeighborsClassifier()
    basic_clf = GaussianNB()
    # Multi-class
    classifier = OneVsRestClassifier(basic_clf)
    classifier.fit(train_data, train_label)
    # test_score = classifier.decision_function(test_data)
    test_score = classifier.predict_proba(test_data)
    return test_score, test_label
Beispiel #12
0
def prepare_features(df):
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    df['Fare'].fillna(df['Fare'].mean(), inplace = True)

    df['Sex'] = label_binarize(df['Sex'], classes = ['males', 'female'])
    
    # disabled as usefull transformation
    # df['Fare'] = df['Fare'].apply(lambda x: int(round(math.log(x+1))))

    df_embarked = pd.DataFrame(label_binarize(df['Embarked'], classes = ['C', 'Q', 'S']), columns = ['Embarked_C',
                                                                                                     'Embarked_Q',
                                                                                                     'Embarked_S'])

    df = pd.concat([df, df_embarked], axis = 1, copy = False)

    return df
Beispiel #13
0
def multiclass_AUC(clf, X, Y):
    # Binarize the output
    X, Y = np.array(X), np.array(Y)
    Y = label_binarize(Y, classes=list(set(Y)))
    n_classes = Y.shape[1]

    # shuffle and split training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
                                                        random_state=0)
    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(clf)
    Y_score = classifier.fit(X_train, Y_train).predict(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
def evaluateOneEpoch(inputCoor, inputGraph, inputLabel, para, sess, trainOperaion):
    test_loss = []
    test_acc = []
    test_predict = []
    for i in range(len(inputCoor)):
        xTest, graphTest, labelTest = inputCoor[i], inputGraph[i], inputLabel[i]
        graphTest = graphTest.tocsr()
        labelBinarize = label_binarize(labelTest, classes=[i for i in range(para.outputClassN)])
        test_batch_size = para.testBatchSize
        for testBatchID in range(len(labelTest) / test_batch_size):
            start = testBatchID * test_batch_size
            end = start + test_batch_size
            batchCoor, batchGraph, batchLabel = get_mini_batch(xTest, graphTest, labelBinarize, start, end)
            batchWeight = uniform_weight(batchLabel)
            batchGraph = batchGraph.todense()

            feed_dict = {trainOperaion['inputPC']: batchCoor, trainOperaion['inputGraph']: batchGraph,
                         trainOperaion['outputLabel']: batchLabel, trainOperaion['weights']: batchWeight,
                         trainOperaion['keep_prob_1']: 1.0, trainOperaion['keep_prob_2']: 1.0}

            predict, loss_test, acc_test = sess.run(
                [trainOperaion['predictLabels'], trainOperaion['loss'], trainOperaion['acc']], feed_dict=feed_dict)
            test_loss.append(loss_test)
            test_acc.append(acc_test)
            test_predict.append(predict)

    test_average_loss = np.mean(test_loss)
    test_average_acc = np.mean(test_acc)

    return test_average_loss, test_average_acc, test_predict
Beispiel #15
0
    def compute_rocauc(self):
        """

        :return:
        """
        # Binarize the output
        y_test = label_binarize(self.y_test, classes=list(range(self.n_classes)))

        # Compute ROC curve and ROC area for each class
        y_score = self.clf.predict_proba(self.X_test)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(self.n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        self.report["roc_auc"] = dict(
            fpr={str(k): v.tolist() for k, v in fpr.items()},
            tpr={str(k): v.tolist() for k, v in tpr.items()},
            roc_auc={str(k): v.tolist() for k, v in roc_auc.items()}
        )
def trainModel(data):
    model = Sequential()
    model.add(Dense(400, input_dim=(data.shape[1] - 1), init="uniform"))
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(500, init="uniform"))
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(39, init="uniform"))
    model.add(Activation("softmax"))

    cb = EarlyStopping(monitor="val_loss", patience=3, verbose=0, mode="auto")

    output = label_binarize(data[0:, 0], range(0, 39))
    print (output.shape)
    # optim = Adam(lr=0.1, beta_l=0.2, beta_2=0.7, epsilon=1e-6)
    # model.compile(loss='categorical_crossentropy',optimizer=optim)
    # model.fit(data[0:,1:].astype(np.float32),output,nb_epoch=30,batch_size=16,show_accuracy=True,validation_split=0.5,callbacks=[cb])
    # optim = Adam(lr=0.01, beta_l=0.5, beta_2=0.8, epsilon=1e-07)
    # model.compile(loss='categorical_crossentropy',optimizer=optim)
    # model.fit(data[0:,1:].astype(np.float32),output,nb_epoch=30,batch_size=16,show_accuracy=True,validation_split=0.3,callbacks=[cb])
    optim = Adam(lr=0.001, beta_l=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(loss="categorical_crossentropy", optimizer=optim)
    model.fit(
        data[0:, 1:].astype(np.float64),
        output,
        nb_epoch=30,
        batch_size=16,
        show_accuracy=True,
        validation_split=0.1,
        callbacks=[cb],
    )
    return model
Beispiel #17
0
    def set_shared_variables(self, dataset, index,enable_time):
        c = np.zeros((self.batch_size, self.max_seqlen), dtype=np.int32)
        q = np.zeros((self.batch_size, ), dtype=np.int32)
        y = np.zeros((self.batch_size, self.num_classes), dtype=np.int32)
        c_pe = np.zeros((self.batch_size, self.max_seqlen, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX)
        q_pe = np.zeros((self.batch_size, 1, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX)
        # c_pe = np.ones((self.batch_size, self.max_seqlen, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX)
        # q_pe = np.ones((self.batch_size, 1, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX)

        indices = range(index*self.batch_size, (index+1)*self.batch_size)
        for i, row in enumerate(dataset['C'][indices]):
            row = row[:self.max_seqlen]
            c[i, :len(row)] = row

        q[:len(indices)] = dataset['Q'][indices] #问题的行数组成的列表
        '''底下这个整个循环是得到一个batch对应的那个调整的矩阵'''
        for key, mask in [('C', c_pe), ('Q', q_pe)]:
            for i, row in enumerate(dataset[key][indices]):
                sentences = self.S[row].reshape((-1, self.max_sentlen)) #这句相当于把每一句,从标号变成具体的词,并补0
                for ii, word_idxs in enumerate(sentences):
                    J = np.count_nonzero(word_idxs)
                    for j in np.arange(J):
                        mask[i, ii, j, :] = (1 - (j+1)/J) - ((np.arange(self.embedding_size)+1)/self.embedding_size)*(1 - 2*(j+1)/J)

        # c_pe=np.not_equal(c_pe,0)
        # q_pe=np.not_equal(q_pe,0)

        # y[:len(indices), 1:self.num_classes] = self.lb.transform(dataset['Y'][indices])#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长
        y[:len(indices), 1:self.num_classes] = label_binarize(dataset['Y'][indices],self.vocab)#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长
        # y[:len(indices), 1:self.embedding_size] = self.mem_layers[0].A[[self.word_to_idx(i) for i in list(dataset['Y'][indices])]]#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长
        self.c_shared.set_value(c)
        self.q_shared.set_value(q)
        self.a_shared.set_value(y)
        self.c_pe_shared.set_value(c_pe)
        self.q_pe_shared.set_value(q_pe)
Beispiel #18
0
 def transform(self, X, y=None):
     f = np.vectorize(self._replace_label)
     X_t = f(X).reshape(len(X), 1)
     if self.binarize:
         return label_binarize(X_t, classes=self.labels)
     else:
         return X_t
def cross_validate(nb_class, X, y, nb_epoch, task, labels, avg_scores, n_folds=10, evaluate=False, max_words=100,
                   stateful=False, convolutional=True, pretrained=None):
    skf = StratifiedKFold(y, n_folds=n_folds, shuffle=False, random_state=None)  # already shuffled
    for i, (train_ix, test_ix) in enumerate(skf):
        print ("Cross-validation fold: %d/%d" % (i + 1, n_folds))
        model = None  # Clearing the NN.
        model = models.construct_cnn_lstm(stateful=stateful, convolutional=convolutional, nb_class=nb_class,
                                          max_words=max_words, pretrained_embedding=copy.deepcopy(pretrained))
        # pretrained_embedding=pre_model.layers[0])
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        models.train_model(model, X_train, preprocessing.label_binarize(y_train, classes=labels),
                           nb_epoch=nb_epoch,
                           evaluate=evaluate, max_words=max_words)
        if evaluate is False:
            X_test = models.pad(X_test, max_words=max_words)
        y_test_pred = model.predict_classes(X_test)
        y_test_pred = [labels[i] for i in y_test_pred]
        cm = confusion_matrix(y_test, y_test_pred, labels=labels)
        print(", ".join(labels))
        print("confusion matrix:")
        print(cm)
        scores = precision_recall_fscore_support(y_test, y_test_pred, average=None, labels=labels)
        print("precision, recall, fscore and support values for each class:")
        print(", ".join(labels))
        for x, label in enumerate(["precision", "recall", "fscore", "support"]):
            print(label, scores[x])
            for j, k in enumerate(scores[x]):
                avg_scores[x][j] += k
        print(", ".join(labels))
        cPickle.dump(cm, open("data/scores/%s_cm_cross_%d.pkl" % (task, i), "w"))
        cPickle.dump(cm, open("data/scores/%s_precision_recall_fscore_support_pos_cross_%d.pkl" % (task, i), "w"))
Beispiel #20
0
def WeekDaysBinarization(column):
    column1 = [0] * len(column)
    for i in range(0, len(column)):
        r = 7
        if column[i] in weekDays.keys():
            r = weekDays[column[i]]

        column1[i] = r

    myset = set(column1)
    mm = list(myset)
    r1 = label_binarize(column1, classes=mm)
    r1 = r1[:,0:7]
    r1 = np.column_stack((r1, column1))

    weekDay = [0] * len(column1)

    for i in range(0, len(column1)):
        weekDay[i] = 0
        if column1[i] == 0 or column1[i] == 6:
            weekDay[i] = 1

    r1 = np.column_stack((r1, weekDay))

    return r1
Beispiel #21
0
    def fit(self, X, y):
        if self.activation is None:
            # Useful to quantify the impact of the non-linearity
            self._activate = lambda x: x
        else:
            self._activate = self.activations[self.activation]
        rng = check_random_state(self.random_state)

        # one-of-K coding for output values
        self.classes_ = unique_labels(y)
        Y = label_binarize(y, self.classes_)

        # set hidden layer parameters randomly
        n_features = X.shape[1]
        if self.rank is None:
            if self.density == 1:
                self.weights_ = rng.randn(n_features, self.n_hidden)
            else:
                self.weights_ = sparse_random_matrix(
                    self.n_hidden, n_features, density=self.density,
                    random_state=rng).T
        else:
            # Low rank weight matrix
            self.weights_u_ = rng.randn(n_features, self.rank)
            self.weights_v_ = rng.randn(self.rank, self.n_hidden)
        self.biases_ = rng.randn(self.n_hidden)

        # map the input data through the hidden layer
        H = self.transform(X)

        # fit the linear model on the hidden layer activation
        self.beta_ = np.dot(pinv2(H), Y)
        return self
Beispiel #22
0
    def fit(self, X, y):
        '''
        Trains the model
        Arguments:
            X is a n-by-d numpy array
            y is an n-dimensional numpy array
        '''
        n, d = X.shape
        # transform y into an n-by-10 numpy array (unique_y = 10)
        num_unique_y = len(np.unique(y))
        binary_y = label_binarize(y, classes = np.unique(y))

        self.all_layers_info = np.append(np.append(d, self.layers), num_unique_y)
        # print self.all_layers_info
        self.L = len(self.all_layers_info)

        np.random.seed(28)
        # Initialize theta
        for l in range(self.L - 1):
            self.theta[l + 1] = np.random.uniform(low=-self.epsilon, high=self.epsilon, size=(self.all_layers_info[l + 1], (self.all_layers_info[l] + 1)))
            # print self.theta[l+1][0]
        # loop though Epochs
        for i in range(self.numEpochs):
            self._forwardPropagation_(X)
            self._backPropagation_(binary_y)
    def logfit(self, X, y, C=1e5, tol=1e-1):
        """
        Method    :  logfit(X, y, C=1e5, tol=1e-1)
        Input     :  X: array of the shape [n_samples, nrow*ncol], it contains
                        features of every sample.
                     y: array of the shape [n_samples], it contains targets
                        (keys) of every sample.
                     C: inverse of regularization strength, must be a positive
                        float, and default value is 1.0.
                     tol: tolerance for stopping criteria, must be a positive
                        float too.
        Output    :  self, the estimator object
        """
        from sklearn import metrics
        from sklearn import preprocessing
        from sklearn import multiclass as mc
        from sklearn import linear_model as lm

        print "Start training the Logistic Regression model ..."
        # Binarize the output
        y = preprocessing.label_binarize(y, classes=range(10))
        classifier = mc.OneVsRestClassifier(lm.LogisticRegression(C=C, tol=tol))
        model = classifier.fit(X, y)
        print "Model training is done!"
        return model
def k_fold_model_select(features, labels, raw_classifiers, n_folds=10, weigh_samples_fn=None):
    # weigh_samples_fn is explained below
    # assumes that the raw_classifier output is in probability

    # split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        stratify=labels,
                                                        random_state=0)


    # use stratified k-fold cross validation to select the model
    skf = StratifiedKFold(y_train, n_folds=n_folds)

    best_classifier = None
    best_score = float('-inf')

    for train_index, validation_index in skf:
        for raw_classifier in raw_classifiers:
            classifier = skl_clone(raw_classifier)
            classifier = classifier.fit(X_train[train_index], y_train[train_index])

            if weigh_samples_fn != None:
                y_pred = classifier.predict(X_train[validation_index])
                sample_weight = weigh_samples_fn(y_train[validation_index], y_pred)
            else:
                sample_weight = None

            score = accuracy_score(classifier.predict(X_train[validation_index]), y_train[validation_index],
                                     sample_weight=sample_weight)

            if score > best_score:
                best_classifier = classifier
                best_score = score

    # compute the confusion matrix
    y_pred = best_classifier.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)

    # now compute the score for the test data of the best found classifier
    if weigh_samples_fn != None:
        sample_weight = weigh_samples_fn(y_test, y_pred)
    else:
        sample_weight = None
    test_score = accuracy_score(best_classifier.predict(X_test), y_test, sample_weight=sample_weight)

    # obtain the classification report
    report = classification_report(y_test, y_pred, target_names=['cat', 'dog'], sample_weight=sample_weight)

    # obtain ROC curve
    y_test_bin = label_binarize(y_test, classes=[0, 1])
    y_prob = best_classifier.predict_proba(X_test)

    #fpr, tpr, _ = roc_curve(y_test_bin[:, 1], y_prob[:, 1])
    fpr, tpr, _ = roc_curve(y_test_bin, y_prob[:, 1])
    roc_info = (best_classifier.__class__.__name__, (fpr, tpr))

    return (test_score, report, conf_mat, roc_info, best_classifier)
def action_to_vector(x, n_classes,p=0): #x是bs*path_length
    # p=0 #p是标签正常的概率
    result = np.zeros([x.shape[0], x.shape[1], n_classes])
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            if np.random.rand()<p and j!=x.shape[1]-1:
                result[i,j]=label_binarize([int(x[i,j])],range(n_classes))[0]
    return np.int32(result)
def plot_roc_curve(X_test_label,test_predicted,class_names):
    X_test_label_binary = label_binarize(X_test_label, classes=class_names)
    test_predicted_binary = label_binarize(test_predicted, classes=class_names)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(X_test_label_binary, test_predicted_binary)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.figure()
    plt.title('Receiver Operating Characteristic')
    #plt.imshow(cmap=plt.cm.GnBu)
    plt.plot(false_positive_rate, true_positive_rate, 'b',
             label='ROC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
def questions_to_y(qs, topic_tags, parents=False):
    if parents:
        class_indices = range(len(unique_parents(qs)))
    else:
        class_indices = range(len(topic_tags))

    return label_binarize(questions_to_topic_index(qs, topic_tags, parents),
                          class_indices)
Beispiel #28
0
def roc(y_true, y_score, ax=None):
    """
    Plot ROC curve.

    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Correct target values (ground truth).
    y_score : array-like, shape = [n_samples] or [n_samples, 2] for binary
              classification or [n_samples, n_classes] for multiclass

        Target scores (estimator predictions).
    ax: matplotlib Axes
        Axes object to draw the plot onto, otherwise uses current Axes

    Notes
    -----
    It is assumed that the y_score parameter columns are in order. For example,
    if ``y_true = [2, 2, 1, 0, 0, 1, 2]``, then the first column in y_score
    must countain the scores for class 0, second column for class 1 and so on.


    Returns
    -------
    ax: matplotlib Axes
        Axes containing the plot

    Examples
    --------
    .. plot:: ../../examples/roc.py

    """
    if ax is None:
        ax = plt.gca()

    # get the number of classes based on the shape of y_score
    y_score_is_vector = is_column_vector(y_score) or is_row_vector(y_score)
    if y_score_is_vector:
        n_classes = 2
    else:
        _, n_classes = y_score.shape

    # check data shape?

    if n_classes > 2:
        # convert y_true to binary format
        y_true_bin = label_binarize(y_true, classes=np.unique(y_true))
        _roc_multi(y_true_bin, y_score, ax=ax)
        for i in range(n_classes):
            _roc(y_true_bin[:, i], y_score[:, i], ax=ax)
    else:
        if y_score_is_vector:
            _roc(y_true, y_score, ax)
        else:
            _roc(y_true, y_score[:, 1], ax)

    # raise error if n_classes = 1?
    return ax
Beispiel #29
0
def plotroc(traindata, trainlabel, testdata, testlabel, labels, rocfilename, cmfilename):
    print('# plot ROC curve')
    print('## train data shape: %s' % (traindata.shape,))
    #clf = LogisticRegression(C=0.0005)
    clf = RandomForestClassifier(10, oob_score=True, n_jobs=-1)
    clf.fit(traindata, trainlabel)
    print('## test data shape: %s' % (testdata.shape,))
    predlabel = clf.predict(testdata)
    predprob = clf.predict_proba(testdata)
    cm = confusion_matrix(testlabel, predlabel)
    print(cm)
    plotconfusionmatrix(cm, labels, cmfilename)
    print(classification_report(testlabel, predlabel, target_names=labels))

    testlabel = label_binarize(testlabel, classes=range(1,13))
    predlabel = label_binarize(predlabel, classes=range(1,13))
    nclasses = predlabel.shape[1]
    fpr = dict()
    tpr = dict()
    rocauc = dict()
    for i in xrange(nclasses):
        fpr[i], tpr[i], _ = roc_curve(testlabel[:,i], predprob[:,i])
        rocauc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(testlabel.ravel(), predprob.ravel())
    rocauc["micro"] = auc(fpr["micro"], tpr["micro"])

    plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'''.format(rocauc["micro"]))
    for i in range(nclasses):
        plt.plot(fpr[i], tpr[i], label='{0} (area = {1:0.2f})'
                 ''.format(labels[i], rocauc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()

    
    plt.savefig(rocfilename)
def analyze_pipeline(model, X, y, folds=3) : 
	# X, y, X_test = load() # Load model with own load function
	# y = y # Reload as numpy
	# y = np.array([Y.score for Y in y])
	y = [Y.score for Y in y]
	y = np.array(y)
	print y.shape
	y = label_binarize(y, classes=[0, 1, 2, 3, 4])
	print y.shape
	# y = label_binarize(y, classes=[0, 1, 2, 3, 4])
	# BINARIZE HERE
	# X = np.array # Reload as numpy
	# if not model: # If no model is specified, call load_model function
 #    	model = load_model()

	# Manual x-validation to accumulate actual
	# print y.shape
	cv_skf = KFold(5, n_folds=folds, shuffle=True, random_state=123)
	print cv_skf
	# y = np.array(y)
	# Creates stratified test set from training set
	scores = [] # Actual scores
	conf_mat = np.zeros((2, 2)) # Binary classification, confusion matrix
	false_pos = Set() # False positive set
	false_neg = Set() # Falso negative set

	for train_i, val_i in cv_skf:
	    X_train, X_val = X[train_i], X[val_i]
	    y_train, y_val = y[train_i], y[val_i]

	    print "Fitting fold..."
	    model.fit(X_train, y_train)

	    print "Predicting fold..."

	    y_pprobs = model.predict_proba(X_val)       # Predicted probabilities
	    y_plabs = np.squeeze(model.predict(X_val))  # Predicted class labels
	    print y_val
	    scores.append(roc_auc_score(y_val, y_pprobs[:, 1]))
	    confusion = confusion_matrix(y_val, y_plabs)
	    conf_mat += confusion

	    # Collect indices of false positive and negatives
	    fp_i = np.where((y_plabs==1) & (y_val==0))[0]
	    fn_i = np.where((y_plabs==0) & (y_val==1))[0]
	    false_pos.update(val_i[fp_i])
	    false_neg.update(val_i[fn_i])

	    print "Fold score: ", scores[-1]
	    print "Fold CM: \n", confusion

	print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2)
	conf_mat /= folds
	print "Mean CM: \n", conf_mat
	print "\nMean classification measures: \n"
	pprint(class_report(conf_mat))
	return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)}
Beispiel #31
0
def classwise_reliability_diagram(probs, labels, class_idx, bins=15):
    assert labels.shape[0] == probs.shape[0], 'Label/probs shape mismatch'

    batch_size, num_classes = probs.shape
    onehot_labels = torch.from_numpy(
        label_binarize(labels, classes=np.arange(num_classes))).float()

    # Predicted probabilities / one-hot labels for the given class
    class_probs = probs[:, class_idx]
    class_labels = onehot_labels[:, class_idx]

    counts, bin_edges = np.histogram(class_probs, bins=bins, range=[0., 1.])
    indices = np.digitize(class_probs, bin_edges, right=True)
    bin_probs = np.array([
        torch.mean(class_probs[indices == j]).item()
        for j in range(1, bins + 1)
    ])
    bin_proportions = np.array([
        torch.mean(class_labels[indices == i]).item()
        for i in range(1, bins + 1)
    ])

    this_class_ece = (1. / batch_size) * np.sum([
        counts[i] * np.abs(bin_probs[i] - bin_proportions[i])
        for i in range(bins) if counts[i] > 0
    ])

    # ---- Setting up figure
    plt.rcParams.update({'font.size': 14})

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.set_xlabel('Class Score')
    ax.set_ylabel('Accuracy')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xticks(np.linspace(0, 1, 6))
    ax.set_yticks(np.linspace(0, 1, 6))

    ax.plot([0, 1], [0, 1], linestyle='--', color='gray')
    # ----- Plotting data
    for i in range(bins):
        x = [bin_edges[i], bin_edges[i], bin_edges[i + 1], bin_edges[i + 1]]
        y = [0, bin_proportions[i], bin_proportions[i], 0]
        ax.fill(x, y, 'b', alpha=0.6, edgecolor='black')

    ax.text(0.01,
            .875,
            'Class {} CE: {:.3f}'.format(class_idx, this_class_ece),
            size=15)

    score_hist_ax = ax.twinx()
    score_hist_ax.hist(class_probs,
                       density=True,
                       label='Score distr.',
                       color='orange',
                       alpha=0.7,
                       bins=20)
    score_hist_ax.set_ylim(0, 35)
    score_hist_ax.legend(loc='upper left')
    score_hist_ax.set_yticks([])

    return fig
Beispiel #32
0
def test(test_loader, featureExtractor, model, epoch, device, cv):
    model.eval()  # switch to train
    groundTruth = []
    prediction_max = []
    prediction_prob = []
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for i, videoFrames in enumerate(tqdm(test_loader)):
            label = videoFrames['label'].to(device)
            videoFrames = torch.squeeze(videoFrames['videoFrames']).to(device)
            length = videoFrames.shape[0]
            Outputs = []

            if length < 16:
                lack = 16 - length
                repeat_frames = videoFrames[-1:, ...].repeat(lack, 1, 1, 1)
                videoFrames = torch.cat((videoFrames, repeat_frames), 0)

            circle = int(length / 8) - 1
            for k in range(circle):
                start = 0 + 8 * k
                end = 16 + 8 * k
                features = featureExtractor(videoFrames[start:end,
                                                        ...].float())
                output, hidden = model(features.unsqueeze(0))
                output_mean = torch.mean(output,
                                         dim=0)  # one serie of frames = 16
                Outputs.append(output_mean.data.cpu().numpy().tolist()
                               )  # All series of frames

            Outputs = torch.Tensor(Outputs)

            if Outputs.shape[0] > 1:
                outputs_average = torch.mean(Outputs, dim=0).unsqueeze(
                    0)  # average of All series' output

            groundTruth.append(label.item())
            _, predicted = torch.max(outputs_average.data, 1)
            prediction_max.append(predicted.item())
            prediction_prob_b = F.softmax(outputs_average.data)
            prediction_prob.append(
                prediction_prob_b.data.numpy().reshape(6).tolist())

            test_total += label.size(0)
            test_correct += (predicted == label.data.cpu()).sum().item()

        accuracy = accuracy_score(prediction_max, groundTruth)
        f1 = f1_score(prediction_max, groundTruth, average="weighted")
        label = label_binarize(groundTruth, classes=list(range(6)))
        auc = roc_auc_score(label, prediction_prob, average='micro')
        print(
            f"CV {cv}/10, Epoch {epoch}/100, accuracy = {accuracy}, F1-Score = {f1}, AUC = {auc}",
        )

        test_accuracy = 100 * test_correct / test_total
        print(
            'CV = %d, Epoch %d, Accuracy of the network on the Test images: %d'
            % (cv, epoch, test_accuracy))

        # Raw
        df = pd.DataFrame(data={
            "pnn_prediction": prediction_max,
            "pnn_groundtruth": groundTruth
        })
        df.to_csv(
            f"./Prediction_202106/CV_{cv}_Epoch_{epoch}_ACC_{test_accuracy}_eval_pnn_2.csv"
        )

        pro = np.array(prediction_prob)
        df2 = pd.DataFrame(pro)
        df2.to_csv(
            f"./Prediction_202106/CV_{cv}_Epoch_{epoch}_ACC_{test_accuracy}_Categorical_lstm_6pnn_202106_2.csv"
        )
        print(f"save cv {cv}")
        return [accuracy, f1, auc]
                    clf = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),
                                            n_estimators=10,
                                            max_samples=0.5,
                                            max_features=0.5)
                elif j == 3:
                    clf = BaggingClassifier(base_estimator=MLPClassifier(hidden_layer_sizes=(100),
                                            activation='relu', solver='adam', batch_size=128,
                                            alpha=1e-4, learning_rate_init=1e-3, learning_rate='adaptive',
                                            tol=1e-4, max_iter=200),
                                            n_estimators=10,
                                            max_samples=0.5,
                                            max_features=0.5)
                elif j == 4:
                    clf = BaggingClassifier(base_estimator=LinearSVC(penalty='l2', random_state=0, tol=1e-4),
                                            n_estimators=10,
                                            max_samples=0.5,
                                            max_features=0.5)
                skf = StratifiedKFold(n_splits=10)
                skf_accuracy = []
                for train, test in skf.split(X, y):
                    clf.fit(X[train], y[train])
                    if n_classes.size < 3:
                        skf_accuracy.append(roc_auc_score(y[test], clf.predict_proba(X[test])[:, 1] if j != 4 else clf.decision_function(X[test]), average='micro'))
                    else:
                        ytest_one_hot = label_binarize(y[test], n_classes)
                        skf_accuracy.append(roc_auc_score(ytest_one_hot, clf.predict_proba(X[test]) if j != 4 else clf.decision_function(X[test]), average='micro'))
                accuracy = np.mean(skf_accuracy)
                of.write(f'{accuracy:.6f}|')
                print(f'{time.time() - start_time:.3f}s')
            of.write('\n')
Beispiel #34
0
def plot_roc(y_true,
             y_score,
             text='',
             linestyle='-',
             classes=None,
             detail=False):
    """
    plot roc, support for multi-class
    detail for : http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

    :param y_true: shape=[n_samples]
    :param y_score: shape=[n_samples, n_classes]
    :param classes: list
    :return:
    """
    unique_classes = set(y_true)
    if not classes:
        classes = unique_classes

    # Binarize the output
    y_true = label_binarize(y_true, classes=classes)
    n_classes = y_true.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    ##############################################################################
    # Plot ROC curves for the multiclass problem

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    #    plt.figure()
    #    plt.plot(fpr["micro"], tpr["micro"],
    #             label=text + ' micro-average ROC curve (area = {0:0.8f})'
    #                          ''.format(roc_auc["micro"]),
    #             linewidth=2)

    plt.plot(fpr["macro"],
             tpr["macro"],
             linestyle,
             label=text + ' (area = {0:0.8f})'
             ''.format(roc_auc["macro"]),
             linewidth=2)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    #    plt.show()

    if detail:
        for i in range(n_classes):
            plt.plot(fpr[i],
                     tpr[i],
                     label='ROC curve of class {0} (area = {1:0.8f})'
                     ''.format(classes[i], roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(
            'Some extension of Receiver operating characteristic to multi-class'
        )
        plt.legend(loc="lower right")
        plt.show()
Beispiel #35
0
def plot_model(X_train_scaled, y_train, X_test_scaled, y_test, clf):

    y_predicted = clf.predict(X_test_scaled)
    y_train_preds = clf.predict(X_train_scaled)

    unique_classes = [1, 2, 10, 15]

    probabilities = clf.predict_proba(X_test_scaled)

    # Binarize the output
    y_test_binarized = label_binarize(y_test, classes=[1, 2, 10, 15])
    n_classes = y_test_binarized.shape[1]

    print(clf)
    print("\n Classification report : \n",
          classification_report(y_test, y_predicted))
    print("Test Accuracy   Score : {:.4f}".format(
        accuracy_score(y_test, y_predicted)))
    #confusion matrix
    conf_matrix = confusion_matrix(y_test, y_predicted)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    thresholds = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], thresholds[i] = roc_curve(y_test_binarized[:, i],
                                                  probabilities[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    ## Calculate MultiClass AUC
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    print("Multi-Class Area Under the Curve: {:.4f}".format(roc_auc['macro']))
    print("\n")

    print('Classwise Area Under the Curves')
    for i in range(len(unique_classes)):
        if i == 0:
            auc_name = 'Normal- '
        else:
            auc_name = 'Disease Type ' + str(i) + '- '

        temp_auc = round(roc_auc[i], 2)
        print(auc_name + str(temp_auc))
    print('\n')

    #plot confusion matrix
    trace1 = go.Heatmap(z=conf_matrix,
                        x=[
                            "No Disease", 'Disease Class 1', 'Disease Class 2',
                            'Disease Class 3'
                        ],
                        y=[
                            "No Disease", 'Disease Class 1', 'Disease Class 2',
                            'Disease Class 3'
                        ],
                        showscale=False,
                        colorscale="Picnic",
                        name="matrix")

    #subplots
    fig = tls.make_subplots(rows=3,
                            cols=2,
                            specs=[[{}, None], [{}, {}], [{}, {}]],
                            subplot_titles=('Confusion Matrix', 'ROC 1',
                                            'ROC 2', 'ROC 3', 'ROC 4'))

    # fig = tls.make_subplots(rows=3, cols=2)
    fig.append_trace(trace1, 1, 1)

    for i in range(n_classes):
        trace2_temp = go.Scatter(x=fpr[i],
                                 y=tpr[i],
                                 name="Roc : " + str(roc_auc[i]),
                                 mode='lines+text',
                                 text=['AUC: ' + str(round(roc_auc[i], 2))],
                                 textposition='top right',
                                 textfont=dict(family="sans serif",
                                               size=18,
                                               color="DarkSeaGreen"),
                                 line=dict(color=('rgb(22, 96, 167)'),
                                           width=2))
        trace3_temp = go.Scatter(x=[0, 1],
                                 y=[0, 1],
                                 line=dict(color=('rgb(205, 12, 24)'),
                                           width=2,
                                           dash='dot'))
        if i == 0:
            fig.append_trace(trace2_temp, 2, 1)
            fig.append_trace(trace3_temp, 2, 1)
        elif i == 1:
            fig.append_trace(trace2_temp, 2, 2)
            fig.append_trace(trace3_temp, 2, 2)
        elif i == 2:
            fig.append_trace(trace2_temp, 3, 1)
            fig.append_trace(trace3_temp, 3, 1)
        else:
            fig.append_trace(trace2_temp, 3, 2)
            fig.append_trace(trace3_temp, 3, 2)

    fig['layout'].update(showlegend=False,
                         title="Model performance",
                         autosize=False,
                         height=900,
                         width=800,
                         plot_bgcolor='rgba(240,240,240, 0.95)',
                         paper_bgcolor='rgba(240,240,240, 0.95)',
                         margin=dict(b=195))
    for i in [2, 3, 4, 5]:
        fig["layout"]["xaxis" + str(i)].update(
            dict(title="false positive rate"))
        fig["layout"]["yaxis" + str(i)].update(
            dict(title="true positive rate"))

    iplot(fig)
# In[28]:

from sklearn.datasets import load_digits
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.grid_search import GridSearchCV

digits = load_digits()
x = digits.data
y = digits.target
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y.shape[1]
for i in range(0, 5):
    print()
    print("round:", i)
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=.2,
                                                        random_state=0)
    classifier = svm.SVC(probability=True)
    from sklearn.grid_search import GridSearchCV
    parameters = {
        'kernel': ('rbf', 'linear', 'poly', 'sigmoid'),
        'C': [1, 10, 100, 1000],
        'degree': np.arange(2, 11),
        'gamma': np.arange(1e-4, 1e-2),
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target

# binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=0)

# learn to predict each class against the other
classifier = OneVsRestClassifier(
from sklearn import svm
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

digits = load_digits()

x, y = digits.data, digits.target
print(y)
y = label_binarize(y, classes=list(range(10)))
print('------------------------------')
print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y)
model = OneVsRestClassifier(svm.SVC(kernel='linear'))
clf = model.fit(x_train, y_train)
print(clf.score(x_train, y_train))
Beispiel #39
0
y_true = []
y_pred = []

#----------- create y_true -------------

for allow in os.listdir(allow_path):  # allow
    for i in os.listdir(allow_path + '/' + allow):
        y_true.append(anchor_name.index(allow))

for reject in os.listdir(reject_path):  # reject
    for i in os.listdir(reject_path + '/' + reject):
        y_true.append(5)

y_true = np.array(y_true)
Y_true = label_binarize(y_true, classes=[i for i in range(nb_classes)])

print('k=', y_true.shape)

#----------- create y_pred -------------

ori_img_array = []

origin_model = load_model(
    './model_with5/SiameseResnet_mc2_model/SiameseResnet_mc2_stable_.h5')

fix_model = create_CaptureFeature_model((1, 32, 32))
fix_model.set_weights(origin_model.get_weights())  #.layers[3]
relation_model = load_model('./model_with5/test_by_test.h5')

#--- fix_feature ---
Beispiel #40
0
 def transformer_binarize(y_true):
     return label_binarize(y_true, classes=classes)
Beispiel #41
0
         k_scores2,
         label='gini',
         color='cornflowerblue',
         linestyle=':',
         linewidth=4)

# plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlabel('number of estimators')
plt.ylabel('Cross-Validated F1')
plt.title('Random Forest')
plt.legend(loc="lower right")
plt.show()

X = datasets.load_iris().data
y = datasets.load_iris().target
y = label_binarize(datasets.load_iris().target, classes=[0, 1, 2])
n_classes = y.shape[1]
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=0)
maxtrix = np.array([[635, 35], [17, 324]])
# Learn to predict each class against the other
classifier = OneVsRestClassifier(
    svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
Beispiel #42
0
X = train[[str(i) for i in range(4096)]].values
Y = np.array(train['label'].values, dtype=np.int32)

test = pd.read_csv('ts2d3dnew.csv')
#test = pd.read_csv('tsIITD.csv')

X_test = test[[str(i) for i in range(4096)]].values
Y_test = np.array(test['label'].values, dtype=np.int32)
print(Y_test)
#Y_test = label_binarize(Y_test, classes=[i for i in range(230)])

#Y = label_binarize(Y, classes=[i for i in range(230)])
#n_classes = 230

Y_test = label_binarize(Y_test, classes=[i for i in range(177)])

Y = label_binarize(Y, classes=[i for i in range(177)])
n_classes = 177

random_state = np.random.RandomState(0)
n_samples, n_features = X.shape

classifier = OneVsRestClassifier(
    svm.SVC(kernel='poly', probability=True, random_state=random_state))
y_score = classifier.fit(X, Y).decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
Beispiel #43
0
def run_prob_based_train_test_kfold_roc_curve_plot(classifier,
                                                   x,
                                                   y,
                                                   is_plot_enabled=True,
                                                   discard_low_pred=False):
    min_discard_prob = 0.2
    max_discard_prob = 0.8
    n_splits = 10
    y = label_binarize(y, classes=[0, 1])
    x, y = shuffle(x, y)
    cv = StratifiedKFold(n_splits=n_splits)
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    y = label_binarize(y, classes=[0, 1])
    i = 0
    logger.info("###" + str(n_splits) + "-fold started ###")
    cum_f1_score = 0
    try:
        cnt = 0

        for train, test in cv.split(x, y):

            logger.info("## fold: " + str(i + 1) + "started")
            x = np.array(x)
            y = np.array(y)
            X_train = x[train]
            y_train = y[train]
            X_test = x[test]
            y_test = y[test]

            classifier.fit(X_train, y_train)
            probas_ = classifier.predict_proba(X_test)

            # probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
            # Compute ROC curve and area the curve
            y_pred = probas_[:, 1]
            if discard_low_pred:
                y_test, y_pred = discard_low_pred_prob_prediction_couple(
                    y_test, y_pred, min_discard_prob, max_discard_prob)
            print_false_predicted_entries(X_test, y_pred, y_test, True)
            cum_f1_score += print_evaluation_stats(y_test, y_pred, True)

            fpr, tpr, thresholds = roc_curve(y_test, y_pred)
            tprs.append(interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            if is_plot_enabled:
                plt.plot(fpr,
                         tpr,
                         lw=1,
                         alpha=0.3,
                         label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

            i += 1
            logger.info("## fold: " + str(i + 1) + "completed")

        logger.info("Average weighted F1-score: " +
                    str(cum_f1_score / n_splits))
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        logger.info("Mean AUC: " + str(mean_auc))
        if is_plot_enabled:
            plt.plot([0, 1], [0, 1],
                     linestyle='--',
                     lw=2,
                     color='r',
                     label='Luck',
                     alpha=.8)
            plt.plot(mean_fpr,
                     mean_tpr,
                     color='b',
                     label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' %
                     (mean_auc, std_auc),
                     lw=2,
                     alpha=.8)

            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            plt.fill_between(mean_fpr,
                             tprs_lower,
                             tprs_upper,
                             color='grey',
                             alpha=.2,
                             label=r'$\pm$ 1 std. dev.')

            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Stratified k-fold with k=' + str(n_splits))
            plt.legend(loc="lower right")
            plt.show()
    except Exception as e:
        logger.error(e)
Beispiel #44
0

from kernels import chi_square_kernel, histogram_intersection_kernel, zero_kernel, multichannel_wrapper

if __name__ == '__main__':
    bowFilename = 'lab4/bow.pkl'

    if not os.path.exists(bowFilename):
        raise IOError("No such file '%s'." % bowFilename)

    x, y, tag = cPickle.load(open(bowFilename, 'rb'))

    x = np.array(x, dtype=float)
    y = np.array(y, dtype=int) - 1

    y = label_binarize(y, classes=range(7))
    n_classes = 7
    X_train, X_test, y_train, y_test = train_test_split(x, y, tag)

    print("Training SVM")
    TIMES = 10
    l = []
    for i in range(TIMES):
        print '\rFitting %d/%d ' % (i, TIMES),
        sys.stdout.flush()

        # resampling
        classifier = OneVsRestClassifier(
            svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel),
                    probability=True))
        X_train, X_test, y_train, y_test = train_test_split(x, y, tag)
Beispiel #45
0
    print(i)
    data = data_r[[i, 'label']]

    MU = data['label'] == "group1"
    WT = data['label'] == "group2"

    #ADDITIONAL CODE
    X_MU = data[MU].drop('label', axis=1)
    X_WT = data[WT].drop('label', axis=1)
    group1SampleSize = len(X_MU)
    group2SampleSize = len(X_WT)

    y_MU = data[MU].label
    y_WT = data[WT].label

    y_MU = label_binarize(y_MU, classes=["group1", "group2"])
    y_WT = label_binarize(y_WT, classes=["group1", "group2"])

    #ADD NOISY FEATURES TO MAKE THE PROBLEM HARDER
    random_state = np.random.RandomState(0)
    n_samples, n_features = X_MU.shape
    X_MU = np.c_[X_MU, random_state.randn(n_samples, 1 * n_features)]

    n_samples, n_features = X_WT.shape
    X_WT = np.c_[X_WT, random_state.randn(n_samples, 1 * n_features)]

    #SMOTE PARAMETERIZATION
    X_MU_train, X_MU_test, y_MU_train, y_MU_test = train_test_split(
        X_MU, y_MU, test_size=0.3)
    X_WT_train, X_WT_test, y_WT_train, y_WT_test = train_test_split(
        X_WT, y_WT, test_size=0.3)
Beispiel #46
0
r1 = algo.score(x_test, y_test)
neighbors = nbs.NearestNeighbors(3)
neighbors.fit(x_train, y_train)

myneighbors = neighbors.kneighbors(x_train, 3, return_distance=True)
print('*' * 100)
print(myneighbors)

#Logistic Regression
logistic = LogisticRegression(penalty='l2', fit_intercept=True, max_iter=100)
logistic.fit(x_train, y_train)
r2 = logistic.score(x_test, y_test)
print('Logistics训练结果:%f' % r2)

predit2 = logistic.predict(x_test)
y_label = label_binarize(y_test, classes=[1, 2, 3])
print(y_label)
fpr, tpr, _ = roc_curve(y_label.ravel(), algo.predict_proba(x_test).ravel())
aucValue = auc(fpr, tpr)
print(aucValue)

fpr_log, tpr_log, _ = roc_curve(y_label.ravel(),
                                logistic.predict_proba(x_test).ravel())
auc_log = auc(fpr_log, tpr_log)
x_test_len = np.arange(len(x_test))
# plt.plot(x_test_len,y_test,'ro',markersize=7,label='真实值')
# plt.plot(x_test_len,predit,'bo',markersize=5,label='KNN预测值')
# plt.plot(x_test_len,predit2,'ko',markersize=3,label='Logistics预测值')
# plt.title('鸢尾花分类预测,准确度:KNN=%f Logis=%f'% (r1,r2))
# plt.legend(loc='lower right')
Beispiel #47
0
    def validate(self, verbose=True, roc=False):
        self.network.eval()

        if self._test_loader is None:
            with torch.no_grad():
                self._test_loader = self._patch_loader(
                    self.args.dataset_path + VALIDATION_PATH, False)

        val_loss = 0
        correct = 0
        classes = len(LABELS)

        tp = [0] * classes
        tpfp = [0] * classes
        tpfn = [0] * classes
        precision = [0] * classes
        recall = [0] * classes
        f1 = [0] * classes

        if verbose:
            print('\nEvaluating....')

        labels_true = []
        labels_pred = np.empty((0, 4))

        for images, labels in self._test_loader:

            if self.args.cuda:
                images, labels = images.cuda(), labels.cuda()

            with torch.no_grad():
                output = self.network(Variable(images))

            val_loss += F.nll_loss(output,
                                   Variable(labels),
                                   size_average=False).data.item()
            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(predicted == labels)

            labels_true = np.append(labels_true, labels)
            labels_pred = np.append(labels_pred,
                                    torch.exp(output.data).cpu().numpy(),
                                    axis=0)

            for label in range(classes):
                t_labels = labels == label
                p_labels = predicted == label
                tp[label] += torch.sum(t_labels == (p_labels * 2 - 1))
                tpfp[label] += torch.sum(p_labels)
                tpfn[label] += torch.sum(t_labels)

        for label in range(classes):
            precision[label] += (tp[label] / (tpfp[label] + 1e-8))
            recall[label] += (tp[label] / (tpfn[label] + 1e-8))
            f1[label] = 2 * precision[label] * recall[label] / (
                precision[label] + recall[label] + 1e-8)

        val_loss /= len(self._test_loader.dataset)
        acc = 100. * correct / len(self._test_loader.dataset)

        if roc == 1:
            labels_true = label_binarize(labels_true, classes=range(classes))
            for lbl in range(classes):
                fpr, tpr, _ = roc_curve(labels_true[:, lbl], labels_pred[:,
                                                                         lbl])
                roc_auc = auc(fpr, tpr)
                plt.plot(fpr,
                         tpr,
                         lw=2,
                         label='{} (AUC: {:.1f})'.format(
                             LABELS[lbl], roc_auc * 100))

            plt.xlim([0, 1])
            plt.ylim([0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.legend(loc="lower right")
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.title('Receiver Operating Characteristic')
            plt.show()

        if verbose:
            print('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
                val_loss, correct, len(self._test_loader.dataset), acc))

            for label in range(classes):
                print(
                    '{}:  \t Precision: {:.2f},  Recall: {:.2f},  F1: {:.2f}'.
                    format(LABELS[label], precision[label], recall[label],
                           f1[label]))

            print('')

        return acc
def permission_roc():
    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    y = train_data['target']

    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
        train_data['permission-data'],
        train_data['target'],
        test_size=0.3,
        random_state=1)

    selector = SelectKBest(chi2, k=15)
    x_train = selector.fit_transform(x_train, y_train)
    x_test = selector.transform(x_test)

    y_train = label_binarize(
        y_train,
        classes=['music-audio', 'personalization', 'social', 'communication'])
    y_test = label_binarize(
        y_test,
        classes=['music-audio', 'personalization', 'social', 'communication'])
    n_classes = 4

    clss = [OneVsRestClassifier(MultinomialNB())]
    for cls in clss:
        model = cls.fit(x_train, y_train)
        # # valid the model
        y_score = model.predict_proba(x_test)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        for i in range(4):
            fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(),
                                                  y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

        # # Then interpolate all ROC curves at this points
        # mean_tpr = np.zeros_like(all_fpr)
        # for i in range(n_classes):
        # mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        #
        # # Finally average it and compute AUC
        # mean_tpr /= n_classes
        #
        # fpr["macro"] = all_fpr
        # tpr["macro"] = mean_tpr
        # roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        #
        # plt.figure()
        # plt.plot(fpr["micro"], tpr["micro"],
        # label='micro-average ROC curve (area = {0:0.2f})'
        # ''.format(roc_auc["micro"]),
        # linewidth=2)
        #
        # plt.plot(fpr["macro"], tpr["macro"],
        # label='macro-average ROC curve (area = {0:0.2f})'
        #                ''.format(roc_auc["macro"]),
        #          linewidth=2)

        for i in range(4):
            plt.plot(fpr[i],
                     tpr[i],
                     label='ROC curve of class {0} (area = {1:0.2f})'
                     ''.format(i, roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(
            'Some extension of Receiver operating characteristic to multi-class'
        )
        plt.legend(loc="lower right")
        plt.show()
Beispiel #49
0
def average_precision(prob_np, target_np):
  num_class = prob_np.shape[1]
  label = label_binarize(target_np, classes=list(range(num_class)))
  with np.errstate(divide='ignore', invalid='ignore'):
    return average_precision_score(label, prob_np, None)
Beispiel #50
0
def main():

    mainData = pickle.load(open("../../../Data/XY_ARXIV.p", "rb"))

    X = mainData[0]

    Y = mainData[1]

    X_test = mainData[2]

    Y_test = mainData[3]

    del mainData

    nb = pickle.load(open("../../../Data/nbARXIVModel.p", "rb"))

    # Make prediction
    print("MAKING PREDICTIONS")
    Y_pred = nb.predict(X_test)

    y_score = nb.predict_proba(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(len(LABELS)):
        fpr[i], tpr[i], _ = metrics.roc_curve(Y_test,
                                              y_score[:, i],
                                              pos_label=i)
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.title(
        'ARXIV: Naive Bayes Model Receiver operating characteristic curve')

    # Plot of a ROC curve for a specific class
    for i in range(len(LABELS)):
        plt.plot(fpr[i],
                 tpr[i],
                 label='ROC curve for label ' + str(i + 1) + " " +
                 list(LABELS.keys())[i] + ' (area = %0.2f)' % roc_auc[i])

    plt.legend(loc="lower right")

    plt.show()

    with open("../../../Data/nbARXIVPredicted.p", "wb") as handle:

        pickle.dump(Y_pred, handle)

    with open("../../../Data/ROC_Curves/NB ARXIV.p", "wb") as handle:
        curve = metrics.roc_curve(
            label_binarize(Y_test, classes=list(LABELS.values())).ravel(),
            y_score.ravel())
        auc = metrics.roc_auc_score(
            label_binarize(Y_test, classes=list(LABELS.values())),
            label_binarize(Y_pred, classes=list(LABELS.values())),
            average="micro")
        pickle.dump((curve, auc), handle)

    # print(Y_pred.tolist())

    # Calculate accuracy, precision, and recall
    print("PRINTING STATISTICS")
    acc = accuracy_score(y_true=Y_test, y_pred=Y_pred)
    print("accuracy = " + str(acc))
    print("Macro Averging")
    prec = precision_score(y_true=Y_test, y_pred=Y_pred, average="macro")
    recall = recall_score(y_true=Y_test, y_pred=Y_pred, average="macro")
    print("F1 score = " +
          str(metrics.f1_score(Y_test, Y_pred, average="macro")))
    print("precision = " + str(prec))
    print("recall = " + str(recall))
    print("Micro Averging")
    prec = precision_score(y_true=Y_test, y_pred=Y_pred, average="micro")
    recall = recall_score(y_true=Y_test, y_pred=Y_pred, average="micro")
    print("F1 score = " +
          str(metrics.f1_score(Y_test, Y_pred, average="micro")))
    print("precision = " + str(prec))
    print("recall = " + str(recall))
Beispiel #51
0
def main(args):

    with tf.Graph().as_default():

        with tf.Session() as sess:

            np.random.seed(seed=args.seed)

            if args.use_split_dataset:
                dataset_tmp = facenet.get_dataset(args.data_dir)
                train_set, test_set = split_dataset(
                    dataset_tmp, args.min_nrof_images_per_class,
                    args.nrof_train_images_per_class)
                if (args.mode == 'TRAIN'):
                    dataset = train_set
                elif (args.mode == 'CLASSIFY'):
                    dataset = test_set
            else:
                dataset = facenet.get_dataset(args.data_dir)

            # Check that there are at least one training image per class
            for cls in dataset:
                assert (
                    len(cls.image_paths) > 0,
                    'There must be at least one image for each class in the dataset'
                )

            paths, labels = facenet.get_image_paths_and_labels(dataset)

            print('Number of classes: %d' % len(dataset))
            print('Number of images: %d' % len(paths))

            # Load the model
            print('Loading feature extraction model')
            facenet.load_model(args.model)

            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name(
                "input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name(
                "embeddings:0")
            phase_train_placeholder = tf.get_default_graph(
            ).get_tensor_by_name("phase_train:0")
            embedding_size = embeddings.get_shape()[1]

            # Run forward pass to calculate embeddings
            print('Calculating features for images')
            nrof_images = len(paths)
            nrof_batches_per_epoch = int(
                math.ceil(1.0 * nrof_images / args.batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in range(nrof_batches_per_epoch):
                start_index = i * args.batch_size
                end_index = min((i + 1) * args.batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, False, False,
                                           args.image_size)
                feed_dict = {
                    images_placeholder: images,
                    phase_train_placeholder: False
                }
                emb_array[start_index:end_index, :] = sess.run(
                    embeddings, feed_dict=feed_dict)

            classifier_filename_exp = os.path.expanduser(
                args.classifier_filename)

            #embfilename='20180402-114759'

            #if not os.path.exists('D:\\facenet\\descriptors\\'+embfilename):
            # os.mkdir('D:\\facenet\\descriptors\\'+embfilename)
            #np.savetxt('D:\\facenet\\descriptors\\'+embfilename+'\\log1.gz', emb_array, fmt='%.32f', delimiter=',', newline='\n')
            #print('Saved feature embeddings to file "%s"' % embfilename)

            if (args.mode == 'TRAIN'):
                # Train classifier
                print('Training classifier')
                model = SVC(kernel='linear', probability=True)
                model.fit(emb_array, labels)

                # Create a list of class names
                class_names = [cls.name.replace('_', ' ') for cls in dataset]

                # Saving classifier model
                with open(classifier_filename_exp, 'wb') as outfile:
                    pickle.dump((model, class_names), outfile)
                print('Saved classifier model to file "%s"' %
                      classifier_filename_exp)

            elif (args.mode == 'CLASSIFY'):
                # Classify images
                print('Testing classifier')
                with open(classifier_filename_exp, 'rb') as infile:
                    (model, class_names) = pickle.load(infile)

                print('Loaded classifier model from file "%s"' %
                      classifier_filename_exp)

                predictions = model.predict_proba(emb_array)
                best_class_indices = np.argmax(predictions, axis=1)
                best_class_probabilities = predictions[
                    np.arange(len(best_class_indices)), best_class_indices]

                for i in range(len(best_class_indices)):
                    print('%4d  %s: %.3f' %
                          (i, class_names[best_class_indices[i]],
                           best_class_probabilities[i]))

                accuracy = np.mean(np.equal(best_class_indices, labels))
                print('Accuracy: %.3f' % accuracy)

                labels = label_binarize(np.array(labels), classes=range(1, 21))
                best_class_indices = label_binarize(
                    np.array(best_class_indices), classes=range(1, 21))
                precision, recall, _ = precision_recall_curve(
                    labels.ravel(), best_class_indices.ravel())
                average_precision = average_precision_score(labels,
                                                            best_class_indices,
                                                            average="micro")
                print(
                    'Average precision score, micro-averaged over all classes: {0:0.2f}'
                    .format(average_precision))
                plt.figure()
                plt.step(recall, precision, color='b', alpha=0.2, where='post')
                plt.fill_between(recall,
                                 precision,
                                 step='post',
                                 alpha=0.2,
                                 color='b')
                plt.xlabel('Recall')
                plt.ylabel('Precision')
                plt.ylim([0.0, 1.05])
                plt.xlim([0.0, 1.0])
                plt.title(
                    'Average precision score, micro-averaged over all classes: AP={0:0.2f}'
                    .format(average_precision))
Beispiel #52
0
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
#from sklearn.multiclass import OneVsOneClassifier
from scipy import interp
import os

x_train = np.loadtxt('D:/SJTU Lessons/X_train.txt', delimiter=' ')
x_test = np.loadtxt('D:/SJTU Lessons/X_test.txt', delimiter=' ')
# 将标签二值化
Y_train = np.loadtxt('D:/SJTU Lessons/Y_train.txt', delimiter=' ')
Y_train = label_binarize(Y_train, classes=[0, 1, 2, 3])
Y_test = np.loadtxt('D:/SJTU Lessons/Y_test.txt', delimiter=' ')
Y_test = label_binarize(Y_test, classes=[0, 1, 2, 3])
# 设置种类
n_classes = 4

# 训练模型并预测
#seed
random_state = np.random.RandomState(0)
#dim
n_samples = 3532
n_features = 641

# Learn to predict each class against the other
#'linear’, ‘poly’, ‘rbf'
classifier = OneVsRestClassifier(
Beispiel #53
0
def plot_precision_recall(y_truth, y_score, labels=None, pos_label=None):
    """ Plot precision recall curve

    Parameters
    -------------------------------------
    y_truth: array
        True labels for the belonging class. If labels are not
        {0, 1, ..., N}, then pos_label should be explicitly given.

    y_score: array
        Estimated probabilities or decision function.

    pos_label : int or str
        The label of the positive class. When pos_label=None,
        if y_true is in {0, 1, ..., N}, pos_label is set to 1,
        otherwise an error will be raised.

    Returns
    -------------------------------------
    out: matplotlib.figure.Figure
        Plot containing the precision recall curves
    """
    # get number of classes
    n_classes = len(np.unique(y_truth))

    if (labels is None and n_classes > 2) or (labels and len(labels) != n_classes):
        labels = []
        for i_class in range(n_classes):
            labels.append(f'class{i_class}')

    res = plt.figure()
    if n_classes <= 2:
        precision, recall, _ = precision_recall_curve(
            y_truth, y_score, pos_label=pos_label)
        plt.step(recall, precision, color='b', alpha=0.2, where='post')
        plt.fill_between(recall, precision, alpha=0.2, color='b', step='post')
        average_precision = average_precision_score(y_truth, y_score)
        plt.title(
            f'2-class Precision-Recall curve: AP={average_precision:0.2f}')
    else:
        cmap = plt.cm.get_cmap('tab10')
        precision, recall = (dict() for i_dict in range(2))
        # convert multi-class labels to multi-labels to obtain a curve for each class
        y_truth_multi = label_binarize(y_truth, classes=range(n_classes))
        for clas, lab in enumerate(labels):
            precision[clas], recall[clas], _ = precision_recall_curve(
                y_truth_multi[:, clas], y_score[:, clas], pos_label=pos_label)
            plt.step(recall[clas], precision[clas], color=cmap(clas), lw=1, where='post',
                     label=lab)
        # compute also micro average
        precision['micro'], recall['micro'], _ = precision_recall_curve(
            y_truth_multi.ravel(), y_score.ravel())
        plt.step(recall['micro'], precision['micro'], color='black', where='post',
                 linestyle='--', lw=1, label='average')
        average_precision = average_precision_score(
            y_truth_multi, y_score, average='micro')
        plt.title(
            f'Average precision score, micro-averaged over all classes: {average_precision:0.2f}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    if n_classes > 2:
        plt.legend(loc='lower left')
        plt.grid()
    return res
Beispiel #54
0
        print('auc: ', score)
        
        print("Features importance...")
        gain = model.feature_importance('gain')
        feat_imp = pd.DataFrame({'feature': model.feature_name(), 
                         'split': model.feature_importance('split'), 
                         'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
        print('Top 50 features:\n', feat_imp.head(50))
        
        del x_train, x_val, y_train, y_val, train_set, val_set
        gc.collect()
        
    return y_pred, oof_pred


y_pred, oof_pred = run_lgb(train, test, use_features)

y_one_hot = label_binarize(train['label'], np.arange(4)) 
oof_one_hot = label_binarize(oof_pred.argmax(axis=1), np.arange(4)) 
score = roc_auc_score(y_one_hot, oof_one_hot) 
print('auc: ', score)

submission = pd.read_csv(DATA_DIR+'sample_submission.csv')
submission.label = y_pred.argmax(axis=1)
submission.label = submission.label.map(mapping_dict_inv)
submission.head()

submission.to_csv('submission_lgb.csv', index=False)

np.save('y_pred_lgb', y_pred)
np.save('oof_pred_lgb', oof_pred)
	for train_index, test_index in k_fold.split(train):
		#Get the fold train, train target, test, test target
		fold_train = train[train_index]
		fold_test = train[test_index]
		fold_target_train = target[train_index]
		fold_target_test = target[test_index]
		#Create the classifier model
		random_forest_classifier = RandomForestClassifier(n_estimators = experiment[0], max_features = experiment[1], n_jobs = cpu_count)
		#Fit the classifier model on the train data
		random_forest_classifier.fit(fold_train, fold_target_train)
		#Predict the results for test data
		predictions = random_forest_classifier.predict(fold_test)
		#Get the probability estimates used for AUROC
		scores = random_forest_classifier.predict_proba(fold_test)
		#Binarize the output target as since it is a multi classification model and we get probability estimates for each class available
		binarized_outputs = label_binarize(fold_target_test, classes = output_classes)
		#Calculate the false positive rate and the true positive rate for each of the labels
		false_postive_rate = dict()
		true_postive_rate = dict()
		roc_auc = dict()
		for i in range(len(output_classes)):
			false_postive_rate[i], true_postive_rate[i], _ = roc_curve(binarized_outputs[:, i], scores[:, i])
			roc_auc[i] = auc(false_postive_rate[i], true_postive_rate[i])

		#Calculate the micro rates
		false_postive_rate["micro"], true_postive_rate["micro"], _ = roc_curve(binarized_outputs.ravel(), scores.ravel())
		roc_auc["micro"] = auc(false_postive_rate["micro"], true_postive_rate["micro"])
		all_false_postive_rate = numpy.unique(numpy.concatenate([false_postive_rate[i] for i in range(len(output_classes))]))

		#Interpolate all ROC curves of the different labels at this points
		mean_true_postive_rate = numpy.zeros_like(all_false_postive_rate)
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
	if(count==1): 
    	    ypred=raw_output
            ytrue=label.asnumpy() 		
 	else:
     	    ypred=np.vstack((ypred,raw_output))
            ytrue=np.hstack((ytrue,label.asnumpy()))		
	count=count+1	



colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
lw = 2
ytrue=label_binarize(ytrue, classes=[0, 1, 2,3,4,5,6,7,8,9]) 
precision = dict()
recall = dict()
average_precision = dict()
precision["micro"], recall["micro"], _ = precision_recall_curve(ytrue.ravel(),ypred.ravel())
average_precision["micro"] = average_precision_score(ytrue, ypred,average="micro")
plt.clf()
plt.plot(recall["micro"], precision["micro"], color='gold', lw=lw,
label='Normal Training (area = {0:0.2f})'.format(average_precision["micro"]))



# # Dropout Training

# In[ ]:
result_config = []
for c in config:
    print 'The following configuration will be used: {}'.format(c)

    result_cv = []
    # Go for LOPO cross-validation
    for idx_lopo_cv in range(len(id_patient_list)):

        # Display some information about the LOPO-CV
        print 'Round #{} of the LOPO-CV'.format(idx_lopo_cv + 1)

        # Get the testing data
        testing_data = np.atleast_2d(data[idx_lopo_cv]).T
        testing_data = np.nan_to_num(testing_data)
        testing_label = label_binarize(label[idx_lopo_cv], [0, 255])
        print 'Create the testing set ...'

        # Create the training data and label
        training_data = [
            arr for idx_arr, arr in enumerate(data) if idx_arr != idx_lopo_cv
        ]
        training_label = [
            arr for idx_arr, arr in enumerate(label) if idx_arr != idx_lopo_cv
        ]
        # Concatenate the data
        training_data = np.atleast_2d(np.hstack(training_data)).T
        training_data = np.nan_to_num(training_data)
        training_label = label_binarize(
            np.hstack(training_label).astype(int), [0, 255])
        print 'Create the training set ...'
Beispiel #58
0
# initialize thetas with labeled data
thetas = [[], [], []]  # (mean,var^-1,weight) per class
# Gaussian Bayes Classifier: MLE for feature distribution + w_y
for y in range(10):
    mask = y_train_lbl == y
    # [μ_y, Σ_y, w_y]
    thetas[0].append(np.mean(X_train_lbl[mask], axis=0))  # μ_y (128-dim)
    thetas[1].append(np.linalg.inv(np.cov(
        X_train_lbl[mask], rowvar=False)))  # Σ_y^-1 (128x128-dim)
    thetas[2].append(
        counts[y] / labeled_rows
    )  #w_y = mean(P(z=y | x, Σ, μ)) = #y-labeled data/#labeled data

gammas = np.zeros((labeled_rows + unlabeled_rows, 10),
                  dtype='float64')  #TODO: empty?
gammas[0:labeled_rows] = preprocessing.label_binarize(
    y_train_lbl, classes=range(10))  # TODO: sparse_output=True?
predicted_class = np.empty((labeled_rows + unlabeled_rows))
predicted_class[0:labeled_rows] = y_train_lbl

# https://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html
tol = 0.001
max_iter = 100
# n = all_rows = X_train_all.shape[0]
# for P(x)
gm = GaussianMixture(max_iter=1,
                     n_components=10,
                     weights_init=thetas[2],
                     means_init=thetas[0],
                     precisions_init=thetas[1])
gm.fit(X_train_lbl)  #needed for predict
print(gm.get_params()['means_init'] == thetas[0])
Beispiel #59
0
def plot_roc(model, x_train, y_train):
    x_train = np.array(x_train, ndmin=2)
    y_train = np.array(y_train, ndmin=2)
    if (x_train.shape[0] != y_train.shape[0]):
        y_train = y_train.T
    if (x_train.shape[0] != y_train.shape[0]):
        print("x_train and y_train do not match in lenght: ", x_train.shape,
              " vs ", y_train.shape)

    x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                        y_train,
                                                        test_size=0.2,
                                                        random_state=123)

    # binarize classes

    classes = []

    for i in range(np.max(y_train) + 1):
        classes.append(i)

    y_test_bin = label_binarize(y_test, classes=classes)
    n_classes = len(classes)

    predictions = model.predict(x_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(int(np.max(y_train) + 1)):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], predictions[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(),
                                              predictions.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()
Beispiel #60
0
def print_model_scores(model, y_test, X_test_scaled, X_train_scaled, y_train):

    from sklearn.metrics import classification_report, confusion_matrix

    y_predicted_test = model.predict(X_test_scaled)
    y_predicted_train = model.predict(X_train_scaled)

    print("Train Accuracy : %.4f " % (model.score(X_train_scaled, y_train)))
    print("Test Accuracy : %.4f " % (model.score(X_test_scaled, y_test)))

    print("Confusion matrix Train: ")
    print(confusion_matrix(y_train, y_predicted_train))

    print("Confusion matrix Test: ")
    print(confusion_matrix(y_test, y_predicted_test))

    unique_classes = [1, 2, 10, 15]

    probabilities_test = model.predict_proba(X_test_scaled)
    probabilities_train = model.predict_proba(X_train_scaled)

    # Binarize the output
    y_test_binarized = label_binarize(y_test, classes=[1, 2, 10, 15])
    y_train_binarized = label_binarize(y_train, classes=[1, 2, 10, 15])
    n_classes = y_test_binarized.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_train = dict()
    tpr_train = dict()
    roc_auc_train = dict()
    fpr_test = dict()
    tpr_test = dict()
    roc_auc_test = dict()
    for i in range(n_classes):
        fpr_test[i], tpr_test[i], _ = roc_curve(y_test_binarized[:, i],
                                                probabilities_test[:, i])
        roc_auc_test[i] = auc(fpr_test[i], tpr_test[i])

        fpr_train[i], tpr_train[i], _ = roc_curve(y_train_binarized[:, i],
                                                  probabilities_train[:, i])
        roc_auc_train[i] = auc(fpr_train[i], tpr_train[i])

    ## Calculate MultiClass AUC
    # First aggregate all false positive rates
    all_fpr_train = np.unique(
        np.concatenate([fpr_train[i] for i in range(n_classes)]))
    all_fpr_test = np.unique(
        np.concatenate([fpr_test[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr_train = np.zeros_like(all_fpr_train)
    for i in range(n_classes):
        mean_tpr_train += interp(all_fpr_train, fpr_train[i], tpr_train[i])
    mean_tpr_test = np.zeros_like(all_fpr_test)
    for i in range(n_classes):
        mean_tpr_test += interp(all_fpr_test, fpr_test[i], tpr_test[i])

    # Finally average it and compute AUC
    mean_tpr_train /= n_classes
    mean_tpr_test /= n_classes

    fpr_train["macro"] = all_fpr_train
    tpr_train["macro"] = mean_tpr_train
    roc_auc_train["macro"] = auc(fpr_train["macro"], tpr_train["macro"])
    print("AUC Train: {:.4f}".format(roc_auc_train['macro']))

    fpr_test["macro"] = all_fpr_test
    tpr_test["macro"] = mean_tpr_test
    roc_auc_test["macro"] = auc(fpr_test["macro"], tpr_test["macro"])
    print("AUC Test: {:.4f}".format(roc_auc_test['macro']))
    print("\n")

    return ({
        'Test_Accuracy': round(model.score(X_test_scaled, y_test), 4),
        'Train_Accuracy': round(model.score(X_train_scaled, y_train), 4),
        'Train AUC': round(roc_auc_train['macro'], 4),
        'Test AUC': round(roc_auc_test['macro'], 4)
    })